sigs.k8s.io/cluster-api@v1.6.3/internal/controllers/machine/machine_controller_noderef.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package machine 18 19 import ( 20 "context" 21 "fmt" 22 "strings" 23 24 "github.com/pkg/errors" 25 corev1 "k8s.io/api/core/v1" 26 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 27 "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 28 "k8s.io/klog/v2" 29 ctrl "sigs.k8s.io/controller-runtime" 30 "sigs.k8s.io/controller-runtime/pkg/client" 31 32 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 33 "sigs.k8s.io/cluster-api/api/v1beta1/index" 34 "sigs.k8s.io/cluster-api/internal/util/taints" 35 "sigs.k8s.io/cluster-api/util" 36 "sigs.k8s.io/cluster-api/util/annotations" 37 "sigs.k8s.io/cluster-api/util/conditions" 38 ) 39 40 var ( 41 // ErrNodeNotFound signals that a corev1.Node could not be found for the given provider id. 42 ErrNodeNotFound = errors.New("cannot find node with matching ProviderID") 43 ) 44 45 func (r *Reconciler) reconcileNode(ctx context.Context, s *scope) (ctrl.Result, error) { 46 log := ctrl.LoggerFrom(ctx) 47 cluster := s.cluster 48 machine := s.machine 49 infraMachine := s.infraMachine 50 51 // Create a watch on the nodes in the Cluster. 52 if err := r.watchClusterNodes(ctx, cluster); err != nil { 53 return ctrl.Result{}, err 54 } 55 56 // Check that the Machine has a valid ProviderID. 57 if machine.Spec.ProviderID == nil || *machine.Spec.ProviderID == "" { 58 log.Info("Waiting for infrastructure provider to report spec.providerID", machine.Spec.InfrastructureRef.Kind, klog.KRef(machine.Spec.InfrastructureRef.Namespace, machine.Spec.InfrastructureRef.Name)) 59 conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.WaitingForNodeRefReason, clusterv1.ConditionSeverityInfo, "") 60 return ctrl.Result{}, nil 61 } 62 63 remoteClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster)) 64 if err != nil { 65 return ctrl.Result{}, err 66 } 67 68 // Even if Status.NodeRef exists, continue to do the following checks to make sure Node is healthy 69 node, err := r.getNode(ctx, remoteClient, *machine.Spec.ProviderID) 70 if err != nil { 71 if err == ErrNodeNotFound { 72 // While a NodeRef is set in the status, failing to get that node means the node is deleted. 73 // If Status.NodeRef is not set before, node still can be in the provisioning state. 74 if machine.Status.NodeRef != nil { 75 conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeNotFoundReason, clusterv1.ConditionSeverityError, "") 76 return ctrl.Result{}, errors.Wrapf(err, "no matching Node for Machine %q in namespace %q", machine.Name, machine.Namespace) 77 } 78 conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeProvisioningReason, clusterv1.ConditionSeverityWarning, "") 79 // No need to requeue here. Nodes emit an event that triggers reconciliation. 80 return ctrl.Result{}, nil 81 } 82 log.Error(err, "Failed to retrieve Node by ProviderID") 83 r.recorder.Event(machine, corev1.EventTypeWarning, "Failed to retrieve Node by ProviderID", err.Error()) 84 return ctrl.Result{}, err 85 } 86 87 // Set the Machine NodeRef. 88 if machine.Status.NodeRef == nil { 89 machine.Status.NodeRef = &corev1.ObjectReference{ 90 Kind: node.Kind, 91 APIVersion: node.APIVersion, 92 Name: node.Name, 93 UID: node.UID, 94 } 95 log.Info("Infrastructure provider reporting spec.providerID, Kubernetes node is now available", machine.Spec.InfrastructureRef.Kind, klog.KRef(machine.Spec.InfrastructureRef.Namespace, machine.Spec.InfrastructureRef.Name), "providerID", *machine.Spec.ProviderID, "node", klog.KRef("", machine.Status.NodeRef.Name)) 96 r.recorder.Event(machine, corev1.EventTypeNormal, "SuccessfulSetNodeRef", machine.Status.NodeRef.Name) 97 } 98 99 // Set the NodeSystemInfo. 100 machine.Status.NodeInfo = &node.Status.NodeInfo 101 102 // Compute all the annotations that CAPI is setting on nodes; 103 // CAPI only enforces some annotations and never changes or removes them. 104 nodeAnnotations := map[string]string{ 105 clusterv1.ClusterNameAnnotation: machine.Spec.ClusterName, 106 clusterv1.ClusterNamespaceAnnotation: machine.GetNamespace(), 107 clusterv1.MachineAnnotation: machine.Name, 108 } 109 if owner := metav1.GetControllerOfNoCopy(machine); owner != nil { 110 nodeAnnotations[clusterv1.OwnerKindAnnotation] = owner.Kind 111 nodeAnnotations[clusterv1.OwnerNameAnnotation] = owner.Name 112 } 113 114 // Compute labels to be propagated from Machines to nodes. 115 // NOTE: CAPI should manage only a subset of node labels, everything else should be preserved. 116 // NOTE: Once we reconcile node labels for the first time, the NodeUninitializedTaint is removed from the node. 117 nodeLabels := getManagedLabels(machine.Labels) 118 119 // Get interruptible instance status from the infrastructure provider and set the interruptible label on the node. 120 interruptible := false 121 found := false 122 if infraMachine != nil { 123 interruptible, found, err = unstructured.NestedBool(infraMachine.Object, "status", "interruptible") 124 if err != nil { 125 return ctrl.Result{}, errors.Wrapf(err, "failed to get status interruptible from infra machine %s", klog.KObj(infraMachine)) 126 } 127 // If interruptible is set and is true add the interruptible label to the node labels. 128 if found && interruptible { 129 nodeLabels[clusterv1.InterruptibleLabel] = "" 130 } 131 } 132 133 _, nodeHadInterruptibleLabel := node.Labels[clusterv1.InterruptibleLabel] 134 135 // Reconcile node taints 136 if err := r.patchNode(ctx, remoteClient, node, nodeLabels, nodeAnnotations); err != nil { 137 return ctrl.Result{}, errors.Wrapf(err, "failed to reconcile Node %s", klog.KObj(node)) 138 } 139 if !nodeHadInterruptibleLabel && interruptible { 140 // If the interruptible label is added to the node then record the event. 141 // Nb. Only record the event if the node previously did not have the label to avoid recording 142 // the event during every reconcile. 143 r.recorder.Event(machine, corev1.EventTypeNormal, "SuccessfulSetInterruptibleNodeLabel", node.Name) 144 } 145 146 // Do the remaining node health checks, then set the node health to true if all checks pass. 147 status, message := summarizeNodeConditions(node) 148 if status == corev1.ConditionFalse { 149 conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeConditionsFailedReason, clusterv1.ConditionSeverityWarning, message) 150 return ctrl.Result{}, nil 151 } 152 if status == corev1.ConditionUnknown { 153 conditions.MarkUnknown(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeConditionsFailedReason, message) 154 return ctrl.Result{}, nil 155 } 156 157 conditions.MarkTrue(machine, clusterv1.MachineNodeHealthyCondition) 158 return ctrl.Result{}, nil 159 } 160 161 // getManagedLabels gets a map[string]string and returns another map[string]string 162 // filtering out labels not managed by CAPI. 163 func getManagedLabels(labels map[string]string) map[string]string { 164 managedLabels := make(map[string]string) 165 for key, value := range labels { 166 dnsSubdomainOrName := strings.Split(key, "/")[0] 167 if dnsSubdomainOrName == clusterv1.NodeRoleLabelPrefix { 168 managedLabels[key] = value 169 } 170 if dnsSubdomainOrName == clusterv1.NodeRestrictionLabelDomain || strings.HasSuffix(dnsSubdomainOrName, "."+clusterv1.NodeRestrictionLabelDomain) { 171 managedLabels[key] = value 172 } 173 if dnsSubdomainOrName == clusterv1.ManagedNodeLabelDomain || strings.HasSuffix(dnsSubdomainOrName, "."+clusterv1.ManagedNodeLabelDomain) { 174 managedLabels[key] = value 175 } 176 } 177 178 return managedLabels 179 } 180 181 // summarizeNodeConditions summarizes a Node's conditions and returns the summary of condition statuses and concatenate failed condition messages: 182 // if there is at least 1 semantically-negative condition, summarized status = False; 183 // if there is at least 1 semantically-positive condition when there is 0 semantically negative condition, summarized status = True; 184 // if all conditions are unknown, summarized status = Unknown. 185 // (semantically true conditions: NodeMemoryPressure/NodeDiskPressure/NodePIDPressure == false or Ready == true.) 186 func summarizeNodeConditions(node *corev1.Node) (corev1.ConditionStatus, string) { 187 semanticallyFalseStatus := 0 188 unknownStatus := 0 189 190 message := "" 191 for _, condition := range node.Status.Conditions { 192 switch condition.Type { 193 case corev1.NodeMemoryPressure, corev1.NodeDiskPressure, corev1.NodePIDPressure: 194 if condition.Status != corev1.ConditionFalse { 195 message += fmt.Sprintf("Node condition %s is %s", condition.Type, condition.Status) + ". " 196 if condition.Status == corev1.ConditionUnknown { 197 unknownStatus++ 198 continue 199 } 200 semanticallyFalseStatus++ 201 } 202 case corev1.NodeReady: 203 if condition.Status != corev1.ConditionTrue { 204 message += fmt.Sprintf("Node condition %s is %s", condition.Type, condition.Status) + ". " 205 if condition.Status == corev1.ConditionUnknown { 206 unknownStatus++ 207 continue 208 } 209 semanticallyFalseStatus++ 210 } 211 } 212 } 213 if semanticallyFalseStatus > 0 { 214 return corev1.ConditionFalse, message 215 } 216 if semanticallyFalseStatus+unknownStatus < 4 { 217 return corev1.ConditionTrue, message 218 } 219 return corev1.ConditionUnknown, message 220 } 221 222 func (r *Reconciler) getNode(ctx context.Context, c client.Reader, providerID string) (*corev1.Node, error) { 223 nodeList := corev1.NodeList{} 224 if err := c.List(ctx, &nodeList, client.MatchingFields{index.NodeProviderIDField: providerID}); err != nil { 225 return nil, err 226 } 227 if len(nodeList.Items) == 0 { 228 // If for whatever reason the index isn't registered or available, we fallback to loop over the whole list. 229 nl := corev1.NodeList{} 230 for { 231 if err := c.List(ctx, &nl, client.Continue(nl.Continue)); err != nil { 232 return nil, err 233 } 234 235 for _, node := range nl.Items { 236 if providerID == node.Spec.ProviderID { 237 return &node, nil 238 } 239 } 240 241 if nl.Continue == "" { 242 break 243 } 244 } 245 246 return nil, ErrNodeNotFound 247 } 248 249 if len(nodeList.Items) != 1 { 250 return nil, fmt.Errorf("unexpectedly found more than one Node matching the providerID %s", providerID) 251 } 252 253 return &nodeList.Items[0], nil 254 } 255 256 // PatchNode is required to workaround an issue on Node.Status.Address which is incorrectly annotated as patchStrategy=merge 257 // and this causes SSA patch to fail in case there are two addresses with the same key https://github.com/kubernetes-sigs/cluster-api/issues/8417 258 func (r *Reconciler) patchNode(ctx context.Context, remoteClient client.Client, node *corev1.Node, newLabels, newAnnotations map[string]string) error { 259 newNode := node.DeepCopy() 260 261 // Adds the annotations CAPI sets on the node. 262 hasAnnotationChanges := annotations.AddAnnotations(newNode, newAnnotations) 263 264 // Adds the labels from the Machine. 265 // NOTE: in order to handle deletion we are tracking the labels set from the Machine in an annotation. 266 // At the next reconcile we are going to use this for deleting labels previously set by the Machine, but 267 // not present anymore. Labels not set from machines should be always preserved. 268 if newNode.Labels == nil { 269 newNode.Labels = make(map[string]string) 270 } 271 hasLabelChanges := false 272 labelsFromPreviousReconcile := strings.Split(newNode.Annotations[clusterv1.LabelsFromMachineAnnotation], ",") 273 if len(labelsFromPreviousReconcile) == 1 && labelsFromPreviousReconcile[0] == "" { 274 labelsFromPreviousReconcile = []string{} 275 } 276 labelsFromCurrentReconcile := []string{} 277 for k, v := range newLabels { 278 if cur, ok := newNode.Labels[k]; !ok || cur != v { 279 newNode.Labels[k] = v 280 hasLabelChanges = true 281 } 282 labelsFromCurrentReconcile = append(labelsFromCurrentReconcile, k) 283 } 284 for _, k := range labelsFromPreviousReconcile { 285 if _, ok := newLabels[k]; !ok { 286 delete(newNode.Labels, k) 287 hasLabelChanges = true 288 } 289 } 290 annotations.AddAnnotations(newNode, map[string]string{clusterv1.LabelsFromMachineAnnotation: strings.Join(labelsFromCurrentReconcile, ",")}) 291 292 // Drop the NodeUninitializedTaint taint on the node given that we are reconciling labels. 293 hasTaintChanges := taints.RemoveNodeTaint(newNode, clusterv1.NodeUninitializedTaint) 294 295 if !hasAnnotationChanges && !hasLabelChanges && !hasTaintChanges { 296 return nil 297 } 298 299 return remoteClient.Patch(ctx, newNode, client.StrategicMergeFrom(node)) 300 }