sigs.k8s.io/cluster-api@v1.6.3/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package machinehealthcheck 18 19 import ( 20 "context" 21 "fmt" 22 "time" 23 24 "github.com/go-logr/logr" 25 "github.com/pkg/errors" 26 corev1 "k8s.io/api/core/v1" 27 apierrors "k8s.io/apimachinery/pkg/api/errors" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/apimachinery/pkg/types" 30 "k8s.io/klog/v2" 31 "sigs.k8s.io/controller-runtime/pkg/client" 32 33 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 34 "sigs.k8s.io/cluster-api/util" 35 "sigs.k8s.io/cluster-api/util/annotations" 36 "sigs.k8s.io/cluster-api/util/conditions" 37 "sigs.k8s.io/cluster-api/util/patch" 38 ) 39 40 const ( 41 // Event types. 42 43 // EventMachineMarkedUnhealthy is emitted when machine was successfully marked as unhealthy. 44 EventMachineMarkedUnhealthy string = "MachineMarkedUnhealthy" 45 // EventDetectedUnhealthy is emitted in case a node associated with a 46 // machine was detected unhealthy. 47 EventDetectedUnhealthy string = "DetectedUnhealthy" 48 ) 49 50 var ( 51 // We allow users to disable the nodeStartupTimeout by setting the duration to 0. 52 disabledNodeStartupTimeout = clusterv1.ZeroDuration 53 ) 54 55 // healthCheckTarget contains the information required to perform a health check 56 // on the node to determine if any remediation is required. 57 type healthCheckTarget struct { 58 Cluster *clusterv1.Cluster 59 Machine *clusterv1.Machine 60 Node *corev1.Node 61 MHC *clusterv1.MachineHealthCheck 62 patchHelper *patch.Helper 63 nodeMissing bool 64 } 65 66 func (t *healthCheckTarget) string() string { 67 return fmt.Sprintf("%s/%s/%s/%s", 68 t.MHC.GetNamespace(), 69 t.MHC.GetName(), 70 t.Machine.GetName(), 71 t.nodeName(), 72 ) 73 } 74 75 // Get the node name if the target has a node. 76 func (t *healthCheckTarget) nodeName() string { 77 if t.Node != nil { 78 return t.Node.GetName() 79 } 80 return "" 81 } 82 83 // Determine whether or not a given target needs remediation. 84 // The node will need remediation if any of the following are true: 85 // - The Machine has failed for some reason 86 // - The Machine did not get a node before `timeoutForMachineToHaveNode` elapses 87 // - The Node has gone away 88 // - Any condition on the node is matched for the given timeout 89 // If the target doesn't currently need rememdiation, provide a duration after 90 // which the target should next be checked. 91 // The target should be requeued after this duration. 92 func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachineToHaveNode metav1.Duration) (bool, time.Duration) { 93 var nextCheckTimes []time.Duration 94 now := time.Now() 95 96 if t.Machine.Status.FailureReason != nil { 97 conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureReason: %v", *t.Machine.Status.FailureReason) 98 logger.V(3).Info("Target is unhealthy", "failureReason", t.Machine.Status.FailureReason) 99 return true, time.Duration(0) 100 } 101 102 if t.Machine.Status.FailureMessage != nil { 103 conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureMessage: %v", *t.Machine.Status.FailureMessage) 104 logger.V(3).Info("Target is unhealthy", "failureMessage", t.Machine.Status.FailureMessage) 105 return true, time.Duration(0) 106 } 107 108 // the node does not exist 109 if t.nodeMissing { 110 logger.V(3).Info("Target is unhealthy: node is missing") 111 conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.NodeNotFoundReason, clusterv1.ConditionSeverityWarning, "") 112 return true, time.Duration(0) 113 } 114 115 // Don't penalize any Machine/Node if the control plane has not been initialized 116 // Exception of this rule are control plane machine itself, so the first control plane machine can be remediated. 117 if !conditions.IsTrue(t.Cluster, clusterv1.ControlPlaneInitializedCondition) && !util.IsControlPlaneMachine(t.Machine) { 118 logger.V(3).Info("Not evaluating target health because the control plane has not yet been initialized") 119 // Return a nextCheck time of 0 because we'll get requeued when the Cluster is updated. 120 return false, 0 121 } 122 123 // Don't penalize any Machine/Node if the cluster infrastructure is not ready. 124 if !conditions.IsTrue(t.Cluster, clusterv1.InfrastructureReadyCondition) { 125 logger.V(3).Info("Not evaluating target health because the cluster infrastructure is not ready") 126 // Return a nextCheck time of 0 because we'll get requeued when the Cluster is updated. 127 return false, 0 128 } 129 130 // the node has not been set yet 131 if t.Node == nil { 132 if timeoutForMachineToHaveNode == disabledNodeStartupTimeout { 133 // Startup timeout is disabled so no need to go any further. 134 // No node yet to check conditions, can return early here. 135 return false, 0 136 } 137 138 controlPlaneInitialized := conditions.GetLastTransitionTime(t.Cluster, clusterv1.ControlPlaneInitializedCondition) 139 clusterInfraReady := conditions.GetLastTransitionTime(t.Cluster, clusterv1.InfrastructureReadyCondition) 140 machineCreationTime := t.Machine.CreationTimestamp.Time 141 142 // Use the latest of the 3 times 143 comparisonTime := machineCreationTime 144 logger.V(3).Info("Determining comparison time", "machineCreationTime", machineCreationTime, "clusterInfraReadyTime", clusterInfraReady, "controlPlaneInitializedTime", controlPlaneInitialized) 145 if conditions.IsTrue(t.Cluster, clusterv1.ControlPlaneInitializedCondition) && controlPlaneInitialized != nil && controlPlaneInitialized.Time.After(comparisonTime) { 146 comparisonTime = controlPlaneInitialized.Time 147 } 148 if conditions.IsTrue(t.Cluster, clusterv1.InfrastructureReadyCondition) && clusterInfraReady != nil && clusterInfraReady.Time.After(comparisonTime) { 149 comparisonTime = clusterInfraReady.Time 150 } 151 logger.V(3).Info("Using comparison time", "time", comparisonTime) 152 153 timeoutDuration := timeoutForMachineToHaveNode.Duration 154 if comparisonTime.Add(timeoutForMachineToHaveNode.Duration).Before(now) { 155 conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.NodeStartupTimeoutReason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutDuration) 156 logger.V(3).Info("Target is unhealthy: machine has no node", "duration", timeoutDuration) 157 return true, time.Duration(0) 158 } 159 160 durationUnhealthy := now.Sub(comparisonTime) 161 nextCheck := timeoutDuration - durationUnhealthy + time.Second 162 163 return false, nextCheck 164 } 165 166 // check conditions 167 for _, c := range t.MHC.Spec.UnhealthyConditions { 168 nodeCondition := getNodeCondition(t.Node, c.Type) 169 170 // Skip when current node condition is different from the one reported 171 // in the MachineHealthCheck. 172 if nodeCondition == nil || nodeCondition.Status != c.Status { 173 continue 174 } 175 176 // If the condition has been in the unhealthy state for longer than the 177 // timeout, return true with no requeue time. 178 if nodeCondition.LastTransitionTime.Add(c.Timeout.Duration).Before(now) { 179 conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.UnhealthyNodeConditionReason, clusterv1.ConditionSeverityWarning, "Condition %s on node is reporting status %s for more than %s", c.Type, c.Status, c.Timeout.Duration.String()) 180 logger.V(3).Info("Target is unhealthy: condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", c.Timeout.Duration.String()) 181 return true, time.Duration(0) 182 } 183 184 durationUnhealthy := now.Sub(nodeCondition.LastTransitionTime.Time) 185 nextCheck := c.Timeout.Duration - durationUnhealthy + time.Second 186 if nextCheck > 0 { 187 nextCheckTimes = append(nextCheckTimes, nextCheck) 188 } 189 } 190 return false, minDuration(nextCheckTimes) 191 } 192 193 // getTargetsFromMHC uses the MachineHealthCheck's selector to fetch machines 194 // and their nodes targeted by the health check, ready for health checking. 195 func (r *Reconciler) getTargetsFromMHC(ctx context.Context, logger logr.Logger, clusterClient client.Reader, cluster *clusterv1.Cluster, mhc *clusterv1.MachineHealthCheck) ([]healthCheckTarget, error) { 196 machines, err := r.getMachinesFromMHC(ctx, mhc) 197 if err != nil { 198 return nil, errors.Wrap(err, "error getting machines from MachineHealthCheck") 199 } 200 if len(machines) == 0 { 201 return nil, nil 202 } 203 204 targets := []healthCheckTarget{} 205 for k := range machines { 206 logger := logger.WithValues("Machine", klog.KObj(&machines[k])) 207 skip, reason := shouldSkipRemediation(&machines[k]) 208 if skip { 209 logger.Info("skipping remediation", "reason", reason) 210 continue 211 } 212 213 patchHelper, err := patch.NewHelper(&machines[k], r.Client) 214 if err != nil { 215 return nil, errors.Wrap(err, "unable to initialize patch helper") 216 } 217 target := healthCheckTarget{ 218 Cluster: cluster, 219 MHC: mhc, 220 Machine: &machines[k], 221 patchHelper: patchHelper, 222 } 223 if clusterClient != nil { 224 node, err := r.getNodeFromMachine(ctx, clusterClient, target.Machine) 225 if err != nil { 226 if !apierrors.IsNotFound(err) { 227 return nil, errors.Wrap(err, "error getting node") 228 } 229 230 // A node has been seen for this machine, but it no longer exists 231 target.nodeMissing = true 232 } 233 target.Node = node 234 } 235 targets = append(targets, target) 236 } 237 return targets, nil 238 } 239 240 // getMachinesFromMHC fetches Machines matched by the MachineHealthCheck's 241 // label selector. 242 func (r *Reconciler) getMachinesFromMHC(ctx context.Context, mhc *clusterv1.MachineHealthCheck) ([]clusterv1.Machine, error) { 243 selector, err := metav1.LabelSelectorAsSelector(metav1.CloneSelectorAndAddLabel( 244 &mhc.Spec.Selector, clusterv1.ClusterNameLabel, mhc.Spec.ClusterName, 245 )) 246 if err != nil { 247 return nil, errors.Wrap(err, "failed to build selector") 248 } 249 250 var machineList clusterv1.MachineList 251 if err := r.Client.List( 252 ctx, 253 &machineList, 254 client.MatchingLabelsSelector{Selector: selector}, 255 client.InNamespace(mhc.GetNamespace()), 256 ); err != nil { 257 return nil, errors.Wrap(err, "failed to list machines") 258 } 259 return machineList.Items, nil 260 } 261 262 // getNodeFromMachine fetches the node from a local or remote cluster for a 263 // given machine. 264 func (r *Reconciler) getNodeFromMachine(ctx context.Context, clusterClient client.Reader, machine *clusterv1.Machine) (*corev1.Node, error) { 265 if machine.Status.NodeRef == nil { 266 return nil, nil 267 } 268 269 node := &corev1.Node{} 270 nodeKey := types.NamespacedName{ 271 Name: machine.Status.NodeRef.Name, 272 } 273 274 // if it cannot find a node, send a nil node back... 275 if err := clusterClient.Get(ctx, nodeKey, node); err != nil { 276 return nil, err 277 } 278 return node, nil 279 } 280 281 // healthCheckTargets health checks a slice of targets 282 // and gives a data to measure the average health. 283 func (r *Reconciler) healthCheckTargets(targets []healthCheckTarget, logger logr.Logger, timeoutForMachineToHaveNode metav1.Duration) ([]healthCheckTarget, []healthCheckTarget, []time.Duration) { 284 var nextCheckTimes []time.Duration 285 var unhealthy []healthCheckTarget 286 var healthy []healthCheckTarget 287 288 for _, t := range targets { 289 logger := logger.WithValues("Target", t.string()) 290 logger.V(3).Info("Health checking target") 291 needsRemediation, nextCheck := t.needsRemediation(logger, timeoutForMachineToHaveNode) 292 293 if needsRemediation { 294 unhealthy = append(unhealthy, t) 295 continue 296 } 297 298 if nextCheck > 0 { 299 logger.V(3).Info("Target is likely to go unhealthy", "timeUntilUnhealthy", nextCheck.Truncate(time.Second).String()) 300 r.recorder.Eventf( 301 t.Machine, 302 corev1.EventTypeNormal, 303 EventDetectedUnhealthy, 304 "Machine %v has unhealthy node %v", 305 t.string(), 306 t.nodeName(), 307 ) 308 nextCheckTimes = append(nextCheckTimes, nextCheck) 309 continue 310 } 311 312 if t.Machine.DeletionTimestamp.IsZero() && t.Node != nil { 313 conditions.MarkTrue(t.Machine, clusterv1.MachineHealthCheckSucceededCondition) 314 healthy = append(healthy, t) 315 } 316 } 317 return healthy, unhealthy, nextCheckTimes 318 } 319 320 // getNodeCondition returns node condition by type. 321 func getNodeCondition(node *corev1.Node, conditionType corev1.NodeConditionType) *corev1.NodeCondition { 322 for _, cond := range node.Status.Conditions { 323 if cond.Type == conditionType { 324 return &cond 325 } 326 } 327 return nil 328 } 329 330 func minDuration(durations []time.Duration) time.Duration { 331 if len(durations) == 0 { 332 return time.Duration(0) 333 } 334 335 minDuration := durations[0] 336 // Ignore first element as that is already minDuration 337 for _, nc := range durations[1:] { 338 if nc < minDuration { 339 minDuration = nc 340 } 341 } 342 return minDuration 343 } 344 345 // shouldSkipRemediation checks if the machine should be skipped for remediation. 346 // Returns true if it should be skipped along with the reason for skipping. 347 func shouldSkipRemediation(m *clusterv1.Machine) (bool, string) { 348 if annotations.HasPaused(m) { 349 return true, fmt.Sprintf("machine has %q annotation", clusterv1.PausedAnnotation) 350 } 351 352 if annotations.HasSkipRemediation(m) { 353 return true, fmt.Sprintf("machine has %q annotation", clusterv1.MachineSkipRemediationAnnotation) 354 } 355 356 return false, "" 357 }