sigs.k8s.io/cluster-api@v1.7.1/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package machinehealthcheck 18 19 import ( 20 "context" 21 "fmt" 22 "time" 23 24 "github.com/go-logr/logr" 25 "github.com/pkg/errors" 26 corev1 "k8s.io/api/core/v1" 27 apierrors "k8s.io/apimachinery/pkg/api/errors" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/apimachinery/pkg/types" 30 "k8s.io/klog/v2" 31 "sigs.k8s.io/controller-runtime/pkg/client" 32 33 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 34 "sigs.k8s.io/cluster-api/util" 35 "sigs.k8s.io/cluster-api/util/annotations" 36 "sigs.k8s.io/cluster-api/util/conditions" 37 "sigs.k8s.io/cluster-api/util/patch" 38 ) 39 40 const ( 41 // Event types. 42 43 // EventMachineMarkedUnhealthy is emitted when machine was successfully marked as unhealthy. 44 EventMachineMarkedUnhealthy string = "MachineMarkedUnhealthy" 45 // EventDetectedUnhealthy is emitted in case a node associated with a 46 // machine was detected unhealthy. 47 EventDetectedUnhealthy string = "DetectedUnhealthy" 48 ) 49 50 var ( 51 // We allow users to disable the nodeStartupTimeout by setting the duration to 0. 52 disabledNodeStartupTimeout = clusterv1.ZeroDuration 53 ) 54 55 // healthCheckTarget contains the information required to perform a health check 56 // on the node to determine if any remediation is required. 57 type healthCheckTarget struct { 58 Cluster *clusterv1.Cluster 59 Machine *clusterv1.Machine 60 Node *corev1.Node 61 MHC *clusterv1.MachineHealthCheck 62 patchHelper *patch.Helper 63 nodeMissing bool 64 } 65 66 func (t *healthCheckTarget) string() string { 67 return fmt.Sprintf("%s/%s/%s/%s", 68 t.MHC.GetNamespace(), 69 t.MHC.GetName(), 70 t.Machine.GetName(), 71 t.nodeName(), 72 ) 73 } 74 75 // Get the node name if the target has a node. 76 func (t *healthCheckTarget) nodeName() string { 77 if t.Node != nil { 78 return t.Node.GetName() 79 } 80 return "" 81 } 82 83 // Determine whether or not a given target needs remediation. 84 // The node will need remediation if any of the following are true: 85 // - The Machine has the remediate machine annotation 86 // - The Machine has failed for some reason 87 // - The Machine did not get a node before `timeoutForMachineToHaveNode` elapses 88 // - The Node has gone away 89 // - Any condition on the node is matched for the given timeout 90 // If the target doesn't currently need rememdiation, provide a duration after 91 // which the target should next be checked. 92 // The target should be requeued after this duration. 93 func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachineToHaveNode metav1.Duration) (bool, time.Duration) { 94 var nextCheckTimes []time.Duration 95 now := time.Now() 96 97 if annotations.HasRemediateMachine(t.Machine) { 98 conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.HasRemediateMachineAnnotationReason, clusterv1.ConditionSeverityWarning, "Marked for remediation via remediate-machine annotation") 99 logger.V(3).Info("Target is marked for remediation via remediate-machine annotation") 100 return true, time.Duration(0) 101 } 102 103 if t.Machine.Status.FailureReason != nil { 104 conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureReason: %v", *t.Machine.Status.FailureReason) 105 logger.V(3).Info("Target is unhealthy", "failureReason", t.Machine.Status.FailureReason) 106 return true, time.Duration(0) 107 } 108 109 if t.Machine.Status.FailureMessage != nil { 110 conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureMessage: %v", *t.Machine.Status.FailureMessage) 111 logger.V(3).Info("Target is unhealthy", "failureMessage", t.Machine.Status.FailureMessage) 112 return true, time.Duration(0) 113 } 114 115 // the node does not exist 116 if t.nodeMissing { 117 logger.V(3).Info("Target is unhealthy: node is missing") 118 conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.NodeNotFoundReason, clusterv1.ConditionSeverityWarning, "") 119 return true, time.Duration(0) 120 } 121 122 // Don't penalize any Machine/Node if the control plane has not been initialized 123 // Exception of this rule are control plane machine itself, so the first control plane machine can be remediated. 124 if !conditions.IsTrue(t.Cluster, clusterv1.ControlPlaneInitializedCondition) && !util.IsControlPlaneMachine(t.Machine) { 125 logger.V(3).Info("Not evaluating target health because the control plane has not yet been initialized") 126 // Return a nextCheck time of 0 because we'll get requeued when the Cluster is updated. 127 return false, 0 128 } 129 130 // Don't penalize any Machine/Node if the cluster infrastructure is not ready. 131 if !conditions.IsTrue(t.Cluster, clusterv1.InfrastructureReadyCondition) { 132 logger.V(3).Info("Not evaluating target health because the cluster infrastructure is not ready") 133 // Return a nextCheck time of 0 because we'll get requeued when the Cluster is updated. 134 return false, 0 135 } 136 137 // the node has not been set yet 138 if t.Node == nil { 139 if timeoutForMachineToHaveNode == disabledNodeStartupTimeout { 140 // Startup timeout is disabled so no need to go any further. 141 // No node yet to check conditions, can return early here. 142 return false, 0 143 } 144 145 controlPlaneInitialized := conditions.GetLastTransitionTime(t.Cluster, clusterv1.ControlPlaneInitializedCondition) 146 clusterInfraReady := conditions.GetLastTransitionTime(t.Cluster, clusterv1.InfrastructureReadyCondition) 147 machineCreationTime := t.Machine.CreationTimestamp.Time 148 149 // Use the latest of the 3 times 150 comparisonTime := machineCreationTime 151 logger.V(3).Info("Determining comparison time", "machineCreationTime", machineCreationTime, "clusterInfraReadyTime", clusterInfraReady, "controlPlaneInitializedTime", controlPlaneInitialized) 152 if conditions.IsTrue(t.Cluster, clusterv1.ControlPlaneInitializedCondition) && controlPlaneInitialized != nil && controlPlaneInitialized.Time.After(comparisonTime) { 153 comparisonTime = controlPlaneInitialized.Time 154 } 155 if conditions.IsTrue(t.Cluster, clusterv1.InfrastructureReadyCondition) && clusterInfraReady != nil && clusterInfraReady.Time.After(comparisonTime) { 156 comparisonTime = clusterInfraReady.Time 157 } 158 logger.V(3).Info("Using comparison time", "time", comparisonTime) 159 160 timeoutDuration := timeoutForMachineToHaveNode.Duration 161 if comparisonTime.Add(timeoutForMachineToHaveNode.Duration).Before(now) { 162 conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.NodeStartupTimeoutReason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutDuration) 163 logger.V(3).Info("Target is unhealthy: machine has no node", "duration", timeoutDuration) 164 return true, time.Duration(0) 165 } 166 167 durationUnhealthy := now.Sub(comparisonTime) 168 nextCheck := timeoutDuration - durationUnhealthy + time.Second 169 170 return false, nextCheck 171 } 172 173 // check conditions 174 for _, c := range t.MHC.Spec.UnhealthyConditions { 175 nodeCondition := getNodeCondition(t.Node, c.Type) 176 177 // Skip when current node condition is different from the one reported 178 // in the MachineHealthCheck. 179 if nodeCondition == nil || nodeCondition.Status != c.Status { 180 continue 181 } 182 183 // If the condition has been in the unhealthy state for longer than the 184 // timeout, return true with no requeue time. 185 if nodeCondition.LastTransitionTime.Add(c.Timeout.Duration).Before(now) { 186 conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.UnhealthyNodeConditionReason, clusterv1.ConditionSeverityWarning, "Condition %s on node is reporting status %s for more than %s", c.Type, c.Status, c.Timeout.Duration.String()) 187 logger.V(3).Info("Target is unhealthy: condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", c.Timeout.Duration.String()) 188 return true, time.Duration(0) 189 } 190 191 durationUnhealthy := now.Sub(nodeCondition.LastTransitionTime.Time) 192 nextCheck := c.Timeout.Duration - durationUnhealthy + time.Second 193 if nextCheck > 0 { 194 nextCheckTimes = append(nextCheckTimes, nextCheck) 195 } 196 } 197 return false, minDuration(nextCheckTimes) 198 } 199 200 // getTargetsFromMHC uses the MachineHealthCheck's selector to fetch machines 201 // and their nodes targeted by the health check, ready for health checking. 202 func (r *Reconciler) getTargetsFromMHC(ctx context.Context, logger logr.Logger, clusterClient client.Reader, cluster *clusterv1.Cluster, mhc *clusterv1.MachineHealthCheck) ([]healthCheckTarget, error) { 203 machines, err := r.getMachinesFromMHC(ctx, mhc) 204 if err != nil { 205 return nil, errors.Wrap(err, "error getting machines from MachineHealthCheck") 206 } 207 if len(machines) == 0 { 208 return nil, nil 209 } 210 211 targets := []healthCheckTarget{} 212 for k := range machines { 213 logger := logger.WithValues("Machine", klog.KObj(&machines[k])) 214 skip, reason := shouldSkipRemediation(&machines[k]) 215 if skip { 216 logger.Info("skipping remediation", "reason", reason) 217 continue 218 } 219 220 patchHelper, err := patch.NewHelper(&machines[k], r.Client) 221 if err != nil { 222 return nil, err 223 } 224 target := healthCheckTarget{ 225 Cluster: cluster, 226 MHC: mhc, 227 Machine: &machines[k], 228 patchHelper: patchHelper, 229 } 230 if clusterClient != nil { 231 node, err := r.getNodeFromMachine(ctx, clusterClient, target.Machine) 232 if err != nil { 233 if !apierrors.IsNotFound(err) { 234 return nil, errors.Wrap(err, "error getting node") 235 } 236 237 // A node has been seen for this machine, but it no longer exists 238 target.nodeMissing = true 239 } 240 target.Node = node 241 } 242 targets = append(targets, target) 243 } 244 return targets, nil 245 } 246 247 // getMachinesFromMHC fetches Machines matched by the MachineHealthCheck's 248 // label selector. 249 func (r *Reconciler) getMachinesFromMHC(ctx context.Context, mhc *clusterv1.MachineHealthCheck) ([]clusterv1.Machine, error) { 250 selector, err := metav1.LabelSelectorAsSelector(metav1.CloneSelectorAndAddLabel( 251 &mhc.Spec.Selector, clusterv1.ClusterNameLabel, mhc.Spec.ClusterName, 252 )) 253 if err != nil { 254 return nil, errors.Wrap(err, "failed to build selector") 255 } 256 257 var machineList clusterv1.MachineList 258 if err := r.Client.List( 259 ctx, 260 &machineList, 261 client.MatchingLabelsSelector{Selector: selector}, 262 client.InNamespace(mhc.GetNamespace()), 263 ); err != nil { 264 return nil, errors.Wrap(err, "failed to list machines") 265 } 266 return machineList.Items, nil 267 } 268 269 // getNodeFromMachine fetches the node from a local or remote cluster for a 270 // given machine. 271 func (r *Reconciler) getNodeFromMachine(ctx context.Context, clusterClient client.Reader, machine *clusterv1.Machine) (*corev1.Node, error) { 272 if machine.Status.NodeRef == nil { 273 return nil, nil 274 } 275 276 node := &corev1.Node{} 277 nodeKey := types.NamespacedName{ 278 Name: machine.Status.NodeRef.Name, 279 } 280 281 // if it cannot find a node, send a nil node back... 282 if err := clusterClient.Get(ctx, nodeKey, node); err != nil { 283 return nil, err 284 } 285 return node, nil 286 } 287 288 // healthCheckTargets health checks a slice of targets 289 // and gives a data to measure the average health. 290 func (r *Reconciler) healthCheckTargets(targets []healthCheckTarget, logger logr.Logger, timeoutForMachineToHaveNode metav1.Duration) ([]healthCheckTarget, []healthCheckTarget, []time.Duration) { 291 var nextCheckTimes []time.Duration 292 var unhealthy []healthCheckTarget 293 var healthy []healthCheckTarget 294 295 for _, t := range targets { 296 logger := logger.WithValues("Target", t.string()) 297 logger.V(3).Info("Health checking target") 298 needsRemediation, nextCheck := t.needsRemediation(logger, timeoutForMachineToHaveNode) 299 300 if needsRemediation { 301 unhealthy = append(unhealthy, t) 302 continue 303 } 304 305 if nextCheck > 0 { 306 logger.V(3).Info("Target is likely to go unhealthy", "timeUntilUnhealthy", nextCheck.Truncate(time.Second).String()) 307 r.recorder.Eventf( 308 t.Machine, 309 corev1.EventTypeNormal, 310 EventDetectedUnhealthy, 311 "Machine %v has unhealthy node %v", 312 t.string(), 313 t.nodeName(), 314 ) 315 nextCheckTimes = append(nextCheckTimes, nextCheck) 316 continue 317 } 318 319 if t.Machine.DeletionTimestamp.IsZero() && t.Node != nil { 320 conditions.MarkTrue(t.Machine, clusterv1.MachineHealthCheckSucceededCondition) 321 healthy = append(healthy, t) 322 } 323 } 324 return healthy, unhealthy, nextCheckTimes 325 } 326 327 // getNodeCondition returns node condition by type. 328 func getNodeCondition(node *corev1.Node, conditionType corev1.NodeConditionType) *corev1.NodeCondition { 329 for _, cond := range node.Status.Conditions { 330 if cond.Type == conditionType { 331 return &cond 332 } 333 } 334 return nil 335 } 336 337 func minDuration(durations []time.Duration) time.Duration { 338 if len(durations) == 0 { 339 return time.Duration(0) 340 } 341 342 minDuration := durations[0] 343 // Ignore first element as that is already minDuration 344 for _, nc := range durations[1:] { 345 if nc < minDuration { 346 minDuration = nc 347 } 348 } 349 return minDuration 350 } 351 352 // shouldSkipRemediation checks if the machine should be skipped for remediation. 353 // Returns true if it should be skipped along with the reason for skipping. 354 func shouldSkipRemediation(m *clusterv1.Machine) (bool, string) { 355 if annotations.HasPaused(m) { 356 return true, fmt.Sprintf("machine has %q annotation", clusterv1.PausedAnnotation) 357 } 358 359 if annotations.HasSkipRemediation(m) { 360 return true, fmt.Sprintf("machine has %q annotation", clusterv1.MachineSkipRemediationAnnotation) 361 } 362 363 return false, "" 364 }