sigs.k8s.io/cluster-api@v1.7.1/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package machinehealthcheck 18 19 import ( 20 "testing" 21 "time" 22 23 . "github.com/onsi/gomega" 24 corev1 "k8s.io/api/core/v1" 25 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 "k8s.io/client-go/tools/record" 27 ctrl "sigs.k8s.io/controller-runtime" 28 "sigs.k8s.io/controller-runtime/pkg/client" 29 "sigs.k8s.io/controller-runtime/pkg/client/fake" 30 31 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 32 "sigs.k8s.io/cluster-api/errors" 33 "sigs.k8s.io/cluster-api/util/conditions" 34 "sigs.k8s.io/cluster-api/util/patch" 35 ) 36 37 func TestGetTargetsFromMHC(t *testing.T) { 38 namespace := "test-mhc" 39 clusterName := "test-cluster" 40 41 cluster := &clusterv1.Cluster{ 42 ObjectMeta: metav1.ObjectMeta{ 43 Namespace: namespace, 44 Name: clusterName, 45 }, 46 } 47 48 mhcSelector := map[string]string{"cluster": clusterName, "machine-group": "foo"} 49 50 // Create a namespace for the tests 51 testNS := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "mhc-test"}} 52 53 // Create a test MHC 54 testMHC := &clusterv1.MachineHealthCheck{ 55 ObjectMeta: metav1.ObjectMeta{ 56 Name: "test-mhc", 57 Namespace: namespace, 58 }, 59 Spec: clusterv1.MachineHealthCheckSpec{ 60 ClusterName: clusterName, 61 Selector: metav1.LabelSelector{ 62 MatchLabels: mhcSelector, 63 }, 64 UnhealthyConditions: []clusterv1.UnhealthyCondition{ 65 { 66 Type: corev1.NodeReady, 67 Status: corev1.ConditionUnknown, 68 Timeout: metav1.Duration{Duration: 5 * time.Minute}, 69 }, 70 }, 71 }, 72 } 73 74 baseObjects := []client.Object{testNS, cluster, testMHC} 75 76 // Initialise some test machines and nodes for use in the test cases 77 78 testNode1 := newTestNode("node1") 79 testMachine1 := newTestMachine("machine1", namespace, clusterName, testNode1.Name, mhcSelector) 80 testNode2 := newTestNode("node2") 81 testMachine2 := newTestMachine("machine2", namespace, clusterName, testNode2.Name, map[string]string{"cluster": clusterName}) 82 testNode3 := newTestNode("node3") 83 testMachine3 := newTestMachine("machine3", namespace, clusterName, testNode3.Name, mhcSelector) 84 testNode4 := newTestNode("node4") 85 testMachine4 := newTestMachine("machine4", namespace, "other-cluster", testNode4.Name, mhcSelector) 86 87 // machines for skip remediation 88 testNode5 := newTestNode("node5") 89 testMachine5 := newTestMachine("machine5", namespace, clusterName, testNode5.Name, mhcSelector) 90 testMachine5.Annotations = map[string]string{clusterv1.MachineSkipRemediationAnnotation: ""} 91 testNode6 := newTestNode("node6") 92 testMachine6 := newTestMachine("machine6", namespace, clusterName, testNode6.Name, mhcSelector) 93 testMachine6.Annotations = map[string]string{clusterv1.PausedAnnotation: ""} 94 95 testCases := []struct { 96 desc string 97 toCreate []client.Object 98 expectedTargets []healthCheckTarget 99 }{ 100 { 101 desc: "with no matching machines", 102 toCreate: baseObjects, 103 expectedTargets: nil, 104 }, 105 { 106 desc: "when a machine's node is missing", 107 toCreate: append(baseObjects, testMachine1), 108 expectedTargets: []healthCheckTarget{ 109 { 110 Machine: testMachine1, 111 MHC: testMHC, 112 Node: nil, 113 nodeMissing: true, 114 }, 115 }, 116 }, 117 { 118 desc: "when a machine's labels do not match the selector", 119 toCreate: append(baseObjects, testMachine1, testMachine2, testNode1), 120 expectedTargets: []healthCheckTarget{ 121 { 122 Machine: testMachine1, 123 MHC: testMHC, 124 Node: testNode1, 125 }, 126 }, 127 }, 128 { 129 desc: "with multiple machines, should match correct nodes", 130 toCreate: append(baseObjects, testNode1, testMachine1, testNode3, testMachine3, testNode4, testMachine4), 131 expectedTargets: []healthCheckTarget{ 132 { 133 Machine: testMachine1, 134 MHC: testMHC, 135 Node: testNode1, 136 }, 137 { 138 Machine: testMachine3, 139 MHC: testMHC, 140 Node: testNode3, 141 }, 142 }, 143 }, 144 { 145 desc: "with machines having skip-remediation or paused annotation", 146 toCreate: append(baseObjects, testNode1, testMachine1, testMachine5, testMachine6), 147 expectedTargets: []healthCheckTarget{ 148 { 149 Machine: testMachine1, 150 MHC: testMHC, 151 Node: testNode1, 152 }, 153 }, 154 }, 155 } 156 157 for _, tc := range testCases { 158 t.Run(tc.desc, func(t *testing.T) { 159 gs := NewGomegaWithT(t) 160 161 k8sClient := fake.NewClientBuilder().WithObjects(tc.toCreate...).Build() 162 163 // Create a test reconciler 164 reconciler := &Reconciler{ 165 Client: k8sClient, 166 } 167 for _, t := range tc.expectedTargets { 168 patchHelper, err := patch.NewHelper(t.Machine, k8sClient) 169 gs.Expect(err).ToNot(HaveOccurred()) 170 t.patchHelper = patchHelper 171 } 172 173 targets, err := reconciler.getTargetsFromMHC(ctx, ctrl.LoggerFrom(ctx), k8sClient, cluster, testMHC) 174 gs.Expect(err).ToNot(HaveOccurred()) 175 176 gs.Expect(targets).To(HaveLen(len(tc.expectedTargets))) 177 for i, target := range targets { 178 expectedTarget := tc.expectedTargets[i] 179 gs.Expect(target.Machine).To(BeComparableTo(expectedTarget.Machine)) 180 gs.Expect(target.MHC).To(BeComparableTo(expectedTarget.MHC)) 181 gs.Expect(target.Node).To(BeComparableTo(expectedTarget.Node)) 182 } 183 }) 184 } 185 } 186 187 func TestHealthCheckTargets(t *testing.T) { 188 namespace := "test-mhc" 189 clusterName := "test-cluster" 190 191 cluster := &clusterv1.Cluster{ 192 ObjectMeta: metav1.ObjectMeta{ 193 Namespace: namespace, 194 Name: clusterName, 195 }, 196 } 197 conditions.MarkTrue(cluster, clusterv1.InfrastructureReadyCondition) 198 conditions.MarkTrue(cluster, clusterv1.ControlPlaneInitializedCondition) 199 200 // Ensure the control plane was initialized earlier to prevent it interfering with 201 // NodeStartupTimeout testing. 202 conds := clusterv1.Conditions{} 203 for _, condition := range cluster.GetConditions() { 204 condition.LastTransitionTime = metav1.NewTime(condition.LastTransitionTime.Add(-1 * time.Hour)) 205 conds = append(conds, condition) 206 } 207 cluster.SetConditions(conds) 208 209 mhcSelector := map[string]string{"cluster": clusterName, "machine-group": "foo"} 210 211 timeoutForMachineToHaveNode := 10 * time.Minute 212 disabledTimeoutForMachineToHaveNode := time.Duration(0) 213 timeoutForUnhealthyConditions := 5 * time.Minute 214 215 // Create a test MHC 216 testMHC := &clusterv1.MachineHealthCheck{ 217 ObjectMeta: metav1.ObjectMeta{ 218 Name: "test-mhc", 219 Namespace: namespace, 220 }, 221 Spec: clusterv1.MachineHealthCheckSpec{ 222 Selector: metav1.LabelSelector{ 223 MatchLabels: mhcSelector, 224 }, 225 ClusterName: clusterName, 226 UnhealthyConditions: []clusterv1.UnhealthyCondition{ 227 { 228 Type: corev1.NodeReady, 229 Status: corev1.ConditionUnknown, 230 Timeout: metav1.Duration{Duration: timeoutForUnhealthyConditions}, 231 }, 232 { 233 Type: corev1.NodeReady, 234 Status: corev1.ConditionFalse, 235 Timeout: metav1.Duration{Duration: timeoutForUnhealthyConditions}, 236 }, 237 }, 238 }, 239 } 240 241 testMachine := newTestMachine("machine1", namespace, clusterName, "node1", mhcSelector) 242 243 // Targets for when the node has not yet been seen by the Machine controller 244 testMachineCreated1200s := testMachine.DeepCopy() 245 nowMinus1200s := metav1.NewTime(time.Now().Add(-1200 * time.Second)) 246 testMachineCreated1200s.ObjectMeta.CreationTimestamp = nowMinus1200s 247 248 nodeNotYetStartedTarget1200s := healthCheckTarget{ 249 Cluster: cluster, 250 MHC: testMHC, 251 Machine: testMachineCreated1200s, 252 Node: nil, 253 } 254 nodeNotYetStartedTarget1200sCondition := newFailedHealthCheckCondition(clusterv1.NodeStartupTimeoutReason, "Node failed to report startup in %s", timeoutForMachineToHaveNode) 255 256 testMachineCreated400s := testMachine.DeepCopy() 257 nowMinus400s := metav1.NewTime(time.Now().Add(-400 * time.Second)) 258 testMachineCreated400s.ObjectMeta.CreationTimestamp = nowMinus400s 259 260 nodeNotYetStartedTarget400s := healthCheckTarget{ 261 Cluster: cluster, 262 MHC: testMHC, 263 Machine: testMachineCreated400s, 264 Node: nil, 265 } 266 267 // Target for when the Node has been seen, but has now gone 268 nodeGoneAway := healthCheckTarget{ 269 Cluster: cluster, 270 MHC: testMHC, 271 Machine: testMachine.DeepCopy(), 272 Node: &corev1.Node{}, 273 nodeMissing: true, 274 } 275 nodeGoneAwayCondition := newFailedHealthCheckCondition(clusterv1.NodeNotFoundReason, "") 276 277 // Target for when the node has been in an unknown state for shorter than the timeout 278 testNodeUnknown200 := newTestUnhealthyNode("node1", corev1.NodeReady, corev1.ConditionUnknown, 200*time.Second) 279 nodeUnknown200 := healthCheckTarget{ 280 Cluster: cluster, 281 MHC: testMHC, 282 Machine: testMachine.DeepCopy(), 283 Node: testNodeUnknown200, 284 nodeMissing: false, 285 } 286 287 // Second Target for when the node has been in an unknown state for shorter than the timeout 288 testNodeUnknown100 := newTestUnhealthyNode("node1", corev1.NodeReady, corev1.ConditionUnknown, 100*time.Second) 289 nodeUnknown100 := healthCheckTarget{ 290 Cluster: cluster, 291 MHC: testMHC, 292 Machine: testMachine.DeepCopy(), 293 Node: testNodeUnknown100, 294 nodeMissing: false, 295 } 296 297 // Target for when the node has been in an unknown state for longer than the timeout 298 testNodeUnknown400 := newTestUnhealthyNode("node1", corev1.NodeReady, corev1.ConditionUnknown, 400*time.Second) 299 nodeUnknown400 := healthCheckTarget{ 300 Cluster: cluster, 301 MHC: testMHC, 302 Machine: testMachine.DeepCopy(), 303 Node: testNodeUnknown400, 304 nodeMissing: false, 305 } 306 nodeUnknown400Condition := newFailedHealthCheckCondition(clusterv1.UnhealthyNodeConditionReason, "Condition Ready on node is reporting status Unknown for more than %s", timeoutForUnhealthyConditions) 307 308 // Target for when a node is healthy 309 testNodeHealthy := newTestNode("node1") 310 testNodeHealthy.UID = "12345" 311 nodeHealthy := healthCheckTarget{ 312 Cluster: cluster, 313 MHC: testMHC, 314 Machine: testMachine.DeepCopy(), 315 Node: testNodeHealthy, 316 nodeMissing: false, 317 } 318 319 // Target for when the machine has a failure reason 320 failureReason := errors.UpdateMachineError 321 testMachineFailureReason := testMachine.DeepCopy() 322 testMachineFailureReason.Status.FailureReason = &failureReason 323 machineFailureReason := healthCheckTarget{ 324 Cluster: cluster, 325 MHC: testMHC, 326 Machine: testMachineFailureReason, 327 Node: nil, 328 } 329 machineFailureReasonCondition := newFailedHealthCheckCondition(clusterv1.MachineHasFailureReason, "FailureReason: %s", failureReason) 330 331 // Target for when the machine has a failure message 332 failureMsg := "some failure message" 333 testMachineFailureMsg := testMachine.DeepCopy() 334 testMachineFailureMsg.Status.FailureMessage = &failureMsg 335 machineFailureMsg := healthCheckTarget{ 336 Cluster: cluster, 337 MHC: testMHC, 338 Machine: testMachineFailureMsg, 339 Node: nil, 340 } 341 machineFailureMsgCondition := newFailedHealthCheckCondition(clusterv1.MachineHasFailureReason, "FailureMessage: %s", failureMsg) 342 343 // Target for when the machine has the remediate machine annotation 344 annotationRemediationMsg := "Marked for remediation via remediate-machine annotation" 345 testMachineAnnotationRemediation := testMachine.DeepCopy() 346 testMachineAnnotationRemediation.Annotations = map[string]string{clusterv1.RemediateMachineAnnotation: ""} 347 machineAnnotationRemediation := healthCheckTarget{ 348 Cluster: cluster, 349 MHC: testMHC, 350 Machine: testMachineAnnotationRemediation, 351 Node: nil, 352 } 353 machineAnnotationRemediationCondition := newFailedHealthCheckCondition(clusterv1.HasRemediateMachineAnnotationReason, annotationRemediationMsg) 354 355 testCases := []struct { 356 desc string 357 targets []healthCheckTarget 358 timeoutForMachineToHaveNode *time.Duration 359 expectedHealthy []healthCheckTarget 360 expectedNeedsRemediation []healthCheckTarget 361 expectedNeedsRemediationCondition []clusterv1.Condition 362 expectedNextCheckTimes []time.Duration 363 }{ 364 { 365 desc: "when the node has not yet started for shorter than the timeout", 366 targets: []healthCheckTarget{nodeNotYetStartedTarget400s}, 367 expectedHealthy: []healthCheckTarget{}, 368 expectedNeedsRemediation: []healthCheckTarget{}, 369 expectedNextCheckTimes: []time.Duration{timeoutForMachineToHaveNode - 400*time.Second}, 370 }, 371 { 372 desc: "when the node has not yet started for longer than the timeout", 373 targets: []healthCheckTarget{nodeNotYetStartedTarget1200s}, 374 expectedHealthy: []healthCheckTarget{}, 375 expectedNeedsRemediation: []healthCheckTarget{nodeNotYetStartedTarget1200s}, 376 expectedNeedsRemediationCondition: []clusterv1.Condition{nodeNotYetStartedTarget1200sCondition}, 377 expectedNextCheckTimes: []time.Duration{}, 378 }, 379 { 380 desc: "when the node has gone away", 381 targets: []healthCheckTarget{nodeGoneAway}, 382 expectedHealthy: []healthCheckTarget{}, 383 expectedNeedsRemediation: []healthCheckTarget{nodeGoneAway}, 384 expectedNeedsRemediationCondition: []clusterv1.Condition{nodeGoneAwayCondition}, 385 expectedNextCheckTimes: []time.Duration{}, 386 }, 387 { 388 desc: "when the node has been in an unknown state for shorter than the timeout", 389 targets: []healthCheckTarget{nodeUnknown200}, 390 expectedHealthy: []healthCheckTarget{}, 391 expectedNeedsRemediation: []healthCheckTarget{}, 392 expectedNextCheckTimes: []time.Duration{100 * time.Second}, 393 }, 394 { 395 desc: "when the node has been in an unknown state for longer than the timeout", 396 targets: []healthCheckTarget{nodeUnknown400}, 397 expectedHealthy: []healthCheckTarget{}, 398 expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400}, 399 expectedNeedsRemediationCondition: []clusterv1.Condition{nodeUnknown400Condition}, 400 expectedNextCheckTimes: []time.Duration{}, 401 }, 402 { 403 desc: "when the node is healthy", 404 targets: []healthCheckTarget{nodeHealthy}, 405 expectedHealthy: []healthCheckTarget{nodeHealthy}, 406 expectedNeedsRemediation: []healthCheckTarget{}, 407 expectedNextCheckTimes: []time.Duration{}, 408 }, 409 { 410 desc: "with a mix of healthy and unhealthy nodes", 411 targets: []healthCheckTarget{nodeUnknown100, nodeUnknown200, nodeUnknown400, nodeHealthy}, 412 expectedHealthy: []healthCheckTarget{nodeHealthy}, 413 expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400}, 414 expectedNeedsRemediationCondition: []clusterv1.Condition{nodeUnknown400Condition}, 415 expectedNextCheckTimes: []time.Duration{200 * time.Second, 100 * time.Second}, 416 }, 417 { 418 desc: "when the node has not started for a long time but the startup timeout is disabled", 419 targets: []healthCheckTarget{nodeNotYetStartedTarget400s}, 420 timeoutForMachineToHaveNode: &disabledTimeoutForMachineToHaveNode, 421 expectedHealthy: []healthCheckTarget{}, // The node is not healthy as it does not have a machine 422 expectedNeedsRemediation: []healthCheckTarget{}, 423 expectedNextCheckTimes: []time.Duration{}, // We don't have a timeout so no way to know when to re-check 424 }, 425 { 426 desc: "when the machine has a failure reason", 427 targets: []healthCheckTarget{machineFailureReason}, 428 expectedHealthy: []healthCheckTarget{}, 429 expectedNeedsRemediation: []healthCheckTarget{machineFailureReason}, 430 expectedNeedsRemediationCondition: []clusterv1.Condition{machineFailureReasonCondition}, 431 expectedNextCheckTimes: []time.Duration{}, 432 }, 433 { 434 desc: "when the machine has a failure message", 435 targets: []healthCheckTarget{machineFailureMsg}, 436 expectedHealthy: []healthCheckTarget{}, 437 expectedNeedsRemediation: []healthCheckTarget{machineFailureMsg}, 438 expectedNeedsRemediationCondition: []clusterv1.Condition{machineFailureMsgCondition}, 439 expectedNextCheckTimes: []time.Duration{}, 440 }, 441 { 442 desc: "when the machine is manually marked for remediation", 443 targets: []healthCheckTarget{machineAnnotationRemediation}, 444 expectedHealthy: []healthCheckTarget{}, 445 expectedNeedsRemediation: []healthCheckTarget{machineAnnotationRemediation}, 446 expectedNeedsRemediationCondition: []clusterv1.Condition{machineAnnotationRemediationCondition}, 447 expectedNextCheckTimes: []time.Duration{}, 448 }, 449 } 450 451 for _, tc := range testCases { 452 t.Run(tc.desc, func(t *testing.T) { 453 gs := NewWithT(t) 454 455 // Create a test reconciler. 456 reconciler := &Reconciler{ 457 recorder: record.NewFakeRecorder(5), 458 } 459 460 // Allow individual test cases to override the timeoutForMachineToHaveNode. 461 timeout := metav1.Duration{Duration: timeoutForMachineToHaveNode} 462 if tc.timeoutForMachineToHaveNode != nil { 463 timeout.Duration = *tc.timeoutForMachineToHaveNode 464 } 465 466 healthy, unhealthy, nextCheckTimes := reconciler.healthCheckTargets(tc.targets, ctrl.LoggerFrom(ctx), timeout) 467 468 // Round durations down to nearest second account for minute differences 469 // in timing when running tests 470 roundDurations := func(in []time.Duration) []time.Duration { 471 out := []time.Duration{} 472 for _, d := range in { 473 out = append(out, d.Truncate(time.Second)) 474 } 475 return out 476 } 477 478 // Remove the last transition time of the given conditions. Used for comparison with expected conditions. 479 removeLastTransitionTimes := func(in clusterv1.Conditions) clusterv1.Conditions { 480 out := clusterv1.Conditions{} 481 for _, c := range in { 482 withoutTime := c.DeepCopy() 483 withoutTime.LastTransitionTime = metav1.Time{} 484 out = append(out, *withoutTime) 485 } 486 return out 487 } 488 489 gs.Expect(healthy).To(ConsistOf(tc.expectedHealthy)) 490 gs.Expect(unhealthy).To(ConsistOf(tc.expectedNeedsRemediation)) 491 gs.Expect(nextCheckTimes).To(WithTransform(roundDurations, ConsistOf(tc.expectedNextCheckTimes))) 492 for i, expectedMachineCondition := range tc.expectedNeedsRemediationCondition { 493 actualConditions := unhealthy[i].Machine.GetConditions() 494 conditionsMatcher := WithTransform(removeLastTransitionTimes, ContainElements(expectedMachineCondition)) 495 gs.Expect(actualConditions).To(conditionsMatcher) 496 } 497 }) 498 } 499 } 500 501 func newTestMachine(name, namespace, clusterName, nodeName string, labels map[string]string) *clusterv1.Machine { 502 // Copy the labels so that the map is unique to each test Machine 503 l := make(map[string]string) 504 for k, v := range labels { 505 l[k] = v 506 } 507 l[clusterv1.ClusterNameLabel] = clusterName 508 509 bootstrap := "bootstrap" 510 return &clusterv1.Machine{ 511 TypeMeta: metav1.TypeMeta{ 512 APIVersion: clusterv1.GroupVersion.String(), 513 Kind: "Machine", 514 }, 515 ObjectMeta: metav1.ObjectMeta{ 516 Name: name, 517 Namespace: namespace, 518 Labels: l, 519 }, 520 Spec: clusterv1.MachineSpec{ 521 ClusterName: clusterName, 522 Bootstrap: clusterv1.Bootstrap{ 523 DataSecretName: &bootstrap, 524 }, 525 }, 526 Status: clusterv1.MachineStatus{ 527 InfrastructureReady: true, 528 BootstrapReady: true, 529 Phase: string(clusterv1.MachinePhaseRunning), 530 NodeRef: &corev1.ObjectReference{ 531 Name: nodeName, 532 }, 533 }, 534 } 535 } 536 537 func newTestNode(name string) *corev1.Node { 538 return &corev1.Node{ 539 TypeMeta: metav1.TypeMeta{ 540 APIVersion: "v1", 541 Kind: "Node", 542 }, 543 ObjectMeta: metav1.ObjectMeta{ 544 Name: name, 545 }, 546 } 547 } 548 549 func newTestUnhealthyNode(name string, condition corev1.NodeConditionType, status corev1.ConditionStatus, unhealthyDuration time.Duration) *corev1.Node { 550 return &corev1.Node{ 551 ObjectMeta: metav1.ObjectMeta{ 552 Name: name, 553 UID: "12345", 554 }, 555 Status: corev1.NodeStatus{ 556 Conditions: []corev1.NodeCondition{ 557 { 558 Type: condition, 559 Status: status, 560 LastTransitionTime: metav1.NewTime(time.Now().Add(-unhealthyDuration)), 561 }, 562 }, 563 }, 564 } 565 } 566 567 func newFailedHealthCheckCondition(reason string, messageFormat string, messageArgs ...interface{}) clusterv1.Condition { 568 return *conditions.FalseCondition(clusterv1.MachineHealthCheckSucceededCondition, reason, clusterv1.ConditionSeverityWarning, messageFormat, messageArgs...) 569 }