sigs.k8s.io/cluster-api@v1.6.3/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package machinehealthcheck 18 19 import ( 20 "testing" 21 "time" 22 23 . "github.com/onsi/gomega" 24 corev1 "k8s.io/api/core/v1" 25 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 "k8s.io/client-go/tools/record" 27 ctrl "sigs.k8s.io/controller-runtime" 28 "sigs.k8s.io/controller-runtime/pkg/client" 29 "sigs.k8s.io/controller-runtime/pkg/client/fake" 30 31 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 32 "sigs.k8s.io/cluster-api/errors" 33 "sigs.k8s.io/cluster-api/util/conditions" 34 "sigs.k8s.io/cluster-api/util/patch" 35 ) 36 37 func TestGetTargetsFromMHC(t *testing.T) { 38 namespace := "test-mhc" 39 clusterName := "test-cluster" 40 41 cluster := &clusterv1.Cluster{ 42 ObjectMeta: metav1.ObjectMeta{ 43 Namespace: namespace, 44 Name: clusterName, 45 }, 46 } 47 48 mhcSelector := map[string]string{"cluster": clusterName, "machine-group": "foo"} 49 50 // Create a namespace for the tests 51 testNS := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "mhc-test"}} 52 53 // Create a test MHC 54 testMHC := &clusterv1.MachineHealthCheck{ 55 ObjectMeta: metav1.ObjectMeta{ 56 Name: "test-mhc", 57 Namespace: namespace, 58 }, 59 Spec: clusterv1.MachineHealthCheckSpec{ 60 ClusterName: clusterName, 61 Selector: metav1.LabelSelector{ 62 MatchLabels: mhcSelector, 63 }, 64 UnhealthyConditions: []clusterv1.UnhealthyCondition{ 65 { 66 Type: corev1.NodeReady, 67 Status: corev1.ConditionUnknown, 68 Timeout: metav1.Duration{Duration: 5 * time.Minute}, 69 }, 70 }, 71 }, 72 } 73 74 baseObjects := []client.Object{testNS, cluster, testMHC} 75 76 // Initialise some test machines and nodes for use in the test cases 77 78 testNode1 := newTestNode("node1") 79 testMachine1 := newTestMachine("machine1", namespace, clusterName, testNode1.Name, mhcSelector) 80 testNode2 := newTestNode("node2") 81 testMachine2 := newTestMachine("machine2", namespace, clusterName, testNode2.Name, map[string]string{"cluster": clusterName}) 82 testNode3 := newTestNode("node3") 83 testMachine3 := newTestMachine("machine3", namespace, clusterName, testNode3.Name, mhcSelector) 84 testNode4 := newTestNode("node4") 85 testMachine4 := newTestMachine("machine4", namespace, "other-cluster", testNode4.Name, mhcSelector) 86 87 // machines for skip remediation 88 testNode5 := newTestNode("node5") 89 testMachine5 := newTestMachine("machine5", namespace, clusterName, testNode5.Name, mhcSelector) 90 testMachine5.Annotations = map[string]string{"cluster.x-k8s.io/skip-remediation": ""} 91 testNode6 := newTestNode("node6") 92 testMachine6 := newTestMachine("machine6", namespace, clusterName, testNode6.Name, mhcSelector) 93 testMachine6.Annotations = map[string]string{"cluster.x-k8s.io/paused": ""} 94 95 testCases := []struct { 96 desc string 97 toCreate []client.Object 98 expectedTargets []healthCheckTarget 99 }{ 100 { 101 desc: "with no matching machines", 102 toCreate: baseObjects, 103 expectedTargets: nil, 104 }, 105 { 106 desc: "when a machine's node is missing", 107 toCreate: append(baseObjects, testMachine1), 108 expectedTargets: []healthCheckTarget{ 109 { 110 Machine: testMachine1, 111 MHC: testMHC, 112 Node: nil, 113 nodeMissing: true, 114 }, 115 }, 116 }, 117 { 118 desc: "when a machine's labels do not match the selector", 119 toCreate: append(baseObjects, testMachine1, testMachine2, testNode1), 120 expectedTargets: []healthCheckTarget{ 121 { 122 Machine: testMachine1, 123 MHC: testMHC, 124 Node: testNode1, 125 }, 126 }, 127 }, 128 { 129 desc: "with multiple machines, should match correct nodes", 130 toCreate: append(baseObjects, testNode1, testMachine1, testNode3, testMachine3, testNode4, testMachine4), 131 expectedTargets: []healthCheckTarget{ 132 { 133 Machine: testMachine1, 134 MHC: testMHC, 135 Node: testNode1, 136 }, 137 { 138 Machine: testMachine3, 139 MHC: testMHC, 140 Node: testNode3, 141 }, 142 }, 143 }, 144 { 145 desc: "with machines having skip-remediation or paused annotation", 146 toCreate: append(baseObjects, testNode1, testMachine1, testMachine5, testMachine6), 147 expectedTargets: []healthCheckTarget{ 148 { 149 Machine: testMachine1, 150 MHC: testMHC, 151 Node: testNode1, 152 }, 153 }, 154 }, 155 } 156 157 for _, tc := range testCases { 158 t.Run(tc.desc, func(t *testing.T) { 159 gs := NewGomegaWithT(t) 160 161 k8sClient := fake.NewClientBuilder().WithObjects(tc.toCreate...).Build() 162 163 // Create a test reconciler 164 reconciler := &Reconciler{ 165 Client: k8sClient, 166 } 167 for _, t := range tc.expectedTargets { 168 patchHelper, err := patch.NewHelper(t.Machine, k8sClient) 169 gs.Expect(err).ToNot(HaveOccurred()) 170 t.patchHelper = patchHelper 171 } 172 173 targets, err := reconciler.getTargetsFromMHC(ctx, ctrl.LoggerFrom(ctx), k8sClient, cluster, testMHC) 174 gs.Expect(err).ToNot(HaveOccurred()) 175 176 gs.Expect(targets).To(HaveLen(len(tc.expectedTargets))) 177 for i, target := range targets { 178 expectedTarget := tc.expectedTargets[i] 179 gs.Expect(target.Machine).To(BeComparableTo(expectedTarget.Machine)) 180 gs.Expect(target.MHC).To(BeComparableTo(expectedTarget.MHC)) 181 gs.Expect(target.Node).To(BeComparableTo(expectedTarget.Node)) 182 } 183 }) 184 } 185 } 186 187 func TestHealthCheckTargets(t *testing.T) { 188 namespace := "test-mhc" 189 clusterName := "test-cluster" 190 191 cluster := &clusterv1.Cluster{ 192 ObjectMeta: metav1.ObjectMeta{ 193 Namespace: namespace, 194 Name: clusterName, 195 }, 196 } 197 conditions.MarkTrue(cluster, clusterv1.InfrastructureReadyCondition) 198 conditions.MarkTrue(cluster, clusterv1.ControlPlaneInitializedCondition) 199 200 // Ensure the control plane was initialized earlier to prevent it interfering with 201 // NodeStartupTimeout testing. 202 conds := clusterv1.Conditions{} 203 for _, condition := range cluster.GetConditions() { 204 condition.LastTransitionTime = metav1.NewTime(condition.LastTransitionTime.Add(-1 * time.Hour)) 205 conds = append(conds, condition) 206 } 207 cluster.SetConditions(conds) 208 209 mhcSelector := map[string]string{"cluster": clusterName, "machine-group": "foo"} 210 211 timeoutForMachineToHaveNode := 10 * time.Minute 212 disabledTimeoutForMachineToHaveNode := time.Duration(0) 213 timeoutForUnhealthyConditions := 5 * time.Minute 214 215 // Create a test MHC 216 testMHC := &clusterv1.MachineHealthCheck{ 217 ObjectMeta: metav1.ObjectMeta{ 218 Name: "test-mhc", 219 Namespace: namespace, 220 }, 221 Spec: clusterv1.MachineHealthCheckSpec{ 222 Selector: metav1.LabelSelector{ 223 MatchLabels: mhcSelector, 224 }, 225 ClusterName: clusterName, 226 UnhealthyConditions: []clusterv1.UnhealthyCondition{ 227 { 228 Type: corev1.NodeReady, 229 Status: corev1.ConditionUnknown, 230 Timeout: metav1.Duration{Duration: timeoutForUnhealthyConditions}, 231 }, 232 { 233 Type: corev1.NodeReady, 234 Status: corev1.ConditionFalse, 235 Timeout: metav1.Duration{Duration: timeoutForUnhealthyConditions}, 236 }, 237 }, 238 }, 239 } 240 241 testMachine := newTestMachine("machine1", namespace, clusterName, "node1", mhcSelector) 242 243 // Targets for when the node has not yet been seen by the Machine controller 244 testMachineCreated1200s := testMachine.DeepCopy() 245 nowMinus1200s := metav1.NewTime(time.Now().Add(-1200 * time.Second)) 246 testMachineCreated1200s.ObjectMeta.CreationTimestamp = nowMinus1200s 247 248 nodeNotYetStartedTarget1200s := healthCheckTarget{ 249 Cluster: cluster, 250 MHC: testMHC, 251 Machine: testMachineCreated1200s, 252 Node: nil, 253 } 254 nodeNotYetStartedTarget1200sCondition := newFailedHealthCheckCondition(clusterv1.NodeStartupTimeoutReason, "Node failed to report startup in %s", timeoutForMachineToHaveNode) 255 256 testMachineCreated400s := testMachine.DeepCopy() 257 nowMinus400s := metav1.NewTime(time.Now().Add(-400 * time.Second)) 258 testMachineCreated400s.ObjectMeta.CreationTimestamp = nowMinus400s 259 260 nodeNotYetStartedTarget400s := healthCheckTarget{ 261 Cluster: cluster, 262 MHC: testMHC, 263 Machine: testMachineCreated400s, 264 Node: nil, 265 } 266 267 // Target for when the Node has been seen, but has now gone 268 nodeGoneAway := healthCheckTarget{ 269 Cluster: cluster, 270 MHC: testMHC, 271 Machine: testMachine.DeepCopy(), 272 Node: &corev1.Node{}, 273 nodeMissing: true, 274 } 275 nodeGoneAwayCondition := newFailedHealthCheckCondition(clusterv1.NodeNotFoundReason, "") 276 277 // Target for when the node has been in an unknown state for shorter than the timeout 278 testNodeUnknown200 := newTestUnhealthyNode("node1", corev1.NodeReady, corev1.ConditionUnknown, 200*time.Second) 279 nodeUnknown200 := healthCheckTarget{ 280 Cluster: cluster, 281 MHC: testMHC, 282 Machine: testMachine.DeepCopy(), 283 Node: testNodeUnknown200, 284 nodeMissing: false, 285 } 286 287 // Second Target for when the node has been in an unknown state for shorter than the timeout 288 testNodeUnknown100 := newTestUnhealthyNode("node1", corev1.NodeReady, corev1.ConditionUnknown, 100*time.Second) 289 nodeUnknown100 := healthCheckTarget{ 290 Cluster: cluster, 291 MHC: testMHC, 292 Machine: testMachine.DeepCopy(), 293 Node: testNodeUnknown100, 294 nodeMissing: false, 295 } 296 297 // Target for when the node has been in an unknown state for longer than the timeout 298 testNodeUnknown400 := newTestUnhealthyNode("node1", corev1.NodeReady, corev1.ConditionUnknown, 400*time.Second) 299 nodeUnknown400 := healthCheckTarget{ 300 Cluster: cluster, 301 MHC: testMHC, 302 Machine: testMachine.DeepCopy(), 303 Node: testNodeUnknown400, 304 nodeMissing: false, 305 } 306 nodeUnknown400Condition := newFailedHealthCheckCondition(clusterv1.UnhealthyNodeConditionReason, "Condition Ready on node is reporting status Unknown for more than %s", timeoutForUnhealthyConditions) 307 308 // Target for when a node is healthy 309 testNodeHealthy := newTestNode("node1") 310 testNodeHealthy.UID = "12345" 311 nodeHealthy := healthCheckTarget{ 312 Cluster: cluster, 313 MHC: testMHC, 314 Machine: testMachine.DeepCopy(), 315 Node: testNodeHealthy, 316 nodeMissing: false, 317 } 318 319 // Target for when the machine has a failure reason 320 failureReason := errors.UpdateMachineError 321 testMachineFailureReason := testMachine.DeepCopy() 322 testMachineFailureReason.Status.FailureReason = &failureReason 323 machineFailureReason := healthCheckTarget{ 324 Cluster: cluster, 325 MHC: testMHC, 326 Machine: testMachineFailureReason, 327 Node: nil, 328 } 329 machineFailureReasonCondition := newFailedHealthCheckCondition(clusterv1.MachineHasFailureReason, "FailureReason: %s", failureReason) 330 331 // Target for when the machine has a failure message 332 failureMsg := "some failure message" 333 testMachineFailureMsg := testMachine.DeepCopy() 334 testMachineFailureMsg.Status.FailureMessage = &failureMsg 335 machineFailureMsg := healthCheckTarget{ 336 Cluster: cluster, 337 MHC: testMHC, 338 Machine: testMachineFailureMsg, 339 Node: nil, 340 } 341 machineFailureMsgCondition := newFailedHealthCheckCondition(clusterv1.MachineHasFailureReason, "FailureMessage: %s", failureMsg) 342 343 testCases := []struct { 344 desc string 345 targets []healthCheckTarget 346 timeoutForMachineToHaveNode *time.Duration 347 expectedHealthy []healthCheckTarget 348 expectedNeedsRemediation []healthCheckTarget 349 expectedNeedsRemediationCondition []clusterv1.Condition 350 expectedNextCheckTimes []time.Duration 351 }{ 352 { 353 desc: "when the node has not yet started for shorter than the timeout", 354 targets: []healthCheckTarget{nodeNotYetStartedTarget400s}, 355 expectedHealthy: []healthCheckTarget{}, 356 expectedNeedsRemediation: []healthCheckTarget{}, 357 expectedNextCheckTimes: []time.Duration{timeoutForMachineToHaveNode - 400*time.Second}, 358 }, 359 { 360 desc: "when the node has not yet started for longer than the timeout", 361 targets: []healthCheckTarget{nodeNotYetStartedTarget1200s}, 362 expectedHealthy: []healthCheckTarget{}, 363 expectedNeedsRemediation: []healthCheckTarget{nodeNotYetStartedTarget1200s}, 364 expectedNeedsRemediationCondition: []clusterv1.Condition{nodeNotYetStartedTarget1200sCondition}, 365 expectedNextCheckTimes: []time.Duration{}, 366 }, 367 { 368 desc: "when the node has gone away", 369 targets: []healthCheckTarget{nodeGoneAway}, 370 expectedHealthy: []healthCheckTarget{}, 371 expectedNeedsRemediation: []healthCheckTarget{nodeGoneAway}, 372 expectedNeedsRemediationCondition: []clusterv1.Condition{nodeGoneAwayCondition}, 373 expectedNextCheckTimes: []time.Duration{}, 374 }, 375 { 376 desc: "when the node has been in an unknown state for shorter than the timeout", 377 targets: []healthCheckTarget{nodeUnknown200}, 378 expectedHealthy: []healthCheckTarget{}, 379 expectedNeedsRemediation: []healthCheckTarget{}, 380 expectedNextCheckTimes: []time.Duration{100 * time.Second}, 381 }, 382 { 383 desc: "when the node has been in an unknown state for longer than the timeout", 384 targets: []healthCheckTarget{nodeUnknown400}, 385 expectedHealthy: []healthCheckTarget{}, 386 expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400}, 387 expectedNeedsRemediationCondition: []clusterv1.Condition{nodeUnknown400Condition}, 388 expectedNextCheckTimes: []time.Duration{}, 389 }, 390 { 391 desc: "when the node is healthy", 392 targets: []healthCheckTarget{nodeHealthy}, 393 expectedHealthy: []healthCheckTarget{nodeHealthy}, 394 expectedNeedsRemediation: []healthCheckTarget{}, 395 expectedNextCheckTimes: []time.Duration{}, 396 }, 397 { 398 desc: "with a mix of healthy and unhealthy nodes", 399 targets: []healthCheckTarget{nodeUnknown100, nodeUnknown200, nodeUnknown400, nodeHealthy}, 400 expectedHealthy: []healthCheckTarget{nodeHealthy}, 401 expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400}, 402 expectedNeedsRemediationCondition: []clusterv1.Condition{nodeUnknown400Condition}, 403 expectedNextCheckTimes: []time.Duration{200 * time.Second, 100 * time.Second}, 404 }, 405 { 406 desc: "when the node has not started for a long time but the startup timeout is disabled", 407 targets: []healthCheckTarget{nodeNotYetStartedTarget400s}, 408 timeoutForMachineToHaveNode: &disabledTimeoutForMachineToHaveNode, 409 expectedHealthy: []healthCheckTarget{}, // The node is not healthy as it does not have a machine 410 expectedNeedsRemediation: []healthCheckTarget{}, 411 expectedNextCheckTimes: []time.Duration{}, // We don't have a timeout so no way to know when to re-check 412 }, 413 { 414 desc: "when the machine has a failure reason", 415 targets: []healthCheckTarget{machineFailureReason}, 416 expectedHealthy: []healthCheckTarget{}, 417 expectedNeedsRemediation: []healthCheckTarget{machineFailureReason}, 418 expectedNeedsRemediationCondition: []clusterv1.Condition{machineFailureReasonCondition}, 419 expectedNextCheckTimes: []time.Duration{}, 420 }, 421 { 422 desc: "when the machine has a failure message", 423 targets: []healthCheckTarget{machineFailureMsg}, 424 expectedHealthy: []healthCheckTarget{}, 425 expectedNeedsRemediation: []healthCheckTarget{machineFailureMsg}, 426 expectedNeedsRemediationCondition: []clusterv1.Condition{machineFailureMsgCondition}, 427 expectedNextCheckTimes: []time.Duration{}, 428 }, 429 } 430 431 for _, tc := range testCases { 432 t.Run(tc.desc, func(t *testing.T) { 433 gs := NewWithT(t) 434 435 // Create a test reconciler. 436 reconciler := &Reconciler{ 437 recorder: record.NewFakeRecorder(5), 438 } 439 440 // Allow individual test cases to override the timeoutForMachineToHaveNode. 441 timeout := metav1.Duration{Duration: timeoutForMachineToHaveNode} 442 if tc.timeoutForMachineToHaveNode != nil { 443 timeout.Duration = *tc.timeoutForMachineToHaveNode 444 } 445 446 healthy, unhealthy, nextCheckTimes := reconciler.healthCheckTargets(tc.targets, ctrl.LoggerFrom(ctx), timeout) 447 448 // Round durations down to nearest second account for minute differences 449 // in timing when running tests 450 roundDurations := func(in []time.Duration) []time.Duration { 451 out := []time.Duration{} 452 for _, d := range in { 453 out = append(out, d.Truncate(time.Second)) 454 } 455 return out 456 } 457 458 // Remove the last transition time of the given conditions. Used for comparison with expected conditions. 459 removeLastTransitionTimes := func(in clusterv1.Conditions) clusterv1.Conditions { 460 out := clusterv1.Conditions{} 461 for _, c := range in { 462 withoutTime := c.DeepCopy() 463 withoutTime.LastTransitionTime = metav1.Time{} 464 out = append(out, *withoutTime) 465 } 466 return out 467 } 468 469 gs.Expect(healthy).To(ConsistOf(tc.expectedHealthy)) 470 gs.Expect(unhealthy).To(ConsistOf(tc.expectedNeedsRemediation)) 471 gs.Expect(nextCheckTimes).To(WithTransform(roundDurations, ConsistOf(tc.expectedNextCheckTimes))) 472 for i, expectedMachineCondition := range tc.expectedNeedsRemediationCondition { 473 actualConditions := unhealthy[i].Machine.GetConditions() 474 conditionsMatcher := WithTransform(removeLastTransitionTimes, ContainElements(expectedMachineCondition)) 475 gs.Expect(actualConditions).To(conditionsMatcher) 476 } 477 }) 478 } 479 } 480 481 func newTestMachine(name, namespace, clusterName, nodeName string, labels map[string]string) *clusterv1.Machine { 482 // Copy the labels so that the map is unique to each test Machine 483 l := make(map[string]string) 484 for k, v := range labels { 485 l[k] = v 486 } 487 l[clusterv1.ClusterNameLabel] = clusterName 488 489 bootstrap := "bootstrap" 490 return &clusterv1.Machine{ 491 TypeMeta: metav1.TypeMeta{ 492 APIVersion: clusterv1.GroupVersion.String(), 493 Kind: "Machine", 494 }, 495 ObjectMeta: metav1.ObjectMeta{ 496 Name: name, 497 Namespace: namespace, 498 Labels: l, 499 }, 500 Spec: clusterv1.MachineSpec{ 501 ClusterName: clusterName, 502 Bootstrap: clusterv1.Bootstrap{ 503 DataSecretName: &bootstrap, 504 }, 505 }, 506 Status: clusterv1.MachineStatus{ 507 InfrastructureReady: true, 508 BootstrapReady: true, 509 Phase: string(clusterv1.MachinePhaseRunning), 510 NodeRef: &corev1.ObjectReference{ 511 Name: nodeName, 512 }, 513 }, 514 } 515 } 516 517 func newTestNode(name string) *corev1.Node { 518 return &corev1.Node{ 519 TypeMeta: metav1.TypeMeta{ 520 APIVersion: "v1", 521 Kind: "Node", 522 }, 523 ObjectMeta: metav1.ObjectMeta{ 524 Name: name, 525 }, 526 } 527 } 528 529 func newTestUnhealthyNode(name string, condition corev1.NodeConditionType, status corev1.ConditionStatus, unhealthyDuration time.Duration) *corev1.Node { 530 return &corev1.Node{ 531 ObjectMeta: metav1.ObjectMeta{ 532 Name: name, 533 UID: "12345", 534 }, 535 Status: corev1.NodeStatus{ 536 Conditions: []corev1.NodeCondition{ 537 { 538 Type: condition, 539 Status: status, 540 LastTransitionTime: metav1.NewTime(time.Now().Add(-unhealthyDuration)), 541 }, 542 }, 543 }, 544 } 545 } 546 547 func newFailedHealthCheckCondition(reason string, messageFormat string, messageArgs ...interface{}) clusterv1.Condition { 548 return *conditions.FalseCondition(clusterv1.MachineHealthCheckSucceededCondition, reason, clusterv1.ConditionSeverityWarning, messageFormat, messageArgs...) 549 }