sigs.k8s.io/cluster-api@v1.6.3/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package machinehealthcheck 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "sort" 24 "testing" 25 "time" 26 27 "github.com/go-logr/logr" 28 . "github.com/onsi/gomega" 29 corev1 "k8s.io/api/core/v1" 30 apierrors "k8s.io/apimachinery/pkg/api/errors" 31 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 32 "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 33 "k8s.io/apimachinery/pkg/types" 34 "k8s.io/apimachinery/pkg/util/intstr" 35 "k8s.io/apimachinery/pkg/util/uuid" 36 "k8s.io/client-go/kubernetes/scheme" 37 "k8s.io/client-go/tools/record" 38 "k8s.io/utils/pointer" 39 "sigs.k8s.io/controller-runtime/pkg/client" 40 "sigs.k8s.io/controller-runtime/pkg/client/fake" 41 "sigs.k8s.io/controller-runtime/pkg/log" 42 "sigs.k8s.io/controller-runtime/pkg/reconcile" 43 44 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 45 "sigs.k8s.io/cluster-api/api/v1beta1/index" 46 "sigs.k8s.io/cluster-api/controllers/remote" 47 capierrors "sigs.k8s.io/cluster-api/errors" 48 "sigs.k8s.io/cluster-api/internal/test/builder" 49 "sigs.k8s.io/cluster-api/internal/webhooks" 50 "sigs.k8s.io/cluster-api/util" 51 "sigs.k8s.io/cluster-api/util/conditions" 52 "sigs.k8s.io/cluster-api/util/patch" 53 ) 54 55 func TestMachineHealthCheck_Reconcile(t *testing.T) { 56 ns, err := env.CreateNamespace(ctx, "test-mhc") 57 if err != nil { 58 t.Fatal(err) 59 } 60 defer func() { 61 if err := env.Delete(ctx, ns); err != nil { 62 t.Fatal(err) 63 } 64 }() 65 66 t.Run("it should ensure the correct cluster-name label when no existing labels exist", func(t *testing.T) { 67 g := NewWithT(t) 68 cluster := createCluster(g, ns.Name) 69 70 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 71 mhc.Labels = map[string]string{} 72 73 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 74 defer func(do ...client.Object) { 75 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 76 }(cluster, mhc) 77 78 g.Eventually(func() map[string]string { 79 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 80 if err != nil { 81 return nil 82 } 83 return mhc.GetLabels() 84 }).Should(HaveKeyWithValue(clusterv1.ClusterNameLabel, cluster.Name)) 85 }) 86 87 t.Run("it should ensure the correct cluster-name label when the label has the wrong value", func(t *testing.T) { 88 g := NewWithT(t) 89 cluster := createCluster(g, ns.Name) 90 91 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 92 mhc.Labels = map[string]string{ 93 clusterv1.ClusterNameLabel: "wrong-cluster", 94 } 95 96 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 97 defer func(do ...client.Object) { 98 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 99 }(cluster, mhc) 100 101 g.Eventually(func() map[string]string { 102 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 103 if err != nil { 104 return nil 105 } 106 return mhc.GetLabels() 107 }).Should(HaveKeyWithValue(clusterv1.ClusterNameLabel, cluster.Name)) 108 }) 109 110 t.Run("it should ensure the correct cluster-name label when other labels are present", func(t *testing.T) { 111 g := NewWithT(t) 112 cluster := createCluster(g, ns.Name) 113 114 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 115 mhc.Labels = map[string]string{ 116 "extra-label": "1", 117 } 118 119 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 120 defer func(do ...client.Object) { 121 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 122 }(cluster, mhc) 123 124 g.Eventually(func() map[string]string { 125 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 126 if err != nil { 127 return nil 128 } 129 return mhc.GetLabels() 130 }).Should(And( 131 HaveKeyWithValue(clusterv1.ClusterNameLabel, cluster.Name), 132 HaveKeyWithValue("extra-label", "1"), 133 HaveLen(2), 134 )) 135 }) 136 137 t.Run("it should ensure an owner reference is present when no existing ones exist", func(t *testing.T) { 138 g := NewWithT(t) 139 cluster := createCluster(g, ns.Name) 140 141 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 142 mhc.OwnerReferences = []metav1.OwnerReference{} 143 144 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 145 defer func(do ...client.Object) { 146 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 147 }(cluster, mhc) 148 149 g.Eventually(func() []metav1.OwnerReference { 150 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 151 if err != nil { 152 fmt.Printf("error cannot retrieve mhc in ctx: %v", err) 153 return nil 154 } 155 return mhc.GetOwnerReferences() 156 }, timeout, 100*time.Millisecond).Should(And( 157 HaveLen(1), 158 ContainElement(ownerReferenceForCluster(ctx, g, cluster)), 159 )) 160 }) 161 162 t.Run("it should ensure an owner reference is present when modifying existing ones", func(t *testing.T) { 163 g := NewWithT(t) 164 cluster := createCluster(g, ns.Name) 165 166 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 167 mhc.OwnerReferences = []metav1.OwnerReference{ 168 {Kind: "Foo", APIVersion: "foo.bar.baz/v1", Name: "Bar", UID: "12345"}, 169 } 170 171 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 172 defer func(do ...client.Object) { 173 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 174 }(cluster, mhc) 175 176 g.Eventually(func() []metav1.OwnerReference { 177 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 178 if err != nil { 179 return nil 180 } 181 return mhc.GetOwnerReferences() 182 }, timeout, 100*time.Millisecond).Should(And( 183 ContainElements( 184 metav1.OwnerReference{Kind: "Foo", APIVersion: "foo.bar.baz/v1", Name: "Bar", UID: "12345"}, 185 ownerReferenceForCluster(ctx, g, cluster)), 186 HaveLen(2), 187 )) 188 }) 189 190 t.Run("it ignores Machines not matching the label selector", func(t *testing.T) { 191 g := NewWithT(t) 192 cluster := createCluster(g, ns.Name) 193 194 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 195 196 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 197 defer func(do ...client.Object) { 198 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 199 }(cluster, mhc) 200 201 // Healthy nodes and machines matching the MHC's label selector. 202 _, machines, cleanup := createMachinesWithNodes(g, cluster, 203 count(2), 204 firstMachineAsControlPlane(), 205 createNodeRefForMachine(true), 206 nodeStatus(corev1.ConditionTrue), 207 machineLabels(mhc.Spec.Selector.MatchLabels), 208 ) 209 defer cleanup() 210 targetMachines := make([]string, len(machines)) 211 for i, m := range machines { 212 targetMachines[i] = m.Name 213 } 214 sort.Strings(targetMachines) 215 216 // Healthy nodes and machines NOT matching the MHC's label selector. 217 _, _, cleanup2 := createMachinesWithNodes(g, cluster, 218 count(2), 219 createNodeRefForMachine(true), 220 nodeStatus(corev1.ConditionTrue), 221 ) 222 defer cleanup2() 223 224 // Make sure the status matches. 225 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 226 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 227 if err != nil { 228 return nil 229 } 230 return &mhc.Status 231 }, 5*time.Second, 100*time.Millisecond).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 232 ExpectedMachines: 2, 233 CurrentHealthy: 2, 234 RemediationsAllowed: 2, 235 ObservedGeneration: 1, 236 Targets: targetMachines, 237 Conditions: clusterv1.Conditions{ 238 { 239 Type: clusterv1.RemediationAllowedCondition, 240 Status: corev1.ConditionTrue, 241 }, 242 }, 243 })) 244 }) 245 246 t.Run("it doesn't mark anything unhealthy when cluster infrastructure is not ready", func(t *testing.T) { 247 g := NewWithT(t) 248 cluster := createCluster(g, ns.Name) 249 250 patchHelper, err := patch.NewHelper(cluster, env.Client) 251 g.Expect(err).ToNot(HaveOccurred()) 252 253 conditions.MarkFalse(cluster, clusterv1.InfrastructureReadyCondition, "SomeReason", clusterv1.ConditionSeverityError, "") 254 g.Expect(patchHelper.Patch(ctx, cluster)).To(Succeed()) 255 256 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 257 258 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 259 defer func(do ...client.Object) { 260 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 261 }(cluster, mhc) 262 263 // Healthy nodes and machines. 264 _, machines, cleanup := createMachinesWithNodes(g, cluster, 265 count(2), 266 firstMachineAsControlPlane(), 267 createNodeRefForMachine(true), 268 machineLabels(mhc.Spec.Selector.MatchLabels), 269 ) 270 defer cleanup() 271 targetMachines := make([]string, len(machines)) 272 for i, m := range machines { 273 targetMachines[i] = m.Name 274 } 275 sort.Strings(targetMachines) 276 277 // Make sure the status matches. 278 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 279 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 280 if err != nil { 281 return nil 282 } 283 return &mhc.Status 284 }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 285 ExpectedMachines: 2, 286 CurrentHealthy: 2, 287 RemediationsAllowed: 2, 288 ObservedGeneration: 1, 289 Targets: targetMachines, 290 Conditions: clusterv1.Conditions{ 291 { 292 Type: clusterv1.RemediationAllowedCondition, 293 Status: corev1.ConditionTrue, 294 }, 295 }, 296 })) 297 }) 298 299 t.Run("it doesn't mark anything unhealthy when all Machines are healthy", func(t *testing.T) { 300 g := NewWithT(t) 301 cluster := createCluster(g, ns.Name) 302 303 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 304 305 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 306 defer func(do ...client.Object) { 307 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 308 }(cluster, mhc) 309 310 // Healthy nodes and machines. 311 _, machines, cleanup := createMachinesWithNodes(g, cluster, 312 count(2), 313 firstMachineAsControlPlane(), 314 createNodeRefForMachine(true), 315 nodeStatus(corev1.ConditionTrue), 316 machineLabels(mhc.Spec.Selector.MatchLabels), 317 ) 318 defer cleanup() 319 targetMachines := make([]string, len(machines)) 320 for i, m := range machines { 321 targetMachines[i] = m.Name 322 } 323 sort.Strings(targetMachines) 324 325 // Make sure the status matches. 326 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 327 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 328 if err != nil { 329 return nil 330 } 331 return &mhc.Status 332 }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 333 ExpectedMachines: 2, 334 CurrentHealthy: 2, 335 RemediationsAllowed: 2, 336 ObservedGeneration: 1, 337 Targets: targetMachines, 338 Conditions: clusterv1.Conditions{ 339 { 340 Type: clusterv1.RemediationAllowedCondition, 341 Status: corev1.ConditionTrue, 342 }, 343 }, 344 })) 345 }) 346 347 t.Run("it marks unhealthy machines for remediation when there is one unhealthy Machine", func(t *testing.T) { 348 g := NewWithT(t) 349 cluster := createCluster(g, ns.Name) 350 351 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 352 353 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 354 defer func(do ...client.Object) { 355 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 356 }(cluster, mhc) 357 358 // Healthy nodes and machines. 359 _, machines, cleanup1 := createMachinesWithNodes(g, cluster, 360 count(2), 361 firstMachineAsControlPlane(), 362 createNodeRefForMachine(true), 363 nodeStatus(corev1.ConditionTrue), 364 machineLabels(mhc.Spec.Selector.MatchLabels), 365 ) 366 defer cleanup1() 367 // Unhealthy nodes and machines. 368 _, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster, 369 count(1), 370 createNodeRefForMachine(true), 371 nodeStatus(corev1.ConditionUnknown), 372 machineLabels(mhc.Spec.Selector.MatchLabels), 373 ) 374 defer cleanup2() 375 machines = append(machines, unhealthyMachines...) 376 targetMachines := make([]string, len(machines)) 377 for i, m := range machines { 378 targetMachines[i] = m.Name 379 } 380 sort.Strings(targetMachines) 381 382 // Make sure the status matches. 383 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 384 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 385 if err != nil { 386 return nil 387 } 388 return &mhc.Status 389 }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 390 ExpectedMachines: 3, 391 CurrentHealthy: 2, 392 RemediationsAllowed: 2, 393 ObservedGeneration: 1, 394 Targets: targetMachines, 395 Conditions: clusterv1.Conditions{ 396 { 397 Type: clusterv1.RemediationAllowedCondition, 398 Status: corev1.ConditionTrue, 399 }, 400 }, 401 })) 402 }) 403 404 t.Run("it marks unhealthy machines for remediation when there a Machine has a failure reason", func(t *testing.T) { 405 g := NewWithT(t) 406 cluster := createCluster(g, ns.Name) 407 408 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 409 410 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 411 defer func(do ...client.Object) { 412 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 413 }(cluster, mhc) 414 415 // Healthy nodes and machines. 416 _, machines, cleanup1 := createMachinesWithNodes(g, cluster, 417 count(2), 418 firstMachineAsControlPlane(), 419 createNodeRefForMachine(true), 420 nodeStatus(corev1.ConditionTrue), 421 machineLabels(mhc.Spec.Selector.MatchLabels), 422 ) 423 defer cleanup1() 424 // Machine with failure reason. 425 _, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster, 426 count(1), 427 createNodeRefForMachine(true), 428 nodeStatus(corev1.ConditionTrue), 429 machineLabels(mhc.Spec.Selector.MatchLabels), 430 machineFailureReason("some failure"), 431 ) 432 defer cleanup2() 433 machines = append(machines, unhealthyMachines...) 434 targetMachines := make([]string, len(machines)) 435 for i, m := range machines { 436 targetMachines[i] = m.Name 437 } 438 sort.Strings(targetMachines) 439 440 // Make sure the status matches. 441 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 442 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 443 if err != nil { 444 return nil 445 } 446 return &mhc.Status 447 }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 448 ExpectedMachines: 3, 449 CurrentHealthy: 2, 450 RemediationsAllowed: 2, 451 ObservedGeneration: 1, 452 Targets: targetMachines, 453 Conditions: clusterv1.Conditions{ 454 { 455 Type: clusterv1.RemediationAllowedCondition, 456 Status: corev1.ConditionTrue, 457 }, 458 }, 459 })) 460 }) 461 462 t.Run("it marks unhealthy machines for remediation when there a Machine has a failure message", func(t *testing.T) { 463 g := NewWithT(t) 464 cluster := createCluster(g, ns.Name) 465 466 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 467 468 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 469 defer func(do ...client.Object) { 470 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 471 }(cluster, mhc) 472 473 // Healthy nodes and machines. 474 _, machines, cleanup1 := createMachinesWithNodes(g, cluster, 475 count(2), 476 firstMachineAsControlPlane(), 477 createNodeRefForMachine(true), 478 nodeStatus(corev1.ConditionTrue), 479 machineLabels(mhc.Spec.Selector.MatchLabels), 480 ) 481 defer cleanup1() 482 // Machine with failure message. 483 _, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster, 484 count(1), 485 createNodeRefForMachine(true), 486 nodeStatus(corev1.ConditionTrue), 487 machineLabels(mhc.Spec.Selector.MatchLabels), 488 machineFailureMessage("some failure"), 489 ) 490 defer cleanup2() 491 machines = append(machines, unhealthyMachines...) 492 targetMachines := make([]string, len(machines)) 493 for i, m := range machines { 494 targetMachines[i] = m.Name 495 } 496 sort.Strings(targetMachines) 497 498 // Make sure the status matches. 499 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 500 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 501 if err != nil { 502 return nil 503 } 504 return &mhc.Status 505 }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 506 ExpectedMachines: 3, 507 CurrentHealthy: 2, 508 RemediationsAllowed: 2, 509 ObservedGeneration: 1, 510 Targets: targetMachines, 511 Conditions: clusterv1.Conditions{ 512 { 513 Type: clusterv1.RemediationAllowedCondition, 514 Status: corev1.ConditionTrue, 515 }, 516 }, 517 })) 518 }) 519 520 t.Run("it marks unhealthy machines for remediation when the unhealthy Machines exceed MaxUnhealthy", func(t *testing.T) { 521 g := NewWithT(t) 522 cluster := createCluster(g, ns.Name) 523 524 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 525 maxUnhealthy := intstr.Parse("40%") 526 mhc.Spec.MaxUnhealthy = &maxUnhealthy 527 528 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 529 defer func(do ...client.Object) { 530 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 531 }(cluster, mhc) 532 533 // Healthy nodes and machines. 534 _, machines, cleanup1 := createMachinesWithNodes(g, cluster, 535 count(1), 536 firstMachineAsControlPlane(), 537 createNodeRefForMachine(true), 538 nodeStatus(corev1.ConditionTrue), 539 machineLabels(mhc.Spec.Selector.MatchLabels), 540 ) 541 defer cleanup1() 542 // Unhealthy nodes and machines. 543 _, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster, 544 count(2), 545 createNodeRefForMachine(true), 546 nodeStatus(corev1.ConditionUnknown), 547 machineLabels(mhc.Spec.Selector.MatchLabels), 548 ) 549 defer cleanup2() 550 machines = append(machines, unhealthyMachines...) 551 targetMachines := make([]string, len(machines)) 552 for i, m := range machines { 553 targetMachines[i] = m.Name 554 } 555 sort.Strings(targetMachines) 556 557 // Make sure the status matches. 558 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 559 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 560 if err != nil { 561 return nil 562 } 563 return &mhc.Status 564 }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 565 ExpectedMachines: 3, 566 CurrentHealthy: 1, 567 RemediationsAllowed: 0, 568 ObservedGeneration: 1, 569 Targets: targetMachines, 570 Conditions: clusterv1.Conditions{ 571 { 572 Type: clusterv1.RemediationAllowedCondition, 573 Status: corev1.ConditionFalse, 574 Severity: clusterv1.ConditionSeverityWarning, 575 Reason: clusterv1.TooManyUnhealthyReason, 576 Message: "Remediation is not allowed, the number of not started or unhealthy machines exceeds maxUnhealthy (total: 3, unhealthy: 2, maxUnhealthy: 40%)", 577 }, 578 }, 579 })) 580 581 // Calculate how many Machines have health check succeeded = false. 582 g.Eventually(func() (unhealthy int) { 583 machines := &clusterv1.MachineList{} 584 err := env.List(ctx, machines, client.MatchingLabels{ 585 "selector": mhc.Spec.Selector.MatchLabels["selector"], 586 }) 587 if err != nil { 588 return -1 589 } 590 591 for i := range machines.Items { 592 if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { 593 unhealthy++ 594 } 595 } 596 return 597 }).Should(Equal(2)) 598 599 // Calculate how many Machines have been remediated. 600 g.Eventually(func() (remediated int) { 601 machines := &clusterv1.MachineList{} 602 err := env.List(ctx, machines, client.MatchingLabels{ 603 "selector": mhc.Spec.Selector.MatchLabels["selector"], 604 }) 605 if err != nil { 606 return -1 607 } 608 609 for i := range machines.Items { 610 if conditions.IsTrue(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) { 611 remediated++ 612 } 613 } 614 return 615 }).Should(Equal(0)) 616 }) 617 618 t.Run("it marks unhealthy machines for remediation when number of unhealthy machines is within unhealthyRange", func(t *testing.T) { 619 g := NewWithT(t) 620 cluster := createCluster(g, ns.Name) 621 622 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 623 unhealthyRange := "[1-3]" 624 mhc.Spec.UnhealthyRange = &unhealthyRange 625 626 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 627 defer func(do ...client.Object) { 628 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 629 }(cluster, mhc) 630 631 // Healthy nodes and machines. 632 _, machines, cleanup1 := createMachinesWithNodes(g, cluster, 633 count(2), 634 firstMachineAsControlPlane(), 635 createNodeRefForMachine(true), 636 nodeStatus(corev1.ConditionTrue), 637 machineLabels(mhc.Spec.Selector.MatchLabels), 638 ) 639 defer cleanup1() 640 // Unhealthy nodes and machines. 641 _, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster, 642 count(1), 643 createNodeRefForMachine(true), 644 nodeStatus(corev1.ConditionUnknown), 645 machineLabels(mhc.Spec.Selector.MatchLabels), 646 ) 647 defer cleanup2() 648 machines = append(machines, unhealthyMachines...) 649 targetMachines := make([]string, len(machines)) 650 for i, m := range machines { 651 targetMachines[i] = m.Name 652 } 653 sort.Strings(targetMachines) 654 655 // Make sure the status matches. 656 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 657 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 658 if err != nil { 659 return nil 660 } 661 return &mhc.Status 662 }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 663 ExpectedMachines: 3, 664 CurrentHealthy: 2, 665 RemediationsAllowed: 2, 666 ObservedGeneration: 1, 667 Targets: targetMachines, 668 Conditions: clusterv1.Conditions{ 669 { 670 Type: clusterv1.RemediationAllowedCondition, 671 Status: corev1.ConditionTrue, 672 }, 673 }, 674 })) 675 }) 676 677 t.Run("it marks unhealthy machines for remediation when the unhealthy Machines is not within UnhealthyRange", func(t *testing.T) { 678 g := NewWithT(t) 679 cluster := createCluster(g, ns.Name) 680 681 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 682 unhealthyRange := "[3-5]" 683 mhc.Spec.UnhealthyRange = &unhealthyRange 684 685 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 686 defer func(do ...client.Object) { 687 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 688 }(cluster, mhc) 689 690 // Healthy nodes and machines. 691 _, machines, cleanup1 := createMachinesWithNodes(g, cluster, 692 count(1), 693 firstMachineAsControlPlane(), 694 createNodeRefForMachine(true), 695 nodeStatus(corev1.ConditionTrue), 696 machineLabels(mhc.Spec.Selector.MatchLabels), 697 ) 698 defer cleanup1() 699 // Unhealthy nodes and machines. 700 _, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster, 701 count(2), 702 createNodeRefForMachine(true), 703 nodeStatus(corev1.ConditionUnknown), 704 machineLabels(mhc.Spec.Selector.MatchLabels), 705 ) 706 defer cleanup2() 707 machines = append(machines, unhealthyMachines...) 708 targetMachines := make([]string, len(machines)) 709 for i, m := range machines { 710 targetMachines[i] = m.Name 711 } 712 sort.Strings(targetMachines) 713 714 // Make sure the status matches. 715 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 716 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 717 if err != nil { 718 return nil 719 } 720 return &mhc.Status 721 }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 722 ExpectedMachines: 3, 723 CurrentHealthy: 1, 724 RemediationsAllowed: 0, 725 ObservedGeneration: 1, 726 Targets: targetMachines, 727 Conditions: clusterv1.Conditions{ 728 { 729 Type: clusterv1.RemediationAllowedCondition, 730 Status: corev1.ConditionFalse, 731 Severity: clusterv1.ConditionSeverityWarning, 732 Reason: clusterv1.TooManyUnhealthyReason, 733 Message: "Remediation is not allowed, the number of not started or unhealthy machines does not fall within the range (total: 3, unhealthy: 2, unhealthyRange: [3-5])", 734 }, 735 }, 736 })) 737 738 // Calculate how many Machines have health check succeeded = false. 739 g.Eventually(func() (unhealthy int) { 740 machines := &clusterv1.MachineList{} 741 err := env.List(ctx, machines, client.MatchingLabels{ 742 "selector": mhc.Spec.Selector.MatchLabels["selector"], 743 }) 744 if err != nil { 745 return -1 746 } 747 748 for i := range machines.Items { 749 if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { 750 unhealthy++ 751 } 752 } 753 return 754 }).Should(Equal(2)) 755 756 // Calculate how many Machines have been remediated. 757 g.Eventually(func() (remediated int) { 758 machines := &clusterv1.MachineList{} 759 err := env.List(ctx, machines, client.MatchingLabels{ 760 "selector": mhc.Spec.Selector.MatchLabels["selector"], 761 }) 762 if err != nil { 763 return -1 764 } 765 766 for i := range machines.Items { 767 if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil { 768 remediated++ 769 } 770 } 771 return 772 }).Should(Equal(0)) 773 }) 774 775 t.Run("when a Machine has no Node ref for less than the NodeStartupTimeout", func(t *testing.T) { 776 g := NewWithT(t) 777 cluster := createCluster(g, ns.Name) 778 779 // After the cluster exists, we have to set the infrastructure ready condition; otherwise, MachineHealthChecks 780 // will never fail when nodeStartupTimeout is exceeded. 781 patchHelper, err := patch.NewHelper(cluster, env.GetClient()) 782 g.Expect(err).ToNot(HaveOccurred()) 783 784 conditions.MarkTrue(cluster, clusterv1.InfrastructureReadyCondition) 785 g.Expect(patchHelper.Patch(ctx, cluster)).To(Succeed()) 786 787 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 788 mhc.Spec.NodeStartupTimeout = &metav1.Duration{Duration: 5 * time.Hour} 789 790 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 791 defer func(do ...client.Object) { 792 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 793 }(cluster, mhc) 794 795 // Healthy nodes and machines. 796 _, machines, cleanup1 := createMachinesWithNodes(g, cluster, 797 count(2), 798 firstMachineAsControlPlane(), 799 createNodeRefForMachine(true), 800 nodeStatus(corev1.ConditionTrue), 801 machineLabels(mhc.Spec.Selector.MatchLabels), 802 ) 803 defer cleanup1() 804 // Unhealthy nodes and machines. 805 _, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster, 806 count(1), 807 createNodeRefForMachine(false), 808 nodeStatus(corev1.ConditionUnknown), 809 machineLabels(mhc.Spec.Selector.MatchLabels), 810 ) 811 defer cleanup2() 812 machines = append(machines, unhealthyMachines...) 813 targetMachines := make([]string, len(machines)) 814 for i, m := range machines { 815 targetMachines[i] = m.Name 816 } 817 sort.Strings(targetMachines) 818 819 // Make sure the status matches. 820 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 821 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 822 if err != nil { 823 return nil 824 } 825 return &mhc.Status 826 }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 827 ExpectedMachines: 3, 828 CurrentHealthy: 2, 829 RemediationsAllowed: 2, 830 ObservedGeneration: 1, 831 Targets: targetMachines, 832 Conditions: clusterv1.Conditions{ 833 { 834 Type: clusterv1.RemediationAllowedCondition, 835 Status: corev1.ConditionTrue, 836 }, 837 }, 838 })) 839 840 // Calculate how many Machines have health check succeeded = false. 841 g.Eventually(func() (unhealthy int) { 842 machines := &clusterv1.MachineList{} 843 err := env.List(ctx, machines, client.MatchingLabels{ 844 "selector": mhc.Spec.Selector.MatchLabels["selector"], 845 }) 846 if err != nil { 847 return -1 848 } 849 850 for i := range machines.Items { 851 if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { 852 unhealthy++ 853 } 854 } 855 return 856 }).Should(Equal(0)) 857 858 // Calculate how many Machines have been remediated. 859 g.Eventually(func() (remediated int) { 860 machines := &clusterv1.MachineList{} 861 err := env.List(ctx, machines, client.MatchingLabels{ 862 "selector": mhc.Spec.Selector.MatchLabels["selector"], 863 }) 864 if err != nil { 865 return -1 866 } 867 868 for i := range machines.Items { 869 if conditions.IsTrue(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) { 870 remediated++ 871 } 872 } 873 return 874 }).Should(Equal(0)) 875 }) 876 877 t.Run("when a Machine has no Node ref for longer than the NodeStartupTimeout", func(t *testing.T) { 878 // FIXME: Resolve flaky/failing test 879 t.Skip("skipping until made stable") 880 g := NewWithT(t) 881 cluster := createCluster(g, ns.Name) 882 883 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 884 mhc.Spec.NodeStartupTimeout = &metav1.Duration{Duration: time.Second} 885 886 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 887 defer func(do ...client.Object) { 888 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 889 }(cluster, mhc) 890 891 // Healthy nodes and machines. 892 _, machines, cleanup1 := createMachinesWithNodes(g, cluster, 893 count(2), 894 firstMachineAsControlPlane(), 895 createNodeRefForMachine(true), 896 nodeStatus(corev1.ConditionTrue), 897 machineLabels(mhc.Spec.Selector.MatchLabels), 898 ) 899 defer cleanup1() 900 // Unhealthy nodes and machines. 901 _, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster, 902 count(1), 903 createNodeRefForMachine(false), 904 nodeStatus(corev1.ConditionUnknown), 905 machineLabels(mhc.Spec.Selector.MatchLabels), 906 ) 907 defer cleanup2() 908 machines = append(machines, unhealthyMachines...) 909 910 targetMachines := make([]string, len(machines)) 911 for i, m := range machines { 912 targetMachines[i] = m.Name 913 } 914 sort.Strings(targetMachines) 915 916 // Make sure the MHC status matches. We have two healthy machines and 917 // one unhealthy. 918 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 919 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 920 if err != nil { 921 fmt.Printf("error retrieving mhc: %v", err) 922 return nil 923 } 924 return &mhc.Status 925 }, timeout, 100*time.Millisecond).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 926 ExpectedMachines: 3, 927 CurrentHealthy: 2, 928 RemediationsAllowed: 2, 929 ObservedGeneration: 1, 930 Targets: targetMachines, 931 Conditions: clusterv1.Conditions{ 932 { 933 Type: clusterv1.RemediationAllowedCondition, 934 Status: corev1.ConditionTrue, 935 }, 936 }, 937 })) 938 939 // Calculate how many Machines have health check succeeded = false. 940 g.Eventually(func() (unhealthy int) { 941 machines := &clusterv1.MachineList{} 942 err := env.List(ctx, machines, client.MatchingLabels{ 943 "selector": mhc.Spec.Selector.MatchLabels["selector"], 944 }) 945 if err != nil { 946 fmt.Printf("error retrieving list: %v", err) 947 return -1 948 } 949 950 for i := range machines.Items { 951 if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { 952 unhealthy++ 953 } 954 } 955 return 956 }, timeout, 100*time.Millisecond).Should(Equal(1)) 957 958 // Calculate how many Machines have been remediated. 959 g.Eventually(func() (remediated int) { 960 machines := &clusterv1.MachineList{} 961 err := env.List(ctx, machines, client.MatchingLabels{ 962 "selector": mhc.Spec.Selector.MatchLabels["selector"], 963 }) 964 if err != nil { 965 return -1 966 } 967 968 for i := range machines.Items { 969 if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil { 970 remediated++ 971 } 972 } 973 return 974 }, timeout, 100*time.Millisecond).Should(Equal(1)) 975 }) 976 977 t.Run("when a Machine's Node has gone away", func(t *testing.T) { 978 // FIXME: Resolve flaky/failing test 979 t.Skip("skipping until made stable") 980 g := NewWithT(t) 981 cluster := createCluster(g, ns.Name) 982 983 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 984 985 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 986 defer func(do ...client.Object) { 987 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 988 }(cluster, mhc) 989 990 // Healthy nodes and machines. 991 nodes, machines, cleanup := createMachinesWithNodes(g, cluster, 992 count(3), 993 firstMachineAsControlPlane(), 994 createNodeRefForMachine(true), 995 nodeStatus(corev1.ConditionTrue), 996 machineLabels(mhc.Spec.Selector.MatchLabels), 997 ) 998 defer cleanup() 999 targetMachines := make([]string, len(machines)) 1000 for i, m := range machines { 1001 targetMachines[i] = m.Name 1002 } 1003 sort.Strings(targetMachines) 1004 1005 // Forcibly remove the last machine's node. 1006 g.Eventually(func() bool { 1007 nodeToBeRemoved := nodes[2] 1008 if err := env.Delete(ctx, nodeToBeRemoved); err != nil { 1009 return apierrors.IsNotFound(err) 1010 } 1011 return apierrors.IsNotFound(env.Get(ctx, util.ObjectKey(nodeToBeRemoved), nodeToBeRemoved)) 1012 }).Should(BeTrue()) 1013 1014 // Make sure the status matches. 1015 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 1016 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 1017 if err != nil { 1018 return nil 1019 } 1020 return &mhc.Status 1021 }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 1022 ExpectedMachines: 3, 1023 CurrentHealthy: 2, 1024 RemediationsAllowed: 2, 1025 ObservedGeneration: 1, 1026 Targets: targetMachines, 1027 Conditions: clusterv1.Conditions{ 1028 { 1029 Type: clusterv1.RemediationAllowedCondition, 1030 Status: corev1.ConditionTrue, 1031 }, 1032 }, 1033 })) 1034 1035 // Calculate how many Machines have health check succeeded = false. 1036 g.Eventually(func() (unhealthy int) { 1037 machines := &clusterv1.MachineList{} 1038 err := env.List(ctx, machines, client.MatchingLabels{ 1039 "selector": mhc.Spec.Selector.MatchLabels["selector"], 1040 }) 1041 if err != nil { 1042 return -1 1043 } 1044 1045 for i := range machines.Items { 1046 if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { 1047 unhealthy++ 1048 } 1049 } 1050 return 1051 }).Should(Equal(1)) 1052 1053 // Calculate how many Machines have been remediated. 1054 g.Eventually(func() (remediated int) { 1055 machines := &clusterv1.MachineList{} 1056 err := env.List(ctx, machines, client.MatchingLabels{ 1057 "selector": mhc.Spec.Selector.MatchLabels["selector"], 1058 }) 1059 if err != nil { 1060 return -1 1061 } 1062 1063 for i := range machines.Items { 1064 if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil { 1065 remediated++ 1066 } 1067 } 1068 return 1069 }, timeout, 100*time.Millisecond).Should(Equal(1)) 1070 }) 1071 1072 t.Run("should react when a Node transitions to unhealthy", func(t *testing.T) { 1073 g := NewWithT(t) 1074 cluster := createCluster(g, ns.Name) 1075 1076 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 1077 1078 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 1079 defer func(do ...client.Object) { 1080 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 1081 }(cluster, mhc) 1082 1083 // Healthy nodes and machines. 1084 nodes, machines, cleanup := createMachinesWithNodes(g, cluster, 1085 count(1), 1086 firstMachineAsControlPlane(), 1087 createNodeRefForMachine(true), 1088 nodeStatus(corev1.ConditionTrue), 1089 machineLabels(mhc.Spec.Selector.MatchLabels), 1090 ) 1091 defer cleanup() 1092 targetMachines := make([]string, len(machines)) 1093 for i, m := range machines { 1094 targetMachines[i] = m.Name 1095 } 1096 sort.Strings(targetMachines) 1097 1098 // Make sure the status matches. 1099 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 1100 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 1101 if err != nil { 1102 return nil 1103 } 1104 return &mhc.Status 1105 }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 1106 ExpectedMachines: 1, 1107 CurrentHealthy: 1, 1108 RemediationsAllowed: 1, 1109 ObservedGeneration: 1, 1110 Targets: targetMachines, 1111 Conditions: clusterv1.Conditions{ 1112 { 1113 Type: clusterv1.RemediationAllowedCondition, 1114 Status: corev1.ConditionTrue, 1115 }, 1116 }, 1117 })) 1118 1119 // Transition the node to unhealthy. 1120 node := nodes[0] 1121 nodePatch := client.MergeFrom(node.DeepCopy()) 1122 node.Status.Conditions = []corev1.NodeCondition{ 1123 { 1124 Type: corev1.NodeReady, 1125 Status: corev1.ConditionUnknown, 1126 LastTransitionTime: metav1.NewTime(time.Now().Add(-10 * time.Minute)), 1127 }, 1128 } 1129 g.Expect(env.Status().Patch(ctx, node, nodePatch)).To(Succeed()) 1130 1131 // Make sure the status matches. 1132 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 1133 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 1134 if err != nil { 1135 return nil 1136 } 1137 return &mhc.Status 1138 }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 1139 ExpectedMachines: 1, 1140 CurrentHealthy: 0, 1141 ObservedGeneration: 1, 1142 Targets: targetMachines, 1143 Conditions: clusterv1.Conditions{ 1144 { 1145 Type: clusterv1.RemediationAllowedCondition, 1146 Status: corev1.ConditionTrue, 1147 }, 1148 }, 1149 })) 1150 1151 // Calculate how many Machines have health check succeeded = false. 1152 g.Eventually(func() (unhealthy int) { 1153 machines := &clusterv1.MachineList{} 1154 err := env.List(ctx, machines, client.MatchingLabels{ 1155 "selector": mhc.Spec.Selector.MatchLabels["selector"], 1156 }) 1157 if err != nil { 1158 return -1 1159 } 1160 1161 for i := range machines.Items { 1162 if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { 1163 unhealthy++ 1164 } 1165 } 1166 return 1167 }).Should(Equal(1)) 1168 1169 // Calculate how many Machines have been marked for remediation 1170 g.Eventually(func() (remediated int) { 1171 machines := &clusterv1.MachineList{} 1172 err := env.List(ctx, machines, client.MatchingLabels{ 1173 "selector": mhc.Spec.Selector.MatchLabels["selector"], 1174 }) 1175 if err != nil { 1176 return -1 1177 } 1178 1179 for i := range machines.Items { 1180 if conditions.IsFalse(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) { 1181 remediated++ 1182 } 1183 } 1184 return 1185 }).Should(Equal(1)) 1186 }) 1187 1188 t.Run("when in a MachineSet, unhealthy machines should be deleted", func(t *testing.T) { 1189 g := NewWithT(t) 1190 cluster := createCluster(g, ns.Name) 1191 1192 // Create 1 control plane machine so MHC can proceed 1193 _, _, cleanup := createMachinesWithNodes(g, cluster, 1194 count(1), 1195 firstMachineAsControlPlane(), 1196 createNodeRefForMachine(true), 1197 nodeStatus(corev1.ConditionTrue), 1198 ) 1199 defer cleanup() 1200 1201 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 1202 // Create infrastructure template resource. 1203 infraResource := map[string]interface{}{ 1204 "kind": "GenericInfrastructureMachine", 1205 "apiVersion": "infrastructure.cluster.x-k8s.io/v1beta1", 1206 "metadata": map[string]interface{}{}, 1207 "spec": map[string]interface{}{ 1208 "size": "3xlarge", 1209 }, 1210 } 1211 infraTmpl := &unstructured.Unstructured{ 1212 Object: map[string]interface{}{ 1213 "spec": map[string]interface{}{ 1214 "template": infraResource, 1215 }, 1216 }, 1217 } 1218 infraTmpl.SetKind("GenericInfrastructureMachineTemplate") 1219 infraTmpl.SetAPIVersion("infrastructure.cluster.x-k8s.io/v1beta1") 1220 infraTmpl.SetGenerateName("mhc-ms-template-") 1221 infraTmpl.SetNamespace(mhc.Namespace) 1222 1223 g.Expect(env.Create(ctx, infraTmpl)).To(Succeed()) 1224 1225 machineSet := &clusterv1.MachineSet{ 1226 ObjectMeta: metav1.ObjectMeta{ 1227 GenerateName: "mhc-ms-", 1228 Namespace: mhc.Namespace, 1229 }, 1230 Spec: clusterv1.MachineSetSpec{ 1231 ClusterName: cluster.Name, 1232 Replicas: pointer.Int32(1), 1233 Selector: mhc.Spec.Selector, 1234 Template: clusterv1.MachineTemplateSpec{ 1235 ObjectMeta: clusterv1.ObjectMeta{ 1236 Labels: mhc.Spec.Selector.MatchLabels, 1237 }, 1238 Spec: clusterv1.MachineSpec{ 1239 ClusterName: cluster.Name, 1240 Bootstrap: clusterv1.Bootstrap{ 1241 DataSecretName: pointer.String("test-data-secret-name"), 1242 }, 1243 InfrastructureRef: corev1.ObjectReference{ 1244 APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", 1245 Kind: "GenericInfrastructureMachineTemplate", 1246 Name: infraTmpl.GetName(), 1247 }, 1248 }, 1249 }, 1250 }, 1251 } 1252 g.Expect((&webhooks.MachineSet{}).Default(ctx, machineSet)).Should(Succeed()) 1253 g.Expect(env.Create(ctx, machineSet)).To(Succeed()) 1254 1255 // Ensure machines have been created. 1256 g.Eventually(func() int { 1257 machines := &clusterv1.MachineList{} 1258 err := env.List(ctx, machines, client.MatchingLabels{ 1259 "selector": mhc.Spec.Selector.MatchLabels["selector"], 1260 }) 1261 if err != nil { 1262 return -1 1263 } 1264 return len(machines.Items) 1265 }, timeout, 100*time.Millisecond).Should(Equal(1)) 1266 1267 // Create the MachineHealthCheck instance. 1268 mhc.Spec.NodeStartupTimeout = &metav1.Duration{Duration: time.Second} 1269 1270 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 1271 // defer cleanup for all the objects that have been created 1272 defer func(do ...client.Object) { 1273 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 1274 }(cluster, mhc, infraTmpl, machineSet) 1275 1276 // Pause the MachineSet reconciler to delay the deletion of the 1277 // Machine, because the MachineSet controller deletes the Machine when 1278 // it is marked unhealthy by MHC. 1279 machineSetPatch := client.MergeFrom(machineSet.DeepCopy()) 1280 machineSet.Annotations = map[string]string{ 1281 clusterv1.PausedAnnotation: "", 1282 } 1283 g.Expect(env.Patch(ctx, machineSet, machineSetPatch)).To(Succeed()) 1284 1285 // Calculate how many Machines have health check succeeded = false. 1286 g.Eventually(func() (unhealthy int) { 1287 machines := &clusterv1.MachineList{} 1288 err := env.List(ctx, machines, client.MatchingLabels{ 1289 "selector": mhc.Spec.Selector.MatchLabels["selector"], 1290 }) 1291 if err != nil { 1292 return -1 1293 } 1294 1295 for i := range machines.Items { 1296 if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { 1297 unhealthy++ 1298 } 1299 } 1300 return 1301 }, timeout, 100*time.Millisecond).Should(Equal(1)) 1302 1303 // Calculate how many Machines should be remediated. 1304 var unhealthyMachine *clusterv1.Machine 1305 g.Eventually(func() (remediated int) { 1306 machines := &clusterv1.MachineList{} 1307 err := env.List(ctx, machines, client.MatchingLabels{ 1308 "selector": mhc.Spec.Selector.MatchLabels["selector"], 1309 }) 1310 if err != nil { 1311 return -1 1312 } 1313 1314 for i := range machines.Items { 1315 if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil { 1316 unhealthyMachine = machines.Items[i].DeepCopy() 1317 remediated++ 1318 } 1319 } 1320 return 1321 }, timeout, 100*time.Millisecond).Should(Equal(1)) 1322 1323 // Unpause the MachineSet reconciler. 1324 machineSetPatch = client.MergeFrom(machineSet.DeepCopy()) 1325 delete(machineSet.Annotations, clusterv1.PausedAnnotation) 1326 g.Expect(env.Patch(ctx, machineSet, machineSetPatch)).To(Succeed()) 1327 1328 // Make sure the Machine gets deleted. 1329 g.Eventually(func() bool { 1330 machine := unhealthyMachine.DeepCopy() 1331 err := env.Get(ctx, util.ObjectKey(unhealthyMachine), machine) 1332 return apierrors.IsNotFound(err) || !machine.DeletionTimestamp.IsZero() 1333 }, timeout, 100*time.Millisecond).Should(BeTrue()) 1334 }) 1335 1336 t.Run("when a machine is paused", func(t *testing.T) { 1337 // FIXME: Resolve flaky/failing test 1338 t.Skip("skipping until made stable") 1339 g := NewWithT(t) 1340 cluster := createCluster(g, ns.Name) 1341 1342 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 1343 1344 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 1345 defer func(do ...client.Object) { 1346 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 1347 }(cluster, mhc) 1348 1349 // Healthy nodes and machines. 1350 nodes, machines, cleanup := createMachinesWithNodes(g, cluster, 1351 count(1), 1352 firstMachineAsControlPlane(), 1353 createNodeRefForMachine(true), 1354 nodeStatus(corev1.ConditionTrue), 1355 machineLabels(mhc.Spec.Selector.MatchLabels), 1356 ) 1357 defer cleanup() 1358 targetMachines := make([]string, len(machines)) 1359 for i, m := range machines { 1360 targetMachines[i] = m.Name 1361 } 1362 sort.Strings(targetMachines) 1363 1364 // Make sure the status matches. 1365 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 1366 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 1367 if err != nil { 1368 return nil 1369 } 1370 return &mhc.Status 1371 }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 1372 ExpectedMachines: 1, 1373 CurrentHealthy: 1, 1374 ObservedGeneration: 1, 1375 Targets: targetMachines, 1376 Conditions: clusterv1.Conditions{ 1377 { 1378 Type: clusterv1.RemediationAllowedCondition, 1379 Status: corev1.ConditionTrue, 1380 }, 1381 }, 1382 })) 1383 1384 // Pause the machine 1385 machinePatch := client.MergeFrom(machines[0].DeepCopy()) 1386 machines[0].Annotations = map[string]string{ 1387 clusterv1.PausedAnnotation: "", 1388 } 1389 g.Expect(env.Patch(ctx, machines[0], machinePatch)).To(Succeed()) 1390 1391 // Transition the node to unhealthy. 1392 node := nodes[0] 1393 nodePatch := client.MergeFrom(node.DeepCopy()) 1394 node.Status.Conditions = []corev1.NodeCondition{ 1395 { 1396 Type: corev1.NodeReady, 1397 Status: corev1.ConditionUnknown, 1398 LastTransitionTime: metav1.NewTime(time.Now().Add(-10 * time.Minute)), 1399 }, 1400 } 1401 g.Expect(env.Status().Patch(ctx, node, nodePatch)).To(Succeed()) 1402 1403 // Make sure the status matches. 1404 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 1405 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 1406 if err != nil { 1407 return nil 1408 } 1409 return &mhc.Status 1410 }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 1411 ExpectedMachines: 1, 1412 CurrentHealthy: 0, 1413 RemediationsAllowed: 0, 1414 ObservedGeneration: 1, 1415 Targets: targetMachines, 1416 Conditions: clusterv1.Conditions{ 1417 { 1418 Type: clusterv1.RemediationAllowedCondition, 1419 Status: corev1.ConditionTrue, 1420 }, 1421 }, 1422 })) 1423 1424 // Calculate how many Machines have health check succeeded = false. 1425 g.Eventually(func() (unhealthy int) { 1426 machines := &clusterv1.MachineList{} 1427 err := env.List(ctx, machines, client.MatchingLabels{ 1428 "selector": mhc.Spec.Selector.MatchLabels["selector"], 1429 }) 1430 if err != nil { 1431 return -1 1432 } 1433 1434 for i := range machines.Items { 1435 if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { 1436 unhealthy++ 1437 } 1438 } 1439 return 1440 }).Should(Equal(1)) 1441 1442 // Calculate how many Machines have been remediated. 1443 g.Eventually(func() (remediated int) { 1444 machines := &clusterv1.MachineList{} 1445 err := env.List(ctx, machines, client.MatchingLabels{ 1446 "selector": mhc.Spec.Selector.MatchLabels["selector"], 1447 }) 1448 if err != nil { 1449 return -1 1450 } 1451 1452 for i := range machines.Items { 1453 if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil { 1454 remediated++ 1455 } 1456 } 1457 return 1458 }).Should(Equal(0)) 1459 }) 1460 1461 t.Run("When remediationTemplate is set and node transitions to unhealthy, new Remediation Request should be created", func(t *testing.T) { 1462 g := NewWithT(t) 1463 cluster := createCluster(g, ns.Name) 1464 1465 // Create remediation template resource. 1466 infraRemediationResource := map[string]interface{}{ 1467 "kind": "GenericExternalRemediation", 1468 "apiVersion": builder.RemediationGroupVersion.String(), 1469 "metadata": map[string]interface{}{}, 1470 "spec": map[string]interface{}{ 1471 "size": "3xlarge", 1472 }, 1473 } 1474 infraRemediationTmpl := &unstructured.Unstructured{ 1475 Object: map[string]interface{}{ 1476 "spec": map[string]interface{}{ 1477 "template": infraRemediationResource, 1478 }, 1479 }, 1480 } 1481 infraRemediationTmpl.SetKind("GenericExternalRemediationTemplate") 1482 infraRemediationTmpl.SetAPIVersion(builder.RemediationGroupVersion.String()) 1483 infraRemediationTmpl.SetGenerateName("remediation-template-name-") 1484 infraRemediationTmpl.SetNamespace(cluster.Namespace) 1485 g.Expect(env.Create(ctx, infraRemediationTmpl)).To(Succeed()) 1486 1487 remediationTemplate := &corev1.ObjectReference{ 1488 APIVersion: builder.RemediationGroupVersion.String(), 1489 Kind: "GenericExternalRemediationTemplate", 1490 Name: infraRemediationTmpl.GetName(), 1491 } 1492 1493 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 1494 mhc.Spec.RemediationTemplate = remediationTemplate 1495 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 1496 defer func(do ...client.Object) { 1497 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 1498 }(cluster, mhc, infraRemediationTmpl) 1499 1500 // Healthy nodes and machines. 1501 nodes, machines, cleanup := createMachinesWithNodes(g, cluster, 1502 count(1), 1503 firstMachineAsControlPlane(), 1504 createNodeRefForMachine(true), 1505 nodeStatus(corev1.ConditionTrue), 1506 machineLabels(mhc.Spec.Selector.MatchLabels), 1507 ) 1508 defer cleanup() 1509 targetMachines := make([]string, len(machines)) 1510 for i, m := range machines { 1511 targetMachines[i] = m.Name 1512 } 1513 sort.Strings(targetMachines) 1514 1515 // Make sure the status matches. 1516 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 1517 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 1518 if err != nil { 1519 return nil 1520 } 1521 return &mhc.Status 1522 }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 1523 ExpectedMachines: 1, 1524 CurrentHealthy: 1, 1525 RemediationsAllowed: 1, 1526 ObservedGeneration: 1, 1527 Targets: targetMachines, 1528 Conditions: clusterv1.Conditions{ 1529 { 1530 Type: clusterv1.RemediationAllowedCondition, 1531 Status: corev1.ConditionTrue, 1532 }, 1533 }, 1534 })) 1535 1536 // Transition the node to unhealthy. 1537 node := nodes[0] 1538 nodePatch := client.MergeFrom(node.DeepCopy()) 1539 node.Status.Conditions = []corev1.NodeCondition{ 1540 { 1541 Type: corev1.NodeReady, 1542 Status: corev1.ConditionUnknown, 1543 LastTransitionTime: metav1.NewTime(time.Now().Add(-10 * time.Minute)), 1544 }, 1545 } 1546 g.Expect(env.Status().Patch(ctx, node, nodePatch)).To(Succeed()) 1547 1548 // Make sure the status matches. 1549 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 1550 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 1551 if err != nil { 1552 return nil 1553 } 1554 return &mhc.Status 1555 }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 1556 ExpectedMachines: 1, 1557 CurrentHealthy: 0, 1558 RemediationsAllowed: 0, 1559 ObservedGeneration: 1, 1560 Targets: targetMachines, 1561 Conditions: clusterv1.Conditions{ 1562 { 1563 Type: clusterv1.RemediationAllowedCondition, 1564 Status: corev1.ConditionTrue, 1565 }, 1566 }, 1567 })) 1568 1569 // Calculate how many Machines have health check succeeded = false. 1570 g.Eventually(func() (unhealthy int) { 1571 machines := &clusterv1.MachineList{} 1572 err := env.List(ctx, machines, client.MatchingLabels{ 1573 "selector": mhc.Spec.Selector.MatchLabels["selector"], 1574 }) 1575 if err != nil { 1576 return -1 1577 } 1578 1579 for i := range machines.Items { 1580 if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { 1581 unhealthy++ 1582 } 1583 } 1584 return 1585 }).Should(Equal(1)) 1586 1587 ref := corev1.ObjectReference{ 1588 APIVersion: builder.RemediationGroupVersion.String(), 1589 Kind: "GenericExternalRemediation", 1590 } 1591 1592 obj := util.ObjectReferenceToUnstructured(ref) 1593 // Make sure the Remeditaion Request is created. 1594 g.Eventually(func() *unstructured.Unstructured { 1595 key := client.ObjectKey{ 1596 Namespace: machines[0].Namespace, 1597 Name: machines[0].Name, 1598 } 1599 err := env.Get(ctx, key, obj) 1600 if err != nil { 1601 return nil 1602 } 1603 return obj 1604 }, timeout, 100*time.Millisecond).ShouldNot(BeNil()) 1605 g.Expect(obj.GetOwnerReferences()).To(HaveLen(1)) 1606 g.Expect(obj.GetOwnerReferences()[0].Name).To(Equal(machines[0].Name)) 1607 }) 1608 1609 t.Run("When remediationTemplate is set and node transitions back to healthy, new Remediation Request should be deleted", func(t *testing.T) { 1610 g := NewWithT(t) 1611 cluster := createCluster(g, ns.Name) 1612 1613 // Create remediation template resource. 1614 infraRemediationResource := map[string]interface{}{ 1615 "kind": "GenericExternalRemediation", 1616 "apiVersion": builder.RemediationGroupVersion.String(), 1617 "metadata": map[string]interface{}{}, 1618 "spec": map[string]interface{}{ 1619 "size": "3xlarge", 1620 }, 1621 } 1622 infraRemediationTmpl := &unstructured.Unstructured{ 1623 Object: map[string]interface{}{ 1624 "spec": map[string]interface{}{ 1625 "template": infraRemediationResource, 1626 }, 1627 }, 1628 } 1629 infraRemediationTmpl.SetKind("GenericExternalRemediationTemplate") 1630 infraRemediationTmpl.SetAPIVersion(builder.RemediationGroupVersion.String()) 1631 infraRemediationTmpl.SetGenerateName("remediation-template-name-") 1632 infraRemediationTmpl.SetNamespace(cluster.Namespace) 1633 g.Expect(env.Create(ctx, infraRemediationTmpl)).To(Succeed()) 1634 1635 remediationTemplate := &corev1.ObjectReference{ 1636 APIVersion: builder.RemediationGroupVersion.String(), 1637 Kind: "GenericExternalRemediationTemplate", 1638 Name: infraRemediationTmpl.GetName(), 1639 } 1640 1641 mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) 1642 mhc.Spec.RemediationTemplate = remediationTemplate 1643 g.Expect(env.Create(ctx, mhc)).To(Succeed()) 1644 defer func(do ...client.Object) { 1645 g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) 1646 }(cluster, mhc, infraRemediationTmpl) 1647 1648 // Healthy nodes and machines. 1649 nodes, machines, cleanup := createMachinesWithNodes(g, cluster, 1650 count(1), 1651 firstMachineAsControlPlane(), 1652 createNodeRefForMachine(true), 1653 nodeStatus(corev1.ConditionTrue), 1654 machineLabels(mhc.Spec.Selector.MatchLabels), 1655 ) 1656 defer cleanup() 1657 targetMachines := make([]string, len(machines)) 1658 for i, m := range machines { 1659 targetMachines[i] = m.Name 1660 } 1661 sort.Strings(targetMachines) 1662 1663 // Make sure the status matches. 1664 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 1665 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 1666 if err != nil { 1667 return nil 1668 } 1669 return &mhc.Status 1670 }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 1671 ExpectedMachines: 1, 1672 CurrentHealthy: 1, 1673 RemediationsAllowed: 1, 1674 ObservedGeneration: 1, 1675 Targets: targetMachines, 1676 Conditions: clusterv1.Conditions{ 1677 { 1678 Type: clusterv1.RemediationAllowedCondition, 1679 Status: corev1.ConditionTrue, 1680 }, 1681 }, 1682 })) 1683 1684 // Transition the node to unhealthy. 1685 node := nodes[0] 1686 nodePatch := client.MergeFrom(node.DeepCopy()) 1687 node.Status.Conditions = []corev1.NodeCondition{ 1688 { 1689 Type: corev1.NodeReady, 1690 Status: corev1.ConditionUnknown, 1691 LastTransitionTime: metav1.NewTime(time.Now().Add(-10 * time.Minute)), 1692 }, 1693 } 1694 g.Expect(env.Status().Patch(ctx, node, nodePatch)).To(Succeed()) 1695 1696 // Make sure the status matches. 1697 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 1698 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 1699 if err != nil { 1700 return nil 1701 } 1702 return &mhc.Status 1703 }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 1704 ExpectedMachines: 1, 1705 CurrentHealthy: 0, 1706 RemediationsAllowed: 0, 1707 ObservedGeneration: 1, 1708 Targets: targetMachines, 1709 Conditions: clusterv1.Conditions{ 1710 { 1711 Type: clusterv1.RemediationAllowedCondition, 1712 Status: corev1.ConditionTrue, 1713 }, 1714 }, 1715 })) 1716 1717 // Calculate how many Machines have health check succeeded = false. 1718 g.Eventually(func() (unhealthy int) { 1719 machines := &clusterv1.MachineList{} 1720 err := env.List(ctx, machines, client.MatchingLabels{ 1721 "selector": mhc.Spec.Selector.MatchLabels["selector"], 1722 }) 1723 if err != nil { 1724 return -1 1725 } 1726 1727 for i := range machines.Items { 1728 if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { 1729 unhealthy++ 1730 } 1731 } 1732 return 1733 }).Should(Equal(1)) 1734 1735 // Transition the node back to healthy. 1736 node = nodes[0] 1737 nodePatch = client.MergeFrom(node.DeepCopy()) 1738 node.Status.Conditions = []corev1.NodeCondition{ 1739 { 1740 Type: corev1.NodeReady, 1741 Status: corev1.ConditionTrue, 1742 LastTransitionTime: metav1.NewTime(time.Now().Add(-10 * time.Minute)), 1743 }, 1744 } 1745 g.Expect(env.Status().Patch(ctx, node, nodePatch)).To(Succeed()) 1746 1747 // Make sure the status matches. 1748 g.Eventually(func() *clusterv1.MachineHealthCheckStatus { 1749 err := env.Get(ctx, util.ObjectKey(mhc), mhc) 1750 if err != nil { 1751 return nil 1752 } 1753 return &mhc.Status 1754 }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ 1755 ExpectedMachines: 1, 1756 CurrentHealthy: 1, 1757 RemediationsAllowed: 1, 1758 ObservedGeneration: 1, 1759 Targets: targetMachines, 1760 Conditions: clusterv1.Conditions{ 1761 { 1762 Type: clusterv1.RemediationAllowedCondition, 1763 Status: corev1.ConditionTrue, 1764 }, 1765 }, 1766 })) 1767 1768 // Calculate how many Machines have health check succeeded = false. 1769 g.Eventually(func() (unhealthy int) { 1770 machines := &clusterv1.MachineList{} 1771 err := env.List(ctx, machines, client.MatchingLabels{ 1772 "selector": mhc.Spec.Selector.MatchLabels["selector"], 1773 }) 1774 if err != nil { 1775 return -1 1776 } 1777 1778 for i := range machines.Items { 1779 if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { 1780 unhealthy++ 1781 } 1782 } 1783 return 1784 }).Should(Equal(0)) 1785 1786 ref := corev1.ObjectReference{ 1787 APIVersion: builder.RemediationGroupVersion.String(), 1788 Kind: "GenericExternalRemediation", 1789 } 1790 1791 obj := util.ObjectReferenceToUnstructured(ref) 1792 // Make sure the Remediation Request is deleted. 1793 g.Eventually(func() *unstructured.Unstructured { 1794 key := client.ObjectKey{ 1795 Namespace: machines[0].Namespace, 1796 Name: machines[0].Name, 1797 } 1798 err := env.Get(ctx, key, obj) 1799 if err != nil { 1800 return nil 1801 } 1802 return obj 1803 }, timeout, 100*time.Millisecond).Should(BeNil()) 1804 }) 1805 } 1806 1807 func TestClusterToMachineHealthCheck(t *testing.T) { 1808 fakeClient := fake.NewClientBuilder().Build() 1809 1810 r := &Reconciler{ 1811 Client: fakeClient, 1812 } 1813 1814 namespace := metav1.NamespaceDefault 1815 clusterName := testClusterName 1816 labels := make(map[string]string) 1817 1818 mhc1 := newMachineHealthCheckWithLabels("mhc1", namespace, clusterName, labels) 1819 mhc1Req := reconcile.Request{NamespacedName: types.NamespacedName{Namespace: mhc1.Namespace, Name: mhc1.Name}} 1820 mhc2 := newMachineHealthCheckWithLabels("mhc2", namespace, clusterName, labels) 1821 mhc2Req := reconcile.Request{NamespacedName: types.NamespacedName{Namespace: mhc2.Namespace, Name: mhc2.Name}} 1822 mhc3 := newMachineHealthCheckWithLabels("mhc3", namespace, "othercluster", labels) 1823 mhc4 := newMachineHealthCheckWithLabels("mhc4", "othernamespace", clusterName, labels) 1824 cluster1 := &clusterv1.Cluster{ 1825 ObjectMeta: metav1.ObjectMeta{ 1826 Name: clusterName, 1827 Namespace: namespace, 1828 }, 1829 } 1830 1831 testCases := []struct { 1832 name string 1833 toCreate []clusterv1.MachineHealthCheck 1834 object client.Object 1835 expected []reconcile.Request 1836 }{ 1837 { 1838 name: "when a MachineHealthCheck exists for the Cluster in the same namespace", 1839 toCreate: []clusterv1.MachineHealthCheck{*mhc1}, 1840 object: cluster1, 1841 expected: []reconcile.Request{mhc1Req}, 1842 }, 1843 { 1844 name: "when 2 MachineHealthChecks exists for the Cluster in the same namespace", 1845 toCreate: []clusterv1.MachineHealthCheck{*mhc1, *mhc2}, 1846 object: cluster1, 1847 expected: []reconcile.Request{mhc1Req, mhc2Req}, 1848 }, 1849 { 1850 name: "when a MachineHealthCheck exists for another Cluster in the same namespace", 1851 toCreate: []clusterv1.MachineHealthCheck{*mhc3}, 1852 object: cluster1, 1853 expected: []reconcile.Request{}, 1854 }, 1855 { 1856 name: "when a MachineHealthCheck exists for another Cluster in another namespace", 1857 toCreate: []clusterv1.MachineHealthCheck{*mhc4}, 1858 object: cluster1, 1859 expected: []reconcile.Request{}, 1860 }, 1861 } 1862 1863 for _, tc := range testCases { 1864 t.Run(tc.name, func(t *testing.T) { 1865 gs := NewWithT(t) 1866 1867 for _, obj := range tc.toCreate { 1868 o := obj 1869 gs.Expect(r.Client.Create(ctx, &o)).To(Succeed()) 1870 defer func() { 1871 gs.Expect(r.Client.Delete(ctx, &o)).To(Succeed()) 1872 }() 1873 // Check the cache is populated 1874 getObj := func() error { 1875 return r.Client.Get(ctx, util.ObjectKey(&o), &clusterv1.MachineHealthCheck{}) 1876 } 1877 gs.Eventually(getObj).Should(Succeed()) 1878 } 1879 1880 got := r.clusterToMachineHealthCheck(ctx, tc.object) 1881 gs.Expect(got).To(ConsistOf(tc.expected)) 1882 }) 1883 } 1884 } 1885 1886 func TestMachineToMachineHealthCheck(t *testing.T) { 1887 fakeClient := fake.NewClientBuilder().Build() 1888 1889 r := &Reconciler{ 1890 Client: fakeClient, 1891 } 1892 1893 namespace := metav1.NamespaceDefault 1894 clusterName := testClusterName 1895 nodeName := "node1" 1896 labels := map[string]string{"cluster": "foo", "nodepool": "bar"} 1897 1898 mhc1 := newMachineHealthCheckWithLabels("mhc1", namespace, clusterName, labels) 1899 mhc1Req := reconcile.Request{NamespacedName: types.NamespacedName{Namespace: mhc1.Namespace, Name: mhc1.Name}} 1900 mhc2 := newMachineHealthCheckWithLabels("mhc2", namespace, clusterName, labels) 1901 mhc2Req := reconcile.Request{NamespacedName: types.NamespacedName{Namespace: mhc2.Namespace, Name: mhc2.Name}} 1902 mhc3 := newMachineHealthCheckWithLabels("mhc3", namespace, clusterName, map[string]string{"cluster": "foo", "nodepool": "other"}) 1903 mhc4 := newMachineHealthCheckWithLabels("mhc4", "othernamespace", clusterName, labels) 1904 machine1 := newTestMachine("machine1", namespace, clusterName, nodeName, labels) 1905 1906 testCases := []struct { 1907 name string 1908 toCreate []clusterv1.MachineHealthCheck 1909 object client.Object 1910 expected []reconcile.Request 1911 }{ 1912 { 1913 name: "when a MachineHealthCheck matches labels for the Machine in the same namespace", 1914 toCreate: []clusterv1.MachineHealthCheck{*mhc1}, 1915 object: machine1, 1916 expected: []reconcile.Request{mhc1Req}, 1917 }, 1918 { 1919 name: "when 2 MachineHealthChecks match labels for the Machine in the same namespace", 1920 toCreate: []clusterv1.MachineHealthCheck{*mhc1, *mhc2}, 1921 object: machine1, 1922 expected: []reconcile.Request{mhc1Req, mhc2Req}, 1923 }, 1924 { 1925 name: "when a MachineHealthCheck does not match labels for the Machine in the same namespace", 1926 toCreate: []clusterv1.MachineHealthCheck{*mhc3}, 1927 object: machine1, 1928 expected: []reconcile.Request{}, 1929 }, 1930 { 1931 name: "when a MachineHealthCheck matches labels for the Machine in another namespace", 1932 toCreate: []clusterv1.MachineHealthCheck{*mhc4}, 1933 object: machine1, 1934 expected: []reconcile.Request{}, 1935 }, 1936 } 1937 1938 for _, tc := range testCases { 1939 t.Run(tc.name, func(t *testing.T) { 1940 gs := NewWithT(t) 1941 1942 for _, obj := range tc.toCreate { 1943 o := obj 1944 gs.Expect(r.Client.Create(ctx, &o)).To(Succeed()) 1945 defer func() { 1946 gs.Expect(r.Client.Delete(ctx, &o)).To(Succeed()) 1947 }() 1948 // Check the cache is populated 1949 getObj := func() error { 1950 return r.Client.Get(ctx, util.ObjectKey(&o), &clusterv1.MachineHealthCheck{}) 1951 } 1952 gs.Eventually(getObj).Should(Succeed()) 1953 } 1954 1955 got := r.machineToMachineHealthCheck(ctx, tc.object) 1956 gs.Expect(got).To(ConsistOf(tc.expected)) 1957 }) 1958 } 1959 } 1960 1961 func TestNodeToMachineHealthCheck(t *testing.T) { 1962 fakeClient := fake.NewClientBuilder(). 1963 WithIndex(&clusterv1.Machine{}, index.MachineNodeNameField, index.MachineByNodeName). 1964 WithStatusSubresource(&clusterv1.MachineHealthCheck{}, &clusterv1.Machine{}). 1965 Build() 1966 1967 r := &Reconciler{ 1968 Client: fakeClient, 1969 } 1970 1971 namespace := metav1.NamespaceDefault 1972 clusterName := testClusterName 1973 nodeName := "node1" 1974 labels := map[string]string{"cluster": "foo", "nodepool": "bar"} 1975 1976 mhc1 := newMachineHealthCheckWithLabels("mhc1", namespace, clusterName, labels) 1977 mhc1Req := reconcile.Request{NamespacedName: types.NamespacedName{Namespace: mhc1.Namespace, Name: mhc1.Name}} 1978 mhc2 := newMachineHealthCheckWithLabels("mhc2", namespace, clusterName, labels) 1979 mhc2Req := reconcile.Request{NamespacedName: types.NamespacedName{Namespace: mhc2.Namespace, Name: mhc2.Name}} 1980 mhc3 := newMachineHealthCheckWithLabels("mhc3", namespace, "othercluster", labels) 1981 mhc4 := newMachineHealthCheckWithLabels("mhc4", "othernamespace", clusterName, labels) 1982 1983 machine1 := newTestMachine("machine1", namespace, clusterName, nodeName, labels) 1984 machine2 := newTestMachine("machine2", namespace, clusterName, nodeName, labels) 1985 1986 node1 := &corev1.Node{ 1987 ObjectMeta: metav1.ObjectMeta{ 1988 Name: nodeName, 1989 }, 1990 } 1991 1992 testCases := []struct { 1993 name string 1994 mhcToCreate []clusterv1.MachineHealthCheck 1995 mToCreate []clusterv1.Machine 1996 object client.Object 1997 expected []reconcile.Request 1998 }{ 1999 { 2000 name: "when no Machine exists for the Node", 2001 mhcToCreate: []clusterv1.MachineHealthCheck{*mhc1}, 2002 mToCreate: []clusterv1.Machine{}, 2003 object: node1, 2004 expected: []reconcile.Request{}, 2005 }, 2006 { 2007 name: "when two Machines exist for the Node", 2008 mhcToCreate: []clusterv1.MachineHealthCheck{*mhc1}, 2009 mToCreate: []clusterv1.Machine{*machine1, *machine2}, 2010 object: node1, 2011 expected: []reconcile.Request{}, 2012 }, 2013 { 2014 name: "when no MachineHealthCheck exists for the Node in the Machine's namespace", 2015 mhcToCreate: []clusterv1.MachineHealthCheck{*mhc4}, 2016 mToCreate: []clusterv1.Machine{*machine1}, 2017 object: node1, 2018 expected: []reconcile.Request{}, 2019 }, 2020 { 2021 name: "when a MachineHealthCheck exists for the Node in the Machine's namespace", 2022 mhcToCreate: []clusterv1.MachineHealthCheck{*mhc1}, 2023 mToCreate: []clusterv1.Machine{*machine1}, 2024 object: node1, 2025 expected: []reconcile.Request{mhc1Req}, 2026 }, 2027 { 2028 name: "when two MachineHealthChecks exist for the Node in the Machine's namespace", 2029 mhcToCreate: []clusterv1.MachineHealthCheck{*mhc1, *mhc2}, 2030 mToCreate: []clusterv1.Machine{*machine1}, 2031 object: node1, 2032 expected: []reconcile.Request{mhc1Req, mhc2Req}, 2033 }, 2034 { 2035 name: "when a MachineHealthCheck exists for the Node, but not in the Machine's cluster", 2036 mhcToCreate: []clusterv1.MachineHealthCheck{*mhc3}, 2037 mToCreate: []clusterv1.Machine{*machine1}, 2038 object: node1, 2039 expected: []reconcile.Request{}, 2040 }, 2041 } 2042 2043 for _, tc := range testCases { 2044 t.Run(tc.name, func(t *testing.T) { 2045 gs := NewWithT(t) 2046 2047 for _, obj := range tc.mhcToCreate { 2048 o := obj 2049 gs.Expect(r.Client.Create(ctx, &o)).To(Succeed()) 2050 defer func() { 2051 gs.Expect(r.Client.Delete(ctx, &o)).To(Succeed()) 2052 }() 2053 // Check the cache is populated 2054 key := util.ObjectKey(&o) 2055 getObj := func() error { 2056 return r.Client.Get(ctx, key, &clusterv1.MachineHealthCheck{}) 2057 } 2058 gs.Eventually(getObj).Should(Succeed()) 2059 } 2060 for _, obj := range tc.mToCreate { 2061 o := obj 2062 gs.Expect(r.Client.Create(ctx, &o)).To(Succeed()) 2063 defer func() { 2064 gs.Expect(r.Client.Delete(ctx, &o)).To(Succeed()) 2065 }() 2066 // Ensure the status is set (required for matching node to machine) 2067 o.Status = obj.Status 2068 gs.Expect(r.Client.Status().Update(ctx, &o)).To(Succeed()) 2069 2070 // Check the cache is up to date with the status update 2071 key := util.ObjectKey(&o) 2072 checkStatus := func() clusterv1.MachineStatus { 2073 m := &clusterv1.Machine{} 2074 err := r.Client.Get(ctx, key, m) 2075 if err != nil { 2076 return clusterv1.MachineStatus{} 2077 } 2078 return m.Status 2079 } 2080 gs.Eventually(checkStatus).Should(BeComparableTo(o.Status)) 2081 } 2082 2083 got := r.nodeToMachineHealthCheck(ctx, tc.object) 2084 gs.Expect(got).To(ConsistOf(tc.expected)) 2085 }) 2086 } 2087 } 2088 2089 func TestIsAllowedRemediation(t *testing.T) { 2090 testCases := []struct { 2091 name string 2092 maxUnhealthy *intstr.IntOrString 2093 expectedMachines int32 2094 currentHealthy int32 2095 allowed bool 2096 observedGeneration int64 2097 }{ 2098 { 2099 name: "when maxUnhealthy is not set", 2100 maxUnhealthy: nil, 2101 expectedMachines: int32(3), 2102 currentHealthy: int32(0), 2103 allowed: false, 2104 }, 2105 { 2106 name: "when maxUnhealthy is not an int or percentage", 2107 maxUnhealthy: &intstr.IntOrString{Type: intstr.String, StrVal: "abcdef"}, 2108 expectedMachines: int32(5), 2109 currentHealthy: int32(2), 2110 allowed: false, 2111 }, 2112 { 2113 name: "when maxUnhealthy is an int less than current unhealthy", 2114 maxUnhealthy: &intstr.IntOrString{Type: intstr.Int, IntVal: int32(1)}, 2115 expectedMachines: int32(3), 2116 currentHealthy: int32(1), 2117 allowed: false, 2118 }, 2119 { 2120 name: "when maxUnhealthy is an int equal to current unhealthy", 2121 maxUnhealthy: &intstr.IntOrString{Type: intstr.Int, IntVal: int32(2)}, 2122 expectedMachines: int32(3), 2123 currentHealthy: int32(1), 2124 allowed: true, 2125 }, 2126 { 2127 name: "when maxUnhealthy is an int greater than current unhealthy", 2128 maxUnhealthy: &intstr.IntOrString{Type: intstr.Int, IntVal: int32(3)}, 2129 expectedMachines: int32(3), 2130 currentHealthy: int32(1), 2131 allowed: true, 2132 }, 2133 { 2134 name: "when maxUnhealthy is a percentage less than current unhealthy", 2135 maxUnhealthy: &intstr.IntOrString{Type: intstr.String, StrVal: "50%"}, 2136 expectedMachines: int32(5), 2137 currentHealthy: int32(2), 2138 allowed: false, 2139 }, 2140 { 2141 name: "when maxUnhealthy is a percentage equal to current unhealthy", 2142 maxUnhealthy: &intstr.IntOrString{Type: intstr.String, StrVal: "60%"}, 2143 expectedMachines: int32(5), 2144 currentHealthy: int32(2), 2145 allowed: true, 2146 }, 2147 { 2148 name: "when maxUnhealthy is a percentage greater than current unhealthy", 2149 maxUnhealthy: &intstr.IntOrString{Type: intstr.String, StrVal: "70%"}, 2150 expectedMachines: int32(5), 2151 currentHealthy: int32(2), 2152 allowed: true, 2153 }, 2154 } 2155 2156 for _, tc := range testCases { 2157 t.Run(tc.name, func(t *testing.T) { 2158 g := NewWithT(t) 2159 2160 mhc := &clusterv1.MachineHealthCheck{ 2161 Spec: clusterv1.MachineHealthCheckSpec{ 2162 MaxUnhealthy: tc.maxUnhealthy, 2163 NodeStartupTimeout: &metav1.Duration{Duration: 1 * time.Millisecond}, 2164 }, 2165 Status: clusterv1.MachineHealthCheckStatus{ 2166 ExpectedMachines: tc.expectedMachines, 2167 CurrentHealthy: tc.currentHealthy, 2168 ObservedGeneration: tc.observedGeneration, 2169 }, 2170 } 2171 2172 remediationAllowed, _, _ := isAllowedRemediation(mhc) 2173 g.Expect(remediationAllowed).To(Equal(tc.allowed)) 2174 }) 2175 } 2176 } 2177 2178 func TestGetMaxUnhealthy(t *testing.T) { 2179 testCases := []struct { 2180 name string 2181 maxUnhealthy *intstr.IntOrString 2182 expectedMaxUnhealthy int 2183 actualMachineCount int32 2184 expectedErr error 2185 }{ 2186 { 2187 name: "when maxUnhealthy is nil", 2188 maxUnhealthy: nil, 2189 expectedMaxUnhealthy: 0, 2190 actualMachineCount: 7, 2191 expectedErr: errors.New("spec.maxUnhealthy must be set"), 2192 }, 2193 { 2194 name: "when maxUnhealthy is not an int or percentage", 2195 maxUnhealthy: &intstr.IntOrString{Type: intstr.String, StrVal: "abcdef"}, 2196 expectedMaxUnhealthy: 0, 2197 actualMachineCount: 3, 2198 expectedErr: errors.New("invalid value for IntOrString: invalid type: string is not a percentage"), 2199 }, 2200 { 2201 name: "when maxUnhealthy is an int", 2202 maxUnhealthy: &intstr.IntOrString{Type: intstr.Int, IntVal: 3}, 2203 actualMachineCount: 2, 2204 expectedMaxUnhealthy: 3, 2205 expectedErr: nil, 2206 }, 2207 { 2208 name: "when maxUnhealthy is a 40% (of 5)", 2209 maxUnhealthy: &intstr.IntOrString{Type: intstr.String, StrVal: "40%"}, 2210 actualMachineCount: 5, 2211 expectedMaxUnhealthy: 2, 2212 expectedErr: nil, 2213 }, 2214 { 2215 name: "when maxUnhealthy is a 60% (of 7)", 2216 maxUnhealthy: &intstr.IntOrString{Type: intstr.String, StrVal: "60%"}, 2217 actualMachineCount: 7, 2218 expectedMaxUnhealthy: 4, 2219 expectedErr: nil, 2220 }, 2221 } 2222 2223 for _, tc := range testCases { 2224 t.Run(tc.name, func(t *testing.T) { 2225 g := NewWithT(t) 2226 2227 mhc := &clusterv1.MachineHealthCheck{ 2228 Spec: clusterv1.MachineHealthCheckSpec{ 2229 MaxUnhealthy: tc.maxUnhealthy, 2230 }, 2231 Status: clusterv1.MachineHealthCheckStatus{ 2232 ExpectedMachines: tc.actualMachineCount, 2233 }, 2234 } 2235 2236 maxUnhealthy, err := getMaxUnhealthy(mhc) 2237 if tc.expectedErr != nil { 2238 g.Expect(err).To(MatchError(tc.expectedErr.Error())) 2239 } else { 2240 g.Expect(err).ToNot(HaveOccurred()) 2241 } 2242 g.Expect(maxUnhealthy).To(Equal(tc.expectedMaxUnhealthy)) 2243 }) 2244 } 2245 } 2246 2247 func ownerReferenceForCluster(ctx context.Context, g *WithT, c *clusterv1.Cluster) metav1.OwnerReference { 2248 // Fetch the cluster to populate the UID 2249 cc := &clusterv1.Cluster{} 2250 g.Expect(env.Get(ctx, util.ObjectKey(c), cc)).To(Succeed()) 2251 2252 return metav1.OwnerReference{ 2253 APIVersion: clusterv1.GroupVersion.String(), 2254 Kind: "Cluster", 2255 Name: cc.Name, 2256 UID: cc.UID, 2257 } 2258 } 2259 2260 // createCluster creates a Cluster and KubeconfigSecret for that cluster in said namespace. 2261 func createCluster(g *WithT, namespaceName string) *clusterv1.Cluster { 2262 cluster := &clusterv1.Cluster{ 2263 ObjectMeta: metav1.ObjectMeta{ 2264 GenerateName: "test-cluster-", 2265 Namespace: namespaceName, 2266 }, 2267 } 2268 2269 g.Expect(env.Create(ctx, cluster)).To(Succeed()) 2270 2271 // Make sure the cluster is in the cache before proceeding 2272 g.Eventually(func() error { 2273 var cl clusterv1.Cluster 2274 return env.Get(ctx, util.ObjectKey(cluster), &cl) 2275 }, timeout, 100*time.Millisecond).Should(Succeed()) 2276 2277 // This is required for MHC to perform checks 2278 patchHelper, err := patch.NewHelper(cluster, env.Client) 2279 g.Expect(err).ToNot(HaveOccurred()) 2280 conditions.MarkTrue(cluster, clusterv1.InfrastructureReadyCondition) 2281 g.Expect(patchHelper.Patch(ctx, cluster)).To(Succeed()) 2282 2283 // Wait for cluster in cache to be updated post-patch 2284 g.Eventually(func() bool { 2285 err := env.Get(ctx, util.ObjectKey(cluster), cluster) 2286 if err != nil { 2287 return false 2288 } 2289 2290 return conditions.IsTrue(cluster, clusterv1.InfrastructureReadyCondition) 2291 }, timeout, 100*time.Millisecond).Should(BeTrue()) 2292 2293 g.Expect(env.CreateKubeconfigSecret(ctx, cluster)).To(Succeed()) 2294 2295 return cluster 2296 } 2297 2298 // newRunningMachine creates a Machine object with a Status.Phase == Running. 2299 func newRunningMachine(c *clusterv1.Cluster, labels map[string]string) *clusterv1.Machine { 2300 return &clusterv1.Machine{ 2301 TypeMeta: metav1.TypeMeta{ 2302 APIVersion: clusterv1.GroupVersion.String(), 2303 Kind: "Machine", 2304 }, 2305 ObjectMeta: metav1.ObjectMeta{ 2306 GenerateName: "test-mhc-machine-", 2307 Namespace: c.Namespace, 2308 Labels: labels, 2309 }, 2310 Spec: clusterv1.MachineSpec{ 2311 ClusterName: c.Name, 2312 Bootstrap: clusterv1.Bootstrap{ 2313 DataSecretName: pointer.String("data-secret-name"), 2314 }, 2315 }, 2316 Status: clusterv1.MachineStatus{ 2317 InfrastructureReady: true, 2318 BootstrapReady: true, 2319 Phase: string(clusterv1.MachinePhaseRunning), 2320 ObservedGeneration: 1, 2321 }, 2322 } 2323 } 2324 2325 func newInfraMachine(machine *clusterv1.Machine) (*unstructured.Unstructured, string) { 2326 providerID := fmt.Sprintf("test:////%v", uuid.NewUUID()) 2327 return &unstructured.Unstructured{ 2328 Object: map[string]interface{}{ 2329 "apiVersion": "infrastructure.cluster.x-k8s.io/v1beta1", 2330 "kind": "GenericInfrastructureMachine", 2331 "metadata": map[string]interface{}{ 2332 "generateName": "test-mhc-machine-infra-", 2333 "namespace": machine.Namespace, 2334 }, 2335 "spec": map[string]interface{}{ 2336 "providerID": providerID, 2337 }, 2338 }, 2339 }, providerID 2340 } 2341 2342 type machinesWithNodes struct { 2343 count int 2344 nodeStatus corev1.ConditionStatus 2345 createNodeRefForMachine bool 2346 firstMachineAsControlPlane bool 2347 labels map[string]string 2348 failureReason string 2349 failureMessage string 2350 } 2351 2352 type machineWithNodesOption func(m *machinesWithNodes) 2353 2354 func count(n int) machineWithNodesOption { 2355 return func(m *machinesWithNodes) { 2356 m.count = n 2357 } 2358 } 2359 2360 func firstMachineAsControlPlane() machineWithNodesOption { 2361 return func(m *machinesWithNodes) { 2362 m.firstMachineAsControlPlane = true 2363 } 2364 } 2365 2366 func nodeStatus(s corev1.ConditionStatus) machineWithNodesOption { 2367 return func(m *machinesWithNodes) { 2368 m.nodeStatus = s 2369 } 2370 } 2371 2372 func createNodeRefForMachine(b bool) machineWithNodesOption { 2373 return func(m *machinesWithNodes) { 2374 m.createNodeRefForMachine = b 2375 } 2376 } 2377 2378 func machineLabels(l map[string]string) machineWithNodesOption { 2379 return func(m *machinesWithNodes) { 2380 m.labels = l 2381 } 2382 } 2383 2384 func machineFailureReason(s string) machineWithNodesOption { 2385 return func(m *machinesWithNodes) { 2386 m.failureReason = s 2387 } 2388 } 2389 2390 func machineFailureMessage(s string) machineWithNodesOption { 2391 return func(m *machinesWithNodes) { 2392 m.failureMessage = s 2393 } 2394 } 2395 2396 func createMachinesWithNodes( 2397 g *WithT, 2398 c *clusterv1.Cluster, 2399 opts ...machineWithNodesOption, 2400 ) ([]*corev1.Node, []*clusterv1.Machine, func()) { 2401 o := &machinesWithNodes{} 2402 for _, op := range opts { 2403 op(o) 2404 } 2405 2406 var ( 2407 nodes []*corev1.Node 2408 machines []*clusterv1.Machine 2409 infraMachines []*unstructured.Unstructured 2410 ) 2411 2412 for i := 0; i < o.count; i++ { 2413 machine := newRunningMachine(c, o.labels) 2414 if i == 0 && o.firstMachineAsControlPlane { 2415 if machine.Labels == nil { 2416 machine.Labels = make(map[string]string) 2417 } 2418 machine.Labels[clusterv1.MachineControlPlaneLabel] = "" 2419 } 2420 infraMachine, providerID := newInfraMachine(machine) 2421 g.Expect(env.Create(ctx, infraMachine)).To(Succeed()) 2422 infraMachines = append(infraMachines, infraMachine) 2423 fmt.Printf("inframachine created: %s\n", infraMachine.GetName()) 2424 // Patch the status of the InfraMachine and mark it as ready. 2425 // NB. Status cannot be set during object creation so we need to patch 2426 // it separately. 2427 infraMachinePatch := client.MergeFrom(infraMachine.DeepCopy()) 2428 g.Expect(unstructured.SetNestedField(infraMachine.Object, true, "status", "ready")).To(Succeed()) 2429 g.Expect(env.Status().Patch(ctx, infraMachine, infraMachinePatch)).To(Succeed()) 2430 2431 machine.Spec.InfrastructureRef = corev1.ObjectReference{ 2432 APIVersion: infraMachine.GetAPIVersion(), 2433 Kind: infraMachine.GetKind(), 2434 Name: infraMachine.GetName(), 2435 } 2436 g.Expect(env.Create(ctx, machine)).To(Succeed()) 2437 fmt.Printf("machine created: %s\n", machine.GetName()) 2438 2439 // Before moving on we want to ensure that the machine has a valid 2440 // status. That is, LastUpdated should not be nil. 2441 g.Eventually(func() *metav1.Time { 2442 k := client.ObjectKey{ 2443 Name: machine.GetName(), 2444 Namespace: machine.GetNamespace(), 2445 } 2446 err := env.Get(ctx, k, machine) 2447 if err != nil { 2448 return nil 2449 } 2450 return machine.Status.LastUpdated 2451 }, timeout, 100*time.Millisecond).ShouldNot(BeNil()) 2452 2453 machinePatchHelper, err := patch.NewHelper(machine, env.Client) 2454 g.Expect(err).ToNot(HaveOccurred()) 2455 2456 if o.createNodeRefForMachine { 2457 // Create node 2458 node := &corev1.Node{ 2459 ObjectMeta: metav1.ObjectMeta{ 2460 GenerateName: "test-mhc-node-", 2461 }, 2462 Spec: corev1.NodeSpec{ 2463 ProviderID: providerID, 2464 }, 2465 } 2466 2467 g.Expect(env.Create(ctx, node)).To(Succeed()) 2468 fmt.Printf("node created: %s\n", node.GetName()) 2469 2470 // Patch node status 2471 nodePatchHelper, err := patch.NewHelper(node, env.Client) 2472 g.Expect(err).ToNot(HaveOccurred()) 2473 2474 node.Status.Conditions = []corev1.NodeCondition{ 2475 { 2476 Type: corev1.NodeReady, 2477 Status: o.nodeStatus, 2478 LastTransitionTime: metav1.NewTime(time.Now().Add(-10 * time.Minute)), 2479 }, 2480 } 2481 2482 g.Expect(nodePatchHelper.Patch(ctx, node)).To(Succeed()) 2483 2484 nodes = append(nodes, node) 2485 2486 machine.Status.NodeRef = &corev1.ObjectReference{ 2487 Name: node.Name, 2488 } 2489 } 2490 2491 if o.failureReason != "" { 2492 failureReason := capierrors.MachineStatusError(o.failureReason) 2493 machine.Status.FailureReason = &failureReason 2494 } 2495 if o.failureMessage != "" { 2496 machine.Status.FailureMessage = pointer.String(o.failureMessage) 2497 } 2498 2499 // Adding one second to ensure there is a difference from the 2500 // original time so that the patch works. That is, ensure the 2501 // precision isn't lost during conversions. 2502 lastUp := metav1.NewTime(machine.Status.LastUpdated.Add(time.Second)) 2503 machine.Status.LastUpdated = &lastUp 2504 2505 // Patch the machine to record the status changes 2506 g.Expect(machinePatchHelper.Patch(ctx, machine)).To(Succeed()) 2507 2508 machines = append(machines, machine) 2509 } 2510 2511 cleanup := func() { 2512 fmt.Println("Cleaning up nodes, machines and infra machines.") 2513 for _, n := range nodes { 2514 if err := env.Delete(ctx, n); !apierrors.IsNotFound(err) { 2515 g.Expect(err).ToNot(HaveOccurred()) 2516 } 2517 } 2518 for _, m := range machines { 2519 g.Expect(env.Delete(ctx, m)).To(Succeed()) 2520 } 2521 for _, im := range infraMachines { 2522 if err := env.Delete(ctx, im); !apierrors.IsNotFound(err) { 2523 g.Expect(err).ToNot(HaveOccurred()) 2524 } 2525 } 2526 } 2527 2528 return nodes, machines, cleanup 2529 } 2530 2531 func newMachineHealthCheckWithLabels(name, namespace, cluster string, labels map[string]string) *clusterv1.MachineHealthCheck { 2532 l := make(map[string]string, len(labels)) 2533 for k, v := range labels { 2534 l[k] = v 2535 } 2536 l[clusterv1.ClusterNameLabel] = cluster 2537 2538 mhc := newMachineHealthCheck(namespace, cluster) 2539 mhc.SetName(name) 2540 mhc.Labels = l 2541 mhc.Spec.Selector.MatchLabels = l 2542 2543 return mhc 2544 } 2545 2546 func newMachineHealthCheck(namespace, clusterName string) *clusterv1.MachineHealthCheck { 2547 maxUnhealthy := intstr.FromString("100%") 2548 return &clusterv1.MachineHealthCheck{ 2549 ObjectMeta: metav1.ObjectMeta{ 2550 GenerateName: "test-mhc-", 2551 Namespace: namespace, 2552 }, 2553 Spec: clusterv1.MachineHealthCheckSpec{ 2554 ClusterName: clusterName, 2555 Selector: metav1.LabelSelector{ 2556 MatchLabels: map[string]string{ 2557 "selector": string(uuid.NewUUID()), 2558 }, 2559 }, 2560 MaxUnhealthy: &maxUnhealthy, 2561 NodeStartupTimeout: &metav1.Duration{Duration: 1 * time.Millisecond}, 2562 UnhealthyConditions: []clusterv1.UnhealthyCondition{ 2563 { 2564 Type: corev1.NodeReady, 2565 Status: corev1.ConditionUnknown, 2566 Timeout: metav1.Duration{Duration: 5 * time.Minute}, 2567 }, 2568 }, 2569 }, 2570 } 2571 } 2572 2573 func TestPatchTargets(t *testing.T) { 2574 g := NewWithT(t) 2575 2576 namespace := metav1.NamespaceDefault 2577 clusterName := testClusterName 2578 defaultCluster := &clusterv1.Cluster{ 2579 ObjectMeta: metav1.ObjectMeta{ 2580 Name: clusterName, 2581 Namespace: namespace, 2582 }, 2583 } 2584 labels := map[string]string{"cluster": "foo", "nodepool": "bar"} 2585 2586 mhc := newMachineHealthCheckWithLabels("mhc", namespace, clusterName, labels) 2587 machine1 := newTestMachine("machine1", namespace, clusterName, "nodeName", labels) 2588 machine1.ResourceVersion = "999" 2589 conditions.MarkTrue(machine1, clusterv1.MachineHealthCheckSucceededCondition) 2590 machine2 := machine1.DeepCopy() 2591 machine2.Name = "machine2" 2592 2593 cl := fake.NewClientBuilder().WithObjects( 2594 machine1, 2595 machine2, 2596 mhc, 2597 ).WithStatusSubresource(&clusterv1.MachineHealthCheck{}, &clusterv1.Machine{}).Build() 2598 r := &Reconciler{ 2599 Client: cl, 2600 recorder: record.NewFakeRecorder(32), 2601 Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), cl, scheme.Scheme, client.ObjectKey{Name: clusterName, Namespace: namespace}, "machinehealthcheck-watchClusterNodes"), 2602 } 2603 2604 // To make the patch fail, create patchHelper with a different client. 2605 fakeMachine := machine1.DeepCopy() 2606 fakeMachine.Name = "fake" 2607 patchHelper, _ := patch.NewHelper(fakeMachine, fake.NewClientBuilder().WithObjects(fakeMachine).Build()) 2608 // healthCheckTarget with fake patchHelper, patch should fail on this target. 2609 target1 := healthCheckTarget{ 2610 MHC: mhc, 2611 Machine: machine1, 2612 patchHelper: patchHelper, 2613 Node: &corev1.Node{}, 2614 } 2615 2616 // healthCheckTarget with correct patchHelper. 2617 patchHelper2, _ := patch.NewHelper(machine2, cl) 2618 target3 := healthCheckTarget{ 2619 MHC: mhc, 2620 Machine: machine2, 2621 patchHelper: patchHelper2, 2622 Node: &corev1.Node{}, 2623 } 2624 2625 // Target with wrong patch helper will fail but the other one will be patched. 2626 g.Expect(r.patchUnhealthyTargets(context.TODO(), logr.New(log.NullLogSink{}), []healthCheckTarget{target1, target3}, defaultCluster, mhc)).ToNot(BeEmpty()) 2627 g.Expect(cl.Get(ctx, client.ObjectKey{Name: machine2.Name, Namespace: machine2.Namespace}, machine2)).ToNot(HaveOccurred()) 2628 g.Expect(conditions.Get(machine2, clusterv1.MachineOwnerRemediatedCondition).Status).To(Equal(corev1.ConditionFalse)) 2629 2630 // Target with wrong patch helper will fail but the other one will be patched. 2631 g.Expect(r.patchHealthyTargets(context.TODO(), logr.New(log.NullLogSink{}), []healthCheckTarget{target1, target3}, mhc)).ToNot(BeEmpty()) 2632 }