sigs.k8s.io/cluster-api@v1.7.1/controlplane/kubeadm/internal/controllers/remediation_test.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package controllers 18 19 import ( 20 "context" 21 "fmt" 22 "strings" 23 "testing" 24 "time" 25 26 . "github.com/onsi/gomega" 27 "github.com/pkg/errors" 28 corev1 "k8s.io/api/core/v1" 29 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 "k8s.io/apimachinery/pkg/util/intstr" 31 "k8s.io/client-go/tools/record" 32 utilptr "k8s.io/utils/ptr" 33 "sigs.k8s.io/controller-runtime/pkg/client" 34 35 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 36 controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" 37 "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" 38 "sigs.k8s.io/cluster-api/util/collections" 39 "sigs.k8s.io/cluster-api/util/conditions" 40 "sigs.k8s.io/cluster-api/util/patch" 41 ) 42 43 func TestGetMachineToBeRemediated(t *testing.T) { 44 t.Run("returns the oldest machine if there are no provisioning machines", func(t *testing.T) { 45 g := NewWithT(t) 46 47 ns, err := env.CreateNamespace(ctx, "ns1") 48 g.Expect(err).ToNot(HaveOccurred()) 49 defer func() { 50 g.Expect(env.Cleanup(ctx, ns)).To(Succeed()) 51 }() 52 53 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed()) 54 m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed()) 55 56 unhealthyMachines := collections.FromMachines(m1, m2) 57 58 g.Expect(getMachineToBeRemediated(unhealthyMachines).Name).To(HavePrefix("m1-unhealthy-")) 59 }) 60 61 t.Run("returns the oldest of the provisioning machines", func(t *testing.T) { 62 g := NewWithT(t) 63 64 ns, err := env.CreateNamespace(ctx, "ns1") 65 g.Expect(err).ToNot(HaveOccurred()) 66 defer func() { 67 g.Expect(env.Cleanup(ctx, ns)).To(Succeed()) 68 }() 69 70 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed()) 71 m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withoutNodeRef()) 72 m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withMachineHealthCheckFailed(), withoutNodeRef()) 73 74 unhealthyMachines := collections.FromMachines(m1, m2, m3) 75 76 g.Expect(getMachineToBeRemediated(unhealthyMachines).Name).To(HavePrefix("m2-unhealthy-")) 77 }) 78 } 79 80 func TestReconcileUnhealthyMachines(t *testing.T) { 81 g := NewWithT(t) 82 83 r := &KubeadmControlPlaneReconciler{ 84 Client: env.GetClient(), 85 recorder: record.NewFakeRecorder(32), 86 } 87 ns, err := env.CreateNamespace(ctx, "ns1") 88 g.Expect(err).ToNot(HaveOccurred()) 89 defer func() { 90 g.Expect(env.Cleanup(ctx, ns)).To(Succeed()) 91 }() 92 93 var removeFinalizer = func(g *WithT, m *clusterv1.Machine) { 94 patchHelper, err := patch.NewHelper(m, env.GetClient()) 95 g.Expect(err).ToNot(HaveOccurred()) 96 m.ObjectMeta.Finalizers = nil 97 g.Expect(patchHelper.Patch(ctx, m)).To(Succeed()) 98 } 99 100 t.Run("It cleans up stuck remediation on previously unhealthy machines", func(t *testing.T) { 101 g := NewWithT(t) 102 103 m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withStuckRemediation()) 104 105 controlPlane := &internal.ControlPlane{ 106 KCP: &controlplanev1.KubeadmControlPlane{}, 107 Cluster: &clusterv1.Cluster{}, 108 Machines: collections.FromMachines(m), 109 } 110 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 111 112 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 113 g.Expect(err).ToNot(HaveOccurred()) 114 115 g.Eventually(func() error { 116 if err := env.Get(ctx, client.ObjectKey{Namespace: m.Namespace, Name: m.Name}, m); err != nil { 117 return err 118 } 119 c := conditions.Get(m, clusterv1.MachineOwnerRemediatedCondition) 120 if c == nil { 121 return nil 122 } 123 return errors.Errorf("condition %s still exists", clusterv1.MachineOwnerRemediatedCondition) 124 }, 10*time.Second).Should(Succeed()) 125 }) 126 127 // Generic preflight checks 128 // Those are ore flight checks that happen no matter if the control plane has been already initialized or not. 129 130 t.Run("Remediation does not happen if there are no unhealthy machines", func(t *testing.T) { 131 g := NewWithT(t) 132 133 controlPlane := &internal.ControlPlane{ 134 KCP: &controlplanev1.KubeadmControlPlane{}, 135 Cluster: &clusterv1.Cluster{}, 136 Machines: collections.New(), 137 } 138 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 139 140 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 141 g.Expect(err).ToNot(HaveOccurred()) 142 }) 143 t.Run("reconcileUnhealthyMachines return early if another remediation is in progress", func(t *testing.T) { 144 g := NewWithT(t) 145 146 m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withStuckRemediation()) 147 conditions.MarkFalse(m, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "") 148 conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "") 149 controlPlane := &internal.ControlPlane{ 150 KCP: &controlplanev1.KubeadmControlPlane{ 151 ObjectMeta: metav1.ObjectMeta{ 152 Annotations: map[string]string{ 153 controlplanev1.RemediationInProgressAnnotation: MustMarshalRemediationData(&RemediationData{ 154 Machine: "foo", 155 Timestamp: metav1.Time{Time: time.Now().UTC()}, 156 RetryCount: 0, 157 }), 158 }, 159 }, 160 }, 161 Cluster: &clusterv1.Cluster{}, 162 Machines: collections.FromMachines(m), 163 } 164 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 165 166 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 167 g.Expect(err).ToNot(HaveOccurred()) 168 }) 169 t.Run("reconcileUnhealthyMachines return early if the machine to be remediated is already being deleted", func(t *testing.T) { 170 g := NewWithT(t) 171 172 m := getDeletingMachine(ns.Name, "m1-unhealthy-deleting-", withMachineHealthCheckFailed()) 173 conditions.MarkFalse(m, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "") 174 conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "") 175 controlPlane := &internal.ControlPlane{ 176 KCP: &controlplanev1.KubeadmControlPlane{}, 177 Cluster: &clusterv1.Cluster{}, 178 Machines: collections.FromMachines(m), 179 } 180 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 181 182 g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 183 184 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 185 g.Expect(err).ToNot(HaveOccurred()) 186 }) 187 t.Run("Remediation does not happen if MaxRetry is reached", func(t *testing.T) { 188 g := NewWithT(t) 189 190 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{ 191 Machine: "m0", 192 Timestamp: metav1.Time{Time: time.Now().Add(-controlplanev1.DefaultMinHealthyPeriod / 2).UTC()}, // minHealthy not expired yet. 193 RetryCount: 3, 194 }))) 195 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember()) 196 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember()) 197 198 controlPlane := &internal.ControlPlane{ 199 KCP: &controlplanev1.KubeadmControlPlane{ 200 Spec: controlplanev1.KubeadmControlPlaneSpec{ 201 Replicas: utilptr.To[int32](3), 202 Version: "v1.19.1", 203 RemediationStrategy: &controlplanev1.RemediationStrategy{ 204 MaxRetry: utilptr.To[int32](3), 205 }, 206 }, 207 }, 208 Cluster: &clusterv1.Cluster{}, 209 Machines: collections.FromMachines(m1, m2, m3), 210 } 211 212 r := &KubeadmControlPlaneReconciler{ 213 Client: env.GetClient(), 214 recorder: record.NewFakeRecorder(32), 215 managementCluster: &fakeManagementCluster{ 216 Workload: fakeWorkloadCluster{ 217 EtcdMembersResult: nodes(controlPlane.Machines), 218 }, 219 }, 220 } 221 222 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 223 224 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 225 g.Expect(err).ToNot(HaveOccurred()) 226 227 g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 228 229 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed 3 times (MaxRetry)") 230 231 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 232 g.Expect(err).ToNot(HaveOccurred()) 233 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeTrue()) 234 235 removeFinalizer(g, m1) 236 g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed()) 237 }) 238 t.Run("Retry history is ignored if min healthy period is expired, default min healthy period", func(t *testing.T) { 239 g := NewWithT(t) 240 241 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{ 242 Machine: "m0", 243 Timestamp: metav1.Time{Time: time.Now().Add(-2 * controlplanev1.DefaultMinHealthyPeriod).UTC()}, // minHealthyPeriod already expired. 244 RetryCount: 3, 245 }))) 246 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember()) 247 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember()) 248 249 controlPlane := &internal.ControlPlane{ 250 KCP: &controlplanev1.KubeadmControlPlane{ 251 Spec: controlplanev1.KubeadmControlPlaneSpec{ 252 Replicas: utilptr.To[int32](3), 253 Version: "v1.19.1", 254 RemediationStrategy: &controlplanev1.RemediationStrategy{ 255 MaxRetry: utilptr.To[int32](3), 256 }, 257 }, 258 }, 259 Cluster: &clusterv1.Cluster{}, 260 Machines: collections.FromMachines(m1, m2, m3), 261 } 262 263 r := &KubeadmControlPlaneReconciler{ 264 Client: env.GetClient(), 265 recorder: record.NewFakeRecorder(32), 266 managementCluster: &fakeManagementCluster{ 267 Workload: fakeWorkloadCluster{ 268 EtcdMembersResult: nodes(controlPlane.Machines), 269 }, 270 }, 271 } 272 273 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 274 275 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 276 g.Expect(err).ToNot(HaveOccurred()) 277 278 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 279 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 280 g.Expect(err).ToNot(HaveOccurred()) 281 g.Expect(remediationData.Machine).To(Equal(m1.Name)) 282 g.Expect(remediationData.RetryCount).To(Equal(0)) 283 284 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 285 286 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 287 g.Expect(err).ToNot(HaveOccurred()) 288 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 289 290 removeFinalizer(g, m1) 291 g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed()) 292 }) 293 t.Run("Retry history is ignored if min healthy period is expired", func(t *testing.T) { 294 g := NewWithT(t) 295 296 minHealthyPeriod := 4 * controlplanev1.DefaultMinHealthyPeriod // big min healthy period, so we are user that we are not using DefaultMinHealthyPeriod. 297 298 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{ 299 Machine: "m0", 300 Timestamp: metav1.Time{Time: time.Now().Add(-2 * minHealthyPeriod).UTC()}, // minHealthyPeriod already expired. 301 RetryCount: 3, 302 }))) 303 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember()) 304 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember()) 305 306 controlPlane := &internal.ControlPlane{ 307 KCP: &controlplanev1.KubeadmControlPlane{ 308 Spec: controlplanev1.KubeadmControlPlaneSpec{ 309 Replicas: utilptr.To[int32](3), 310 Version: "v1.19.1", 311 RemediationStrategy: &controlplanev1.RemediationStrategy{ 312 MaxRetry: utilptr.To[int32](3), 313 MinHealthyPeriod: &metav1.Duration{Duration: minHealthyPeriod}, 314 }, 315 }, 316 }, 317 Cluster: &clusterv1.Cluster{}, 318 Machines: collections.FromMachines(m1, m2, m3), 319 } 320 321 r := &KubeadmControlPlaneReconciler{ 322 Client: env.GetClient(), 323 recorder: record.NewFakeRecorder(32), 324 managementCluster: &fakeManagementCluster{ 325 Workload: fakeWorkloadCluster{ 326 EtcdMembersResult: nodes(controlPlane.Machines), 327 }, 328 }, 329 } 330 331 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 332 333 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 334 g.Expect(err).ToNot(HaveOccurred()) 335 336 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 337 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 338 g.Expect(err).ToNot(HaveOccurred()) 339 g.Expect(remediationData.Machine).To(Equal(m1.Name)) 340 g.Expect(remediationData.RetryCount).To(Equal(0)) 341 342 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 343 344 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 345 g.Expect(err).ToNot(HaveOccurred()) 346 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 347 348 removeFinalizer(g, m1) 349 g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed()) 350 }) 351 t.Run("Remediation does not happen if RetryPeriod is not yet passed", func(t *testing.T) { 352 g := NewWithT(t) 353 354 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{ 355 Machine: "m0", 356 Timestamp: metav1.Time{Time: time.Now().Add(-controlplanev1.DefaultMinHealthyPeriod / 2).UTC()}, // minHealthyPeriod not yet expired. 357 RetryCount: 2, 358 }))) 359 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember()) 360 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember()) 361 362 controlPlane := &internal.ControlPlane{ 363 KCP: &controlplanev1.KubeadmControlPlane{ 364 Spec: controlplanev1.KubeadmControlPlaneSpec{ 365 Replicas: utilptr.To[int32](3), 366 Version: "v1.19.1", 367 RemediationStrategy: &controlplanev1.RemediationStrategy{ 368 MaxRetry: utilptr.To[int32](3), 369 RetryPeriod: metav1.Duration{Duration: controlplanev1.DefaultMinHealthyPeriod}, // RetryPeriod not yet expired. 370 }, 371 }, 372 }, 373 Cluster: &clusterv1.Cluster{}, 374 Machines: collections.FromMachines(m1, m2, m3), 375 } 376 377 r := &KubeadmControlPlaneReconciler{ 378 Client: env.GetClient(), 379 recorder: record.NewFakeRecorder(32), 380 managementCluster: &fakeManagementCluster{ 381 Workload: fakeWorkloadCluster{ 382 EtcdMembersResult: nodes(controlPlane.Machines), 383 }, 384 }, 385 } 386 387 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 388 389 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 390 g.Expect(err).ToNot(HaveOccurred()) 391 392 g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 393 394 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest 1h0m0s (RetryPeriod)") 395 396 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 397 g.Expect(err).ToNot(HaveOccurred()) 398 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeTrue()) 399 400 removeFinalizer(g, m1) 401 g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed()) 402 }) 403 404 // There are no preflight checks for when control plane is not yet initialized 405 // (it is the first CP, we can nuke it). 406 407 // Preflight checks for when control plane is already initialized. 408 409 t.Run("Remediation does not happen if desired replicas <= 1", func(t *testing.T) { 410 g := NewWithT(t) 411 412 m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed()) 413 controlPlane := &internal.ControlPlane{ 414 KCP: &controlplanev1.KubeadmControlPlane{ 415 Spec: controlplanev1.KubeadmControlPlaneSpec{ 416 Replicas: utilptr.To[int32](1), 417 RolloutStrategy: &controlplanev1.RolloutStrategy{ 418 RollingUpdate: &controlplanev1.RollingUpdate{ 419 MaxSurge: &intstr.IntOrString{ 420 IntVal: 1, 421 }, 422 }, 423 }, 424 }, 425 Status: controlplanev1.KubeadmControlPlaneStatus{ 426 Initialized: true, 427 }, 428 }, 429 Cluster: &clusterv1.Cluster{}, 430 Machines: collections.FromMachines(m), 431 } 432 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 433 434 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 435 g.Expect(err).ToNot(HaveOccurred()) 436 437 g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 438 439 assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal to 1") 440 441 g.Expect(env.Cleanup(ctx, m)).To(Succeed()) 442 }) 443 t.Run("Remediation does not happen if there is another machine being deleted (not the one to be remediated)", func(t *testing.T) { 444 g := NewWithT(t) 445 446 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed()) 447 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-") 448 m3 := getDeletingMachine(ns.Name, "m3-deleting") // NB. This machine is not created, it gets only added to control plane 449 controlPlane := &internal.ControlPlane{ 450 KCP: &controlplanev1.KubeadmControlPlane{ 451 Spec: controlplanev1.KubeadmControlPlaneSpec{ 452 Replicas: utilptr.To[int32](3), 453 }, 454 Status: controlplanev1.KubeadmControlPlaneStatus{ 455 Initialized: true, 456 }, 457 }, 458 Cluster: &clusterv1.Cluster{}, 459 Machines: collections.FromMachines(m1, m2, m3), 460 } 461 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 462 463 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 464 g.Expect(err).ToNot(HaveOccurred()) 465 466 g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 467 468 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation") 469 470 g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed()) 471 }) 472 t.Run("Remediation does not happen if there is an healthy machine being provisioned", func(t *testing.T) { 473 g := NewWithT(t) 474 475 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed()) 476 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-") 477 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withoutNodeRef()) // Provisioning 478 controlPlane := &internal.ControlPlane{ 479 KCP: &controlplanev1.KubeadmControlPlane{ 480 Spec: controlplanev1.KubeadmControlPlaneSpec{ 481 Replicas: utilptr.To(int32(3)), 482 }, 483 Status: controlplanev1.KubeadmControlPlaneStatus{ 484 Initialized: true, 485 }, 486 }, 487 Cluster: &clusterv1.Cluster{}, 488 Machines: collections.FromMachines(m1, m2, m3), 489 } 490 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 491 492 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 493 g.Expect(err).ToNot(HaveOccurred()) 494 495 g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 496 497 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine provisioning to complete before triggering remediation") 498 499 g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed()) 500 }) 501 t.Run("Remediation does not happen if there is an healthy machine being provisioned - 4 CP (during 3 CP rolling upgrade)", func(t *testing.T) { 502 g := NewWithT(t) 503 504 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed()) 505 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-") 506 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-") 507 m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withoutNodeRef()) // Provisioning 508 controlPlane := &internal.ControlPlane{ 509 KCP: &controlplanev1.KubeadmControlPlane{ 510 Spec: controlplanev1.KubeadmControlPlaneSpec{ 511 Replicas: utilptr.To(int32(3)), 512 }, 513 Status: controlplanev1.KubeadmControlPlaneStatus{ 514 Initialized: true, 515 }, 516 }, 517 Cluster: &clusterv1.Cluster{}, 518 Machines: collections.FromMachines(m1, m2, m3, m4), 519 } 520 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 521 522 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 523 g.Expect(err).ToNot(HaveOccurred()) 524 525 g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 526 527 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine provisioning to complete before triggering remediation") 528 529 g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed()) 530 }) 531 t.Run("Remediation does not happen if there is at least one additional unhealthy etcd member on a 3 machine CP", func(t *testing.T) { 532 g := NewWithT(t) 533 534 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 535 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember()) 536 m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember()) 537 538 controlPlane := &internal.ControlPlane{ 539 KCP: &controlplanev1.KubeadmControlPlane{ 540 Spec: controlplanev1.KubeadmControlPlaneSpec{ 541 Replicas: utilptr.To[int32](3), 542 }, 543 Status: controlplanev1.KubeadmControlPlaneStatus{ 544 Initialized: true, 545 }, 546 }, 547 Cluster: &clusterv1.Cluster{}, 548 Machines: collections.FromMachines(m1, m2, m3), 549 } 550 551 r := &KubeadmControlPlaneReconciler{ 552 Client: env.GetClient(), 553 recorder: record.NewFakeRecorder(32), 554 managementCluster: &fakeManagementCluster{ 555 Workload: fakeWorkloadCluster{ 556 EtcdMembersResult: nodes(controlPlane.Machines), 557 }, 558 }, 559 } 560 controlPlane.InjectTestManagementCluster(r.managementCluster) 561 562 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 563 564 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 565 g.Expect(err).ToNot(HaveOccurred()) 566 567 g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 568 569 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum") 570 571 g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed()) 572 }) 573 t.Run("Remediation does not happen if there are at least two additional unhealthy etcd member on a 5 machine CP", func(t *testing.T) { 574 g := NewWithT(t) 575 576 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 577 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember()) 578 m3 := createMachine(ctx, g, ns.Name, "m3-etcd-unhealthy-", withUnhealthyEtcdMember()) 579 m4 := createMachine(ctx, g, ns.Name, "m4-etcd-healthy-", withHealthyEtcdMember()) 580 m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember()) 581 582 controlPlane := &internal.ControlPlane{ 583 KCP: &controlplanev1.KubeadmControlPlane{ 584 Spec: controlplanev1.KubeadmControlPlaneSpec{ 585 Replicas: utilptr.To[int32](5), 586 }, 587 Status: controlplanev1.KubeadmControlPlaneStatus{ 588 Initialized: true, 589 }, 590 }, 591 Cluster: &clusterv1.Cluster{}, 592 Machines: collections.FromMachines(m1, m2, m3, m4, m5), 593 } 594 595 r := &KubeadmControlPlaneReconciler{ 596 Client: env.GetClient(), 597 recorder: record.NewFakeRecorder(32), 598 managementCluster: &fakeManagementCluster{ 599 Workload: fakeWorkloadCluster{ 600 EtcdMembersResult: nodes(controlPlane.Machines), 601 }, 602 }, 603 } 604 controlPlane.InjectTestManagementCluster(r.managementCluster) 605 606 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 607 608 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 609 g.Expect(err).ToNot(HaveOccurred()) 610 611 g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 612 613 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum") 614 615 g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5)).To(Succeed()) 616 }) 617 618 // Remediation for when control plane is not yet initialized 619 620 t.Run("Remediation deletes unhealthy machine - 1 CP not initialized", func(t *testing.T) { 621 g := NewWithT(t) 622 623 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 624 625 controlPlane := &internal.ControlPlane{ 626 KCP: &controlplanev1.KubeadmControlPlane{ 627 Spec: controlplanev1.KubeadmControlPlaneSpec{ 628 Replicas: utilptr.To[int32](1), 629 Version: "v1.19.1", 630 }, 631 Status: controlplanev1.KubeadmControlPlaneStatus{ 632 Initialized: false, 633 }, 634 }, 635 Cluster: &clusterv1.Cluster{}, 636 Machines: collections.FromMachines(m1), 637 } 638 639 r := &KubeadmControlPlaneReconciler{ 640 Client: env.GetClient(), 641 recorder: record.NewFakeRecorder(32), 642 managementCluster: &fakeManagementCluster{ 643 Workload: fakeWorkloadCluster{ 644 EtcdMembersResult: nodes(controlPlane.Machines), 645 }, 646 }, 647 } 648 649 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 650 651 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 652 g.Expect(err).ToNot(HaveOccurred()) 653 654 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 655 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 656 g.Expect(err).ToNot(HaveOccurred()) 657 g.Expect(remediationData.Machine).To(Equal(m1.Name)) 658 g.Expect(remediationData.RetryCount).To(Equal(0)) 659 660 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 661 662 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 663 g.Expect(err).ToNot(HaveOccurred()) 664 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 665 666 removeFinalizer(g, m1) 667 g.Expect(env.Cleanup(ctx, m1)).To(Succeed()) 668 }) 669 t.Run("Subsequent remediation of the same machine increase retry count - 1 CP not initialized", func(t *testing.T) { 670 g := NewWithT(t) 671 672 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 673 674 controlPlane := &internal.ControlPlane{ 675 KCP: &controlplanev1.KubeadmControlPlane{ 676 Spec: controlplanev1.KubeadmControlPlaneSpec{ 677 Replicas: utilptr.To[int32](1), 678 Version: "v1.19.1", 679 }, 680 Status: controlplanev1.KubeadmControlPlaneStatus{ 681 Initialized: false, 682 }, 683 }, 684 Cluster: &clusterv1.Cluster{}, 685 Machines: collections.FromMachines(m1), 686 } 687 688 // First reconcile, remediate machine m1 for the first time 689 r := &KubeadmControlPlaneReconciler{ 690 Client: env.GetClient(), 691 recorder: record.NewFakeRecorder(32), 692 managementCluster: &fakeManagementCluster{ 693 Workload: fakeWorkloadCluster{ 694 EtcdMembersResult: nodes(controlPlane.Machines), 695 }, 696 }, 697 } 698 699 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 700 701 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 702 g.Expect(err).ToNot(HaveOccurred()) 703 704 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 705 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 706 g.Expect(err).ToNot(HaveOccurred()) 707 g.Expect(remediationData.Machine).To(Equal(m1.Name)) 708 g.Expect(remediationData.RetryCount).To(Equal(0)) 709 710 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 711 712 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 713 g.Expect(err).ToNot(HaveOccurred()) 714 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 715 716 removeFinalizer(g, m1) 717 g.Expect(env.CleanupAndWait(ctx, m1)).To(Succeed()) 718 719 for i := 2; i < 4; i++ { 720 // Simulate the creation of a replacement for 0. 721 mi := createMachine(ctx, g, ns.Name, fmt.Sprintf("m%d-unhealthy-", i), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData))) 722 723 // Simulate KCP dropping RemediationInProgressAnnotation after creating the replacement machine. 724 delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation) 725 726 controlPlane.Machines = collections.FromMachines(mi) 727 728 // Reconcile unhealthy replacements for m1. 729 r.managementCluster = &fakeManagementCluster{ 730 Workload: fakeWorkloadCluster{ 731 EtcdMembersResult: nodes(collections.FromMachines(mi)), 732 }, 733 } 734 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 735 736 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 737 g.Expect(err).ToNot(HaveOccurred()) 738 739 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 740 remediationData, err = RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 741 g.Expect(err).ToNot(HaveOccurred()) 742 g.Expect(remediationData.Machine).To(Equal(mi.Name)) 743 g.Expect(remediationData.RetryCount).To(Equal(i - 1)) 744 745 assertMachineCondition(ctx, g, mi, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 746 747 err = env.Get(ctx, client.ObjectKey{Namespace: mi.Namespace, Name: mi.Name}, mi) 748 g.Expect(err).ToNot(HaveOccurred()) 749 g.Expect(mi.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 750 751 removeFinalizer(g, mi) 752 g.Expect(env.CleanupAndWait(ctx, mi)).To(Succeed()) 753 } 754 }) 755 756 // Remediation for when control plane is already initialized 757 758 t.Run("Remediation deletes unhealthy machine - 2 CP (during 1 CP rolling upgrade)", func(t *testing.T) { 759 g := NewWithT(t) 760 761 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 762 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember()) 763 764 controlPlane := &internal.ControlPlane{ 765 KCP: &controlplanev1.KubeadmControlPlane{ 766 Spec: controlplanev1.KubeadmControlPlaneSpec{ 767 Replicas: utilptr.To[int32](2), 768 Version: "v1.19.1", 769 }, 770 Status: controlplanev1.KubeadmControlPlaneStatus{ 771 Initialized: true, 772 }, 773 }, 774 Cluster: &clusterv1.Cluster{}, 775 Machines: collections.FromMachines(m1, m2), 776 } 777 778 r := &KubeadmControlPlaneReconciler{ 779 Client: env.GetClient(), 780 recorder: record.NewFakeRecorder(32), 781 managementCluster: &fakeManagementCluster{ 782 Workload: fakeWorkloadCluster{ 783 EtcdMembersResult: nodes(controlPlane.Machines), 784 }, 785 }, 786 } 787 controlPlane.InjectTestManagementCluster(r.managementCluster) 788 789 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 790 791 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 792 g.Expect(err).ToNot(HaveOccurred()) 793 794 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 795 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 796 g.Expect(err).ToNot(HaveOccurred()) 797 g.Expect(remediationData.Machine).To(Equal(m1.Name)) 798 g.Expect(remediationData.RetryCount).To(Equal(0)) 799 800 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 801 802 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 803 g.Expect(err).ToNot(HaveOccurred()) 804 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 805 806 removeFinalizer(g, m1) 807 g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed()) 808 }) 809 t.Run("Remediation deletes unhealthy machine - 3 CP", func(t *testing.T) { 810 g := NewWithT(t) 811 812 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 813 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember()) 814 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember()) 815 816 controlPlane := &internal.ControlPlane{ 817 KCP: &controlplanev1.KubeadmControlPlane{ 818 Spec: controlplanev1.KubeadmControlPlaneSpec{ 819 Replicas: utilptr.To[int32](3), 820 Version: "v1.19.1", 821 }, 822 Status: controlplanev1.KubeadmControlPlaneStatus{ 823 Initialized: true, 824 }, 825 }, 826 Cluster: &clusterv1.Cluster{}, 827 Machines: collections.FromMachines(m1, m2, m3), 828 } 829 830 r := &KubeadmControlPlaneReconciler{ 831 Client: env.GetClient(), 832 recorder: record.NewFakeRecorder(32), 833 managementCluster: &fakeManagementCluster{ 834 Workload: fakeWorkloadCluster{ 835 EtcdMembersResult: nodes(controlPlane.Machines), 836 }, 837 }, 838 } 839 controlPlane.InjectTestManagementCluster(r.managementCluster) 840 841 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 842 843 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 844 g.Expect(err).ToNot(HaveOccurred()) 845 846 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 847 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 848 g.Expect(err).ToNot(HaveOccurred()) 849 g.Expect(remediationData.Machine).To(Equal(m1.Name)) 850 g.Expect(remediationData.RetryCount).To(Equal(0)) 851 852 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 853 854 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 855 g.Expect(err).ToNot(HaveOccurred()) 856 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 857 858 removeFinalizer(g, m1) 859 g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed()) 860 }) 861 t.Run("Remediation deletes unhealthy machine failed to provision - 3 CP", func(t *testing.T) { 862 g := NewWithT(t) 863 864 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withoutNodeRef()) 865 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember()) 866 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember()) 867 868 controlPlane := &internal.ControlPlane{ 869 KCP: &controlplanev1.KubeadmControlPlane{ 870 Spec: controlplanev1.KubeadmControlPlaneSpec{ 871 Replicas: utilptr.To(int32(3)), 872 Version: "v1.19.1", 873 }, 874 Status: controlplanev1.KubeadmControlPlaneStatus{ 875 Initialized: true, 876 }, 877 }, 878 Cluster: &clusterv1.Cluster{}, 879 Machines: collections.FromMachines(m1, m2, m3), 880 } 881 882 r := &KubeadmControlPlaneReconciler{ 883 Client: env.GetClient(), 884 recorder: record.NewFakeRecorder(32), 885 managementCluster: &fakeManagementCluster{ 886 Workload: fakeWorkloadCluster{ 887 EtcdMembersResult: nodes(controlPlane.Machines), 888 }, 889 }, 890 } 891 controlPlane.InjectTestManagementCluster(r.managementCluster) 892 893 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 894 895 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 896 g.Expect(err).ToNot(HaveOccurred()) 897 898 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 899 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 900 g.Expect(err).ToNot(HaveOccurred()) 901 g.Expect(remediationData.Machine).To(Equal(m1.Name)) 902 g.Expect(remediationData.RetryCount).To(Equal(0)) 903 904 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 905 906 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 907 g.Expect(err).ToNot(HaveOccurred()) 908 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 909 910 removeFinalizer(g, m1) 911 g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed()) 912 }) 913 t.Run("Remediation deletes unhealthy machine - 4 CP (during 3 CP rolling upgrade)", func(t *testing.T) { 914 g := NewWithT(t) 915 916 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 917 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember()) 918 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember()) 919 m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withHealthyEtcdMember()) 920 921 controlPlane := &internal.ControlPlane{ 922 KCP: &controlplanev1.KubeadmControlPlane{ 923 Spec: controlplanev1.KubeadmControlPlaneSpec{ 924 Replicas: utilptr.To[int32](4), 925 Version: "v1.19.1", 926 }, 927 Status: controlplanev1.KubeadmControlPlaneStatus{ 928 Initialized: true, 929 }, 930 }, 931 Cluster: &clusterv1.Cluster{}, 932 Machines: collections.FromMachines(m1, m2, m3, m4), 933 } 934 935 r := &KubeadmControlPlaneReconciler{ 936 Client: env.GetClient(), 937 recorder: record.NewFakeRecorder(32), 938 managementCluster: &fakeManagementCluster{ 939 Workload: fakeWorkloadCluster{ 940 EtcdMembersResult: nodes(controlPlane.Machines), 941 }, 942 }, 943 } 944 controlPlane.InjectTestManagementCluster(r.managementCluster) 945 946 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 947 948 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 949 g.Expect(err).ToNot(HaveOccurred()) 950 951 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 952 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 953 g.Expect(err).ToNot(HaveOccurred()) 954 g.Expect(remediationData.Machine).To(Equal(m1.Name)) 955 g.Expect(remediationData.RetryCount).To(Equal(0)) 956 957 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 958 959 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 960 g.Expect(err).ToNot(HaveOccurred()) 961 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 962 963 removeFinalizer(g, m1) 964 g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed()) 965 }) 966 t.Run("Remediation deletes unhealthy machine failed to provision - 4 CP (during 3 CP rolling upgrade)", func(t *testing.T) { 967 g := NewWithT(t) 968 969 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withoutNodeRef()) 970 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember()) 971 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember()) 972 m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withHealthyEtcdMember()) 973 974 controlPlane := &internal.ControlPlane{ 975 KCP: &controlplanev1.KubeadmControlPlane{ 976 Spec: controlplanev1.KubeadmControlPlaneSpec{ 977 Replicas: utilptr.To(int32(4)), 978 Version: "v1.19.1", 979 }, 980 Status: controlplanev1.KubeadmControlPlaneStatus{ 981 Initialized: true, 982 }, 983 }, 984 Cluster: &clusterv1.Cluster{}, 985 Machines: collections.FromMachines(m1, m2, m3, m4), 986 } 987 988 r := &KubeadmControlPlaneReconciler{ 989 Client: env.GetClient(), 990 recorder: record.NewFakeRecorder(32), 991 managementCluster: &fakeManagementCluster{ 992 Workload: fakeWorkloadCluster{ 993 EtcdMembersResult: nodes(controlPlane.Machines), 994 }, 995 }, 996 } 997 controlPlane.InjectTestManagementCluster(r.managementCluster) 998 999 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 1000 1001 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 1002 g.Expect(err).ToNot(HaveOccurred()) 1003 1004 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 1005 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 1006 g.Expect(err).ToNot(HaveOccurred()) 1007 g.Expect(remediationData.Machine).To(Equal(m1.Name)) 1008 g.Expect(remediationData.RetryCount).To(Equal(0)) 1009 1010 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 1011 1012 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 1013 g.Expect(err).ToNot(HaveOccurred()) 1014 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 1015 1016 removeFinalizer(g, m1) 1017 g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed()) 1018 }) 1019 t.Run("Remediation fails gracefully if no healthy Control Planes are available to become etcd leader", func(t *testing.T) { 1020 g := NewWithT(t) 1021 1022 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 1023 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember()) 1024 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember()) 1025 m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember()) 1026 1027 controlPlane := &internal.ControlPlane{ 1028 KCP: &controlplanev1.KubeadmControlPlane{ 1029 Spec: controlplanev1.KubeadmControlPlaneSpec{ 1030 Replicas: utilptr.To[int32](4), 1031 Version: "v1.19.1", 1032 }, 1033 Status: controlplanev1.KubeadmControlPlaneStatus{ 1034 Initialized: true, 1035 }, 1036 }, 1037 Cluster: &clusterv1.Cluster{}, 1038 Machines: collections.FromMachines(m1, m2, m3, m4), 1039 } 1040 1041 r := &KubeadmControlPlaneReconciler{ 1042 Client: env.GetClient(), 1043 recorder: record.NewFakeRecorder(32), 1044 managementCluster: &fakeManagementCluster{ 1045 Workload: fakeWorkloadCluster{ 1046 EtcdMembersResult: nodes(controlPlane.Machines), 1047 }, 1048 }, 1049 } 1050 controlPlane.InjectTestManagementCluster(r.managementCluster) 1051 1052 _, err = r.reconcileUnhealthyMachines(ctx, controlPlane) 1053 g.Expect(err).ToNot(HaveOccurred()) 1054 1055 g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 1056 1057 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning, 1058 "A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation") 1059 1060 removeFinalizer(g, m1) 1061 g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed()) 1062 }) 1063 t.Run("Subsequent remediation of the same machine increase retry count - 3 CP", func(t *testing.T) { 1064 g := NewWithT(t) 1065 1066 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 1067 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember()) 1068 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember()) 1069 1070 controlPlane := &internal.ControlPlane{ 1071 KCP: &controlplanev1.KubeadmControlPlane{ 1072 Spec: controlplanev1.KubeadmControlPlaneSpec{ 1073 Replicas: utilptr.To[int32](1), 1074 Version: "v1.19.1", 1075 }, 1076 Status: controlplanev1.KubeadmControlPlaneStatus{ 1077 Initialized: false, 1078 }, 1079 }, 1080 Cluster: &clusterv1.Cluster{}, 1081 Machines: collections.FromMachines(m1, m2, m3), 1082 } 1083 1084 // First reconcile, remediate machine m1 for the first time 1085 r := &KubeadmControlPlaneReconciler{ 1086 Client: env.GetClient(), 1087 recorder: record.NewFakeRecorder(32), 1088 managementCluster: &fakeManagementCluster{ 1089 Workload: fakeWorkloadCluster{ 1090 EtcdMembersResult: nodes(controlPlane.Machines), 1091 }, 1092 }, 1093 } 1094 1095 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 1096 1097 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 1098 g.Expect(err).ToNot(HaveOccurred()) 1099 1100 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 1101 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 1102 g.Expect(err).ToNot(HaveOccurred()) 1103 g.Expect(remediationData.Machine).To(Equal(m1.Name)) 1104 g.Expect(remediationData.RetryCount).To(Equal(0)) 1105 1106 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 1107 1108 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 1109 g.Expect(err).ToNot(HaveOccurred()) 1110 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 1111 1112 removeFinalizer(g, m1) 1113 g.Expect(env.CleanupAndWait(ctx, m1)).To(Succeed()) 1114 1115 for i := 5; i < 6; i++ { 1116 // Simulate the creation of a replacement for m1. 1117 mi := createMachine(ctx, g, ns.Name, fmt.Sprintf("m%d-unhealthy-", i), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData))) 1118 1119 // Simulate KCP dropping RemediationInProgressAnnotation after creating the replacement machine. 1120 delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation) 1121 controlPlane.Machines = collections.FromMachines(mi, m2, m3) 1122 1123 // Reconcile unhealthy replacements for m1. 1124 r.managementCluster = &fakeManagementCluster{ 1125 Workload: fakeWorkloadCluster{ 1126 EtcdMembersResult: nodes(collections.FromMachines(mi, m2, m3)), 1127 }, 1128 } 1129 1130 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 1131 1132 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 1133 g.Expect(err).ToNot(HaveOccurred()) 1134 1135 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 1136 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 1137 g.Expect(err).ToNot(HaveOccurred()) 1138 g.Expect(remediationData.Machine).To(Equal(mi.Name)) 1139 g.Expect(remediationData.RetryCount).To(Equal(i - 4)) 1140 1141 assertMachineCondition(ctx, g, mi, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 1142 1143 err = env.Get(ctx, client.ObjectKey{Namespace: mi.Namespace, Name: mi.Name}, mi) 1144 g.Expect(err).ToNot(HaveOccurred()) 1145 g.Expect(mi.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 1146 1147 removeFinalizer(g, mi) 1148 g.Expect(env.CleanupAndWait(ctx, mi)).To(Succeed()) 1149 } 1150 1151 g.Expect(env.CleanupAndWait(ctx, m2, m3)).To(Succeed()) 1152 }) 1153 } 1154 1155 func TestReconcileUnhealthyMachinesSequences(t *testing.T) { 1156 var removeFinalizer = func(g *WithT, m *clusterv1.Machine) { 1157 patchHelper, err := patch.NewHelper(m, env.GetClient()) 1158 g.Expect(err).ToNot(HaveOccurred()) 1159 m.ObjectMeta.Finalizers = nil 1160 g.Expect(patchHelper.Patch(ctx, m)).To(Succeed()) 1161 } 1162 1163 t.Run("Remediates the first CP machine having problems to come up", func(t *testing.T) { 1164 g := NewWithT(t) 1165 1166 ns, err := env.CreateNamespace(ctx, "ns1") 1167 g.Expect(err).ToNot(HaveOccurred()) 1168 defer func() { 1169 g.Expect(env.Cleanup(ctx, ns)).To(Succeed()) 1170 }() 1171 1172 // Control plane not initialized yet, First CP is unhealthy and gets remediated: 1173 1174 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 1175 1176 controlPlane := &internal.ControlPlane{ 1177 KCP: &controlplanev1.KubeadmControlPlane{ 1178 Spec: controlplanev1.KubeadmControlPlaneSpec{ 1179 Replicas: utilptr.To[int32](3), 1180 Version: "v1.19.1", 1181 }, 1182 Status: controlplanev1.KubeadmControlPlaneStatus{ 1183 Initialized: false, 1184 }, 1185 }, 1186 Cluster: &clusterv1.Cluster{}, 1187 Machines: collections.FromMachines(m1), 1188 } 1189 1190 r := &KubeadmControlPlaneReconciler{ 1191 Client: env.GetClient(), 1192 recorder: record.NewFakeRecorder(32), 1193 managementCluster: &fakeManagementCluster{ 1194 Workload: fakeWorkloadCluster{ 1195 EtcdMembersResult: nodes(controlPlane.Machines), 1196 }, 1197 }, 1198 } 1199 1200 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 1201 1202 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 1203 g.Expect(err).ToNot(HaveOccurred()) 1204 1205 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 1206 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 1207 g.Expect(err).ToNot(HaveOccurred()) 1208 g.Expect(remediationData.Machine).To(Equal(m1.Name)) 1209 g.Expect(remediationData.RetryCount).To(Equal(0)) 1210 1211 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 1212 1213 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 1214 g.Expect(err).ToNot(HaveOccurred()) 1215 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 1216 1217 removeFinalizer(g, m1) 1218 g.Expect(env.Cleanup(ctx, m1)).To(Succeed()) 1219 1220 // Fake scaling up, which creates a remediation machine, fast forwards to when also the replacement machine is marked unhealthy. 1221 // NOTE: scale up also resets remediation in progress and remediation counts. 1222 1223 m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData))) 1224 delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation) 1225 1226 // Control plane not initialized yet, Second CP is unhealthy and gets remediated (retry 2) 1227 1228 controlPlane.Machines = collections.FromMachines(m2) 1229 r.managementCluster = &fakeManagementCluster{ 1230 Workload: fakeWorkloadCluster{ 1231 EtcdMembersResult: nodes(controlPlane.Machines), 1232 }, 1233 } 1234 1235 ret, err = r.reconcileUnhealthyMachines(ctx, controlPlane) 1236 1237 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 1238 g.Expect(err).ToNot(HaveOccurred()) 1239 1240 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 1241 remediationData, err = RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 1242 g.Expect(err).ToNot(HaveOccurred()) 1243 g.Expect(remediationData.Machine).To(Equal(m2.Name)) 1244 g.Expect(remediationData.RetryCount).To(Equal(1)) 1245 1246 assertMachineCondition(ctx, g, m2, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 1247 1248 err = env.Get(ctx, client.ObjectKey{Namespace: m2.Namespace, Name: m2.Name}, m1) 1249 g.Expect(err).ToNot(HaveOccurred()) 1250 g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 1251 1252 removeFinalizer(g, m2) 1253 g.Expect(env.Cleanup(ctx, m2)).To(Succeed()) 1254 1255 // Fake scaling up, which creates a remediation machine, which is healthy. 1256 // NOTE: scale up also resets remediation in progress and remediation counts. 1257 1258 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData))) 1259 delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation) 1260 1261 g.Expect(env.Cleanup(ctx, m3)).To(Succeed()) 1262 }) 1263 1264 t.Run("Remediates the second CP machine having problems to come up", func(t *testing.T) { 1265 g := NewWithT(t) 1266 1267 ns, err := env.CreateNamespace(ctx, "ns1") 1268 g.Expect(err).ToNot(HaveOccurred()) 1269 defer func() { 1270 g.Expect(env.Cleanup(ctx, ns)).To(Succeed()) 1271 }() 1272 1273 // Control plane initialized yet, First CP healthy, second CP is unhealthy and gets remediated: 1274 1275 m1 := createMachine(ctx, g, ns.Name, "m1-healthy-", withHealthyEtcdMember()) 1276 m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 1277 1278 controlPlane := &internal.ControlPlane{ 1279 KCP: &controlplanev1.KubeadmControlPlane{ 1280 Spec: controlplanev1.KubeadmControlPlaneSpec{ 1281 Replicas: utilptr.To[int32](3), 1282 Version: "v1.19.1", 1283 RolloutStrategy: &controlplanev1.RolloutStrategy{ 1284 RollingUpdate: &controlplanev1.RollingUpdate{ 1285 MaxSurge: &intstr.IntOrString{ 1286 IntVal: 1, 1287 }, 1288 }, 1289 }, 1290 }, 1291 Status: controlplanev1.KubeadmControlPlaneStatus{ 1292 Initialized: true, 1293 }, 1294 }, 1295 Cluster: &clusterv1.Cluster{}, 1296 Machines: collections.FromMachines(m1, m2), 1297 } 1298 1299 r := &KubeadmControlPlaneReconciler{ 1300 Client: env.GetClient(), 1301 recorder: record.NewFakeRecorder(32), 1302 managementCluster: &fakeManagementCluster{ 1303 Workload: fakeWorkloadCluster{ 1304 EtcdMembersResult: nodes(controlPlane.Machines), 1305 }, 1306 }, 1307 } 1308 controlPlane.InjectTestManagementCluster(r.managementCluster) 1309 1310 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 1311 1312 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 1313 g.Expect(err).ToNot(HaveOccurred()) 1314 1315 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 1316 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 1317 g.Expect(err).ToNot(HaveOccurred()) 1318 g.Expect(remediationData.Machine).To(Equal(m2.Name)) 1319 g.Expect(remediationData.RetryCount).To(Equal(0)) 1320 1321 assertMachineCondition(ctx, g, m2, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 1322 1323 err = env.Get(ctx, client.ObjectKey{Namespace: m2.Namespace, Name: m2.Name}, m2) 1324 g.Expect(err).ToNot(HaveOccurred()) 1325 g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 1326 1327 removeFinalizer(g, m2) 1328 g.Expect(env.Cleanup(ctx, m2)).To(Succeed()) 1329 1330 // Fake scaling up, which creates a remediation machine, fast forwards to when also the replacement machine is marked unhealthy. 1331 // NOTE: scale up also resets remediation in progress and remediation counts. 1332 1333 m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData))) 1334 delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation) 1335 1336 // Control plane not initialized yet, Second CP is unhealthy and gets remediated (retry 2) 1337 1338 controlPlane.Machines = collections.FromMachines(m1, m3) 1339 r.managementCluster = &fakeManagementCluster{ 1340 Workload: fakeWorkloadCluster{ 1341 EtcdMembersResult: nodes(controlPlane.Machines), 1342 }, 1343 } 1344 controlPlane.InjectTestManagementCluster(r.managementCluster) 1345 1346 ret, err = r.reconcileUnhealthyMachines(ctx, controlPlane) 1347 1348 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 1349 g.Expect(err).ToNot(HaveOccurred()) 1350 1351 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 1352 remediationData, err = RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 1353 g.Expect(err).ToNot(HaveOccurred()) 1354 g.Expect(remediationData.Machine).To(Equal(m3.Name)) 1355 g.Expect(remediationData.RetryCount).To(Equal(1)) 1356 1357 assertMachineCondition(ctx, g, m3, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 1358 1359 err = env.Get(ctx, client.ObjectKey{Namespace: m3.Namespace, Name: m3.Name}, m3) 1360 g.Expect(err).ToNot(HaveOccurred()) 1361 g.Expect(m3.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 1362 1363 removeFinalizer(g, m3) 1364 g.Expect(env.Cleanup(ctx, m3)).To(Succeed()) 1365 1366 // Fake scaling up, which creates a remediation machine, which is healthy. 1367 // NOTE: scale up also resets remediation in progress and remediation counts. 1368 1369 m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withHealthyEtcdMember(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData))) 1370 delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation) 1371 1372 g.Expect(env.Cleanup(ctx, m1, m4)).To(Succeed()) 1373 }) 1374 1375 t.Run("Remediates only one CP machine in case of multiple failures", func(t *testing.T) { 1376 g := NewWithT(t) 1377 1378 ns, err := env.CreateNamespace(ctx, "ns1") 1379 g.Expect(err).ToNot(HaveOccurred()) 1380 defer func() { 1381 g.Expect(env.Cleanup(ctx, ns)).To(Succeed()) 1382 }() 1383 1384 // Control plane initialized yet, First CP healthy, second and third CP are unhealthy. second gets remediated: 1385 1386 m1 := createMachine(ctx, g, ns.Name, "m1-healthy-", withHealthyEtcdMember()) 1387 m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withHealthyEtcdMember(), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 1388 m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withHealthyEtcdMember(), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 1389 1390 controlPlane := &internal.ControlPlane{ 1391 KCP: &controlplanev1.KubeadmControlPlane{ 1392 Spec: controlplanev1.KubeadmControlPlaneSpec{ 1393 Replicas: utilptr.To[int32](3), 1394 Version: "v1.19.1", 1395 RolloutStrategy: &controlplanev1.RolloutStrategy{ 1396 RollingUpdate: &controlplanev1.RollingUpdate{ 1397 MaxSurge: &intstr.IntOrString{ 1398 IntVal: 1, 1399 }, 1400 }, 1401 }, 1402 }, 1403 Status: controlplanev1.KubeadmControlPlaneStatus{ 1404 Initialized: true, 1405 }, 1406 }, 1407 Cluster: &clusterv1.Cluster{}, 1408 Machines: collections.FromMachines(m1, m2, m3), 1409 } 1410 1411 r := &KubeadmControlPlaneReconciler{ 1412 Client: env.GetClient(), 1413 recorder: record.NewFakeRecorder(32), 1414 managementCluster: &fakeManagementCluster{ 1415 Workload: fakeWorkloadCluster{ 1416 EtcdMembersResult: nodes(controlPlane.Machines), 1417 }, 1418 }, 1419 } 1420 controlPlane.InjectTestManagementCluster(r.managementCluster) 1421 1422 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 1423 1424 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 1425 g.Expect(err).ToNot(HaveOccurred()) 1426 1427 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 1428 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 1429 g.Expect(err).ToNot(HaveOccurred()) 1430 g.Expect(remediationData.Machine).To(Equal(m2.Name)) 1431 g.Expect(remediationData.RetryCount).To(Equal(0)) 1432 1433 assertMachineCondition(ctx, g, m2, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 1434 assertMachineCondition(ctx, g, m3, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "") 1435 1436 err = env.Get(ctx, client.ObjectKey{Namespace: m2.Namespace, Name: m2.Name}, m2) 1437 g.Expect(err).ToNot(HaveOccurred()) 1438 g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 1439 1440 removeFinalizer(g, m2) 1441 g.Expect(env.Cleanup(ctx, m2)).To(Succeed()) 1442 1443 // Check next reconcile does not further remediate 1444 1445 controlPlane.Machines = collections.FromMachines(m1, m3) 1446 r.managementCluster = &fakeManagementCluster{ 1447 Workload: fakeWorkloadCluster{ 1448 EtcdMembersResult: nodes(controlPlane.Machines), 1449 }, 1450 } 1451 1452 ret, err = r.reconcileUnhealthyMachines(ctx, controlPlane) 1453 1454 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 1455 g.Expect(err).ToNot(HaveOccurred()) 1456 1457 g.Expect(env.Cleanup(ctx, m1)).To(Succeed()) 1458 }) 1459 } 1460 1461 func TestCanSafelyRemoveEtcdMember(t *testing.T) { 1462 g := NewWithT(t) 1463 1464 ns, err := env.CreateNamespace(ctx, "ns1") 1465 g.Expect(err).ToNot(HaveOccurred()) 1466 defer func() { 1467 g.Expect(env.Cleanup(ctx, ns)).To(Succeed()) 1468 }() 1469 1470 t.Run("Can't safely remediate 1 machine CP", func(t *testing.T) { 1471 g := NewWithT(t) 1472 1473 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1474 1475 controlPlane := &internal.ControlPlane{ 1476 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1477 Replicas: utilptr.To[int32](1), 1478 }}, 1479 Cluster: &clusterv1.Cluster{}, 1480 Machines: collections.FromMachines(m1), 1481 } 1482 1483 r := &KubeadmControlPlaneReconciler{ 1484 Client: env.GetClient(), 1485 recorder: record.NewFakeRecorder(32), 1486 managementCluster: &fakeManagementCluster{ 1487 Workload: fakeWorkloadCluster{ 1488 EtcdMembersResult: nodes(controlPlane.Machines), 1489 }, 1490 }, 1491 } 1492 controlPlane.InjectTestManagementCluster(r.managementCluster) 1493 1494 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1495 g.Expect(ret).To(BeFalse()) 1496 g.Expect(err).ToNot(HaveOccurred()) 1497 1498 g.Expect(env.Cleanup(ctx, m1)).To(Succeed()) 1499 }) 1500 1501 t.Run("Can safely remediate 2 machine CP without additional etcd member failures", func(t *testing.T) { 1502 g := NewWithT(t) 1503 1504 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1505 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember()) 1506 1507 controlPlane := &internal.ControlPlane{ 1508 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1509 Replicas: utilptr.To[int32](3), 1510 }}, 1511 Cluster: &clusterv1.Cluster{}, 1512 Machines: collections.FromMachines(m1, m2), 1513 } 1514 1515 r := &KubeadmControlPlaneReconciler{ 1516 Client: env.GetClient(), 1517 recorder: record.NewFakeRecorder(32), 1518 managementCluster: &fakeManagementCluster{ 1519 Workload: fakeWorkloadCluster{ 1520 EtcdMembersResult: nodes(controlPlane.Machines), 1521 }, 1522 }, 1523 } 1524 controlPlane.InjectTestManagementCluster(r.managementCluster) 1525 1526 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1527 g.Expect(ret).To(BeTrue()) 1528 g.Expect(err).ToNot(HaveOccurred()) 1529 1530 g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed()) 1531 }) 1532 t.Run("Can safely remediate 2 machines CP when the etcd member being remediated is missing", func(t *testing.T) { 1533 g := NewWithT(t) 1534 1535 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1536 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember()) 1537 1538 controlPlane := &internal.ControlPlane{ 1539 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1540 Replicas: utilptr.To[int32](3), 1541 }}, 1542 Cluster: &clusterv1.Cluster{}, 1543 Machines: collections.FromMachines(m1, m2), 1544 } 1545 1546 members := make([]string, 0, len(controlPlane.Machines)-1) 1547 for _, n := range nodes(controlPlane.Machines) { 1548 if !strings.Contains(n, "m1-mhc-unhealthy-") { 1549 members = append(members, n) 1550 } 1551 } 1552 1553 r := &KubeadmControlPlaneReconciler{ 1554 Client: env.GetClient(), 1555 recorder: record.NewFakeRecorder(32), 1556 managementCluster: &fakeManagementCluster{ 1557 Workload: fakeWorkloadCluster{ 1558 EtcdMembersResult: members, 1559 }, 1560 }, 1561 } 1562 controlPlane.InjectTestManagementCluster(r.managementCluster) 1563 1564 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1565 g.Expect(ret).To(BeTrue()) 1566 g.Expect(err).ToNot(HaveOccurred()) 1567 1568 g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed()) 1569 }) 1570 t.Run("Can't safely remediate 2 machines CP with one additional etcd member failure", func(t *testing.T) { 1571 g := NewWithT(t) 1572 1573 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1574 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember()) 1575 1576 controlPlane := &internal.ControlPlane{ 1577 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1578 Replicas: utilptr.To[int32](3), 1579 }}, 1580 Cluster: &clusterv1.Cluster{}, 1581 Machines: collections.FromMachines(m1, m2), 1582 } 1583 1584 r := &KubeadmControlPlaneReconciler{ 1585 Client: env.GetClient(), 1586 recorder: record.NewFakeRecorder(32), 1587 managementCluster: &fakeManagementCluster{ 1588 Workload: fakeWorkloadCluster{ 1589 EtcdMembersResult: nodes(controlPlane.Machines), 1590 }, 1591 }, 1592 } 1593 controlPlane.InjectTestManagementCluster(r.managementCluster) 1594 1595 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1596 g.Expect(ret).To(BeFalse()) 1597 g.Expect(err).ToNot(HaveOccurred()) 1598 1599 g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed()) 1600 }) 1601 t.Run("Can safely remediate 3 machines CP without additional etcd member failures", func(t *testing.T) { 1602 g := NewWithT(t) 1603 1604 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1605 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember()) 1606 m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember()) 1607 1608 controlPlane := &internal.ControlPlane{ 1609 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1610 Replicas: utilptr.To[int32](3), 1611 }}, 1612 Cluster: &clusterv1.Cluster{}, 1613 Machines: collections.FromMachines(m1, m2, m3), 1614 } 1615 1616 r := &KubeadmControlPlaneReconciler{ 1617 Client: env.GetClient(), 1618 recorder: record.NewFakeRecorder(32), 1619 managementCluster: &fakeManagementCluster{ 1620 Workload: fakeWorkloadCluster{ 1621 EtcdMembersResult: nodes(controlPlane.Machines), 1622 }, 1623 }, 1624 } 1625 controlPlane.InjectTestManagementCluster(r.managementCluster) 1626 1627 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1628 g.Expect(ret).To(BeTrue()) 1629 g.Expect(err).ToNot(HaveOccurred()) 1630 1631 g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed()) 1632 }) 1633 t.Run("Can safely remediate 3 machines CP when the etcd member being remediated is missing", func(t *testing.T) { 1634 g := NewWithT(t) 1635 1636 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1637 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember()) 1638 m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember()) 1639 1640 controlPlane := &internal.ControlPlane{ 1641 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1642 Replicas: utilptr.To[int32](3), 1643 }}, 1644 Cluster: &clusterv1.Cluster{}, 1645 Machines: collections.FromMachines(m1, m2, m3), 1646 } 1647 1648 members := make([]string, 0, len(controlPlane.Machines)-1) 1649 for _, n := range nodes(controlPlane.Machines) { 1650 if !strings.Contains(n, "m1-mhc-unhealthy-") { 1651 members = append(members, n) 1652 } 1653 } 1654 1655 r := &KubeadmControlPlaneReconciler{ 1656 Client: env.GetClient(), 1657 recorder: record.NewFakeRecorder(32), 1658 managementCluster: &fakeManagementCluster{ 1659 Workload: fakeWorkloadCluster{ 1660 EtcdMembersResult: members, 1661 }, 1662 }, 1663 } 1664 controlPlane.InjectTestManagementCluster(r.managementCluster) 1665 1666 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1667 g.Expect(ret).To(BeTrue()) 1668 g.Expect(err).ToNot(HaveOccurred()) 1669 1670 g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed()) 1671 }) 1672 t.Run("Can't safely remediate 3 machines CP with one additional etcd member failure", func(t *testing.T) { 1673 g := NewWithT(t) 1674 1675 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1676 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember()) 1677 m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember()) 1678 1679 controlPlane := &internal.ControlPlane{ 1680 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1681 Replicas: utilptr.To[int32](3), 1682 }}, 1683 Cluster: &clusterv1.Cluster{}, 1684 Machines: collections.FromMachines(m1, m2, m3), 1685 } 1686 1687 r := &KubeadmControlPlaneReconciler{ 1688 Client: env.GetClient(), 1689 recorder: record.NewFakeRecorder(32), 1690 managementCluster: &fakeManagementCluster{ 1691 Workload: fakeWorkloadCluster{ 1692 EtcdMembersResult: nodes(controlPlane.Machines), 1693 }, 1694 }, 1695 } 1696 controlPlane.InjectTestManagementCluster(r.managementCluster) 1697 1698 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1699 g.Expect(ret).To(BeFalse()) 1700 g.Expect(err).ToNot(HaveOccurred()) 1701 1702 g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed()) 1703 }) 1704 t.Run("Can safely remediate 5 machines CP less than 2 additional etcd member failures", func(t *testing.T) { 1705 g := NewWithT(t) 1706 1707 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1708 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember()) 1709 m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember()) 1710 m4 := createMachine(ctx, g, ns.Name, "m4-etcd-healthy-", withHealthyEtcdMember()) 1711 m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember()) 1712 1713 controlPlane := &internal.ControlPlane{ 1714 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1715 Replicas: utilptr.To[int32](5), 1716 }}, 1717 Cluster: &clusterv1.Cluster{}, 1718 Machines: collections.FromMachines(m1, m2, m3, m4, m5), 1719 } 1720 1721 r := &KubeadmControlPlaneReconciler{ 1722 Client: env.GetClient(), 1723 recorder: record.NewFakeRecorder(32), 1724 managementCluster: &fakeManagementCluster{ 1725 Workload: fakeWorkloadCluster{ 1726 EtcdMembersResult: nodes(controlPlane.Machines), 1727 }, 1728 }, 1729 } 1730 controlPlane.InjectTestManagementCluster(r.managementCluster) 1731 1732 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1733 g.Expect(ret).To(BeTrue()) 1734 g.Expect(err).ToNot(HaveOccurred()) 1735 1736 g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5)).To(Succeed()) 1737 }) 1738 t.Run("Can't safely remediate 5 machines CP with 2 additional etcd member failures", func(t *testing.T) { 1739 g := NewWithT(t) 1740 1741 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1742 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember()) 1743 m3 := createMachine(ctx, g, ns.Name, "m3-etcd-unhealthy-", withUnhealthyEtcdMember()) 1744 m4 := createMachine(ctx, g, ns.Name, "m4-etcd-healthy-", withHealthyEtcdMember()) 1745 m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember()) 1746 1747 controlPlane := &internal.ControlPlane{ 1748 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1749 Replicas: utilptr.To[int32](7), 1750 }}, 1751 Cluster: &clusterv1.Cluster{}, 1752 Machines: collections.FromMachines(m1, m2, m3, m4, m5), 1753 } 1754 1755 r := &KubeadmControlPlaneReconciler{ 1756 Client: env.GetClient(), 1757 recorder: record.NewFakeRecorder(32), 1758 managementCluster: &fakeManagementCluster{ 1759 Workload: fakeWorkloadCluster{ 1760 EtcdMembersResult: nodes(controlPlane.Machines), 1761 }, 1762 }, 1763 } 1764 controlPlane.InjectTestManagementCluster(r.managementCluster) 1765 1766 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1767 g.Expect(ret).To(BeFalse()) 1768 g.Expect(err).ToNot(HaveOccurred()) 1769 1770 g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5)).To(Succeed()) 1771 }) 1772 t.Run("Can safely remediate 7 machines CP with less than 3 additional etcd member failures", func(t *testing.T) { 1773 g := NewWithT(t) 1774 1775 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1776 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember()) 1777 m3 := createMachine(ctx, g, ns.Name, "m3-etcd-unhealthy-", withUnhealthyEtcdMember()) 1778 m4 := createMachine(ctx, g, ns.Name, "m4-etcd-healthy-", withHealthyEtcdMember()) 1779 m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember()) 1780 m6 := createMachine(ctx, g, ns.Name, "m6-etcd-healthy-", withHealthyEtcdMember()) 1781 m7 := createMachine(ctx, g, ns.Name, "m7-etcd-healthy-", withHealthyEtcdMember()) 1782 1783 controlPlane := &internal.ControlPlane{ 1784 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1785 Replicas: utilptr.To[int32](7), 1786 }}, 1787 Cluster: &clusterv1.Cluster{}, 1788 Machines: collections.FromMachines(m1, m2, m3, m4, m5, m6, m7), 1789 } 1790 1791 r := &KubeadmControlPlaneReconciler{ 1792 Client: env.GetClient(), 1793 recorder: record.NewFakeRecorder(32), 1794 managementCluster: &fakeManagementCluster{ 1795 Workload: fakeWorkloadCluster{ 1796 EtcdMembersResult: nodes(controlPlane.Machines), 1797 }, 1798 }, 1799 } 1800 controlPlane.InjectTestManagementCluster(r.managementCluster) 1801 1802 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1803 g.Expect(ret).To(BeTrue()) 1804 g.Expect(err).ToNot(HaveOccurred()) 1805 1806 g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5, m6, m7)).To(Succeed()) 1807 }) 1808 t.Run("Can't safely remediate 7 machines CP with 3 additional etcd member failures", func(t *testing.T) { 1809 g := NewWithT(t) 1810 1811 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1812 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember()) 1813 m3 := createMachine(ctx, g, ns.Name, "m3-etcd-unhealthy-", withUnhealthyEtcdMember()) 1814 m4 := createMachine(ctx, g, ns.Name, "m4-etcd-unhealthy-", withUnhealthyEtcdMember()) 1815 m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember()) 1816 m6 := createMachine(ctx, g, ns.Name, "m6-etcd-healthy-", withHealthyEtcdMember()) 1817 m7 := createMachine(ctx, g, ns.Name, "m7-etcd-healthy-", withHealthyEtcdMember()) 1818 1819 controlPlane := &internal.ControlPlane{ 1820 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1821 Replicas: utilptr.To[int32](5), 1822 }}, 1823 Cluster: &clusterv1.Cluster{}, 1824 Machines: collections.FromMachines(m1, m2, m3, m4, m5, m6, m7), 1825 } 1826 1827 r := &KubeadmControlPlaneReconciler{ 1828 Client: env.GetClient(), 1829 recorder: record.NewFakeRecorder(32), 1830 managementCluster: &fakeManagementCluster{ 1831 Workload: fakeWorkloadCluster{ 1832 EtcdMembersResult: nodes(controlPlane.Machines), 1833 }, 1834 }, 1835 } 1836 controlPlane.InjectTestManagementCluster(r.managementCluster) 1837 1838 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1839 g.Expect(ret).To(BeFalse()) 1840 g.Expect(err).ToNot(HaveOccurred()) 1841 1842 g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5, m6, m7)).To(Succeed()) 1843 }) 1844 } 1845 1846 func nodes(machines collections.Machines) []string { 1847 nodes := make([]string, 0, machines.Len()) 1848 for _, m := range machines { 1849 if m.Status.NodeRef != nil { 1850 nodes = append(nodes, m.Status.NodeRef.Name) 1851 } 1852 } 1853 return nodes 1854 } 1855 1856 type machineOption func(*clusterv1.Machine) 1857 1858 func withMachineHealthCheckFailed() machineOption { 1859 return func(machine *clusterv1.Machine) { 1860 conditions.MarkFalse(machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "") 1861 conditions.MarkFalse(machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "") 1862 } 1863 } 1864 1865 func withStuckRemediation() machineOption { 1866 return func(machine *clusterv1.Machine) { 1867 conditions.MarkTrue(machine, clusterv1.MachineHealthCheckSucceededCondition) 1868 conditions.MarkFalse(machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "") 1869 } 1870 } 1871 1872 func withHealthyEtcdMember() machineOption { 1873 return func(machine *clusterv1.Machine) { 1874 conditions.MarkTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition) 1875 } 1876 } 1877 1878 func withUnhealthyEtcdMember() machineOption { 1879 return func(machine *clusterv1.Machine) { 1880 conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "") 1881 } 1882 } 1883 1884 func withUnhealthyAPIServerPod() machineOption { 1885 return func(machine *clusterv1.Machine) { 1886 conditions.MarkFalse(machine, controlplanev1.MachineAPIServerPodHealthyCondition, controlplanev1.ControlPlaneComponentsUnhealthyReason, clusterv1.ConditionSeverityError, "") 1887 } 1888 } 1889 1890 func withNodeRef(ref string) machineOption { 1891 return func(machine *clusterv1.Machine) { 1892 machine.Status.NodeRef = &corev1.ObjectReference{ 1893 Kind: "Node", 1894 Name: ref, 1895 } 1896 } 1897 } 1898 1899 func withoutNodeRef() machineOption { 1900 return func(machine *clusterv1.Machine) { 1901 machine.Status.NodeRef = nil 1902 } 1903 } 1904 1905 func withRemediateForAnnotation(remediatedFor string) machineOption { 1906 return func(machine *clusterv1.Machine) { 1907 if machine.Annotations == nil { 1908 machine.Annotations = map[string]string{} 1909 } 1910 machine.Annotations[controlplanev1.RemediationForAnnotation] = remediatedFor 1911 } 1912 } 1913 1914 func withWaitBeforeDeleteFinalizer() machineOption { 1915 return func(machine *clusterv1.Machine) { 1916 machine.Finalizers = []string{"wait-before-delete"} 1917 } 1918 } 1919 1920 func createMachine(ctx context.Context, g *WithT, namespace, name string, options ...machineOption) *clusterv1.Machine { 1921 m := &clusterv1.Machine{ 1922 ObjectMeta: metav1.ObjectMeta{ 1923 Namespace: namespace, 1924 GenerateName: name, 1925 }, 1926 Spec: clusterv1.MachineSpec{ 1927 ClusterName: "cluster", 1928 Bootstrap: clusterv1.Bootstrap{ 1929 DataSecretName: utilptr.To("secret"), 1930 }, 1931 }, 1932 } 1933 g.Expect(env.CreateAndWait(ctx, m)).To(Succeed()) 1934 1935 patchHelper, err := patch.NewHelper(m, env.GetClient()) 1936 g.Expect(err).ToNot(HaveOccurred()) 1937 1938 for _, opt := range append([]machineOption{withNodeRef(fmt.Sprintf("node-%s", m.Name))}, options...) { 1939 opt(m) 1940 } 1941 1942 g.Expect(patchHelper.Patch(ctx, m)).To(Succeed()) 1943 return m 1944 } 1945 1946 func getDeletingMachine(namespace, name string, options ...machineOption) *clusterv1.Machine { 1947 deletionTime := metav1.Now() 1948 m := &clusterv1.Machine{ 1949 ObjectMeta: metav1.ObjectMeta{ 1950 Namespace: namespace, 1951 Name: name, 1952 DeletionTimestamp: &deletionTime, 1953 }, 1954 Spec: clusterv1.MachineSpec{ 1955 ClusterName: "cluster", 1956 Bootstrap: clusterv1.Bootstrap{ 1957 DataSecretName: utilptr.To("secret"), 1958 }, 1959 }, 1960 } 1961 1962 for _, opt := range append([]machineOption{withNodeRef(fmt.Sprintf("node-%s", m.Name))}, options...) { 1963 opt(m) 1964 } 1965 return m 1966 } 1967 1968 func assertMachineCondition(ctx context.Context, g *WithT, m *clusterv1.Machine, t clusterv1.ConditionType, status corev1.ConditionStatus, reason string, severity clusterv1.ConditionSeverity, message string) { 1969 g.Eventually(func() error { 1970 if err := env.Get(ctx, client.ObjectKey{Namespace: m.Namespace, Name: m.Name}, m); err != nil { 1971 return err 1972 } 1973 c := conditions.Get(m, t) 1974 if c == nil { 1975 return errors.Errorf("condition %q was nil", t) 1976 } 1977 if c.Status != status { 1978 return errors.Errorf("condition %q status %q did not match %q", t, c.Status, status) 1979 } 1980 if c.Reason != reason { 1981 return errors.Errorf("condition %q reason %q did not match %q", t, c.Reason, reason) 1982 } 1983 if c.Severity != severity { 1984 return errors.Errorf("condition %q severity %q did not match %q", t, c.Status, status) 1985 } 1986 if c.Message != message { 1987 return errors.Errorf("condition %q message %q did not match %q", t, c.Message, message) 1988 } 1989 return nil 1990 }, 10*time.Second).Should(Succeed()) 1991 } 1992 1993 func MustMarshalRemediationData(r *RemediationData) string { 1994 s, err := r.Marshal() 1995 if err != nil { 1996 panic("failed to marshal remediation data") 1997 } 1998 return s 1999 }