sigs.k8s.io/cluster-api@v1.6.3/controlplane/kubeadm/internal/controllers/remediation_test.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package controllers 18 19 import ( 20 "context" 21 "fmt" 22 "strings" 23 "testing" 24 "time" 25 26 . "github.com/onsi/gomega" 27 "github.com/pkg/errors" 28 corev1 "k8s.io/api/core/v1" 29 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 "k8s.io/apimachinery/pkg/util/intstr" 31 "k8s.io/client-go/tools/record" 32 utilpointer "k8s.io/utils/pointer" 33 "sigs.k8s.io/controller-runtime/pkg/client" 34 35 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 36 controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" 37 "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" 38 "sigs.k8s.io/cluster-api/util/collections" 39 "sigs.k8s.io/cluster-api/util/conditions" 40 "sigs.k8s.io/cluster-api/util/patch" 41 ) 42 43 func TestReconcileUnhealthyMachines(t *testing.T) { 44 g := NewWithT(t) 45 46 r := &KubeadmControlPlaneReconciler{ 47 Client: env.GetClient(), 48 recorder: record.NewFakeRecorder(32), 49 } 50 ns, err := env.CreateNamespace(ctx, "ns1") 51 g.Expect(err).ToNot(HaveOccurred()) 52 defer func() { 53 g.Expect(env.Cleanup(ctx, ns)).To(Succeed()) 54 }() 55 56 var removeFinalizer = func(g *WithT, m *clusterv1.Machine) { 57 patchHelper, err := patch.NewHelper(m, env.GetClient()) 58 g.Expect(err).ToNot(HaveOccurred()) 59 m.ObjectMeta.Finalizers = nil 60 g.Expect(patchHelper.Patch(ctx, m)).To(Succeed()) 61 } 62 63 t.Run("It cleans up stuck remediation on previously unhealthy machines", func(t *testing.T) { 64 g := NewWithT(t) 65 66 m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withStuckRemediation()) 67 68 controlPlane := &internal.ControlPlane{ 69 KCP: &controlplanev1.KubeadmControlPlane{}, 70 Cluster: &clusterv1.Cluster{}, 71 Machines: collections.FromMachines(m), 72 } 73 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 74 75 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 76 g.Expect(err).ToNot(HaveOccurred()) 77 78 g.Eventually(func() error { 79 if err := env.Get(ctx, client.ObjectKey{Namespace: m.Namespace, Name: m.Name}, m); err != nil { 80 return err 81 } 82 c := conditions.Get(m, clusterv1.MachineOwnerRemediatedCondition) 83 if c == nil { 84 return nil 85 } 86 return errors.Errorf("condition %s still exists", clusterv1.MachineOwnerRemediatedCondition) 87 }, 10*time.Second).Should(Succeed()) 88 }) 89 90 // Generic preflight checks 91 // Those are ore flight checks that happen no matter if the control plane has been already initialized or not. 92 93 t.Run("Remediation does not happen if there are no unhealthy machines", func(t *testing.T) { 94 g := NewWithT(t) 95 96 controlPlane := &internal.ControlPlane{ 97 KCP: &controlplanev1.KubeadmControlPlane{}, 98 Cluster: &clusterv1.Cluster{}, 99 Machines: collections.New(), 100 } 101 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 102 103 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 104 g.Expect(err).ToNot(HaveOccurred()) 105 }) 106 t.Run("reconcileUnhealthyMachines return early if another remediation is in progress", func(t *testing.T) { 107 g := NewWithT(t) 108 109 m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withStuckRemediation()) 110 conditions.MarkFalse(m, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "") 111 conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "") 112 controlPlane := &internal.ControlPlane{ 113 KCP: &controlplanev1.KubeadmControlPlane{ 114 ObjectMeta: metav1.ObjectMeta{ 115 Annotations: map[string]string{ 116 controlplanev1.RemediationInProgressAnnotation: MustMarshalRemediationData(&RemediationData{ 117 Machine: "foo", 118 Timestamp: metav1.Time{Time: time.Now().UTC()}, 119 RetryCount: 0, 120 }), 121 }, 122 }, 123 }, 124 Cluster: &clusterv1.Cluster{}, 125 Machines: collections.FromMachines(m), 126 } 127 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 128 129 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 130 g.Expect(err).ToNot(HaveOccurred()) 131 }) 132 t.Run("reconcileUnhealthyMachines return early if the machine to be remediated is already being deleted", func(t *testing.T) { 133 g := NewWithT(t) 134 135 m := getDeletingMachine(ns.Name, "m1-unhealthy-deleting-", withMachineHealthCheckFailed()) 136 conditions.MarkFalse(m, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "") 137 conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "") 138 controlPlane := &internal.ControlPlane{ 139 KCP: &controlplanev1.KubeadmControlPlane{}, 140 Cluster: &clusterv1.Cluster{}, 141 Machines: collections.FromMachines(m), 142 } 143 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 144 145 g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 146 147 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 148 g.Expect(err).ToNot(HaveOccurred()) 149 }) 150 t.Run("Remediation does not happen if MaxRetry is reached", func(t *testing.T) { 151 g := NewWithT(t) 152 153 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{ 154 Machine: "m0", 155 Timestamp: metav1.Time{Time: time.Now().Add(-controlplanev1.DefaultMinHealthyPeriod / 2).UTC()}, // minHealthy not expired yet. 156 RetryCount: 3, 157 }))) 158 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember()) 159 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember()) 160 161 controlPlane := &internal.ControlPlane{ 162 KCP: &controlplanev1.KubeadmControlPlane{ 163 Spec: controlplanev1.KubeadmControlPlaneSpec{ 164 Replicas: utilpointer.Int32(3), 165 Version: "v1.19.1", 166 RemediationStrategy: &controlplanev1.RemediationStrategy{ 167 MaxRetry: utilpointer.Int32(3), 168 }, 169 }, 170 }, 171 Cluster: &clusterv1.Cluster{}, 172 Machines: collections.FromMachines(m1, m2, m3), 173 } 174 175 r := &KubeadmControlPlaneReconciler{ 176 Client: env.GetClient(), 177 recorder: record.NewFakeRecorder(32), 178 managementCluster: &fakeManagementCluster{ 179 Workload: fakeWorkloadCluster{ 180 EtcdMembersResult: nodes(controlPlane.Machines), 181 }, 182 }, 183 } 184 185 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 186 187 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 188 g.Expect(err).ToNot(HaveOccurred()) 189 190 g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 191 192 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed 3 times (MaxRetry)") 193 194 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 195 g.Expect(err).ToNot(HaveOccurred()) 196 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeTrue()) 197 198 removeFinalizer(g, m1) 199 g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed()) 200 }) 201 t.Run("Retry history is ignored if min healthy period is expired, default min healthy period", func(t *testing.T) { 202 g := NewWithT(t) 203 204 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{ 205 Machine: "m0", 206 Timestamp: metav1.Time{Time: time.Now().Add(-2 * controlplanev1.DefaultMinHealthyPeriod).UTC()}, // minHealthyPeriod already expired. 207 RetryCount: 3, 208 }))) 209 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember()) 210 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember()) 211 212 controlPlane := &internal.ControlPlane{ 213 KCP: &controlplanev1.KubeadmControlPlane{ 214 Spec: controlplanev1.KubeadmControlPlaneSpec{ 215 Replicas: utilpointer.Int32(3), 216 Version: "v1.19.1", 217 RemediationStrategy: &controlplanev1.RemediationStrategy{ 218 MaxRetry: utilpointer.Int32(3), 219 }, 220 }, 221 }, 222 Cluster: &clusterv1.Cluster{}, 223 Machines: collections.FromMachines(m1, m2, m3), 224 } 225 226 r := &KubeadmControlPlaneReconciler{ 227 Client: env.GetClient(), 228 recorder: record.NewFakeRecorder(32), 229 managementCluster: &fakeManagementCluster{ 230 Workload: fakeWorkloadCluster{ 231 EtcdMembersResult: nodes(controlPlane.Machines), 232 }, 233 }, 234 } 235 236 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 237 238 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 239 g.Expect(err).ToNot(HaveOccurred()) 240 241 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 242 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 243 g.Expect(err).ToNot(HaveOccurred()) 244 g.Expect(remediationData.Machine).To(Equal(m1.Name)) 245 g.Expect(remediationData.RetryCount).To(Equal(0)) 246 247 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 248 249 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 250 g.Expect(err).ToNot(HaveOccurred()) 251 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 252 253 removeFinalizer(g, m1) 254 g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed()) 255 }) 256 t.Run("Retry history is ignored if min healthy period is expired", func(t *testing.T) { 257 g := NewWithT(t) 258 259 minHealthyPeriod := 4 * controlplanev1.DefaultMinHealthyPeriod // big min healthy period, so we are user that we are not using DefaultMinHealthyPeriod. 260 261 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{ 262 Machine: "m0", 263 Timestamp: metav1.Time{Time: time.Now().Add(-2 * minHealthyPeriod).UTC()}, // minHealthyPeriod already expired. 264 RetryCount: 3, 265 }))) 266 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember()) 267 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember()) 268 269 controlPlane := &internal.ControlPlane{ 270 KCP: &controlplanev1.KubeadmControlPlane{ 271 Spec: controlplanev1.KubeadmControlPlaneSpec{ 272 Replicas: utilpointer.Int32(3), 273 Version: "v1.19.1", 274 RemediationStrategy: &controlplanev1.RemediationStrategy{ 275 MaxRetry: utilpointer.Int32(3), 276 MinHealthyPeriod: &metav1.Duration{Duration: minHealthyPeriod}, 277 }, 278 }, 279 }, 280 Cluster: &clusterv1.Cluster{}, 281 Machines: collections.FromMachines(m1, m2, m3), 282 } 283 284 r := &KubeadmControlPlaneReconciler{ 285 Client: env.GetClient(), 286 recorder: record.NewFakeRecorder(32), 287 managementCluster: &fakeManagementCluster{ 288 Workload: fakeWorkloadCluster{ 289 EtcdMembersResult: nodes(controlPlane.Machines), 290 }, 291 }, 292 } 293 294 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 295 296 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 297 g.Expect(err).ToNot(HaveOccurred()) 298 299 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 300 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 301 g.Expect(err).ToNot(HaveOccurred()) 302 g.Expect(remediationData.Machine).To(Equal(m1.Name)) 303 g.Expect(remediationData.RetryCount).To(Equal(0)) 304 305 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 306 307 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 308 g.Expect(err).ToNot(HaveOccurred()) 309 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 310 311 removeFinalizer(g, m1) 312 g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed()) 313 }) 314 t.Run("Remediation does not happen if RetryPeriod is not yet passed", func(t *testing.T) { 315 g := NewWithT(t) 316 317 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{ 318 Machine: "m0", 319 Timestamp: metav1.Time{Time: time.Now().Add(-controlplanev1.DefaultMinHealthyPeriod / 2).UTC()}, // minHealthyPeriod not yet expired. 320 RetryCount: 2, 321 }))) 322 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember()) 323 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember()) 324 325 controlPlane := &internal.ControlPlane{ 326 KCP: &controlplanev1.KubeadmControlPlane{ 327 Spec: controlplanev1.KubeadmControlPlaneSpec{ 328 Replicas: utilpointer.Int32(3), 329 Version: "v1.19.1", 330 RemediationStrategy: &controlplanev1.RemediationStrategy{ 331 MaxRetry: utilpointer.Int32(3), 332 RetryPeriod: metav1.Duration{Duration: controlplanev1.DefaultMinHealthyPeriod}, // RetryPeriod not yet expired. 333 }, 334 }, 335 }, 336 Cluster: &clusterv1.Cluster{}, 337 Machines: collections.FromMachines(m1, m2, m3), 338 } 339 340 r := &KubeadmControlPlaneReconciler{ 341 Client: env.GetClient(), 342 recorder: record.NewFakeRecorder(32), 343 managementCluster: &fakeManagementCluster{ 344 Workload: fakeWorkloadCluster{ 345 EtcdMembersResult: nodes(controlPlane.Machines), 346 }, 347 }, 348 } 349 350 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 351 352 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 353 g.Expect(err).ToNot(HaveOccurred()) 354 355 g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 356 357 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest 1h0m0s (RetryPeriod)") 358 359 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 360 g.Expect(err).ToNot(HaveOccurred()) 361 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeTrue()) 362 363 removeFinalizer(g, m1) 364 g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed()) 365 }) 366 367 // There are no preflight checks for when control plane is not yet initialized 368 // (it is the first CP, we can nuke it). 369 370 // Preflight checks for when control plane is already initialized. 371 372 t.Run("Remediation does not happen if desired replicas <= 1", func(t *testing.T) { 373 g := NewWithT(t) 374 375 m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed()) 376 controlPlane := &internal.ControlPlane{ 377 KCP: &controlplanev1.KubeadmControlPlane{ 378 Spec: controlplanev1.KubeadmControlPlaneSpec{ 379 Replicas: utilpointer.Int32(1), 380 RolloutStrategy: &controlplanev1.RolloutStrategy{ 381 RollingUpdate: &controlplanev1.RollingUpdate{ 382 MaxSurge: &intstr.IntOrString{ 383 IntVal: 1, 384 }, 385 }, 386 }, 387 }, 388 Status: controlplanev1.KubeadmControlPlaneStatus{ 389 Initialized: true, 390 }, 391 }, 392 Cluster: &clusterv1.Cluster{}, 393 Machines: collections.FromMachines(m), 394 } 395 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 396 397 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 398 g.Expect(err).ToNot(HaveOccurred()) 399 400 g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 401 402 assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal to 1") 403 404 g.Expect(env.Cleanup(ctx, m)).To(Succeed()) 405 }) 406 t.Run("Remediation does not happen if there is another machine being deleted (not the one to be remediated)", func(t *testing.T) { 407 g := NewWithT(t) 408 409 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed()) 410 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-") 411 m3 := getDeletingMachine(ns.Name, "m3-deleting") // NB. This machine is not created, it gets only added to control plane 412 controlPlane := &internal.ControlPlane{ 413 KCP: &controlplanev1.KubeadmControlPlane{ 414 Spec: controlplanev1.KubeadmControlPlaneSpec{ 415 Replicas: utilpointer.Int32(3), 416 }, 417 Status: controlplanev1.KubeadmControlPlaneStatus{ 418 Initialized: true, 419 }, 420 }, 421 Cluster: &clusterv1.Cluster{}, 422 Machines: collections.FromMachines(m1, m2, m3), 423 } 424 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 425 426 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 427 g.Expect(err).ToNot(HaveOccurred()) 428 429 g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 430 431 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation") 432 433 g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed()) 434 }) 435 t.Run("Remediation does not happen if there is at least one additional unhealthy etcd member on a 3 machine CP", func(t *testing.T) { 436 g := NewWithT(t) 437 438 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 439 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember()) 440 m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember()) 441 442 controlPlane := &internal.ControlPlane{ 443 KCP: &controlplanev1.KubeadmControlPlane{ 444 Spec: controlplanev1.KubeadmControlPlaneSpec{ 445 Replicas: utilpointer.Int32(3), 446 }, 447 Status: controlplanev1.KubeadmControlPlaneStatus{ 448 Initialized: true, 449 }, 450 }, 451 Cluster: &clusterv1.Cluster{}, 452 Machines: collections.FromMachines(m1, m2, m3), 453 } 454 455 r := &KubeadmControlPlaneReconciler{ 456 Client: env.GetClient(), 457 recorder: record.NewFakeRecorder(32), 458 managementCluster: &fakeManagementCluster{ 459 Workload: fakeWorkloadCluster{ 460 EtcdMembersResult: nodes(controlPlane.Machines), 461 }, 462 }, 463 } 464 controlPlane.InjectTestManagementCluster(r.managementCluster) 465 466 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 467 468 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 469 g.Expect(err).ToNot(HaveOccurred()) 470 471 g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 472 473 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum") 474 475 g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed()) 476 }) 477 t.Run("Remediation does not happen if there are at least two additional unhealthy etcd member on a 5 machine CP", func(t *testing.T) { 478 g := NewWithT(t) 479 480 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 481 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember()) 482 m3 := createMachine(ctx, g, ns.Name, "m3-etcd-unhealthy-", withUnhealthyEtcdMember()) 483 m4 := createMachine(ctx, g, ns.Name, "m4-etcd-healthy-", withHealthyEtcdMember()) 484 m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember()) 485 486 controlPlane := &internal.ControlPlane{ 487 KCP: &controlplanev1.KubeadmControlPlane{ 488 Spec: controlplanev1.KubeadmControlPlaneSpec{ 489 Replicas: utilpointer.Int32(5), 490 }, 491 Status: controlplanev1.KubeadmControlPlaneStatus{ 492 Initialized: true, 493 }, 494 }, 495 Cluster: &clusterv1.Cluster{}, 496 Machines: collections.FromMachines(m1, m2, m3, m4, m5), 497 } 498 499 r := &KubeadmControlPlaneReconciler{ 500 Client: env.GetClient(), 501 recorder: record.NewFakeRecorder(32), 502 managementCluster: &fakeManagementCluster{ 503 Workload: fakeWorkloadCluster{ 504 EtcdMembersResult: nodes(controlPlane.Machines), 505 }, 506 }, 507 } 508 controlPlane.InjectTestManagementCluster(r.managementCluster) 509 510 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 511 512 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 513 g.Expect(err).ToNot(HaveOccurred()) 514 515 g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 516 517 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum") 518 519 g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5)).To(Succeed()) 520 }) 521 522 // Remediation for when control plane is not yet initialized 523 524 t.Run("Remediation deletes unhealthy machine - 1 CP not initialized", func(t *testing.T) { 525 g := NewWithT(t) 526 527 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 528 529 controlPlane := &internal.ControlPlane{ 530 KCP: &controlplanev1.KubeadmControlPlane{ 531 Spec: controlplanev1.KubeadmControlPlaneSpec{ 532 Replicas: utilpointer.Int32(1), 533 Version: "v1.19.1", 534 }, 535 Status: controlplanev1.KubeadmControlPlaneStatus{ 536 Initialized: false, 537 }, 538 }, 539 Cluster: &clusterv1.Cluster{}, 540 Machines: collections.FromMachines(m1), 541 } 542 543 r := &KubeadmControlPlaneReconciler{ 544 Client: env.GetClient(), 545 recorder: record.NewFakeRecorder(32), 546 managementCluster: &fakeManagementCluster{ 547 Workload: fakeWorkloadCluster{ 548 EtcdMembersResult: nodes(controlPlane.Machines), 549 }, 550 }, 551 } 552 553 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 554 555 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 556 g.Expect(err).ToNot(HaveOccurred()) 557 558 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 559 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 560 g.Expect(err).ToNot(HaveOccurred()) 561 g.Expect(remediationData.Machine).To(Equal(m1.Name)) 562 g.Expect(remediationData.RetryCount).To(Equal(0)) 563 564 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 565 566 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 567 g.Expect(err).ToNot(HaveOccurred()) 568 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 569 570 removeFinalizer(g, m1) 571 g.Expect(env.Cleanup(ctx, m1)).To(Succeed()) 572 }) 573 t.Run("Subsequent remediation of the same machine increase retry count - 1 CP not initialized", func(t *testing.T) { 574 g := NewWithT(t) 575 576 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 577 578 controlPlane := &internal.ControlPlane{ 579 KCP: &controlplanev1.KubeadmControlPlane{ 580 Spec: controlplanev1.KubeadmControlPlaneSpec{ 581 Replicas: utilpointer.Int32(1), 582 Version: "v1.19.1", 583 }, 584 Status: controlplanev1.KubeadmControlPlaneStatus{ 585 Initialized: false, 586 }, 587 }, 588 Cluster: &clusterv1.Cluster{}, 589 Machines: collections.FromMachines(m1), 590 } 591 592 // First reconcile, remediate machine m1 for the first time 593 r := &KubeadmControlPlaneReconciler{ 594 Client: env.GetClient(), 595 recorder: record.NewFakeRecorder(32), 596 managementCluster: &fakeManagementCluster{ 597 Workload: fakeWorkloadCluster{ 598 EtcdMembersResult: nodes(controlPlane.Machines), 599 }, 600 }, 601 } 602 603 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 604 605 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 606 g.Expect(err).ToNot(HaveOccurred()) 607 608 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 609 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 610 g.Expect(err).ToNot(HaveOccurred()) 611 g.Expect(remediationData.Machine).To(Equal(m1.Name)) 612 g.Expect(remediationData.RetryCount).To(Equal(0)) 613 614 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 615 616 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 617 g.Expect(err).ToNot(HaveOccurred()) 618 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 619 620 removeFinalizer(g, m1) 621 g.Expect(env.CleanupAndWait(ctx, m1)).To(Succeed()) 622 623 for i := 2; i < 4; i++ { 624 // Simulate the creation of a replacement for 0. 625 mi := createMachine(ctx, g, ns.Name, fmt.Sprintf("m%d-unhealthy-", i), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData))) 626 627 // Simulate KCP dropping RemediationInProgressAnnotation after creating the replacement machine. 628 delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation) 629 630 controlPlane.Machines = collections.FromMachines(mi) 631 632 // Reconcile unhealthy replacements for m1. 633 r.managementCluster = &fakeManagementCluster{ 634 Workload: fakeWorkloadCluster{ 635 EtcdMembersResult: nodes(collections.FromMachines(mi)), 636 }, 637 } 638 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 639 640 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 641 g.Expect(err).ToNot(HaveOccurred()) 642 643 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 644 remediationData, err = RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 645 g.Expect(err).ToNot(HaveOccurred()) 646 g.Expect(remediationData.Machine).To(Equal(mi.Name)) 647 g.Expect(remediationData.RetryCount).To(Equal(i - 1)) 648 649 assertMachineCondition(ctx, g, mi, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 650 651 err = env.Get(ctx, client.ObjectKey{Namespace: mi.Namespace, Name: mi.Name}, mi) 652 g.Expect(err).ToNot(HaveOccurred()) 653 g.Expect(mi.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 654 655 removeFinalizer(g, mi) 656 g.Expect(env.CleanupAndWait(ctx, mi)).To(Succeed()) 657 } 658 }) 659 660 // Remediation for when control plane is already initialized 661 662 t.Run("Remediation deletes unhealthy machine - 2 CP (during 1 CP rolling upgrade)", func(t *testing.T) { 663 g := NewWithT(t) 664 665 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 666 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember()) 667 668 controlPlane := &internal.ControlPlane{ 669 KCP: &controlplanev1.KubeadmControlPlane{ 670 Spec: controlplanev1.KubeadmControlPlaneSpec{ 671 Replicas: utilpointer.Int32(2), 672 Version: "v1.19.1", 673 }, 674 Status: controlplanev1.KubeadmControlPlaneStatus{ 675 Initialized: true, 676 }, 677 }, 678 Cluster: &clusterv1.Cluster{}, 679 Machines: collections.FromMachines(m1, m2), 680 } 681 682 r := &KubeadmControlPlaneReconciler{ 683 Client: env.GetClient(), 684 recorder: record.NewFakeRecorder(32), 685 managementCluster: &fakeManagementCluster{ 686 Workload: fakeWorkloadCluster{ 687 EtcdMembersResult: nodes(controlPlane.Machines), 688 }, 689 }, 690 } 691 controlPlane.InjectTestManagementCluster(r.managementCluster) 692 693 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 694 695 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 696 g.Expect(err).ToNot(HaveOccurred()) 697 698 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 699 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 700 g.Expect(err).ToNot(HaveOccurred()) 701 g.Expect(remediationData.Machine).To(Equal(m1.Name)) 702 g.Expect(remediationData.RetryCount).To(Equal(0)) 703 704 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 705 706 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 707 g.Expect(err).ToNot(HaveOccurred()) 708 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 709 710 removeFinalizer(g, m1) 711 g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed()) 712 }) 713 t.Run("Remediation deletes unhealthy machine - 3 CP", func(t *testing.T) { 714 g := NewWithT(t) 715 716 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 717 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember()) 718 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember()) 719 720 controlPlane := &internal.ControlPlane{ 721 KCP: &controlplanev1.KubeadmControlPlane{ 722 Spec: controlplanev1.KubeadmControlPlaneSpec{ 723 Replicas: utilpointer.Int32(3), 724 Version: "v1.19.1", 725 }, 726 Status: controlplanev1.KubeadmControlPlaneStatus{ 727 Initialized: true, 728 }, 729 }, 730 Cluster: &clusterv1.Cluster{}, 731 Machines: collections.FromMachines(m1, m2, m3), 732 } 733 734 r := &KubeadmControlPlaneReconciler{ 735 Client: env.GetClient(), 736 recorder: record.NewFakeRecorder(32), 737 managementCluster: &fakeManagementCluster{ 738 Workload: fakeWorkloadCluster{ 739 EtcdMembersResult: nodes(controlPlane.Machines), 740 }, 741 }, 742 } 743 controlPlane.InjectTestManagementCluster(r.managementCluster) 744 745 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 746 747 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 748 g.Expect(err).ToNot(HaveOccurred()) 749 750 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 751 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 752 g.Expect(err).ToNot(HaveOccurred()) 753 g.Expect(remediationData.Machine).To(Equal(m1.Name)) 754 g.Expect(remediationData.RetryCount).To(Equal(0)) 755 756 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 757 758 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 759 g.Expect(err).ToNot(HaveOccurred()) 760 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 761 762 removeFinalizer(g, m1) 763 g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed()) 764 }) 765 t.Run("Remediation deletes unhealthy machine - 4 CP (during 3 CP rolling upgrade)", func(t *testing.T) { 766 g := NewWithT(t) 767 768 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 769 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember()) 770 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember()) 771 m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withHealthyEtcdMember()) 772 773 controlPlane := &internal.ControlPlane{ 774 KCP: &controlplanev1.KubeadmControlPlane{ 775 Spec: controlplanev1.KubeadmControlPlaneSpec{ 776 Replicas: utilpointer.Int32(4), 777 Version: "v1.19.1", 778 }, 779 Status: controlplanev1.KubeadmControlPlaneStatus{ 780 Initialized: true, 781 }, 782 }, 783 Cluster: &clusterv1.Cluster{}, 784 Machines: collections.FromMachines(m1, m2, m3, m4), 785 } 786 787 r := &KubeadmControlPlaneReconciler{ 788 Client: env.GetClient(), 789 recorder: record.NewFakeRecorder(32), 790 managementCluster: &fakeManagementCluster{ 791 Workload: fakeWorkloadCluster{ 792 EtcdMembersResult: nodes(controlPlane.Machines), 793 }, 794 }, 795 } 796 controlPlane.InjectTestManagementCluster(r.managementCluster) 797 798 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 799 800 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 801 g.Expect(err).ToNot(HaveOccurred()) 802 803 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 804 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 805 g.Expect(err).ToNot(HaveOccurred()) 806 g.Expect(remediationData.Machine).To(Equal(m1.Name)) 807 g.Expect(remediationData.RetryCount).To(Equal(0)) 808 809 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 810 811 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 812 g.Expect(err).ToNot(HaveOccurred()) 813 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 814 815 removeFinalizer(g, m1) 816 g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed()) 817 }) 818 t.Run("Remediation fails gracefully if no healthy Control Planes are available to become etcd leader", func(t *testing.T) { 819 g := NewWithT(t) 820 821 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 822 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember()) 823 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember()) 824 m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember()) 825 826 controlPlane := &internal.ControlPlane{ 827 KCP: &controlplanev1.KubeadmControlPlane{ 828 Spec: controlplanev1.KubeadmControlPlaneSpec{ 829 Replicas: utilpointer.Int32(4), 830 Version: "v1.19.1", 831 }, 832 Status: controlplanev1.KubeadmControlPlaneStatus{ 833 Initialized: true, 834 }, 835 }, 836 Cluster: &clusterv1.Cluster{}, 837 Machines: collections.FromMachines(m1, m2, m3, m4), 838 } 839 840 r := &KubeadmControlPlaneReconciler{ 841 Client: env.GetClient(), 842 recorder: record.NewFakeRecorder(32), 843 managementCluster: &fakeManagementCluster{ 844 Workload: fakeWorkloadCluster{ 845 EtcdMembersResult: nodes(controlPlane.Machines), 846 }, 847 }, 848 } 849 controlPlane.InjectTestManagementCluster(r.managementCluster) 850 851 _, err = r.reconcileUnhealthyMachines(ctx, controlPlane) 852 g.Expect(err).ToNot(HaveOccurred()) 853 854 g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 855 856 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning, 857 "A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation") 858 859 removeFinalizer(g, m1) 860 g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed()) 861 }) 862 t.Run("Subsequent remediation of the same machine increase retry count - 3 CP", func(t *testing.T) { 863 g := NewWithT(t) 864 865 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 866 m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember()) 867 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember()) 868 869 controlPlane := &internal.ControlPlane{ 870 KCP: &controlplanev1.KubeadmControlPlane{ 871 Spec: controlplanev1.KubeadmControlPlaneSpec{ 872 Replicas: utilpointer.Int32(1), 873 Version: "v1.19.1", 874 }, 875 Status: controlplanev1.KubeadmControlPlaneStatus{ 876 Initialized: false, 877 }, 878 }, 879 Cluster: &clusterv1.Cluster{}, 880 Machines: collections.FromMachines(m1, m2, m3), 881 } 882 883 // First reconcile, remediate machine m1 for the first time 884 r := &KubeadmControlPlaneReconciler{ 885 Client: env.GetClient(), 886 recorder: record.NewFakeRecorder(32), 887 managementCluster: &fakeManagementCluster{ 888 Workload: fakeWorkloadCluster{ 889 EtcdMembersResult: nodes(controlPlane.Machines), 890 }, 891 }, 892 } 893 894 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 895 896 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 897 g.Expect(err).ToNot(HaveOccurred()) 898 899 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 900 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 901 g.Expect(err).ToNot(HaveOccurred()) 902 g.Expect(remediationData.Machine).To(Equal(m1.Name)) 903 g.Expect(remediationData.RetryCount).To(Equal(0)) 904 905 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 906 907 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 908 g.Expect(err).ToNot(HaveOccurred()) 909 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 910 911 removeFinalizer(g, m1) 912 g.Expect(env.CleanupAndWait(ctx, m1)).To(Succeed()) 913 914 for i := 5; i < 6; i++ { 915 // Simulate the creation of a replacement for m1. 916 mi := createMachine(ctx, g, ns.Name, fmt.Sprintf("m%d-unhealthy-", i), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData))) 917 918 // Simulate KCP dropping RemediationInProgressAnnotation after creating the replacement machine. 919 delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation) 920 controlPlane.Machines = collections.FromMachines(mi, m2, m3) 921 922 // Reconcile unhealthy replacements for m1. 923 r.managementCluster = &fakeManagementCluster{ 924 Workload: fakeWorkloadCluster{ 925 EtcdMembersResult: nodes(collections.FromMachines(mi, m2, m3)), 926 }, 927 } 928 929 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 930 931 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 932 g.Expect(err).ToNot(HaveOccurred()) 933 934 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 935 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 936 g.Expect(err).ToNot(HaveOccurred()) 937 g.Expect(remediationData.Machine).To(Equal(mi.Name)) 938 g.Expect(remediationData.RetryCount).To(Equal(i - 4)) 939 940 assertMachineCondition(ctx, g, mi, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 941 942 err = env.Get(ctx, client.ObjectKey{Namespace: mi.Namespace, Name: mi.Name}, mi) 943 g.Expect(err).ToNot(HaveOccurred()) 944 g.Expect(mi.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 945 946 removeFinalizer(g, mi) 947 g.Expect(env.CleanupAndWait(ctx, mi)).To(Succeed()) 948 } 949 950 g.Expect(env.CleanupAndWait(ctx, m2, m3)).To(Succeed()) 951 }) 952 } 953 954 func TestReconcileUnhealthyMachinesSequences(t *testing.T) { 955 var removeFinalizer = func(g *WithT, m *clusterv1.Machine) { 956 patchHelper, err := patch.NewHelper(m, env.GetClient()) 957 g.Expect(err).ToNot(HaveOccurred()) 958 m.ObjectMeta.Finalizers = nil 959 g.Expect(patchHelper.Patch(ctx, m)).To(Succeed()) 960 } 961 962 t.Run("Remediates the first CP machine having problems to come up", func(t *testing.T) { 963 g := NewWithT(t) 964 965 ns, err := env.CreateNamespace(ctx, "ns1") 966 g.Expect(err).ToNot(HaveOccurred()) 967 defer func() { 968 g.Expect(env.Cleanup(ctx, ns)).To(Succeed()) 969 }() 970 971 // Control plane not initialized yet, First CP is unhealthy and gets remediated: 972 973 m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 974 975 controlPlane := &internal.ControlPlane{ 976 KCP: &controlplanev1.KubeadmControlPlane{ 977 Spec: controlplanev1.KubeadmControlPlaneSpec{ 978 Replicas: utilpointer.Int32(3), 979 Version: "v1.19.1", 980 }, 981 Status: controlplanev1.KubeadmControlPlaneStatus{ 982 Initialized: false, 983 }, 984 }, 985 Cluster: &clusterv1.Cluster{}, 986 Machines: collections.FromMachines(m1), 987 } 988 989 r := &KubeadmControlPlaneReconciler{ 990 Client: env.GetClient(), 991 recorder: record.NewFakeRecorder(32), 992 managementCluster: &fakeManagementCluster{ 993 Workload: fakeWorkloadCluster{ 994 EtcdMembersResult: nodes(controlPlane.Machines), 995 }, 996 }, 997 } 998 999 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 1000 1001 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 1002 g.Expect(err).ToNot(HaveOccurred()) 1003 1004 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 1005 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 1006 g.Expect(err).ToNot(HaveOccurred()) 1007 g.Expect(remediationData.Machine).To(Equal(m1.Name)) 1008 g.Expect(remediationData.RetryCount).To(Equal(0)) 1009 1010 assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 1011 1012 err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) 1013 g.Expect(err).ToNot(HaveOccurred()) 1014 g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 1015 1016 removeFinalizer(g, m1) 1017 g.Expect(env.Cleanup(ctx, m1)).To(Succeed()) 1018 1019 // Fake scaling up, which creates a remediation machine, fast forwards to when also the replacement machine is marked unhealthy. 1020 // NOTE: scale up also resets remediation in progress and remediation counts. 1021 1022 m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData))) 1023 delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation) 1024 1025 // Control plane not initialized yet, Second CP is unhealthy and gets remediated (retry 2) 1026 1027 controlPlane.Machines = collections.FromMachines(m2) 1028 r.managementCluster = &fakeManagementCluster{ 1029 Workload: fakeWorkloadCluster{ 1030 EtcdMembersResult: nodes(controlPlane.Machines), 1031 }, 1032 } 1033 1034 ret, err = r.reconcileUnhealthyMachines(ctx, controlPlane) 1035 1036 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 1037 g.Expect(err).ToNot(HaveOccurred()) 1038 1039 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 1040 remediationData, err = RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 1041 g.Expect(err).ToNot(HaveOccurred()) 1042 g.Expect(remediationData.Machine).To(Equal(m2.Name)) 1043 g.Expect(remediationData.RetryCount).To(Equal(1)) 1044 1045 assertMachineCondition(ctx, g, m2, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 1046 1047 err = env.Get(ctx, client.ObjectKey{Namespace: m2.Namespace, Name: m2.Name}, m1) 1048 g.Expect(err).ToNot(HaveOccurred()) 1049 g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 1050 1051 removeFinalizer(g, m2) 1052 g.Expect(env.Cleanup(ctx, m2)).To(Succeed()) 1053 1054 // Fake scaling up, which creates a remediation machine, which is healthy. 1055 // NOTE: scale up also resets remediation in progress and remediation counts. 1056 1057 m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData))) 1058 delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation) 1059 1060 g.Expect(env.Cleanup(ctx, m3)).To(Succeed()) 1061 }) 1062 1063 t.Run("Remediates the second CP machine having problems to come up", func(t *testing.T) { 1064 g := NewWithT(t) 1065 1066 ns, err := env.CreateNamespace(ctx, "ns1") 1067 g.Expect(err).ToNot(HaveOccurred()) 1068 defer func() { 1069 g.Expect(env.Cleanup(ctx, ns)).To(Succeed()) 1070 }() 1071 1072 // Control plane initialized yet, First CP healthy, second CP is unhealthy and gets remediated: 1073 1074 m1 := createMachine(ctx, g, ns.Name, "m1-healthy-", withHealthyEtcdMember()) 1075 m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 1076 1077 controlPlane := &internal.ControlPlane{ 1078 KCP: &controlplanev1.KubeadmControlPlane{ 1079 Spec: controlplanev1.KubeadmControlPlaneSpec{ 1080 Replicas: utilpointer.Int32(3), 1081 Version: "v1.19.1", 1082 RolloutStrategy: &controlplanev1.RolloutStrategy{ 1083 RollingUpdate: &controlplanev1.RollingUpdate{ 1084 MaxSurge: &intstr.IntOrString{ 1085 IntVal: 1, 1086 }, 1087 }, 1088 }, 1089 }, 1090 Status: controlplanev1.KubeadmControlPlaneStatus{ 1091 Initialized: true, 1092 }, 1093 }, 1094 Cluster: &clusterv1.Cluster{}, 1095 Machines: collections.FromMachines(m1, m2), 1096 } 1097 1098 r := &KubeadmControlPlaneReconciler{ 1099 Client: env.GetClient(), 1100 recorder: record.NewFakeRecorder(32), 1101 managementCluster: &fakeManagementCluster{ 1102 Workload: fakeWorkloadCluster{ 1103 EtcdMembersResult: nodes(controlPlane.Machines), 1104 }, 1105 }, 1106 } 1107 controlPlane.InjectTestManagementCluster(r.managementCluster) 1108 1109 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 1110 1111 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 1112 g.Expect(err).ToNot(HaveOccurred()) 1113 1114 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 1115 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 1116 g.Expect(err).ToNot(HaveOccurred()) 1117 g.Expect(remediationData.Machine).To(Equal(m2.Name)) 1118 g.Expect(remediationData.RetryCount).To(Equal(0)) 1119 1120 assertMachineCondition(ctx, g, m2, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 1121 1122 err = env.Get(ctx, client.ObjectKey{Namespace: m2.Namespace, Name: m2.Name}, m2) 1123 g.Expect(err).ToNot(HaveOccurred()) 1124 g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 1125 1126 removeFinalizer(g, m2) 1127 g.Expect(env.Cleanup(ctx, m2)).To(Succeed()) 1128 1129 // Fake scaling up, which creates a remediation machine, fast forwards to when also the replacement machine is marked unhealthy. 1130 // NOTE: scale up also resets remediation in progress and remediation counts. 1131 1132 m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData))) 1133 delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation) 1134 1135 // Control plane not initialized yet, Second CP is unhealthy and gets remediated (retry 2) 1136 1137 controlPlane.Machines = collections.FromMachines(m1, m3) 1138 r.managementCluster = &fakeManagementCluster{ 1139 Workload: fakeWorkloadCluster{ 1140 EtcdMembersResult: nodes(controlPlane.Machines), 1141 }, 1142 } 1143 controlPlane.InjectTestManagementCluster(r.managementCluster) 1144 1145 ret, err = r.reconcileUnhealthyMachines(ctx, controlPlane) 1146 1147 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 1148 g.Expect(err).ToNot(HaveOccurred()) 1149 1150 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 1151 remediationData, err = RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 1152 g.Expect(err).ToNot(HaveOccurred()) 1153 g.Expect(remediationData.Machine).To(Equal(m3.Name)) 1154 g.Expect(remediationData.RetryCount).To(Equal(1)) 1155 1156 assertMachineCondition(ctx, g, m3, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 1157 1158 err = env.Get(ctx, client.ObjectKey{Namespace: m3.Namespace, Name: m3.Name}, m3) 1159 g.Expect(err).ToNot(HaveOccurred()) 1160 g.Expect(m3.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 1161 1162 removeFinalizer(g, m3) 1163 g.Expect(env.Cleanup(ctx, m3)).To(Succeed()) 1164 1165 // Fake scaling up, which creates a remediation machine, which is healthy. 1166 // NOTE: scale up also resets remediation in progress and remediation counts. 1167 1168 m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withHealthyEtcdMember(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData))) 1169 delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation) 1170 1171 g.Expect(env.Cleanup(ctx, m1, m4)).To(Succeed()) 1172 }) 1173 1174 t.Run("Remediates only one CP machine in case of multiple failures", func(t *testing.T) { 1175 g := NewWithT(t) 1176 1177 ns, err := env.CreateNamespace(ctx, "ns1") 1178 g.Expect(err).ToNot(HaveOccurred()) 1179 defer func() { 1180 g.Expect(env.Cleanup(ctx, ns)).To(Succeed()) 1181 }() 1182 1183 // Control plane initialized yet, First CP healthy, second and third CP are unhealthy. second gets remediated: 1184 1185 m1 := createMachine(ctx, g, ns.Name, "m1-healthy-", withHealthyEtcdMember()) 1186 m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withHealthyEtcdMember(), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 1187 m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withHealthyEtcdMember(), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer()) 1188 1189 controlPlane := &internal.ControlPlane{ 1190 KCP: &controlplanev1.KubeadmControlPlane{ 1191 Spec: controlplanev1.KubeadmControlPlaneSpec{ 1192 Replicas: utilpointer.Int32(3), 1193 Version: "v1.19.1", 1194 RolloutStrategy: &controlplanev1.RolloutStrategy{ 1195 RollingUpdate: &controlplanev1.RollingUpdate{ 1196 MaxSurge: &intstr.IntOrString{ 1197 IntVal: 1, 1198 }, 1199 }, 1200 }, 1201 }, 1202 Status: controlplanev1.KubeadmControlPlaneStatus{ 1203 Initialized: true, 1204 }, 1205 }, 1206 Cluster: &clusterv1.Cluster{}, 1207 Machines: collections.FromMachines(m1, m2, m3), 1208 } 1209 1210 r := &KubeadmControlPlaneReconciler{ 1211 Client: env.GetClient(), 1212 recorder: record.NewFakeRecorder(32), 1213 managementCluster: &fakeManagementCluster{ 1214 Workload: fakeWorkloadCluster{ 1215 EtcdMembersResult: nodes(controlPlane.Machines), 1216 }, 1217 }, 1218 } 1219 controlPlane.InjectTestManagementCluster(r.managementCluster) 1220 1221 ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) 1222 1223 g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue 1224 g.Expect(err).ToNot(HaveOccurred()) 1225 1226 g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) 1227 remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) 1228 g.Expect(err).ToNot(HaveOccurred()) 1229 g.Expect(remediationData.Machine).To(Equal(m2.Name)) 1230 g.Expect(remediationData.RetryCount).To(Equal(0)) 1231 1232 assertMachineCondition(ctx, g, m2, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 1233 assertMachineCondition(ctx, g, m3, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "") 1234 1235 err = env.Get(ctx, client.ObjectKey{Namespace: m2.Namespace, Name: m2.Name}, m2) 1236 g.Expect(err).ToNot(HaveOccurred()) 1237 g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) 1238 1239 removeFinalizer(g, m2) 1240 g.Expect(env.Cleanup(ctx, m2)).To(Succeed()) 1241 1242 // Check next reconcile does not further remediate 1243 1244 controlPlane.Machines = collections.FromMachines(m1, m3) 1245 r.managementCluster = &fakeManagementCluster{ 1246 Workload: fakeWorkloadCluster{ 1247 EtcdMembersResult: nodes(controlPlane.Machines), 1248 }, 1249 } 1250 1251 ret, err = r.reconcileUnhealthyMachines(ctx, controlPlane) 1252 1253 g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped 1254 g.Expect(err).ToNot(HaveOccurred()) 1255 1256 g.Expect(env.Cleanup(ctx, m1)).To(Succeed()) 1257 }) 1258 } 1259 1260 func TestCanSafelyRemoveEtcdMember(t *testing.T) { 1261 g := NewWithT(t) 1262 1263 ns, err := env.CreateNamespace(ctx, "ns1") 1264 g.Expect(err).ToNot(HaveOccurred()) 1265 defer func() { 1266 g.Expect(env.Cleanup(ctx, ns)).To(Succeed()) 1267 }() 1268 1269 t.Run("Can't safely remediate 1 machine CP", func(t *testing.T) { 1270 g := NewWithT(t) 1271 1272 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1273 1274 controlPlane := &internal.ControlPlane{ 1275 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1276 Replicas: utilpointer.Int32(1), 1277 }}, 1278 Cluster: &clusterv1.Cluster{}, 1279 Machines: collections.FromMachines(m1), 1280 } 1281 1282 r := &KubeadmControlPlaneReconciler{ 1283 Client: env.GetClient(), 1284 recorder: record.NewFakeRecorder(32), 1285 managementCluster: &fakeManagementCluster{ 1286 Workload: fakeWorkloadCluster{ 1287 EtcdMembersResult: nodes(controlPlane.Machines), 1288 }, 1289 }, 1290 } 1291 controlPlane.InjectTestManagementCluster(r.managementCluster) 1292 1293 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1294 g.Expect(ret).To(BeFalse()) 1295 g.Expect(err).ToNot(HaveOccurred()) 1296 1297 g.Expect(env.Cleanup(ctx, m1)).To(Succeed()) 1298 }) 1299 1300 t.Run("Can safely remediate 2 machine CP without additional etcd member failures", func(t *testing.T) { 1301 g := NewWithT(t) 1302 1303 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1304 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember()) 1305 1306 controlPlane := &internal.ControlPlane{ 1307 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1308 Replicas: utilpointer.Int32(3), 1309 }}, 1310 Cluster: &clusterv1.Cluster{}, 1311 Machines: collections.FromMachines(m1, m2), 1312 } 1313 1314 r := &KubeadmControlPlaneReconciler{ 1315 Client: env.GetClient(), 1316 recorder: record.NewFakeRecorder(32), 1317 managementCluster: &fakeManagementCluster{ 1318 Workload: fakeWorkloadCluster{ 1319 EtcdMembersResult: nodes(controlPlane.Machines), 1320 }, 1321 }, 1322 } 1323 controlPlane.InjectTestManagementCluster(r.managementCluster) 1324 1325 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1326 g.Expect(ret).To(BeTrue()) 1327 g.Expect(err).ToNot(HaveOccurred()) 1328 1329 g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed()) 1330 }) 1331 t.Run("Can safely remediate 2 machines CP when the etcd member being remediated is missing", func(t *testing.T) { 1332 g := NewWithT(t) 1333 1334 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1335 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember()) 1336 1337 controlPlane := &internal.ControlPlane{ 1338 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1339 Replicas: utilpointer.Int32(3), 1340 }}, 1341 Cluster: &clusterv1.Cluster{}, 1342 Machines: collections.FromMachines(m1, m2), 1343 } 1344 1345 members := make([]string, 0, len(controlPlane.Machines)-1) 1346 for _, n := range nodes(controlPlane.Machines) { 1347 if !strings.Contains(n, "m1-mhc-unhealthy-") { 1348 members = append(members, n) 1349 } 1350 } 1351 1352 r := &KubeadmControlPlaneReconciler{ 1353 Client: env.GetClient(), 1354 recorder: record.NewFakeRecorder(32), 1355 managementCluster: &fakeManagementCluster{ 1356 Workload: fakeWorkloadCluster{ 1357 EtcdMembersResult: members, 1358 }, 1359 }, 1360 } 1361 controlPlane.InjectTestManagementCluster(r.managementCluster) 1362 1363 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1364 g.Expect(ret).To(BeTrue()) 1365 g.Expect(err).ToNot(HaveOccurred()) 1366 1367 g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed()) 1368 }) 1369 t.Run("Can't safely remediate 2 machines CP with one additional etcd member failure", func(t *testing.T) { 1370 g := NewWithT(t) 1371 1372 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1373 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember()) 1374 1375 controlPlane := &internal.ControlPlane{ 1376 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1377 Replicas: utilpointer.Int32(3), 1378 }}, 1379 Cluster: &clusterv1.Cluster{}, 1380 Machines: collections.FromMachines(m1, m2), 1381 } 1382 1383 r := &KubeadmControlPlaneReconciler{ 1384 Client: env.GetClient(), 1385 recorder: record.NewFakeRecorder(32), 1386 managementCluster: &fakeManagementCluster{ 1387 Workload: fakeWorkloadCluster{ 1388 EtcdMembersResult: nodes(controlPlane.Machines), 1389 }, 1390 }, 1391 } 1392 controlPlane.InjectTestManagementCluster(r.managementCluster) 1393 1394 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1395 g.Expect(ret).To(BeFalse()) 1396 g.Expect(err).ToNot(HaveOccurred()) 1397 1398 g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed()) 1399 }) 1400 t.Run("Can safely remediate 3 machines CP without additional etcd member failures", func(t *testing.T) { 1401 g := NewWithT(t) 1402 1403 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1404 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember()) 1405 m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember()) 1406 1407 controlPlane := &internal.ControlPlane{ 1408 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1409 Replicas: utilpointer.Int32(3), 1410 }}, 1411 Cluster: &clusterv1.Cluster{}, 1412 Machines: collections.FromMachines(m1, m2, m3), 1413 } 1414 1415 r := &KubeadmControlPlaneReconciler{ 1416 Client: env.GetClient(), 1417 recorder: record.NewFakeRecorder(32), 1418 managementCluster: &fakeManagementCluster{ 1419 Workload: fakeWorkloadCluster{ 1420 EtcdMembersResult: nodes(controlPlane.Machines), 1421 }, 1422 }, 1423 } 1424 controlPlane.InjectTestManagementCluster(r.managementCluster) 1425 1426 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1427 g.Expect(ret).To(BeTrue()) 1428 g.Expect(err).ToNot(HaveOccurred()) 1429 1430 g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed()) 1431 }) 1432 t.Run("Can safely remediate 3 machines CP when the etcd member being remediated is missing", func(t *testing.T) { 1433 g := NewWithT(t) 1434 1435 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1436 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember()) 1437 m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember()) 1438 1439 controlPlane := &internal.ControlPlane{ 1440 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1441 Replicas: utilpointer.Int32(3), 1442 }}, 1443 Cluster: &clusterv1.Cluster{}, 1444 Machines: collections.FromMachines(m1, m2, m3), 1445 } 1446 1447 members := make([]string, 0, len(controlPlane.Machines)-1) 1448 for _, n := range nodes(controlPlane.Machines) { 1449 if !strings.Contains(n, "m1-mhc-unhealthy-") { 1450 members = append(members, n) 1451 } 1452 } 1453 1454 r := &KubeadmControlPlaneReconciler{ 1455 Client: env.GetClient(), 1456 recorder: record.NewFakeRecorder(32), 1457 managementCluster: &fakeManagementCluster{ 1458 Workload: fakeWorkloadCluster{ 1459 EtcdMembersResult: members, 1460 }, 1461 }, 1462 } 1463 controlPlane.InjectTestManagementCluster(r.managementCluster) 1464 1465 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1466 g.Expect(ret).To(BeTrue()) 1467 g.Expect(err).ToNot(HaveOccurred()) 1468 1469 g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed()) 1470 }) 1471 t.Run("Can't safely remediate 3 machines CP with one additional etcd member failure", func(t *testing.T) { 1472 g := NewWithT(t) 1473 1474 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1475 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember()) 1476 m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember()) 1477 1478 controlPlane := &internal.ControlPlane{ 1479 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1480 Replicas: utilpointer.Int32(3), 1481 }}, 1482 Cluster: &clusterv1.Cluster{}, 1483 Machines: collections.FromMachines(m1, m2, m3), 1484 } 1485 1486 r := &KubeadmControlPlaneReconciler{ 1487 Client: env.GetClient(), 1488 recorder: record.NewFakeRecorder(32), 1489 managementCluster: &fakeManagementCluster{ 1490 Workload: fakeWorkloadCluster{ 1491 EtcdMembersResult: nodes(controlPlane.Machines), 1492 }, 1493 }, 1494 } 1495 controlPlane.InjectTestManagementCluster(r.managementCluster) 1496 1497 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1498 g.Expect(ret).To(BeFalse()) 1499 g.Expect(err).ToNot(HaveOccurred()) 1500 1501 g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed()) 1502 }) 1503 t.Run("Can safely remediate 5 machines CP less than 2 additional etcd member failures", func(t *testing.T) { 1504 g := NewWithT(t) 1505 1506 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1507 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember()) 1508 m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember()) 1509 m4 := createMachine(ctx, g, ns.Name, "m4-etcd-healthy-", withHealthyEtcdMember()) 1510 m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember()) 1511 1512 controlPlane := &internal.ControlPlane{ 1513 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1514 Replicas: utilpointer.Int32(5), 1515 }}, 1516 Cluster: &clusterv1.Cluster{}, 1517 Machines: collections.FromMachines(m1, m2, m3, m4, m5), 1518 } 1519 1520 r := &KubeadmControlPlaneReconciler{ 1521 Client: env.GetClient(), 1522 recorder: record.NewFakeRecorder(32), 1523 managementCluster: &fakeManagementCluster{ 1524 Workload: fakeWorkloadCluster{ 1525 EtcdMembersResult: nodes(controlPlane.Machines), 1526 }, 1527 }, 1528 } 1529 controlPlane.InjectTestManagementCluster(r.managementCluster) 1530 1531 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1532 g.Expect(ret).To(BeTrue()) 1533 g.Expect(err).ToNot(HaveOccurred()) 1534 1535 g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5)).To(Succeed()) 1536 }) 1537 t.Run("Can't safely remediate 5 machines CP with 2 additional etcd member failures", func(t *testing.T) { 1538 g := NewWithT(t) 1539 1540 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1541 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember()) 1542 m3 := createMachine(ctx, g, ns.Name, "m3-etcd-unhealthy-", withUnhealthyEtcdMember()) 1543 m4 := createMachine(ctx, g, ns.Name, "m4-etcd-healthy-", withHealthyEtcdMember()) 1544 m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember()) 1545 1546 controlPlane := &internal.ControlPlane{ 1547 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1548 Replicas: utilpointer.Int32(7), 1549 }}, 1550 Cluster: &clusterv1.Cluster{}, 1551 Machines: collections.FromMachines(m1, m2, m3, m4, m5), 1552 } 1553 1554 r := &KubeadmControlPlaneReconciler{ 1555 Client: env.GetClient(), 1556 recorder: record.NewFakeRecorder(32), 1557 managementCluster: &fakeManagementCluster{ 1558 Workload: fakeWorkloadCluster{ 1559 EtcdMembersResult: nodes(controlPlane.Machines), 1560 }, 1561 }, 1562 } 1563 controlPlane.InjectTestManagementCluster(r.managementCluster) 1564 1565 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1566 g.Expect(ret).To(BeFalse()) 1567 g.Expect(err).ToNot(HaveOccurred()) 1568 1569 g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5)).To(Succeed()) 1570 }) 1571 t.Run("Can safely remediate 7 machines CP with less than 3 additional etcd member failures", func(t *testing.T) { 1572 g := NewWithT(t) 1573 1574 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1575 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember()) 1576 m3 := createMachine(ctx, g, ns.Name, "m3-etcd-unhealthy-", withUnhealthyEtcdMember()) 1577 m4 := createMachine(ctx, g, ns.Name, "m4-etcd-healthy-", withHealthyEtcdMember()) 1578 m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember()) 1579 m6 := createMachine(ctx, g, ns.Name, "m6-etcd-healthy-", withHealthyEtcdMember()) 1580 m7 := createMachine(ctx, g, ns.Name, "m7-etcd-healthy-", withHealthyEtcdMember()) 1581 1582 controlPlane := &internal.ControlPlane{ 1583 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1584 Replicas: utilpointer.Int32(7), 1585 }}, 1586 Cluster: &clusterv1.Cluster{}, 1587 Machines: collections.FromMachines(m1, m2, m3, m4, m5, m6, m7), 1588 } 1589 1590 r := &KubeadmControlPlaneReconciler{ 1591 Client: env.GetClient(), 1592 recorder: record.NewFakeRecorder(32), 1593 managementCluster: &fakeManagementCluster{ 1594 Workload: fakeWorkloadCluster{ 1595 EtcdMembersResult: nodes(controlPlane.Machines), 1596 }, 1597 }, 1598 } 1599 controlPlane.InjectTestManagementCluster(r.managementCluster) 1600 1601 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1602 g.Expect(ret).To(BeTrue()) 1603 g.Expect(err).ToNot(HaveOccurred()) 1604 1605 g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5, m6, m7)).To(Succeed()) 1606 }) 1607 t.Run("Can't safely remediate 7 machines CP with 3 additional etcd member failures", func(t *testing.T) { 1608 g := NewWithT(t) 1609 1610 m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) 1611 m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember()) 1612 m3 := createMachine(ctx, g, ns.Name, "m3-etcd-unhealthy-", withUnhealthyEtcdMember()) 1613 m4 := createMachine(ctx, g, ns.Name, "m4-etcd-unhealthy-", withUnhealthyEtcdMember()) 1614 m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember()) 1615 m6 := createMachine(ctx, g, ns.Name, "m6-etcd-healthy-", withHealthyEtcdMember()) 1616 m7 := createMachine(ctx, g, ns.Name, "m7-etcd-healthy-", withHealthyEtcdMember()) 1617 1618 controlPlane := &internal.ControlPlane{ 1619 KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ 1620 Replicas: utilpointer.Int32(5), 1621 }}, 1622 Cluster: &clusterv1.Cluster{}, 1623 Machines: collections.FromMachines(m1, m2, m3, m4, m5, m6, m7), 1624 } 1625 1626 r := &KubeadmControlPlaneReconciler{ 1627 Client: env.GetClient(), 1628 recorder: record.NewFakeRecorder(32), 1629 managementCluster: &fakeManagementCluster{ 1630 Workload: fakeWorkloadCluster{ 1631 EtcdMembersResult: nodes(controlPlane.Machines), 1632 }, 1633 }, 1634 } 1635 controlPlane.InjectTestManagementCluster(r.managementCluster) 1636 1637 ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1) 1638 g.Expect(ret).To(BeFalse()) 1639 g.Expect(err).ToNot(HaveOccurred()) 1640 1641 g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5, m6, m7)).To(Succeed()) 1642 }) 1643 } 1644 1645 func nodes(machines collections.Machines) []string { 1646 nodes := make([]string, 0, machines.Len()) 1647 for _, m := range machines { 1648 if m.Status.NodeRef != nil { 1649 nodes = append(nodes, m.Status.NodeRef.Name) 1650 } 1651 } 1652 return nodes 1653 } 1654 1655 type machineOption func(*clusterv1.Machine) 1656 1657 func withMachineHealthCheckFailed() machineOption { 1658 return func(machine *clusterv1.Machine) { 1659 conditions.MarkFalse(machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "") 1660 conditions.MarkFalse(machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "") 1661 } 1662 } 1663 1664 func withStuckRemediation() machineOption { 1665 return func(machine *clusterv1.Machine) { 1666 conditions.MarkTrue(machine, clusterv1.MachineHealthCheckSucceededCondition) 1667 conditions.MarkFalse(machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "") 1668 } 1669 } 1670 1671 func withHealthyEtcdMember() machineOption { 1672 return func(machine *clusterv1.Machine) { 1673 conditions.MarkTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition) 1674 } 1675 } 1676 1677 func withUnhealthyEtcdMember() machineOption { 1678 return func(machine *clusterv1.Machine) { 1679 conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "") 1680 } 1681 } 1682 1683 func withNodeRef(ref string) machineOption { 1684 return func(machine *clusterv1.Machine) { 1685 machine.Status.NodeRef = &corev1.ObjectReference{ 1686 Kind: "Node", 1687 Name: ref, 1688 } 1689 } 1690 } 1691 1692 func withRemediateForAnnotation(remediatedFor string) machineOption { 1693 return func(machine *clusterv1.Machine) { 1694 if machine.Annotations == nil { 1695 machine.Annotations = map[string]string{} 1696 } 1697 machine.Annotations[controlplanev1.RemediationForAnnotation] = remediatedFor 1698 } 1699 } 1700 1701 func withWaitBeforeDeleteFinalizer() machineOption { 1702 return func(machine *clusterv1.Machine) { 1703 machine.Finalizers = []string{"wait-before-delete"} 1704 } 1705 } 1706 1707 func createMachine(ctx context.Context, g *WithT, namespace, name string, options ...machineOption) *clusterv1.Machine { 1708 m := &clusterv1.Machine{ 1709 ObjectMeta: metav1.ObjectMeta{ 1710 Namespace: namespace, 1711 GenerateName: name, 1712 }, 1713 Spec: clusterv1.MachineSpec{ 1714 ClusterName: "cluster", 1715 Bootstrap: clusterv1.Bootstrap{ 1716 DataSecretName: utilpointer.String("secret"), 1717 }, 1718 }, 1719 } 1720 g.Expect(env.CreateAndWait(ctx, m)).To(Succeed()) 1721 1722 patchHelper, err := patch.NewHelper(m, env.GetClient()) 1723 g.Expect(err).ToNot(HaveOccurred()) 1724 1725 for _, opt := range append(options, withNodeRef(fmt.Sprintf("node-%s", m.Name))) { 1726 opt(m) 1727 } 1728 1729 g.Expect(patchHelper.Patch(ctx, m)).To(Succeed()) 1730 return m 1731 } 1732 1733 func getDeletingMachine(namespace, name string, options ...machineOption) *clusterv1.Machine { 1734 deletionTime := metav1.Now() 1735 m := &clusterv1.Machine{ 1736 ObjectMeta: metav1.ObjectMeta{ 1737 Namespace: namespace, 1738 Name: name, 1739 DeletionTimestamp: &deletionTime, 1740 }, 1741 Spec: clusterv1.MachineSpec{ 1742 ClusterName: "cluster", 1743 Bootstrap: clusterv1.Bootstrap{ 1744 DataSecretName: utilpointer.String("secret"), 1745 }, 1746 }, 1747 } 1748 1749 for _, opt := range append(options, withNodeRef(fmt.Sprintf("node-%s", m.Name))) { 1750 opt(m) 1751 } 1752 return m 1753 } 1754 1755 func assertMachineCondition(ctx context.Context, g *WithT, m *clusterv1.Machine, t clusterv1.ConditionType, status corev1.ConditionStatus, reason string, severity clusterv1.ConditionSeverity, message string) { 1756 g.Eventually(func() error { 1757 if err := env.Get(ctx, client.ObjectKey{Namespace: m.Namespace, Name: m.Name}, m); err != nil { 1758 return err 1759 } 1760 c := conditions.Get(m, t) 1761 if c == nil { 1762 return errors.Errorf("condition %q was nil", t) 1763 } 1764 if c.Status != status { 1765 return errors.Errorf("condition %q status %q did not match %q", t, c.Status, status) 1766 } 1767 if c.Reason != reason { 1768 return errors.Errorf("condition %q reason %q did not match %q", t, c.Reason, reason) 1769 } 1770 if c.Severity != severity { 1771 return errors.Errorf("condition %q severity %q did not match %q", t, c.Status, status) 1772 } 1773 if c.Message != message { 1774 return errors.Errorf("condition %q message %q did not match %q", t, c.Message, message) 1775 } 1776 return nil 1777 }, 10*time.Second).Should(Succeed()) 1778 } 1779 1780 func MustMarshalRemediationData(r *RemediationData) string { 1781 s, err := r.Marshal() 1782 if err != nil { 1783 panic("failed to marshal remediation data") 1784 } 1785 return s 1786 }