k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/test/e2e/apimachinery/etcd_failure.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package apimachinery 18 19 import ( 20 "context" 21 "time" 22 23 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 24 "k8s.io/apimachinery/pkg/labels" 25 "k8s.io/apimachinery/pkg/util/wait" 26 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 27 "k8s.io/kubernetes/test/e2e/apps" 28 "k8s.io/kubernetes/test/e2e/framework" 29 e2erc "k8s.io/kubernetes/test/e2e/framework/rc" 30 e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" 31 e2essh "k8s.io/kubernetes/test/e2e/framework/ssh" 32 testutils "k8s.io/kubernetes/test/utils" 33 imageutils "k8s.io/kubernetes/test/utils/image" 34 admissionapi "k8s.io/pod-security-admission/api" 35 36 "github.com/onsi/ginkgo/v2" 37 ) 38 39 var _ = SIGDescribe("Etcd failure", framework.WithDisruptive(), func() { 40 41 f := framework.NewDefaultFramework("etcd-failure") 42 f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged 43 44 ginkgo.BeforeEach(func(ctx context.Context) { 45 // This test requires: 46 // - SSH 47 // - master access 48 // ... so the provider check should be identical to the intersection of 49 // providers that provide those capabilities. 50 e2eskipper.SkipUnlessProviderIs("gce") 51 e2eskipper.SkipUnlessSSHKeyPresent() 52 53 err := e2erc.RunRC(ctx, testutils.RCConfig{ 54 Client: f.ClientSet, 55 Name: "baz", 56 Namespace: f.Namespace.Name, 57 Image: imageutils.GetPauseImageName(), 58 Replicas: 1, 59 }) 60 framework.ExpectNoError(err) 61 }) 62 63 ginkgo.It("should recover from network partition with master", func(ctx context.Context) { 64 etcdFailTest( 65 ctx, 66 f, 67 "sudo iptables -A INPUT -p tcp --destination-port 2379 -j DROP", 68 "sudo iptables -D INPUT -p tcp --destination-port 2379 -j DROP", 69 ) 70 }) 71 72 ginkgo.It("should recover from SIGKILL", func(ctx context.Context) { 73 etcdFailTest( 74 ctx, 75 f, 76 "pgrep etcd | xargs -I {} sudo kill -9 {}", 77 "echo 'do nothing. monit should restart etcd.'", 78 ) 79 }) 80 }) 81 82 func etcdFailTest(ctx context.Context, f *framework.Framework, failCommand, fixCommand string) { 83 doEtcdFailure(ctx, failCommand, fixCommand) 84 85 checkExistingRCRecovers(ctx, f) 86 87 apps.TestReplicationControllerServeImageOrFail(ctx, f, "basic", imageutils.GetE2EImage(imageutils.Agnhost)) 88 } 89 90 // For this duration, etcd will be failed by executing a failCommand on the master. 91 // If repeat is true, the failCommand will be called at a rate of once per second for 92 // the failure duration. If repeat is false, failCommand will only be called once at the 93 // beginning of the failure duration. After this duration, we execute a fixCommand on the 94 // master and go on to assert that etcd and kubernetes components recover. 95 const etcdFailureDuration = 20 * time.Second 96 97 func doEtcdFailure(ctx context.Context, failCommand, fixCommand string) { 98 ginkgo.By("failing etcd") 99 100 masterExec(ctx, failCommand) 101 time.Sleep(etcdFailureDuration) 102 masterExec(ctx, fixCommand) 103 } 104 105 func masterExec(ctx context.Context, cmd string) { 106 host := framework.APIAddress() + ":22" 107 result, err := e2essh.SSH(ctx, cmd, host, framework.TestContext.Provider) 108 framework.ExpectNoError(err, "failed to SSH to host %s on provider %s and run command: %q", host, framework.TestContext.Provider, cmd) 109 if result.Code != 0 { 110 e2essh.LogResult(result) 111 framework.Failf("master exec command returned non-zero") 112 } 113 } 114 115 func checkExistingRCRecovers(ctx context.Context, f *framework.Framework) { 116 ginkgo.By("assert that the pre-existing replication controller recovers") 117 podClient := f.ClientSet.CoreV1().Pods(f.Namespace.Name) 118 rcSelector := labels.Set{"name": "baz"}.AsSelector() 119 120 ginkgo.By("deleting pods from existing replication controller") 121 framework.ExpectNoError(wait.PollWithContext(ctx, time.Millisecond*500, time.Second*60, func(ctx context.Context) (bool, error) { 122 options := metav1.ListOptions{LabelSelector: rcSelector.String()} 123 pods, err := podClient.List(ctx, options) 124 if err != nil { 125 framework.Logf("apiserver returned error, as expected before recovery: %v", err) 126 return false, nil 127 } 128 if len(pods.Items) == 0 { 129 return false, nil 130 } 131 for _, pod := range pods.Items { 132 err = podClient.Delete(ctx, pod.Name, *metav1.NewDeleteOptions(0)) 133 framework.ExpectNoError(err, "failed to delete pod %s in namespace: %s", pod.Name, f.Namespace.Name) 134 } 135 framework.Logf("apiserver has recovered") 136 return true, nil 137 })) 138 139 ginkgo.By("waiting for replication controller to recover") 140 framework.ExpectNoError(wait.PollWithContext(ctx, time.Millisecond*500, time.Second*60, func(ctx context.Context) (bool, error) { 141 options := metav1.ListOptions{LabelSelector: rcSelector.String()} 142 pods, err := podClient.List(ctx, options) 143 framework.ExpectNoError(err, "failed to list pods in namespace: %s, that match label selector: %s", f.Namespace.Name, rcSelector.String()) 144 for _, pod := range pods.Items { 145 if pod.DeletionTimestamp == nil && podutil.IsPodReady(&pod) { 146 return true, nil 147 } 148 } 149 return false, nil 150 })) 151 }