k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/test/e2e/apimachinery/etcd_failure.go

k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/test/e2e/apimachinery/etcd_failure.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package apimachinery
    18  
    19  import (
    20  	"context"
    21  	"time"
    22  
    23  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    24  	"k8s.io/apimachinery/pkg/labels"
    25  	"k8s.io/apimachinery/pkg/util/wait"
    26  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    27  	"k8s.io/kubernetes/test/e2e/apps"
    28  	"k8s.io/kubernetes/test/e2e/framework"
    29  	e2erc "k8s.io/kubernetes/test/e2e/framework/rc"
    30  	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
    31  	e2essh "k8s.io/kubernetes/test/e2e/framework/ssh"
    32  	testutils "k8s.io/kubernetes/test/utils"
    33  	imageutils "k8s.io/kubernetes/test/utils/image"
    34  	admissionapi "k8s.io/pod-security-admission/api"
    35  
    36  	"github.com/onsi/ginkgo/v2"
    37  )
    38  
    39  var _ = SIGDescribe("Etcd failure", framework.WithDisruptive(), func() {
    40  
    41  	f := framework.NewDefaultFramework("etcd-failure")
    42  	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
    43  
    44  	ginkgo.BeforeEach(func(ctx context.Context) {
    45  		// This test requires:
    46  		// - SSH
    47  		// - master access
    48  		// ... so the provider check should be identical to the intersection of
    49  		// providers that provide those capabilities.
    50  		e2eskipper.SkipUnlessProviderIs("gce")
    51  		e2eskipper.SkipUnlessSSHKeyPresent()
    52  
    53  		err := e2erc.RunRC(ctx, testutils.RCConfig{
    54  			Client:    f.ClientSet,
    55  			Name:      "baz",
    56  			Namespace: f.Namespace.Name,
    57  			Image:     imageutils.GetPauseImageName(),
    58  			Replicas:  1,
    59  		})
    60  		framework.ExpectNoError(err)
    61  	})
    62  
    63  	ginkgo.It("should recover from network partition with master", func(ctx context.Context) {
    64  		etcdFailTest(
    65  			ctx,
    66  			f,
    67  			"sudo iptables -A INPUT -p tcp --destination-port 2379 -j DROP",
    68  			"sudo iptables -D INPUT -p tcp --destination-port 2379 -j DROP",
    69  		)
    70  	})
    71  
    72  	ginkgo.It("should recover from SIGKILL", func(ctx context.Context) {
    73  		etcdFailTest(
    74  			ctx,
    75  			f,
    76  			"pgrep etcd | xargs -I {} sudo kill -9 {}",
    77  			"echo 'do nothing. monit should restart etcd.'",
    78  		)
    79  	})
    80  })
    81  
    82  func etcdFailTest(ctx context.Context, f *framework.Framework, failCommand, fixCommand string) {
    83  	doEtcdFailure(ctx, failCommand, fixCommand)
    84  
    85  	checkExistingRCRecovers(ctx, f)
    86  
    87  	apps.TestReplicationControllerServeImageOrFail(ctx, f, "basic", imageutils.GetE2EImage(imageutils.Agnhost))
    88  }
    89  
    90  // For this duration, etcd will be failed by executing a failCommand on the master.
    91  // If repeat is true, the failCommand will be called at a rate of once per second for
    92  // the failure duration. If repeat is false, failCommand will only be called once at the
    93  // beginning of the failure duration. After this duration, we execute a fixCommand on the
    94  // master and go on to assert that etcd and kubernetes components recover.
    95  const etcdFailureDuration = 20 * time.Second
    96  
    97  func doEtcdFailure(ctx context.Context, failCommand, fixCommand string) {
    98  	ginkgo.By("failing etcd")
    99  
   100  	masterExec(ctx, failCommand)
   101  	time.Sleep(etcdFailureDuration)
   102  	masterExec(ctx, fixCommand)
   103  }
   104  
   105  func masterExec(ctx context.Context, cmd string) {
   106  	host := framework.APIAddress() + ":22"
   107  	result, err := e2essh.SSH(ctx, cmd, host, framework.TestContext.Provider)
   108  	framework.ExpectNoError(err, "failed to SSH to host %s on provider %s and run command: %q", host, framework.TestContext.Provider, cmd)
   109  	if result.Code != 0 {
   110  		e2essh.LogResult(result)
   111  		framework.Failf("master exec command returned non-zero")
   112  	}
   113  }
   114  
   115  func checkExistingRCRecovers(ctx context.Context, f *framework.Framework) {
   116  	ginkgo.By("assert that the pre-existing replication controller recovers")
   117  	podClient := f.ClientSet.CoreV1().Pods(f.Namespace.Name)
   118  	rcSelector := labels.Set{"name": "baz"}.AsSelector()
   119  
   120  	ginkgo.By("deleting pods from existing replication controller")
   121  	framework.ExpectNoError(wait.PollWithContext(ctx, time.Millisecond*500, time.Second*60, func(ctx context.Context) (bool, error) {
   122  		options := metav1.ListOptions{LabelSelector: rcSelector.String()}
   123  		pods, err := podClient.List(ctx, options)
   124  		if err != nil {
   125  			framework.Logf("apiserver returned error, as expected before recovery: %v", err)
   126  			return false, nil
   127  		}
   128  		if len(pods.Items) == 0 {
   129  			return false, nil
   130  		}
   131  		for _, pod := range pods.Items {
   132  			err = podClient.Delete(ctx, pod.Name, *metav1.NewDeleteOptions(0))
   133  			framework.ExpectNoError(err, "failed to delete pod %s in namespace: %s", pod.Name, f.Namespace.Name)
   134  		}
   135  		framework.Logf("apiserver has recovered")
   136  		return true, nil
   137  	}))
   138  
   139  	ginkgo.By("waiting for replication controller to recover")
   140  	framework.ExpectNoError(wait.PollWithContext(ctx, time.Millisecond*500, time.Second*60, func(ctx context.Context) (bool, error) {
   141  		options := metav1.ListOptions{LabelSelector: rcSelector.String()}
   142  		pods, err := podClient.List(ctx, options)
   143  		framework.ExpectNoError(err, "failed to list pods in namespace: %s, that match label selector: %s", f.Namespace.Name, rcSelector.String())
   144  		for _, pod := range pods.Items {
   145  			if pod.DeletionTimestamp == nil && podutil.IsPodReady(&pod) {
   146  				return true, nil
   147  			}
   148  		}
   149  		return false, nil
   150  	}))
   151  }