istio.io/istio@v0.0.0-20240520182934-d79c90f27776/tests/integration/pilot/cni_race_test.go (about)

     1  //go:build integ
     2  // +build integ
     3  
     4  //  Copyright Istio Authors
     5  //
     6  //  Licensed under the Apache License, Version 2.0 (the "License");
     7  //  you may not use this file except in compliance with the License.
     8  //  You may obtain a copy of the License at
     9  //
    10  //      http://www.apache.org/licenses/LICENSE-2.0
    11  //
    12  //  Unless required by applicable law or agreed to in writing, software
    13  //  distributed under the License is distributed on an "AS IS" BASIS,
    14  //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    15  //  See the License for the specific language governing permissions and
    16  //  limitations under the License.
    17  
    18  package pilot
    19  
    20  import (
    21  	"context"
    22  	"errors"
    23  	"fmt"
    24  	"testing"
    25  	"time"
    26  
    27  	appsv1 "k8s.io/api/apps/v1"
    28  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  
    30  	istioKube "istio.io/istio/pkg/kube"
    31  	"istio.io/istio/pkg/test/framework"
    32  	"istio.io/istio/pkg/test/framework/components/cluster"
    33  	"istio.io/istio/pkg/test/framework/components/echo"
    34  	"istio.io/istio/pkg/test/framework/components/echo/common/ports"
    35  	"istio.io/istio/pkg/test/framework/components/echo/deployment"
    36  	"istio.io/istio/pkg/test/framework/components/namespace"
    37  	"istio.io/istio/pkg/test/scopes"
    38  	"istio.io/istio/pkg/test/shell"
    39  	"istio.io/istio/pkg/test/util/retry"
    40  	"istio.io/istio/tools/istio-iptables/pkg/constants"
    41  )
    42  
    43  func TestCNIRaceRepair(t *testing.T) {
    44  	framework.NewTest(t).
    45  		Run(func(t framework.TestContext) {
    46  			if !i.Settings().EnableCNI {
    47  				t.Skip("CNI race condition mitigation is only tested when CNI is enabled.")
    48  			}
    49  			c := t.Clusters().Default()
    50  
    51  			ns := namespace.NewOrFail(t, t, namespace.Config{
    52  				Prefix: "cni-race",
    53  				Inject: true,
    54  			})
    55  
    56  			// Create a echo deployment in the cni-race namespace.
    57  			t.Logf("Deploy an echo instance in namespace %v...", ns.Name())
    58  			deployment.
    59  				New(t, c).
    60  				WithConfig(echo.Config{
    61  					Namespace: ns,
    62  					Ports:     ports.All(),
    63  					Subsets:   []echo.SubsetConfig{{}},
    64  				}).BuildOrFail(t)
    65  
    66  			// To begin with, delete CNI Daemonset to simulate a CNI race condition.
    67  			// Temporarily store CNI DaemonSet, which will be deployed again later.
    68  			t.Log("Delete CNI Daemonset temporarily to simulate race condition")
    69  			cniDaemonSet := getCNIDaemonSet(t, c)
    70  			deleteCNIDaemonset(t, c)
    71  
    72  			// Rollout restart instances in the echo namespace, and wait for a broken instance.
    73  			t.Log("Rollout restart echo instance to get a broken instance")
    74  			rolloutCmd := fmt.Sprintf("kubectl rollout restart deployment -n %s", ns.Name())
    75  			if _, err := shell.Execute(true, rolloutCmd); err != nil {
    76  				t.Fatalf("failed to rollout restart deployments %v", err)
    77  			}
    78  			waitForBrokenPodOrFail(t, c, ns)
    79  
    80  			t.Log("Redeploy CNI and verify repair takes effect by evicting the broken pod")
    81  			// Now bring back CNI Daemonset, and pod in the echo namespace should be repaired.
    82  			deployCNIDaemonset(t, c, cniDaemonSet)
    83  			waitForRepairOrFail(t, c, ns)
    84  		})
    85  }
    86  
    87  func getCNIDaemonSet(ctx framework.TestContext, c cluster.Cluster) *appsv1.DaemonSet {
    88  	cniDaemonSet, err := c.(istioKube.CLIClient).
    89  		Kube().AppsV1().DaemonSets(i.Settings().SystemNamespace).
    90  		Get(context.Background(), "istio-cni-node", metav1.GetOptions{})
    91  	if err != nil {
    92  		ctx.Fatalf("failed to get CNI Daemonset %v", err)
    93  	}
    94  	if cniDaemonSet == nil {
    95  		ctx.Fatal("cannot find CNI Daemonset")
    96  	}
    97  	return cniDaemonSet
    98  }
    99  
   100  func deleteCNIDaemonset(ctx framework.TestContext, c cluster.Cluster) {
   101  	if err := c.(istioKube.CLIClient).
   102  		Kube().AppsV1().DaemonSets(i.Settings().SystemNamespace).
   103  		Delete(context.Background(), "istio-cni-node", metav1.DeleteOptions{}); err != nil {
   104  		ctx.Fatalf("failed to delete CNI Daemonset %v", err)
   105  	}
   106  
   107  	// Wait until the CNI Daemonset pod cannot be fetched anymore
   108  	retry.UntilSuccessOrFail(ctx, func() error {
   109  		scopes.Framework.Infof("Checking if CNI Daemonset pods are deleted...")
   110  		pods, err := c.PodsForSelector(context.TODO(), i.Settings().SystemNamespace, "k8s-app=istio-cni-node")
   111  		if err != nil {
   112  			return err
   113  		}
   114  		if len(pods.Items) > 0 {
   115  			return errors.New("CNI Daemonset pod still exists after deletion")
   116  		}
   117  		return nil
   118  	}, retry.Delay(1*time.Second), retry.Timeout(80*time.Second))
   119  }
   120  
   121  func deployCNIDaemonset(ctx framework.TestContext, c cluster.Cluster, cniDaemonSet *appsv1.DaemonSet) {
   122  	deployDaemonSet := appsv1.DaemonSet{}
   123  	deployDaemonSet.Spec = cniDaemonSet.Spec
   124  	deployDaemonSet.ObjectMeta = metav1.ObjectMeta{
   125  		Name:        cniDaemonSet.ObjectMeta.Name,
   126  		Namespace:   cniDaemonSet.ObjectMeta.Namespace,
   127  		Labels:      cniDaemonSet.ObjectMeta.Labels,
   128  		Annotations: cniDaemonSet.ObjectMeta.Annotations,
   129  	}
   130  	_, err := c.(istioKube.CLIClient).Kube().AppsV1().DaemonSets(i.Settings().SystemNamespace).
   131  		Create(context.Background(), &deployDaemonSet, metav1.CreateOptions{})
   132  	if err != nil {
   133  		ctx.Fatalf("failed to deploy CNI Daemonset %v", err)
   134  	}
   135  }
   136  
   137  func waitForBrokenPodOrFail(t framework.TestContext, cluster cluster.Cluster, ns namespace.Instance) {
   138  	retry.UntilSuccessOrFail(t, func() error {
   139  		pods, err := cluster.Kube().CoreV1().Pods(ns.Name()).List(context.TODO(), metav1.ListOptions{})
   140  		if err != nil {
   141  			return err
   142  		}
   143  		if len(pods.Items) == 0 {
   144  			return fmt.Errorf("still waiting the pod in namespace %v to start", ns.Name())
   145  		}
   146  		// Verify that at least one pod is in broken state due to the race condition.
   147  		for _, p := range pods.Items {
   148  			for _, container := range p.Status.InitContainerStatuses {
   149  				if state := container.LastTerminationState.Terminated; state != nil && state.ExitCode ==
   150  					constants.ValidationErrorCode {
   151  					return nil
   152  				}
   153  			}
   154  		}
   155  		return fmt.Errorf("cannot find any pod with wanted exit code %v", constants.ValidationErrorCode)
   156  	}, retry.Delay(1*time.Second), retry.Timeout(80*time.Second))
   157  }
   158  
   159  func waitForRepairOrFail(t framework.TestContext, cluster cluster.Cluster, ns namespace.Instance) {
   160  	retry.UntilSuccessOrFail(t, func() error {
   161  		pods, err := cluster.Kube().CoreV1().Pods(ns.Name()).List(context.TODO(), metav1.ListOptions{})
   162  		if err != nil {
   163  			return err
   164  		}
   165  		if len(pods.Items) == 0 {
   166  			return errors.New("no pod found")
   167  		}
   168  		// Verify that no pod is broken by the race condition now.
   169  		for _, p := range pods.Items {
   170  			for _, container := range p.Status.InitContainerStatuses {
   171  				if state := container.LastTerminationState.Terminated; state != nil && state.ExitCode ==
   172  					constants.ValidationErrorCode {
   173  					return errors.New("there are still pods in broken state due to CNI race condition")
   174  				}
   175  			}
   176  		}
   177  		return nil
   178  	}, retry.Delay(1*time.Second), retry.Timeout(80*time.Second))
   179  }