istio.io/istio@v0.0.0-20240520182934-d79c90f27776/tests/integration/pilot/cni_race_test.go (about) 1 //go:build integ 2 // +build integ 3 4 // Copyright Istio Authors 5 // 6 // Licensed under the Apache License, Version 2.0 (the "License"); 7 // you may not use this file except in compliance with the License. 8 // You may obtain a copy of the License at 9 // 10 // http://www.apache.org/licenses/LICENSE-2.0 11 // 12 // Unless required by applicable law or agreed to in writing, software 13 // distributed under the License is distributed on an "AS IS" BASIS, 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 // See the License for the specific language governing permissions and 16 // limitations under the License. 17 18 package pilot 19 20 import ( 21 "context" 22 "errors" 23 "fmt" 24 "testing" 25 "time" 26 27 appsv1 "k8s.io/api/apps/v1" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 30 istioKube "istio.io/istio/pkg/kube" 31 "istio.io/istio/pkg/test/framework" 32 "istio.io/istio/pkg/test/framework/components/cluster" 33 "istio.io/istio/pkg/test/framework/components/echo" 34 "istio.io/istio/pkg/test/framework/components/echo/common/ports" 35 "istio.io/istio/pkg/test/framework/components/echo/deployment" 36 "istio.io/istio/pkg/test/framework/components/namespace" 37 "istio.io/istio/pkg/test/scopes" 38 "istio.io/istio/pkg/test/shell" 39 "istio.io/istio/pkg/test/util/retry" 40 "istio.io/istio/tools/istio-iptables/pkg/constants" 41 ) 42 43 func TestCNIRaceRepair(t *testing.T) { 44 framework.NewTest(t). 45 Run(func(t framework.TestContext) { 46 if !i.Settings().EnableCNI { 47 t.Skip("CNI race condition mitigation is only tested when CNI is enabled.") 48 } 49 c := t.Clusters().Default() 50 51 ns := namespace.NewOrFail(t, t, namespace.Config{ 52 Prefix: "cni-race", 53 Inject: true, 54 }) 55 56 // Create a echo deployment in the cni-race namespace. 57 t.Logf("Deploy an echo instance in namespace %v...", ns.Name()) 58 deployment. 59 New(t, c). 60 WithConfig(echo.Config{ 61 Namespace: ns, 62 Ports: ports.All(), 63 Subsets: []echo.SubsetConfig{{}}, 64 }).BuildOrFail(t) 65 66 // To begin with, delete CNI Daemonset to simulate a CNI race condition. 67 // Temporarily store CNI DaemonSet, which will be deployed again later. 68 t.Log("Delete CNI Daemonset temporarily to simulate race condition") 69 cniDaemonSet := getCNIDaemonSet(t, c) 70 deleteCNIDaemonset(t, c) 71 72 // Rollout restart instances in the echo namespace, and wait for a broken instance. 73 t.Log("Rollout restart echo instance to get a broken instance") 74 rolloutCmd := fmt.Sprintf("kubectl rollout restart deployment -n %s", ns.Name()) 75 if _, err := shell.Execute(true, rolloutCmd); err != nil { 76 t.Fatalf("failed to rollout restart deployments %v", err) 77 } 78 waitForBrokenPodOrFail(t, c, ns) 79 80 t.Log("Redeploy CNI and verify repair takes effect by evicting the broken pod") 81 // Now bring back CNI Daemonset, and pod in the echo namespace should be repaired. 82 deployCNIDaemonset(t, c, cniDaemonSet) 83 waitForRepairOrFail(t, c, ns) 84 }) 85 } 86 87 func getCNIDaemonSet(ctx framework.TestContext, c cluster.Cluster) *appsv1.DaemonSet { 88 cniDaemonSet, err := c.(istioKube.CLIClient). 89 Kube().AppsV1().DaemonSets(i.Settings().SystemNamespace). 90 Get(context.Background(), "istio-cni-node", metav1.GetOptions{}) 91 if err != nil { 92 ctx.Fatalf("failed to get CNI Daemonset %v", err) 93 } 94 if cniDaemonSet == nil { 95 ctx.Fatal("cannot find CNI Daemonset") 96 } 97 return cniDaemonSet 98 } 99 100 func deleteCNIDaemonset(ctx framework.TestContext, c cluster.Cluster) { 101 if err := c.(istioKube.CLIClient). 102 Kube().AppsV1().DaemonSets(i.Settings().SystemNamespace). 103 Delete(context.Background(), "istio-cni-node", metav1.DeleteOptions{}); err != nil { 104 ctx.Fatalf("failed to delete CNI Daemonset %v", err) 105 } 106 107 // Wait until the CNI Daemonset pod cannot be fetched anymore 108 retry.UntilSuccessOrFail(ctx, func() error { 109 scopes.Framework.Infof("Checking if CNI Daemonset pods are deleted...") 110 pods, err := c.PodsForSelector(context.TODO(), i.Settings().SystemNamespace, "k8s-app=istio-cni-node") 111 if err != nil { 112 return err 113 } 114 if len(pods.Items) > 0 { 115 return errors.New("CNI Daemonset pod still exists after deletion") 116 } 117 return nil 118 }, retry.Delay(1*time.Second), retry.Timeout(80*time.Second)) 119 } 120 121 func deployCNIDaemonset(ctx framework.TestContext, c cluster.Cluster, cniDaemonSet *appsv1.DaemonSet) { 122 deployDaemonSet := appsv1.DaemonSet{} 123 deployDaemonSet.Spec = cniDaemonSet.Spec 124 deployDaemonSet.ObjectMeta = metav1.ObjectMeta{ 125 Name: cniDaemonSet.ObjectMeta.Name, 126 Namespace: cniDaemonSet.ObjectMeta.Namespace, 127 Labels: cniDaemonSet.ObjectMeta.Labels, 128 Annotations: cniDaemonSet.ObjectMeta.Annotations, 129 } 130 _, err := c.(istioKube.CLIClient).Kube().AppsV1().DaemonSets(i.Settings().SystemNamespace). 131 Create(context.Background(), &deployDaemonSet, metav1.CreateOptions{}) 132 if err != nil { 133 ctx.Fatalf("failed to deploy CNI Daemonset %v", err) 134 } 135 } 136 137 func waitForBrokenPodOrFail(t framework.TestContext, cluster cluster.Cluster, ns namespace.Instance) { 138 retry.UntilSuccessOrFail(t, func() error { 139 pods, err := cluster.Kube().CoreV1().Pods(ns.Name()).List(context.TODO(), metav1.ListOptions{}) 140 if err != nil { 141 return err 142 } 143 if len(pods.Items) == 0 { 144 return fmt.Errorf("still waiting the pod in namespace %v to start", ns.Name()) 145 } 146 // Verify that at least one pod is in broken state due to the race condition. 147 for _, p := range pods.Items { 148 for _, container := range p.Status.InitContainerStatuses { 149 if state := container.LastTerminationState.Terminated; state != nil && state.ExitCode == 150 constants.ValidationErrorCode { 151 return nil 152 } 153 } 154 } 155 return fmt.Errorf("cannot find any pod with wanted exit code %v", constants.ValidationErrorCode) 156 }, retry.Delay(1*time.Second), retry.Timeout(80*time.Second)) 157 } 158 159 func waitForRepairOrFail(t framework.TestContext, cluster cluster.Cluster, ns namespace.Instance) { 160 retry.UntilSuccessOrFail(t, func() error { 161 pods, err := cluster.Kube().CoreV1().Pods(ns.Name()).List(context.TODO(), metav1.ListOptions{}) 162 if err != nil { 163 return err 164 } 165 if len(pods.Items) == 0 { 166 return errors.New("no pod found") 167 } 168 // Verify that no pod is broken by the race condition now. 169 for _, p := range pods.Items { 170 for _, container := range p.Status.InitContainerStatuses { 171 if state := container.LastTerminationState.Terminated; state != nil && state.ExitCode == 172 constants.ValidationErrorCode { 173 return errors.New("there are still pods in broken state due to CNI race condition") 174 } 175 } 176 } 177 return nil 178 }, retry.Delay(1*time.Second), retry.Timeout(80*time.Second)) 179 }