k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/test/e2e/cloud/gcp/reboot.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package gcp 18 19 import ( 20 "context" 21 "fmt" 22 "strings" 23 "sync" 24 "time" 25 26 v1 "k8s.io/api/core/v1" 27 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 "k8s.io/apimachinery/pkg/fields" 29 "k8s.io/apimachinery/pkg/labels" 30 "k8s.io/apimachinery/pkg/util/sets" 31 clientset "k8s.io/client-go/kubernetes" 32 "k8s.io/kubernetes/test/e2e/feature" 33 "k8s.io/kubernetes/test/e2e/framework" 34 e2enode "k8s.io/kubernetes/test/e2e/framework/node" 35 e2epod "k8s.io/kubernetes/test/e2e/framework/pod" 36 e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" 37 e2essh "k8s.io/kubernetes/test/e2e/framework/ssh" 38 testutils "k8s.io/kubernetes/test/utils" 39 admissionapi "k8s.io/pod-security-admission/api" 40 41 "github.com/onsi/ginkgo/v2" 42 ) 43 44 const ( 45 // How long a node is allowed to go from "Ready" to "NotReady" after a 46 // reboot is issued before the test is considered failed. 47 rebootNodeNotReadyTimeout = 2 * time.Minute 48 49 // How long a node is allowed to go from "NotReady" to "Ready" after a 50 // reboot is issued and it is found to be "NotReady" before the test is 51 // considered failed. 52 rebootNodeReadyAgainTimeout = 5 * time.Minute 53 54 // How long pods have to be "ready" after the reboot. 55 rebootPodReadyAgainTimeout = 5 * time.Minute 56 ) 57 58 var _ = SIGDescribe("Reboot", framework.WithDisruptive(), feature.Reboot, func() { 59 var f *framework.Framework 60 61 ginkgo.BeforeEach(func() { 62 // These tests requires SSH to nodes, so the provider check should be identical to there 63 // (the limiting factor is the implementation of util.go's e2essh.GetSigner(...)). 64 65 // Cluster must support node reboot 66 e2eskipper.SkipUnlessProviderIs(framework.ProvidersWithSSH...) 67 }) 68 69 ginkgo.AfterEach(func(ctx context.Context) { 70 if ginkgo.CurrentSpecReport().Failed() { 71 // Most of the reboot tests just make sure that addon/system pods are running, so dump 72 // events for the kube-system namespace on failures 73 namespaceName := metav1.NamespaceSystem 74 ginkgo.By(fmt.Sprintf("Collecting events from namespace %q.", namespaceName)) 75 events, err := f.ClientSet.CoreV1().Events(namespaceName).List(ctx, metav1.ListOptions{}) 76 framework.ExpectNoError(err) 77 78 for _, e := range events.Items { 79 framework.Logf("event for %v: %v %v: %v", e.InvolvedObject.Name, e.Source, e.Reason, e.Message) 80 } 81 } 82 // In GKE, our current tunneling setup has the potential to hold on to a broken tunnel (from a 83 // rebooted/deleted node) for up to 5 minutes before all tunnels are dropped and recreated. Most tests 84 // make use of some proxy feature to verify functionality. So, if a reboot test runs right before a test 85 // that tries to get logs, for example, we may get unlucky and try to use a closed tunnel to a node that 86 // was recently rebooted. There's no good way to framework.Poll for proxies being closed, so we sleep. 87 // 88 // TODO(cjcullen) reduce this sleep (#19314) 89 if framework.ProviderIs("gke") { 90 ginkgo.By("waiting 5 minutes for all dead tunnels to be dropped") 91 time.Sleep(5 * time.Minute) 92 } 93 }) 94 95 f = framework.NewDefaultFramework("reboot") 96 f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged 97 98 ginkgo.It("each node by ordering clean reboot and ensure they function upon restart", func(ctx context.Context) { 99 // clean shutdown and restart 100 // We sleep 10 seconds to give some time for ssh command to cleanly finish before the node is rebooted. 101 testReboot(ctx, f.ClientSet, "nohup sh -c 'sleep 10 && sudo reboot' >/dev/null 2>&1 &", nil) 102 }) 103 104 ginkgo.It("each node by ordering unclean reboot and ensure they function upon restart", func(ctx context.Context) { 105 // unclean shutdown and restart 106 // We sleep 10 seconds to give some time for ssh command to cleanly finish before the node is shutdown. 107 testReboot(ctx, f.ClientSet, "nohup sh -c 'echo 1 | sudo tee /proc/sys/kernel/sysrq && sleep 10 && echo b | sudo tee /proc/sysrq-trigger' >/dev/null 2>&1 &", nil) 108 }) 109 110 ginkgo.It("each node by triggering kernel panic and ensure they function upon restart", func(ctx context.Context) { 111 // kernel panic 112 // We sleep 10 seconds to give some time for ssh command to cleanly finish before kernel panic is triggered. 113 testReboot(ctx, f.ClientSet, "nohup sh -c 'echo 1 | sudo tee /proc/sys/kernel/sysrq && sleep 10 && echo c | sudo tee /proc/sysrq-trigger' >/dev/null 2>&1 &", nil) 114 }) 115 116 ginkgo.It("each node by switching off the network interface and ensure they function upon switch on", func(ctx context.Context) { 117 // switch the network interface off for a while to simulate a network outage 118 // We sleep 10 seconds to give some time for ssh command to cleanly finish before network is down. 119 cmd := "nohup sh -c '" + 120 "sleep 10; " + 121 "echo Shutting down eth0 | sudo tee /dev/kmsg; " + 122 "sudo ip link set eth0 down | sudo tee /dev/kmsg; " + 123 "sleep 120; " + 124 "echo Starting up eth0 | sudo tee /dev/kmsg; " + 125 "sudo ip link set eth0 up | sudo tee /dev/kmsg; " + 126 "sleep 10; " + 127 "echo Retrying starting up eth0 | sudo tee /dev/kmsg; " + 128 "sudo ip link set eth0 up | sudo tee /dev/kmsg; " + 129 "echo Running dhclient | sudo tee /dev/kmsg; " + 130 "sudo dhclient | sudo tee /dev/kmsg; " + 131 "echo Starting systemd-networkd | sudo tee /dev/kmsg; " + 132 "sudo systemctl restart systemd-networkd | sudo tee /dev/kmsg" + 133 "' >/dev/null 2>&1 &" 134 testReboot(ctx, f.ClientSet, cmd, nil) 135 }) 136 137 ginkgo.It("each node by dropping all inbound packets for a while and ensure they function afterwards", func(ctx context.Context) { 138 // tell the firewall to drop all inbound packets for a while 139 // We sleep 10 seconds to give some time for ssh command to cleanly finish before starting dropping inbound packets. 140 // We still accept packages send from localhost to prevent monit from restarting kubelet. 141 tmpLogPath := "/tmp/drop-inbound.log" 142 testReboot(ctx, f.ClientSet, dropPacketsScript("INPUT", tmpLogPath), catLogHook(ctx, tmpLogPath)) 143 }) 144 145 ginkgo.It("each node by dropping all outbound packets for a while and ensure they function afterwards", func(ctx context.Context) { 146 // tell the firewall to drop all outbound packets for a while 147 // We sleep 10 seconds to give some time for ssh command to cleanly finish before starting dropping outbound packets. 148 // We still accept packages send to localhost to prevent monit from restarting kubelet. 149 tmpLogPath := "/tmp/drop-outbound.log" 150 testReboot(ctx, f.ClientSet, dropPacketsScript("OUTPUT", tmpLogPath), catLogHook(ctx, tmpLogPath)) 151 }) 152 }) 153 154 func testReboot(ctx context.Context, c clientset.Interface, rebootCmd string, hook terminationHook) { 155 // Get all nodes, and kick off the test on each. 156 nodelist, err := e2enode.GetReadySchedulableNodes(ctx, c) 157 framework.ExpectNoError(err, "failed to list nodes") 158 if hook != nil { 159 defer func() { 160 framework.Logf("Executing termination hook on nodes") 161 hook(framework.TestContext.Provider, nodelist) 162 }() 163 } 164 result := make([]bool, len(nodelist.Items)) 165 wg := sync.WaitGroup{} 166 wg.Add(len(nodelist.Items)) 167 168 failed := false 169 for ix := range nodelist.Items { 170 go func(ix int) { 171 defer ginkgo.GinkgoRecover() 172 defer wg.Done() 173 n := nodelist.Items[ix] 174 result[ix] = rebootNode(ctx, c, framework.TestContext.Provider, n.ObjectMeta.Name, rebootCmd) 175 if !result[ix] { 176 failed = true 177 } 178 }(ix) 179 } 180 181 // Wait for all to finish and check the final result. 182 wg.Wait() 183 184 if failed { 185 for ix := range nodelist.Items { 186 n := nodelist.Items[ix] 187 if !result[ix] { 188 framework.Logf("Node %s failed reboot test.", n.ObjectMeta.Name) 189 } 190 } 191 framework.Failf("Test failed; at least one node failed to reboot in the time given.") 192 } 193 } 194 195 func printStatusAndLogsForNotReadyPods(ctx context.Context, c clientset.Interface, ns string, podNames []string, pods []*v1.Pod) { 196 printFn := func(id, log string, err error, previous bool) { 197 prefix := "Retrieving log for container" 198 if previous { 199 prefix = "Retrieving log for the last terminated container" 200 } 201 if err != nil { 202 framework.Logf("%s %s, err: %v:\n%s\n", prefix, id, err, log) 203 } else { 204 framework.Logf("%s %s:\n%s\n", prefix, id, log) 205 } 206 } 207 podNameSet := sets.NewString(podNames...) 208 for _, p := range pods { 209 if p.Namespace != ns { 210 continue 211 } 212 if !podNameSet.Has(p.Name) { 213 continue 214 } 215 if ok, _ := testutils.PodRunningReady(p); ok { 216 continue 217 } 218 framework.Logf("Status for not ready pod %s/%s: %+v", p.Namespace, p.Name, p.Status) 219 // Print the log of the containers if pod is not running and ready. 220 for _, container := range p.Status.ContainerStatuses { 221 cIdentifer := fmt.Sprintf("%s/%s/%s", p.Namespace, p.Name, container.Name) 222 log, err := e2epod.GetPodLogs(ctx, c, p.Namespace, p.Name, container.Name) 223 printFn(cIdentifer, log, err, false) 224 // Get log from the previous container. 225 if container.RestartCount > 0 { 226 printFn(cIdentifer, log, err, true) 227 } 228 } 229 } 230 } 231 232 // rebootNode takes node name on provider through the following steps using c: 233 // - ensures the node is ready 234 // - ensures all pods on the node are running and ready 235 // - reboots the node (by executing rebootCmd over ssh) 236 // - ensures the node reaches some non-ready state 237 // - ensures the node becomes ready again 238 // - ensures all pods on the node become running and ready again 239 // 240 // It returns true through result only if all of the steps pass; at the first 241 // failed step, it will return false through result and not run the rest. 242 func rebootNode(ctx context.Context, c clientset.Interface, provider, name, rebootCmd string) bool { 243 // Setup 244 ns := metav1.NamespaceSystem 245 ps, err := testutils.NewPodStore(c, ns, labels.Everything(), fields.OneTermEqualSelector("spec.nodeName", name)) 246 if err != nil { 247 framework.Logf("Couldn't initialize pod store: %v", err) 248 return false 249 } 250 defer ps.Stop() 251 252 // Get the node initially. 253 framework.Logf("Getting %s", name) 254 node, err := c.CoreV1().Nodes().Get(ctx, name, metav1.GetOptions{}) 255 if err != nil { 256 framework.Logf("Couldn't get node %s", name) 257 return false 258 } 259 260 // Node sanity check: ensure it is "ready". 261 if !e2enode.WaitForNodeToBeReady(ctx, c, name, framework.NodeReadyInitialTimeout) { 262 return false 263 } 264 265 // Get all the pods on the node that don't have liveness probe set. 266 // Liveness probe may cause restart of a pod during node reboot, and the pod may not be running. 267 pods := ps.List() 268 podNames := []string{} 269 for _, p := range pods { 270 probe := false 271 for _, c := range p.Spec.Containers { 272 if c.LivenessProbe != nil { 273 probe = true 274 break 275 } 276 } 277 if !probe { 278 podNames = append(podNames, p.ObjectMeta.Name) 279 } 280 } 281 framework.Logf("Node %s has %d assigned pods with no liveness probes: %v", name, len(podNames), podNames) 282 283 // For each pod, we do a sanity check to ensure it's running / healthy 284 // or succeeded now, as that's what we'll be checking later. 285 if !e2epod.CheckPodsRunningReadyOrSucceeded(ctx, c, ns, podNames, framework.PodReadyBeforeTimeout) { 286 printStatusAndLogsForNotReadyPods(ctx, c, ns, podNames, pods) 287 return false 288 } 289 290 // Reboot the node. 291 if err = e2essh.IssueSSHCommand(ctx, rebootCmd, provider, node); err != nil { 292 framework.Logf("Error while issuing ssh command: %v", err) 293 return false 294 } 295 296 // Wait for some kind of "not ready" status. 297 if !e2enode.WaitForNodeToBeNotReady(ctx, c, name, rebootNodeNotReadyTimeout) { 298 return false 299 } 300 301 // Wait for some kind of "ready" status. 302 if !e2enode.WaitForNodeToBeReady(ctx, c, name, rebootNodeReadyAgainTimeout) { 303 return false 304 } 305 306 // Ensure all of the pods that we found on this node before the reboot are 307 // running / healthy, or succeeded. 308 if !e2epod.CheckPodsRunningReadyOrSucceeded(ctx, c, ns, podNames, rebootPodReadyAgainTimeout) { 309 newPods := ps.List() 310 printStatusAndLogsForNotReadyPods(ctx, c, ns, podNames, newPods) 311 return false 312 } 313 314 framework.Logf("Reboot successful on node %s", name) 315 return true 316 } 317 318 type terminationHook func(provider string, nodes *v1.NodeList) 319 320 func catLogHook(ctx context.Context, logPath string) terminationHook { 321 return func(provider string, nodes *v1.NodeList) { 322 for _, n := range nodes.Items { 323 cmd := fmt.Sprintf("cat %v && rm %v", logPath, logPath) 324 if _, err := e2essh.IssueSSHCommandWithResult(ctx, cmd, provider, &n); err != nil { 325 framework.Logf("Error while issuing ssh command: %v", err) 326 } 327 } 328 329 } 330 } 331 332 func dropPacketsScript(chainName, logPath string) string { 333 return strings.Replace(fmt.Sprintf(` 334 nohup sh -c ' 335 set -x 336 sleep 10 337 while true; do sudo iptables -I ${CHAIN} 1 -s 127.0.0.1 -j ACCEPT && break; done 338 while true; do sudo iptables -I ${CHAIN} 2 -j DROP && break; done 339 date 340 sleep 120 341 while true; do sudo iptables -D ${CHAIN} -j DROP && break; done 342 while true; do sudo iptables -D ${CHAIN} -s 127.0.0.1 -j ACCEPT && break; done 343 ' >%v 2>&1 & 344 `, logPath), "${CHAIN}", chainName, -1) 345 }