k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/test/e2e/network/loadbalancer.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package network 18 19 import ( 20 "context" 21 "fmt" 22 "io" 23 "net" 24 "net/http" 25 "strconv" 26 "strings" 27 "sync" 28 "sync/atomic" 29 "time" 30 31 appsv1 "k8s.io/api/apps/v1" 32 v1 "k8s.io/api/core/v1" 33 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 34 "k8s.io/apimachinery/pkg/types" 35 "k8s.io/apimachinery/pkg/util/intstr" 36 utilnet "k8s.io/apimachinery/pkg/util/net" 37 "k8s.io/apimachinery/pkg/util/sets" 38 "k8s.io/apimachinery/pkg/util/wait" 39 clientset "k8s.io/client-go/kubernetes" 40 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 41 e2eapps "k8s.io/kubernetes/test/e2e/apps" 42 "k8s.io/kubernetes/test/e2e/feature" 43 "k8s.io/kubernetes/test/e2e/framework" 44 e2edaemonset "k8s.io/kubernetes/test/e2e/framework/daemonset" 45 e2edeployment "k8s.io/kubernetes/test/e2e/framework/deployment" 46 e2enetwork "k8s.io/kubernetes/test/e2e/framework/network" 47 e2enode "k8s.io/kubernetes/test/e2e/framework/node" 48 e2epod "k8s.io/kubernetes/test/e2e/framework/pod" 49 e2eoutput "k8s.io/kubernetes/test/e2e/framework/pod/output" 50 e2erc "k8s.io/kubernetes/test/e2e/framework/rc" 51 e2eservice "k8s.io/kubernetes/test/e2e/framework/service" 52 e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" 53 "k8s.io/kubernetes/test/e2e/network/common" 54 admissionapi "k8s.io/pod-security-admission/api" 55 netutils "k8s.io/utils/net" 56 "k8s.io/utils/ptr" 57 58 "github.com/onsi/ginkgo/v2" 59 "github.com/onsi/gomega" 60 ) 61 62 // getInternalIP returns node internal IP 63 func getInternalIP(node *v1.Node) (string, error) { 64 for _, address := range node.Status.Addresses { 65 if address.Type == v1.NodeInternalIP && address.Address != "" { 66 return address.Address, nil 67 } 68 } 69 return "", fmt.Errorf("couldn't get the internal IP of host %s with addresses %v", node.Name, node.Status.Addresses) 70 } 71 72 // getSubnetPrefix returns a network prefix based on one of the workers 73 // InternalIP adding a /16 or /64 mask depending on the IP family of the node. 74 // IMPORTANT: These assumes a flat network assigned to the nodes, that is common 75 // on cloud providers. 76 func getSubnetPrefix(ctx context.Context, c clientset.Interface) (*net.IPNet, error) { 77 node, err := getReadySchedulableWorkerNode(ctx, c) 78 if err != nil { 79 return nil, fmt.Errorf("error getting a ready schedulable worker Node, err: %w", err) 80 } 81 internalIP, err := getInternalIP(node) 82 if err != nil { 83 return nil, fmt.Errorf("error getting Node internal IP, err: %w", err) 84 } 85 ip := netutils.ParseIPSloppy(internalIP) 86 if ip == nil { 87 return nil, fmt.Errorf("invalid IP address format: %s", internalIP) 88 } 89 90 // if IPv6 return a net.IPNet with IP = ip and mask /64 91 ciderMask := net.CIDRMask(64, 128) 92 // if IPv4 return a net.IPNet with IP = ip and mask /16 93 if netutils.IsIPv4(ip) { 94 ciderMask = net.CIDRMask(16, 32) 95 } 96 return &net.IPNet{IP: ip.Mask(ciderMask), Mask: ciderMask}, nil 97 } 98 99 // getReadySchedulableWorkerNode gets a single worker node which is available for 100 // running pods on. If there are no such available nodes it will return an error. 101 func getReadySchedulableWorkerNode(ctx context.Context, c clientset.Interface) (*v1.Node, error) { 102 nodes, err := e2enode.GetReadySchedulableNodes(ctx, c) 103 if err != nil { 104 return nil, err 105 } 106 for i := range nodes.Items { 107 node := nodes.Items[i] 108 _, isMaster := node.Labels["node-role.kubernetes.io/master"] 109 _, isControlPlane := node.Labels["node-role.kubernetes.io/control-plane"] 110 if !isMaster && !isControlPlane { 111 return &node, nil 112 } 113 } 114 return nil, fmt.Errorf("there are currently no ready, schedulable worker nodes in the cluster") 115 } 116 117 var _ = common.SIGDescribe("LoadBalancers", feature.LoadBalancer, func() { 118 f := framework.NewDefaultFramework("loadbalancers") 119 f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged 120 121 var cs clientset.Interface 122 123 ginkgo.BeforeEach(func(ctx context.Context) { 124 cs = f.ClientSet 125 }) 126 127 ginkgo.AfterEach(func(ctx context.Context) { 128 if ginkgo.CurrentSpecReport().Failed() { 129 DescribeSvc(f.Namespace.Name) 130 } 131 }) 132 133 f.It("should be able to change the type and ports of a TCP service", f.WithSlow(), func(ctx context.Context) { 134 // FIXME: need a better platform-independent timeout 135 loadBalancerLagTimeout := e2eservice.LoadBalancerLagTimeoutAWS 136 loadBalancerCreateTimeout := e2eservice.GetServiceLoadBalancerCreationTimeout(ctx, cs) 137 138 // This test is more monolithic than we'd like because LB turnup can be 139 // very slow, so we lumped all the tests into one LB lifecycle. 140 141 serviceName := "mutability-test" 142 ns1 := f.Namespace.Name // LB1 in ns1 on TCP 143 framework.Logf("namespace for TCP test: %s", ns1) 144 145 ginkgo.By("creating a TCP service " + serviceName + " with type=ClusterIP in namespace " + ns1) 146 tcpJig := e2eservice.NewTestJig(cs, ns1, serviceName) 147 tcpService, err := tcpJig.CreateTCPService(ctx, nil) 148 framework.ExpectNoError(err) 149 150 svcPort := int(tcpService.Spec.Ports[0].Port) 151 framework.Logf("service port TCP: %d", svcPort) 152 153 ginkgo.By("creating a pod to be part of the TCP service " + serviceName) 154 _, err = tcpJig.Run(ctx, nil) 155 framework.ExpectNoError(err) 156 157 execPod := e2epod.CreateExecPodOrFail(ctx, cs, ns1, "execpod", nil) 158 err = tcpJig.CheckServiceReachability(ctx, tcpService, execPod) 159 framework.ExpectNoError(err) 160 161 // Change the services to NodePort. 162 163 ginkgo.By("changing the TCP service to type=NodePort") 164 tcpService, err = tcpJig.UpdateService(ctx, func(s *v1.Service) { 165 s.Spec.Type = v1.ServiceTypeNodePort 166 }) 167 framework.ExpectNoError(err) 168 tcpNodePort := int(tcpService.Spec.Ports[0].NodePort) 169 framework.Logf("TCP node port: %d", tcpNodePort) 170 171 err = tcpJig.CheckServiceReachability(ctx, tcpService, execPod) 172 framework.ExpectNoError(err) 173 174 // Change the services to LoadBalancer. 175 ginkgo.By("changing the TCP service to type=LoadBalancer") 176 _, err = tcpJig.UpdateService(ctx, func(s *v1.Service) { 177 s.Spec.Type = v1.ServiceTypeLoadBalancer 178 }) 179 framework.ExpectNoError(err) 180 181 ginkgo.By("waiting for the TCP service to have a load balancer") 182 // Wait for the load balancer to be created asynchronously 183 tcpService, err = tcpJig.WaitForLoadBalancer(ctx, loadBalancerCreateTimeout) 184 framework.ExpectNoError(err) 185 if int(tcpService.Spec.Ports[0].NodePort) != tcpNodePort { 186 framework.Failf("TCP Spec.Ports[0].NodePort changed (%d -> %d) when not expected", tcpNodePort, tcpService.Spec.Ports[0].NodePort) 187 } 188 tcpIngressIP := e2eservice.GetIngressPoint(&tcpService.Status.LoadBalancer.Ingress[0]) 189 framework.Logf("TCP load balancer: %s", tcpIngressIP) 190 191 err = tcpJig.CheckServiceReachability(ctx, tcpService, execPod) 192 framework.ExpectNoError(err) 193 194 ginkgo.By("hitting the TCP service's LoadBalancer") 195 e2eservice.TestReachableHTTP(ctx, tcpIngressIP, svcPort, loadBalancerLagTimeout) 196 197 // Change the services' node ports. 198 199 ginkgo.By("changing the TCP service's NodePort") 200 tcpService, err = tcpJig.ChangeServiceNodePort(ctx, tcpNodePort) 201 framework.ExpectNoError(err) 202 tcpNodePortOld := tcpNodePort 203 tcpNodePort = int(tcpService.Spec.Ports[0].NodePort) 204 if tcpNodePort == tcpNodePortOld { 205 framework.Failf("TCP Spec.Ports[0].NodePort (%d) did not change", tcpNodePort) 206 } 207 if e2eservice.GetIngressPoint(&tcpService.Status.LoadBalancer.Ingress[0]) != tcpIngressIP { 208 framework.Failf("TCP Status.LoadBalancer.Ingress changed (%s -> %s) when not expected", tcpIngressIP, e2eservice.GetIngressPoint(&tcpService.Status.LoadBalancer.Ingress[0])) 209 } 210 framework.Logf("TCP node port: %d", tcpNodePort) 211 212 ginkgo.By("hitting the TCP service's LoadBalancer") 213 e2eservice.TestReachableHTTP(ctx, tcpIngressIP, svcPort, loadBalancerLagTimeout) 214 215 // Change the services' main ports. 216 217 ginkgo.By("changing the TCP service's port") 218 tcpService, err = tcpJig.UpdateService(ctx, func(s *v1.Service) { 219 s.Spec.Ports[0].Port++ 220 }) 221 framework.ExpectNoError(err) 222 svcPortOld := svcPort 223 svcPort = int(tcpService.Spec.Ports[0].Port) 224 if svcPort == svcPortOld { 225 framework.Failf("TCP Spec.Ports[0].Port (%d) did not change", svcPort) 226 } 227 if int(tcpService.Spec.Ports[0].NodePort) != tcpNodePort { 228 framework.Failf("TCP Spec.Ports[0].NodePort (%d) changed", tcpService.Spec.Ports[0].NodePort) 229 } 230 if e2eservice.GetIngressPoint(&tcpService.Status.LoadBalancer.Ingress[0]) != tcpIngressIP { 231 framework.Failf("TCP Status.LoadBalancer.Ingress changed (%s -> %s) when not expected", tcpIngressIP, e2eservice.GetIngressPoint(&tcpService.Status.LoadBalancer.Ingress[0])) 232 } 233 234 framework.Logf("service port TCP: %d", svcPort) 235 236 ginkgo.By("hitting the TCP service's LoadBalancer") 237 e2eservice.TestReachableHTTP(ctx, tcpIngressIP, svcPort, loadBalancerLagTimeout) 238 239 ginkgo.By("Scaling the pods to 0") 240 err = tcpJig.Scale(ctx, 0) 241 framework.ExpectNoError(err) 242 243 ginkgo.By("hitting the TCP service's LoadBalancer with no backends, no answer expected") 244 testNotReachableHTTP(ctx, tcpIngressIP, svcPort, loadBalancerLagTimeout) 245 246 ginkgo.By("Scaling the pods to 1") 247 err = tcpJig.Scale(ctx, 1) 248 framework.ExpectNoError(err) 249 250 ginkgo.By("hitting the TCP service's LoadBalancer") 251 e2eservice.TestReachableHTTP(ctx, tcpIngressIP, svcPort, loadBalancerLagTimeout) 252 253 // Change the services back to ClusterIP. 254 255 ginkgo.By("changing TCP service back to type=ClusterIP") 256 tcpReadback, err := tcpJig.UpdateService(ctx, func(s *v1.Service) { 257 s.Spec.Type = v1.ServiceTypeClusterIP 258 }) 259 framework.ExpectNoError(err) 260 if tcpReadback.Spec.Ports[0].NodePort != 0 { 261 framework.Fail("TCP Spec.Ports[0].NodePort was not cleared") 262 } 263 // Wait for the load balancer to be destroyed asynchronously 264 _, err = tcpJig.WaitForLoadBalancerDestroy(ctx, tcpIngressIP, svcPort, loadBalancerCreateTimeout) 265 framework.ExpectNoError(err) 266 267 ginkgo.By("checking the TCP LoadBalancer is closed") 268 testNotReachableHTTP(ctx, tcpIngressIP, svcPort, loadBalancerLagTimeout) 269 }) 270 271 f.It("should be able to change the type and ports of a UDP service", f.WithSlow(), func(ctx context.Context) { 272 // FIXME: some cloud providers do not support UDP LoadBalancers 273 274 loadBalancerLagTimeout := e2eservice.LoadBalancerLagTimeoutDefault 275 loadBalancerCreateTimeout := e2eservice.GetServiceLoadBalancerCreationTimeout(ctx, cs) 276 277 // This test is more monolithic than we'd like because LB turnup can be 278 // very slow, so we lumped all the tests into one LB lifecycle. 279 280 serviceName := "mutability-test" 281 ns2 := f.Namespace.Name // LB1 in ns2 on TCP 282 framework.Logf("namespace for TCP test: %s", ns2) 283 284 ginkgo.By("creating a UDP service " + serviceName + " with type=ClusterIP in namespace " + ns2) 285 udpJig := e2eservice.NewTestJig(cs, ns2, serviceName) 286 udpService, err := udpJig.CreateUDPService(ctx, nil) 287 framework.ExpectNoError(err) 288 289 svcPort := int(udpService.Spec.Ports[0].Port) 290 framework.Logf("service port UDP: %d", svcPort) 291 292 ginkgo.By("creating a pod to be part of the UDP service " + serviceName) 293 _, err = udpJig.Run(ctx, nil) 294 framework.ExpectNoError(err) 295 296 execPod := e2epod.CreateExecPodOrFail(ctx, cs, ns2, "execpod", nil) 297 err = udpJig.CheckServiceReachability(ctx, udpService, execPod) 298 framework.ExpectNoError(err) 299 300 // Change the services to NodePort. 301 302 ginkgo.By("changing the UDP service to type=NodePort") 303 udpService, err = udpJig.UpdateService(ctx, func(s *v1.Service) { 304 s.Spec.Type = v1.ServiceTypeNodePort 305 }) 306 framework.ExpectNoError(err) 307 udpNodePort := int(udpService.Spec.Ports[0].NodePort) 308 framework.Logf("UDP node port: %d", udpNodePort) 309 310 err = udpJig.CheckServiceReachability(ctx, udpService, execPod) 311 framework.ExpectNoError(err) 312 313 // Change the services to LoadBalancer. 314 ginkgo.By("changing the UDP service to type=LoadBalancer") 315 _, err = udpJig.UpdateService(ctx, func(s *v1.Service) { 316 s.Spec.Type = v1.ServiceTypeLoadBalancer 317 }) 318 framework.ExpectNoError(err) 319 320 var udpIngressIP string 321 ginkgo.By("waiting for the UDP service to have a load balancer") 322 // 2nd one should be faster since they ran in parallel. 323 udpService, err = udpJig.WaitForLoadBalancer(ctx, loadBalancerCreateTimeout) 324 framework.ExpectNoError(err) 325 if int(udpService.Spec.Ports[0].NodePort) != udpNodePort { 326 framework.Failf("UDP Spec.Ports[0].NodePort changed (%d -> %d) when not expected", udpNodePort, udpService.Spec.Ports[0].NodePort) 327 } 328 udpIngressIP = e2eservice.GetIngressPoint(&udpService.Status.LoadBalancer.Ingress[0]) 329 framework.Logf("UDP load balancer: %s", udpIngressIP) 330 331 err = udpJig.CheckServiceReachability(ctx, udpService, execPod) 332 framework.ExpectNoError(err) 333 334 ginkgo.By("hitting the UDP service's LoadBalancer") 335 testReachableUDP(ctx, udpIngressIP, svcPort, loadBalancerLagTimeout) 336 337 // Change the services' node ports. 338 339 ginkgo.By("changing the UDP service's NodePort") 340 udpService, err = udpJig.ChangeServiceNodePort(ctx, udpNodePort) 341 framework.ExpectNoError(err) 342 udpNodePortOld := udpNodePort 343 udpNodePort = int(udpService.Spec.Ports[0].NodePort) 344 if udpNodePort == udpNodePortOld { 345 framework.Failf("UDP Spec.Ports[0].NodePort (%d) did not change", udpNodePort) 346 } 347 if e2eservice.GetIngressPoint(&udpService.Status.LoadBalancer.Ingress[0]) != udpIngressIP { 348 framework.Failf("UDP Status.LoadBalancer.Ingress changed (%s -> %s) when not expected", udpIngressIP, e2eservice.GetIngressPoint(&udpService.Status.LoadBalancer.Ingress[0])) 349 } 350 framework.Logf("UDP node port: %d", udpNodePort) 351 352 err = udpJig.CheckServiceReachability(ctx, udpService, execPod) 353 framework.ExpectNoError(err) 354 355 ginkgo.By("hitting the UDP service's LoadBalancer") 356 testReachableUDP(ctx, udpIngressIP, svcPort, loadBalancerLagTimeout) 357 358 // Change the services' main ports. 359 360 ginkgo.By("changing the UDP service's port") 361 udpService, err = udpJig.UpdateService(ctx, func(s *v1.Service) { 362 s.Spec.Ports[0].Port++ 363 }) 364 framework.ExpectNoError(err) 365 svcPortOld := svcPort 366 svcPort = int(udpService.Spec.Ports[0].Port) 367 if svcPort == svcPortOld { 368 framework.Failf("UDP Spec.Ports[0].Port (%d) did not change", svcPort) 369 } 370 if int(udpService.Spec.Ports[0].NodePort) != udpNodePort { 371 framework.Failf("UDP Spec.Ports[0].NodePort (%d) changed", udpService.Spec.Ports[0].NodePort) 372 } 373 if e2eservice.GetIngressPoint(&udpService.Status.LoadBalancer.Ingress[0]) != udpIngressIP { 374 framework.Failf("UDP Status.LoadBalancer.Ingress changed (%s -> %s) when not expected", udpIngressIP, e2eservice.GetIngressPoint(&udpService.Status.LoadBalancer.Ingress[0])) 375 } 376 377 framework.Logf("service port UDP: %d", svcPort) 378 379 ginkgo.By("hitting the UDP service's NodePort") 380 err = udpJig.CheckServiceReachability(ctx, udpService, execPod) 381 framework.ExpectNoError(err) 382 383 ginkgo.By("hitting the UDP service's LoadBalancer") 384 testReachableUDP(ctx, udpIngressIP, svcPort, loadBalancerCreateTimeout) 385 386 ginkgo.By("Scaling the pods to 0") 387 err = udpJig.Scale(ctx, 0) 388 framework.ExpectNoError(err) 389 390 ginkgo.By("checking that the UDP service's LoadBalancer is not reachable") 391 testNotReachableUDP(ctx, udpIngressIP, svcPort, loadBalancerCreateTimeout) 392 393 ginkgo.By("Scaling the pods to 1") 394 err = udpJig.Scale(ctx, 1) 395 framework.ExpectNoError(err) 396 397 ginkgo.By("hitting the UDP service's NodePort") 398 err = udpJig.CheckServiceReachability(ctx, udpService, execPod) 399 framework.ExpectNoError(err) 400 401 ginkgo.By("hitting the UDP service's LoadBalancer") 402 testReachableUDP(ctx, udpIngressIP, svcPort, loadBalancerCreateTimeout) 403 404 // Change the services back to ClusterIP. 405 406 ginkgo.By("changing UDP service back to type=ClusterIP") 407 udpReadback, err := udpJig.UpdateService(ctx, func(s *v1.Service) { 408 s.Spec.Type = v1.ServiceTypeClusterIP 409 }) 410 framework.ExpectNoError(err) 411 if udpReadback.Spec.Ports[0].NodePort != 0 { 412 framework.Fail("UDP Spec.Ports[0].NodePort was not cleared") 413 } 414 // Wait for the load balancer to be destroyed asynchronously 415 _, err = udpJig.WaitForLoadBalancerDestroy(ctx, udpIngressIP, svcPort, loadBalancerCreateTimeout) 416 framework.ExpectNoError(err) 417 418 ginkgo.By("checking the UDP LoadBalancer is closed") 419 testNotReachableUDP(ctx, udpIngressIP, svcPort, loadBalancerLagTimeout) 420 }) 421 422 f.It("should only allow access from service loadbalancer source ranges", f.WithSlow(), func(ctx context.Context) { 423 loadBalancerCreateTimeout := e2eservice.GetServiceLoadBalancerCreationTimeout(ctx, cs) 424 425 namespace := f.Namespace.Name 426 serviceName := "lb-sourcerange" 427 jig := e2eservice.NewTestJig(cs, namespace, serviceName) 428 429 ginkgo.By("Prepare allow source ips") 430 // prepare the exec pods 431 // acceptPod are allowed to access the loadbalancer 432 acceptPod := e2epod.CreateExecPodOrFail(ctx, cs, namespace, "execpod-accept", nil) 433 dropPod := e2epod.CreateExecPodOrFail(ctx, cs, namespace, "execpod-drop", nil) 434 435 ginkgo.By("creating a pod to be part of the service " + serviceName) 436 // This container is an nginx container listening on port 80 437 // See kubernetes/contrib/ingress/echoheaders/nginx.conf for content of response 438 _, err := jig.Run(ctx, nil) 439 framework.ExpectNoError(err) 440 // Make sure acceptPod is running. There are certain chances that pod might be terminated due to unexpected reasons. 441 acceptPod, err = cs.CoreV1().Pods(namespace).Get(ctx, acceptPod.Name, metav1.GetOptions{}) 442 framework.ExpectNoError(err, "Unable to get pod %s", acceptPod.Name) 443 gomega.Expect(acceptPod.Status.Phase).To(gomega.Equal(v1.PodRunning)) 444 gomega.Expect(acceptPod.Status.PodIP).ToNot(gomega.BeEmpty()) 445 446 // Create loadbalancer service with source range from node[0] and podAccept 447 svc, err := jig.CreateTCPService(ctx, func(svc *v1.Service) { 448 svc.Spec.Type = v1.ServiceTypeLoadBalancer 449 svc.Spec.LoadBalancerSourceRanges = []string{acceptPod.Status.PodIP + "/32"} 450 }) 451 framework.ExpectNoError(err) 452 453 ginkgo.DeferCleanup(func(ctx context.Context) { 454 ginkgo.By("Clean up loadbalancer service") 455 e2eservice.WaitForServiceDeletedWithFinalizer(ctx, cs, svc.Namespace, svc.Name) 456 }) 457 458 svc, err = jig.WaitForLoadBalancer(ctx, loadBalancerCreateTimeout) 459 framework.ExpectNoError(err) 460 461 ginkgo.By("check reachability from different sources") 462 svcIP := e2eservice.GetIngressPoint(&svc.Status.LoadBalancer.Ingress[0]) 463 // We should wait until service changes are actually propagated in the cloud-provider, 464 // as this may take significant amount of time, especially in large clusters. 465 // However, the information whether it was already programmed isn't achievable. 466 // So we're resolving it by using loadBalancerCreateTimeout that takes cluster size into account. 467 checkReachabilityFromPod(ctx, true, loadBalancerCreateTimeout, namespace, acceptPod.Name, svcIP) 468 checkReachabilityFromPod(ctx, false, loadBalancerCreateTimeout, namespace, dropPod.Name, svcIP) 469 470 // Make sure dropPod is running. There are certain chances that the pod might be terminated due to unexpected reasons. 471 dropPod, err = cs.CoreV1().Pods(namespace).Get(ctx, dropPod.Name, metav1.GetOptions{}) 472 framework.ExpectNoError(err, "Unable to get pod %s", dropPod.Name) 473 gomega.Expect(acceptPod.Status.Phase).To(gomega.Equal(v1.PodRunning)) 474 gomega.Expect(acceptPod.Status.PodIP).ToNot(gomega.BeEmpty()) 475 476 ginkgo.By("Update service LoadBalancerSourceRange and check reachability") 477 _, err = jig.UpdateService(ctx, func(svc *v1.Service) { 478 // only allow access from dropPod 479 svc.Spec.LoadBalancerSourceRanges = []string{dropPod.Status.PodIP + "/32"} 480 }) 481 framework.ExpectNoError(err) 482 483 // We should wait until service changes are actually propagates, as this may take 484 // significant amount of time, especially in large clusters. 485 // However, the information whether it was already programmed isn't achievable. 486 // So we're resolving it by using loadBalancerCreateTimeout that takes cluster size into account. 487 checkReachabilityFromPod(ctx, false, loadBalancerCreateTimeout, namespace, acceptPod.Name, svcIP) 488 checkReachabilityFromPod(ctx, true, loadBalancerCreateTimeout, namespace, dropPod.Name, svcIP) 489 490 ginkgo.By("Delete LoadBalancerSourceRange field and check reachability") 491 _, err = jig.UpdateService(ctx, func(svc *v1.Service) { 492 svc.Spec.LoadBalancerSourceRanges = nil 493 }) 494 framework.ExpectNoError(err) 495 // We should wait until service changes are actually propagates, as this may take 496 // significant amount of time, especially in large clusters. 497 // However, the information whether it was already programmed isn't achievable. 498 // So we're resolving it by using loadBalancerCreateTimeout that takes cluster size into account. 499 checkReachabilityFromPod(ctx, true, loadBalancerCreateTimeout, namespace, acceptPod.Name, svcIP) 500 checkReachabilityFromPod(ctx, true, loadBalancerCreateTimeout, namespace, dropPod.Name, svcIP) 501 }) 502 503 // [LinuxOnly]: Windows does not support session affinity. 504 f.It("should have session affinity work for LoadBalancer service with Local traffic policy", f.WithSlow(), "[LinuxOnly]", func(ctx context.Context) { 505 // FIXME: some cloud providers do not support k8s-compatible affinity 506 507 svc := getServeHostnameService("affinity-lb-esipp") 508 svc.Spec.Type = v1.ServiceTypeLoadBalancer 509 svc.Spec.ExternalTrafficPolicy = v1.ServiceExternalTrafficPolicyLocal 510 execAffinityTestForLBService(ctx, f, cs, svc) 511 }) 512 513 // [LinuxOnly]: Windows does not support session affinity. 514 f.It("should be able to switch session affinity for LoadBalancer service with Local traffic policy", f.WithSlow(), "[LinuxOnly]", func(ctx context.Context) { 515 // FIXME: some cloud providers do not support k8s-compatible affinity 516 517 svc := getServeHostnameService("affinity-lb-esipp-transition") 518 svc.Spec.Type = v1.ServiceTypeLoadBalancer 519 svc.Spec.ExternalTrafficPolicy = v1.ServiceExternalTrafficPolicyLocal 520 execAffinityTestForLBServiceWithTransition(ctx, f, cs, svc) 521 }) 522 523 // [LinuxOnly]: Windows does not support session affinity. 524 f.It("should have session affinity work for LoadBalancer service with Cluster traffic policy", f.WithSlow(), "[LinuxOnly]", func(ctx context.Context) { 525 // FIXME: some cloud providers do not support k8s-compatible affinity 526 527 svc := getServeHostnameService("affinity-lb") 528 svc.Spec.Type = v1.ServiceTypeLoadBalancer 529 svc.Spec.ExternalTrafficPolicy = v1.ServiceExternalTrafficPolicyCluster 530 execAffinityTestForLBService(ctx, f, cs, svc) 531 }) 532 533 // [LinuxOnly]: Windows does not support session affinity. 534 f.It("should be able to switch session affinity for LoadBalancer service with Cluster traffic policy", f.WithSlow(), "[LinuxOnly]", func(ctx context.Context) { 535 // FIXME: some cloud providers do not support k8s-compatible affinity 536 537 svc := getServeHostnameService("affinity-lb-transition") 538 svc.Spec.Type = v1.ServiceTypeLoadBalancer 539 svc.Spec.ExternalTrafficPolicy = v1.ServiceExternalTrafficPolicyCluster 540 execAffinityTestForLBServiceWithTransition(ctx, f, cs, svc) 541 }) 542 543 // This test verifies if service load balancer cleanup finalizer is properly 544 // handled during service lifecycle. 545 // 1. Create service with type=LoadBalancer. Finalizer should be added. 546 // 2. Update service to type=ClusterIP. Finalizer should be removed. 547 // 3. Update service to type=LoadBalancer. Finalizer should be added. 548 // 4. Delete service with type=LoadBalancer. Finalizer should be removed. 549 f.It("should handle load balancer cleanup finalizer for service", f.WithSlow(), func(ctx context.Context) { 550 jig := e2eservice.NewTestJig(cs, f.Namespace.Name, "lb-finalizer") 551 552 ginkgo.By("Create load balancer service") 553 svc, err := jig.CreateTCPService(ctx, func(svc *v1.Service) { 554 svc.Spec.Type = v1.ServiceTypeLoadBalancer 555 }) 556 framework.ExpectNoError(err) 557 558 ginkgo.DeferCleanup(func(ctx context.Context) { 559 ginkgo.By("Check that service can be deleted with finalizer") 560 e2eservice.WaitForServiceDeletedWithFinalizer(ctx, cs, svc.Namespace, svc.Name) 561 }) 562 563 ginkgo.By("Wait for load balancer to serve traffic") 564 svc, err = jig.WaitForLoadBalancer(ctx, e2eservice.GetServiceLoadBalancerCreationTimeout(ctx, cs)) 565 framework.ExpectNoError(err) 566 567 ginkgo.By("Check if finalizer presents on service with type=LoadBalancer") 568 e2eservice.WaitForServiceUpdatedWithFinalizer(ctx, cs, svc.Namespace, svc.Name, true) 569 570 ginkgo.By("Check if finalizer is removed on service after changed to type=ClusterIP") 571 err = jig.ChangeServiceType(ctx, v1.ServiceTypeClusterIP, e2eservice.GetServiceLoadBalancerCreationTimeout(ctx, cs)) 572 framework.ExpectNoError(err) 573 e2eservice.WaitForServiceUpdatedWithFinalizer(ctx, cs, svc.Namespace, svc.Name, false) 574 575 ginkgo.By("Check if finalizer is added back to service after changed to type=LoadBalancer") 576 err = jig.ChangeServiceType(ctx, v1.ServiceTypeLoadBalancer, e2eservice.GetServiceLoadBalancerCreationTimeout(ctx, cs)) 577 framework.ExpectNoError(err) 578 e2eservice.WaitForServiceUpdatedWithFinalizer(ctx, cs, svc.Namespace, svc.Name, true) 579 }) 580 581 f.It("should be able to create LoadBalancer Service without NodePort and change it", f.WithSlow(), func(ctx context.Context) { 582 // FIXME: need a better platform-independent timeout 583 loadBalancerLagTimeout := e2eservice.LoadBalancerLagTimeoutAWS 584 loadBalancerCreateTimeout := e2eservice.GetServiceLoadBalancerCreationTimeout(ctx, cs) 585 586 // This test is more monolithic than we'd like because LB turnup can be 587 // very slow, so we lumped all the tests into one LB lifecycle. 588 589 serviceName := "reallocate-nodeport-test" 590 ns1 := f.Namespace.Name // LB1 in ns1 on TCP 591 framework.Logf("namespace for TCP test: %s", ns1) 592 593 ginkgo.By("creating a TCP service " + serviceName + " with type=ClusterIP in namespace " + ns1) 594 tcpJig := e2eservice.NewTestJig(cs, ns1, serviceName) 595 tcpService, err := tcpJig.CreateTCPService(ctx, nil) 596 framework.ExpectNoError(err) 597 598 svcPort := int(tcpService.Spec.Ports[0].Port) 599 framework.Logf("service port TCP: %d", svcPort) 600 601 ginkgo.By("creating a pod to be part of the TCP service " + serviceName) 602 _, err = tcpJig.Run(ctx, nil) 603 framework.ExpectNoError(err) 604 605 // Change the services to LoadBalancer. 606 ginkgo.By("changing the TCP service to type=LoadBalancer") 607 _, err = tcpJig.UpdateService(ctx, func(s *v1.Service) { 608 s.Spec.Type = v1.ServiceTypeLoadBalancer 609 s.Spec.AllocateLoadBalancerNodePorts = ptr.To(false) 610 }) 611 framework.ExpectNoError(err) 612 613 ginkgo.By("waiting for the TCP service to have a load balancer") 614 // Wait for the load balancer to be created asynchronously 615 tcpService, err = tcpJig.WaitForLoadBalancer(ctx, loadBalancerCreateTimeout) 616 framework.ExpectNoError(err) 617 if int(tcpService.Spec.Ports[0].NodePort) != 0 { 618 framework.Failf("TCP Spec.Ports[0].NodePort allocated %d when not expected", tcpService.Spec.Ports[0].NodePort) 619 } 620 tcpIngressIP := e2eservice.GetIngressPoint(&tcpService.Status.LoadBalancer.Ingress[0]) 621 framework.Logf("TCP load balancer: %s", tcpIngressIP) 622 623 ginkgo.By("hitting the TCP service's LoadBalancer") 624 e2eservice.TestReachableHTTP(ctx, tcpIngressIP, svcPort, loadBalancerLagTimeout) 625 626 // Change the services' node ports. 627 628 ginkgo.By("adding a TCP service's NodePort") 629 tcpService, err = tcpJig.UpdateService(ctx, func(s *v1.Service) { 630 s.Spec.AllocateLoadBalancerNodePorts = ptr.To(true) 631 }) 632 framework.ExpectNoError(err) 633 tcpNodePort := int(tcpService.Spec.Ports[0].NodePort) 634 if tcpNodePort == 0 { 635 framework.Failf("TCP Spec.Ports[0].NodePort (%d) not allocated", tcpNodePort) 636 } 637 if e2eservice.GetIngressPoint(&tcpService.Status.LoadBalancer.Ingress[0]) != tcpIngressIP { 638 framework.Failf("TCP Status.LoadBalancer.Ingress changed (%s -> %s) when not expected", tcpIngressIP, e2eservice.GetIngressPoint(&tcpService.Status.LoadBalancer.Ingress[0])) 639 } 640 framework.Logf("TCP node port: %d", tcpNodePort) 641 642 ginkgo.By("hitting the TCP service's LoadBalancer") 643 e2eservice.TestReachableHTTP(ctx, tcpIngressIP, svcPort, loadBalancerLagTimeout) 644 }) 645 646 ginkgo.It("should be able to preserve UDP traffic when server pod cycles for a LoadBalancer service on different nodes", func(ctx context.Context) { 647 // FIXME: some cloud providers do not support UDP LoadBalancers 648 649 ns := f.Namespace.Name 650 nodes, err := e2enode.GetBoundedReadySchedulableNodes(ctx, cs, 2) 651 framework.ExpectNoError(err) 652 if len(nodes.Items) < 2 { 653 e2eskipper.Skipf( 654 "Test requires >= 2 Ready nodes, but there are only %v nodes", 655 len(nodes.Items)) 656 } 657 658 loadBalancerLagTimeout := e2eservice.LoadBalancerLagTimeoutDefault 659 loadBalancerCreateTimeout := e2eservice.GetServiceLoadBalancerCreationTimeout(ctx, cs) 660 661 // Create a LoadBalancer service 662 udpJig := e2eservice.NewTestJig(cs, ns, serviceName) 663 ginkgo.By("creating a UDP service " + serviceName + " with type=LoadBalancer in " + ns) 664 _, err = udpJig.CreateUDPService(ctx, func(svc *v1.Service) { 665 svc.Spec.Type = v1.ServiceTypeLoadBalancer 666 svc.Spec.Ports = []v1.ServicePort{ 667 {Port: 80, Name: "udp", Protocol: v1.ProtocolUDP, TargetPort: intstr.FromInt32(80)}, 668 } 669 }) 670 framework.ExpectNoError(err) 671 672 var udpIngressIP string 673 ginkgo.By("waiting for the UDP service to have a load balancer") 674 udpService, err := udpJig.WaitForLoadBalancer(ctx, loadBalancerCreateTimeout) 675 framework.ExpectNoError(err) 676 677 udpIngressIP = e2eservice.GetIngressPoint(&udpService.Status.LoadBalancer.Ingress[0]) 678 framework.Logf("UDP load balancer: %s", udpIngressIP) 679 680 // keep hitting the loadbalancer to check it fails over to the second pod 681 ginkgo.By("hitting the UDP service's LoadBalancer with same source port") 682 stopCh := make(chan struct{}) 683 defer close(stopCh) 684 var mu sync.Mutex 685 hostnames := sets.NewString() 686 go func() { 687 defer ginkgo.GinkgoRecover() 688 port := int(udpService.Spec.Ports[0].Port) 689 laddr, err := net.ResolveUDPAddr("udp", ":54321") 690 if err != nil { 691 framework.Failf("Failed to resolve local address: %v", err) 692 } 693 raddr := net.UDPAddr{IP: netutils.ParseIPSloppy(udpIngressIP), Port: port} 694 695 for { 696 select { 697 case <-stopCh: 698 if len(hostnames) != 2 { 699 framework.Failf("Failed to hit the 2 UDP LoadBalancer backends successfully, got %v", hostnames.List()) 700 } 701 return 702 default: 703 time.Sleep(1 * time.Second) 704 } 705 706 conn, err := net.DialUDP("udp", laddr, &raddr) 707 if err != nil { 708 framework.Logf("Failed to connect to: %s %d", udpIngressIP, port) 709 continue 710 } 711 _ = conn.SetDeadline(time.Now().Add(3 * time.Second)) 712 framework.Logf("Connected successfully to: %s", raddr.String()) 713 _, _ = conn.Write([]byte("hostname\n")) 714 buff := make([]byte, 1024) 715 n, _, err := conn.ReadFrom(buff) 716 if err == nil { 717 mu.Lock() 718 hostnames.Insert(string(buff[:n])) 719 mu.Unlock() 720 framework.Logf("Connected successfully to hostname: %s", string(buff[:n])) 721 } 722 _ = conn.Close() 723 } 724 }() 725 726 // Add a backend pod to the service in one node 727 ginkgo.By("creating a backend pod " + podBackend1 + " for the service " + serviceName) 728 serverPod1 := e2epod.NewAgnhostPod(ns, podBackend1, nil, nil, nil, "netexec", fmt.Sprintf("--udp-port=%d", 80)) 729 serverPod1.Labels = udpJig.Labels 730 serverPod1.Spec.Hostname = "hostname1" 731 nodeSelection := e2epod.NodeSelection{Name: nodes.Items[0].Name} 732 e2epod.SetNodeSelection(&serverPod1.Spec, nodeSelection) 733 e2epod.NewPodClient(f).CreateSync(ctx, serverPod1) 734 735 validateEndpointsPortsOrFail(ctx, cs, ns, serviceName, portsByPodName{podBackend1: {80}}) 736 737 // Note that the fact that Endpoints object already exists, does NOT mean 738 // that iptables (or whatever else is used) was already programmed. 739 // Additionally take into account that UDP conntract entries timeout is 740 // 30 seconds by default. 741 // Based on the above check if the pod receives the traffic. 742 ginkgo.By("checking client pod connected to the backend 1 on Node " + nodes.Items[0].Name) 743 if err := wait.PollUntilContextTimeout(ctx, 1*time.Second, loadBalancerLagTimeout, true, func(ctx context.Context) (bool, error) { 744 mu.Lock() 745 defer mu.Unlock() 746 return hostnames.Has(serverPod1.Spec.Hostname), nil 747 }); err != nil { 748 framework.Failf("Failed to connect to backend 1") 749 } 750 751 // Create a second pod 752 ginkgo.By("creating a second backend pod " + podBackend2 + " for the service " + serviceName) 753 serverPod2 := e2epod.NewAgnhostPod(ns, podBackend2, nil, nil, nil, "netexec", fmt.Sprintf("--udp-port=%d", 80)) 754 serverPod2.Labels = udpJig.Labels 755 serverPod2.Spec.Hostname = "hostname2" 756 nodeSelection = e2epod.NodeSelection{Name: nodes.Items[1].Name} 757 e2epod.SetNodeSelection(&serverPod2.Spec, nodeSelection) 758 e2epod.NewPodClient(f).CreateSync(ctx, serverPod2) 759 760 // and delete the first pod 761 framework.Logf("Cleaning up %s pod", podBackend1) 762 e2epod.NewPodClient(f).DeleteSync(ctx, podBackend1, metav1.DeleteOptions{}, e2epod.DefaultPodDeletionTimeout) 763 764 validateEndpointsPortsOrFail(ctx, cs, ns, serviceName, portsByPodName{podBackend2: {80}}) 765 766 // Check that the second pod keeps receiving traffic 767 // UDP conntrack entries timeout is 30 sec by default 768 ginkgo.By("checking client pod connected to the backend 2 on Node " + nodes.Items[1].Name) 769 if err := wait.PollUntilContextTimeout(ctx, 1*time.Second, loadBalancerLagTimeout, true, func(ctx context.Context) (bool, error) { 770 mu.Lock() 771 defer mu.Unlock() 772 return hostnames.Has(serverPod2.Spec.Hostname), nil 773 }); err != nil { 774 framework.Failf("Failed to connect to backend 2") 775 } 776 }) 777 778 ginkgo.It("should be able to preserve UDP traffic when server pod cycles for a LoadBalancer service on the same nodes", func(ctx context.Context) { 779 // FIXME: some cloud providers do not support UDP LoadBalancers 780 781 ns := f.Namespace.Name 782 nodes, err := e2enode.GetBoundedReadySchedulableNodes(ctx, cs, 1) 783 framework.ExpectNoError(err) 784 if len(nodes.Items) < 1 { 785 e2eskipper.Skipf( 786 "Test requires >= 1 Ready nodes, but there are only %d nodes", 787 len(nodes.Items)) 788 } 789 790 loadBalancerLagTimeout := e2eservice.LoadBalancerLagTimeoutDefault 791 loadBalancerCreateTimeout := e2eservice.GetServiceLoadBalancerCreationTimeout(ctx, cs) 792 793 // Create a LoadBalancer service 794 udpJig := e2eservice.NewTestJig(cs, ns, serviceName) 795 ginkgo.By("creating a UDP service " + serviceName + " with type=LoadBalancer in " + ns) 796 _, err = udpJig.CreateUDPService(ctx, func(svc *v1.Service) { 797 svc.Spec.Type = v1.ServiceTypeLoadBalancer 798 svc.Spec.Ports = []v1.ServicePort{ 799 {Port: 80, Name: "udp", Protocol: v1.ProtocolUDP, TargetPort: intstr.FromInt32(80)}, 800 } 801 }) 802 framework.ExpectNoError(err) 803 804 var udpIngressIP string 805 ginkgo.By("waiting for the UDP service to have a load balancer") 806 udpService, err := udpJig.WaitForLoadBalancer(ctx, loadBalancerCreateTimeout) 807 framework.ExpectNoError(err) 808 809 udpIngressIP = e2eservice.GetIngressPoint(&udpService.Status.LoadBalancer.Ingress[0]) 810 framework.Logf("UDP load balancer: %s", udpIngressIP) 811 812 // keep hitting the loadbalancer to check it fails over to the second pod 813 ginkgo.By("hitting the UDP service's LoadBalancer with same source port") 814 stopCh := make(chan struct{}) 815 defer close(stopCh) 816 var mu sync.Mutex 817 hostnames := sets.NewString() 818 go func() { 819 defer ginkgo.GinkgoRecover() 820 port := int(udpService.Spec.Ports[0].Port) 821 laddr, err := net.ResolveUDPAddr("udp", ":54322") 822 if err != nil { 823 framework.Failf("Failed to resolve local address: %v", err) 824 } 825 raddr := net.UDPAddr{IP: netutils.ParseIPSloppy(udpIngressIP), Port: port} 826 827 for { 828 select { 829 case <-stopCh: 830 if len(hostnames) != 2 { 831 framework.Failf("Failed to hit the 2 UDP LoadBalancer backends successfully, got %v", hostnames.List()) 832 } 833 return 834 default: 835 time.Sleep(1 * time.Second) 836 } 837 838 conn, err := net.DialUDP("udp", laddr, &raddr) 839 if err != nil { 840 framework.Logf("Failed to connect to: %s %d", udpIngressIP, port) 841 continue 842 } 843 _ = conn.SetDeadline(time.Now().Add(3 * time.Second)) 844 framework.Logf("Connected successfully to: %s", raddr.String()) 845 _, _ = conn.Write([]byte("hostname\n")) 846 buff := make([]byte, 1024) 847 n, _, err := conn.ReadFrom(buff) 848 if err == nil { 849 mu.Lock() 850 hostnames.Insert(string(buff[:n])) 851 mu.Unlock() 852 framework.Logf("Connected successfully to hostname: %s", string(buff[:n])) 853 } 854 _ = conn.Close() 855 } 856 }() 857 858 // Add a backend pod to the service in one node 859 ginkgo.By("creating a backend pod " + podBackend1 + " for the service " + serviceName) 860 serverPod1 := e2epod.NewAgnhostPod(ns, podBackend1, nil, nil, nil, "netexec", fmt.Sprintf("--udp-port=%d", 80)) 861 serverPod1.Labels = udpJig.Labels 862 serverPod1.Spec.Hostname = "hostname1" 863 nodeSelection := e2epod.NodeSelection{Name: nodes.Items[0].Name} 864 e2epod.SetNodeSelection(&serverPod1.Spec, nodeSelection) 865 e2epod.NewPodClient(f).CreateSync(ctx, serverPod1) 866 867 validateEndpointsPortsOrFail(ctx, cs, ns, serviceName, portsByPodName{podBackend1: {80}}) 868 869 // Note that the fact that Endpoints object already exists, does NOT mean 870 // that iptables (or whatever else is used) was already programmed. 871 // Additionally take into account that UDP conntract entries timeout is 872 // 30 seconds by default. 873 // Based on the above check if the pod receives the traffic. 874 ginkgo.By("checking client pod connected to the backend 1 on Node " + nodes.Items[0].Name) 875 if err := wait.PollUntilContextTimeout(ctx, 1*time.Second, loadBalancerLagTimeout, true, func(ctx context.Context) (bool, error) { 876 mu.Lock() 877 defer mu.Unlock() 878 return hostnames.Has(serverPod1.Spec.Hostname), nil 879 }); err != nil { 880 framework.Failf("Failed to connect to backend 1") 881 } 882 883 // Create a second pod on the same node 884 ginkgo.By("creating a second backend pod " + podBackend2 + " for the service " + serviceName) 885 serverPod2 := e2epod.NewAgnhostPod(ns, podBackend2, nil, nil, nil, "netexec", fmt.Sprintf("--udp-port=%d", 80)) 886 serverPod2.Labels = udpJig.Labels 887 serverPod2.Spec.Hostname = "hostname2" 888 // use the same node as previous pod 889 e2epod.SetNodeSelection(&serverPod2.Spec, nodeSelection) 890 e2epod.NewPodClient(f).CreateSync(ctx, serverPod2) 891 892 // and delete the first pod 893 framework.Logf("Cleaning up %s pod", podBackend1) 894 e2epod.NewPodClient(f).DeleteSync(ctx, podBackend1, metav1.DeleteOptions{}, e2epod.DefaultPodDeletionTimeout) 895 896 validateEndpointsPortsOrFail(ctx, cs, ns, serviceName, portsByPodName{podBackend2: {80}}) 897 898 // Check that the second pod keeps receiving traffic 899 // UDP conntrack entries timeout is 30 sec by default 900 ginkgo.By("checking client pod connected to the backend 2 on Node " + nodes.Items[0].Name) 901 if err := wait.PollUntilContextTimeout(ctx, 1*time.Second, loadBalancerLagTimeout, true, func(ctx context.Context) (bool, error) { 902 mu.Lock() 903 defer mu.Unlock() 904 return hostnames.Has(serverPod2.Spec.Hostname), nil 905 }); err != nil { 906 framework.Failf("Failed to connect to backend 2") 907 } 908 }) 909 910 f.It("should not have connectivity disruption during rolling update with externalTrafficPolicy=Cluster", f.WithSlow(), func(ctx context.Context) { 911 // We start with a low but reasonable threshold to analyze the results. 912 // The goal is to achieve 99% minimum success rate. 913 // TODO: We should do incremental steps toward the goal. 914 minSuccessRate := 0.95 915 916 testRollingUpdateLBConnectivityDisruption(ctx, f, v1.ServiceExternalTrafficPolicyTypeCluster, minSuccessRate) 917 }) 918 919 f.It("should not have connectivity disruption during rolling update with externalTrafficPolicy=Local", f.WithSlow(), func(ctx context.Context) { 920 // We start with a low but reasonable threshold to analyze the results. 921 // The goal is to achieve 99% minimum success rate. 922 // TODO: We should do incremental steps toward the goal. 923 minSuccessRate := 0.95 924 925 testRollingUpdateLBConnectivityDisruption(ctx, f, v1.ServiceExternalTrafficPolicyTypeLocal, minSuccessRate) 926 }) 927 }) 928 929 var _ = common.SIGDescribe("LoadBalancers ExternalTrafficPolicy: Local", feature.LoadBalancer, framework.WithSlow(), func() { 930 f := framework.NewDefaultFramework("esipp") 931 f.NamespacePodSecurityLevel = admissionapi.LevelBaseline 932 var loadBalancerCreateTimeout time.Duration 933 934 var cs clientset.Interface 935 var subnetPrefix *net.IPNet 936 var err error 937 938 ginkgo.BeforeEach(func(ctx context.Context) { 939 cs = f.ClientSet 940 loadBalancerCreateTimeout = e2eservice.GetServiceLoadBalancerCreationTimeout(ctx, cs) 941 subnetPrefix, err = getSubnetPrefix(ctx, cs) 942 framework.ExpectNoError(err) 943 }) 944 945 ginkgo.AfterEach(func(ctx context.Context) { 946 if ginkgo.CurrentSpecReport().Failed() { 947 DescribeSvc(f.Namespace.Name) 948 } 949 }) 950 951 ginkgo.It("should work for type=LoadBalancer", func(ctx context.Context) { 952 namespace := f.Namespace.Name 953 serviceName := "external-local-lb" 954 jig := e2eservice.NewTestJig(cs, namespace, serviceName) 955 956 svc, err := jig.CreateOnlyLocalLoadBalancerService(ctx, loadBalancerCreateTimeout, true, nil) 957 framework.ExpectNoError(err) 958 healthCheckNodePort := int(svc.Spec.HealthCheckNodePort) 959 if healthCheckNodePort == 0 { 960 framework.Failf("Service HealthCheck NodePort was not allocated") 961 } 962 ginkgo.DeferCleanup(func(ctx context.Context) { 963 err = jig.ChangeServiceType(ctx, v1.ServiceTypeClusterIP, loadBalancerCreateTimeout) 964 framework.ExpectNoError(err) 965 966 // Make sure we didn't leak the health check node port. 967 const threshold = 2 968 nodes, err := getEndpointNodesWithInternalIP(ctx, jig) 969 framework.ExpectNoError(err) 970 config := e2enetwork.NewNetworkingTestConfig(ctx, f) 971 for _, internalIP := range nodes { 972 err := testHTTPHealthCheckNodePortFromTestContainer(ctx, 973 config, 974 internalIP, 975 healthCheckNodePort, 976 e2eservice.KubeProxyLagTimeout, 977 false, 978 threshold) 979 framework.ExpectNoError(err) 980 } 981 err = cs.CoreV1().Services(svc.Namespace).Delete(ctx, svc.Name, metav1.DeleteOptions{}) 982 framework.ExpectNoError(err) 983 }) 984 985 // FIXME: figure out the actual expected semantics for 986 // "ExternalTrafficPolicy: Local" + "IPMode: Proxy". 987 // https://issues.k8s.io/123714 988 ingress := &svc.Status.LoadBalancer.Ingress[0] 989 if ingress.IP == "" || (ingress.IPMode != nil && *ingress.IPMode == v1.LoadBalancerIPModeProxy) { 990 e2eskipper.Skipf("LoadBalancer uses 'Proxy' IPMode") 991 } 992 993 svcTCPPort := int(svc.Spec.Ports[0].Port) 994 ingressIP := e2eservice.GetIngressPoint(&svc.Status.LoadBalancer.Ingress[0]) 995 996 ginkgo.By("reading clientIP using the TCP service's service port via its external VIP") 997 clientIPPort, err := GetHTTPContent(ingressIP, svcTCPPort, e2eservice.KubeProxyLagTimeout, "/clientip") 998 framework.ExpectNoError(err) 999 framework.Logf("ClientIP detected by target pod using VIP:SvcPort is %s", clientIPPort) 1000 1001 ginkgo.By("checking if Source IP is preserved") 1002 // The clientIPPort returned from GetHTTPContent is in this format: x.x.x.x:port or [xx:xx:xx::x]:port 1003 host, _, err := net.SplitHostPort(clientIPPort) 1004 if err != nil { 1005 framework.Failf("SplitHostPort returned unexpected error: %q", clientIPPort) 1006 } 1007 ip := netutils.ParseIPSloppy(host) 1008 if ip == nil { 1009 framework.Failf("Invalid client IP address format: %q", host) 1010 } 1011 if subnetPrefix.Contains(ip) { 1012 framework.Failf("Source IP was NOT preserved") 1013 } 1014 }) 1015 1016 ginkgo.It("should work for type=NodePort", func(ctx context.Context) { 1017 namespace := f.Namespace.Name 1018 serviceName := "external-local-nodeport" 1019 jig := e2eservice.NewTestJig(cs, namespace, serviceName) 1020 1021 svc, err := jig.CreateOnlyLocalNodePortService(ctx, true) 1022 framework.ExpectNoError(err) 1023 ginkgo.DeferCleanup(func(ctx context.Context) { 1024 err := cs.CoreV1().Services(svc.Namespace).Delete(ctx, svc.Name, metav1.DeleteOptions{}) 1025 framework.ExpectNoError(err) 1026 }) 1027 1028 tcpNodePort := int(svc.Spec.Ports[0].NodePort) 1029 1030 endpointsNodeMap, err := getEndpointNodesWithInternalIP(ctx, jig) 1031 framework.ExpectNoError(err) 1032 1033 dialCmd := "clientip" 1034 config := e2enetwork.NewNetworkingTestConfig(ctx, f) 1035 1036 for nodeName, nodeIP := range endpointsNodeMap { 1037 ginkgo.By(fmt.Sprintf("reading clientIP using the TCP service's NodePort, on node %v: %v:%v/%v", nodeName, nodeIP, tcpNodePort, dialCmd)) 1038 clientIP, err := GetHTTPContentFromTestContainer(ctx, config, nodeIP, tcpNodePort, e2eservice.KubeProxyLagTimeout, dialCmd) 1039 framework.ExpectNoError(err) 1040 framework.Logf("ClientIP detected by target pod using NodePort is %s, the ip of test container is %s", clientIP, config.TestContainerPod.Status.PodIP) 1041 // the clientIP returned by agnhost contains port 1042 if !strings.HasPrefix(clientIP, config.TestContainerPod.Status.PodIP) { 1043 framework.Failf("Source IP was NOT preserved") 1044 } 1045 } 1046 }) 1047 1048 ginkgo.It("should only target nodes with endpoints", func(ctx context.Context) { 1049 namespace := f.Namespace.Name 1050 serviceName := "external-local-nodes" 1051 jig := e2eservice.NewTestJig(cs, namespace, serviceName) 1052 nodes, err := e2enode.GetBoundedReadySchedulableNodes(ctx, cs, e2eservice.MaxNodesForEndpointsTests) 1053 framework.ExpectNoError(err) 1054 1055 svc, err := jig.CreateOnlyLocalLoadBalancerService(ctx, loadBalancerCreateTimeout, false, 1056 func(svc *v1.Service) { 1057 // Change service port to avoid collision with opened hostPorts 1058 // in other tests that run in parallel. 1059 if len(svc.Spec.Ports) != 0 { 1060 svc.Spec.Ports[0].TargetPort = intstr.FromInt32(svc.Spec.Ports[0].Port) 1061 svc.Spec.Ports[0].Port = 8081 1062 } 1063 1064 }) 1065 framework.ExpectNoError(err) 1066 ginkgo.DeferCleanup(func(ctx context.Context) { 1067 err = jig.ChangeServiceType(ctx, v1.ServiceTypeClusterIP, loadBalancerCreateTimeout) 1068 framework.ExpectNoError(err) 1069 err := cs.CoreV1().Services(svc.Namespace).Delete(ctx, svc.Name, metav1.DeleteOptions{}) 1070 framework.ExpectNoError(err) 1071 }) 1072 1073 healthCheckNodePort := int(svc.Spec.HealthCheckNodePort) 1074 if healthCheckNodePort == 0 { 1075 framework.Failf("Service HealthCheck NodePort was not allocated") 1076 } 1077 1078 ips := e2enode.CollectAddresses(nodes, v1.NodeInternalIP) 1079 1080 ingressIP := e2eservice.GetIngressPoint(&svc.Status.LoadBalancer.Ingress[0]) 1081 svcTCPPort := int(svc.Spec.Ports[0].Port) 1082 1083 const threshold = 2 1084 config := e2enetwork.NewNetworkingTestConfig(ctx, f) 1085 for i := 0; i < len(nodes.Items); i++ { 1086 endpointNodeName := nodes.Items[i].Name 1087 1088 ginkgo.By("creating a pod to be part of the service " + serviceName + " on node " + endpointNodeName) 1089 _, err = jig.Run(ctx, func(rc *v1.ReplicationController) { 1090 rc.Name = serviceName 1091 if endpointNodeName != "" { 1092 rc.Spec.Template.Spec.NodeName = endpointNodeName 1093 } 1094 }) 1095 framework.ExpectNoError(err) 1096 1097 ginkgo.By(fmt.Sprintf("waiting for service endpoint on node %v", endpointNodeName)) 1098 err = jig.WaitForEndpointOnNode(ctx, endpointNodeName) 1099 framework.ExpectNoError(err) 1100 1101 // HealthCheck should pass only on the node where num(endpoints) > 0 1102 // All other nodes should fail the healthcheck on the service healthCheckNodePort 1103 for n, internalIP := range ips { 1104 // Make sure the loadbalancer picked up the health check change. 1105 // Confirm traffic can reach backend through LB before checking healthcheck nodeport. 1106 e2eservice.TestReachableHTTP(ctx, ingressIP, svcTCPPort, e2eservice.KubeProxyLagTimeout) 1107 expectedSuccess := nodes.Items[n].Name == endpointNodeName 1108 port := strconv.Itoa(healthCheckNodePort) 1109 ipPort := net.JoinHostPort(internalIP, port) 1110 framework.Logf("Health checking %s, http://%s/healthz, expectedSuccess %v", nodes.Items[n].Name, ipPort, expectedSuccess) 1111 err := testHTTPHealthCheckNodePortFromTestContainer(ctx, 1112 config, 1113 internalIP, 1114 healthCheckNodePort, 1115 e2eservice.KubeProxyEndpointLagTimeout, 1116 expectedSuccess, 1117 threshold) 1118 framework.ExpectNoError(err) 1119 } 1120 framework.ExpectNoError(e2erc.DeleteRCAndWaitForGC(ctx, f.ClientSet, namespace, serviceName)) 1121 } 1122 }) 1123 1124 ginkgo.It("should work from pods", func(ctx context.Context) { 1125 var err error 1126 namespace := f.Namespace.Name 1127 serviceName := "external-local-pods" 1128 jig := e2eservice.NewTestJig(cs, namespace, serviceName) 1129 1130 svc, err := jig.CreateOnlyLocalLoadBalancerService(ctx, loadBalancerCreateTimeout, true, nil) 1131 framework.ExpectNoError(err) 1132 ginkgo.DeferCleanup(func(ctx context.Context) { 1133 err = jig.ChangeServiceType(ctx, v1.ServiceTypeClusterIP, loadBalancerCreateTimeout) 1134 framework.ExpectNoError(err) 1135 err := cs.CoreV1().Services(svc.Namespace).Delete(ctx, svc.Name, metav1.DeleteOptions{}) 1136 framework.ExpectNoError(err) 1137 }) 1138 1139 // FIXME: figure out the actual expected semantics for 1140 // "ExternalTrafficPolicy: Local" + "IPMode: Proxy". 1141 // https://issues.k8s.io/123714 1142 ingress := &svc.Status.LoadBalancer.Ingress[0] 1143 if ingress.IP == "" || (ingress.IPMode != nil && *ingress.IPMode == v1.LoadBalancerIPModeProxy) { 1144 e2eskipper.Skipf("LoadBalancer uses 'Proxy' IPMode") 1145 } 1146 1147 ingressIP := e2eservice.GetIngressPoint(&svc.Status.LoadBalancer.Ingress[0]) 1148 port := strconv.Itoa(int(svc.Spec.Ports[0].Port)) 1149 ipPort := net.JoinHostPort(ingressIP, port) 1150 path := fmt.Sprintf("%s/clientip", ipPort) 1151 1152 ginkgo.By("Creating pause pod deployment to make sure, pausePods are in desired state") 1153 deployment := createPausePodDeployment(ctx, cs, "pause-pod-deployment", namespace, 1) 1154 framework.ExpectNoError(e2edeployment.WaitForDeploymentComplete(cs, deployment), "Failed to complete pause pod deployment") 1155 1156 ginkgo.DeferCleanup(func(ctx context.Context) { 1157 framework.Logf("Deleting deployment") 1158 err = cs.AppsV1().Deployments(namespace).Delete(ctx, deployment.Name, metav1.DeleteOptions{}) 1159 framework.ExpectNoError(err, "Failed to delete deployment %s", deployment.Name) 1160 }) 1161 1162 deployment, err = cs.AppsV1().Deployments(namespace).Get(ctx, deployment.Name, metav1.GetOptions{}) 1163 framework.ExpectNoError(err, "Error in retrieving pause pod deployment") 1164 labelSelector, err := metav1.LabelSelectorAsSelector(deployment.Spec.Selector) 1165 framework.ExpectNoError(err, "Error in setting LabelSelector as selector from deployment") 1166 1167 pausePods, err := cs.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector.String()}) 1168 framework.ExpectNoError(err, "Error in listing pods associated with pause pod deployments") 1169 1170 pausePod := pausePods.Items[0] 1171 framework.Logf("Waiting up to %v curl %v", e2eservice.KubeProxyLagTimeout, path) 1172 cmd := fmt.Sprintf(`curl -q -s --connect-timeout 30 %v`, path) 1173 1174 var srcIP string 1175 loadBalancerPropagationTimeout := e2eservice.GetServiceLoadBalancerPropagationTimeout(ctx, cs) 1176 ginkgo.By(fmt.Sprintf("Hitting external lb %v from pod %v on node %v", ingressIP, pausePod.Name, pausePod.Spec.NodeName)) 1177 if pollErr := wait.PollUntilContextTimeout(ctx, framework.Poll, loadBalancerPropagationTimeout, true, func(ctx context.Context) (bool, error) { 1178 stdout, err := e2eoutput.RunHostCmd(pausePod.Namespace, pausePod.Name, cmd) 1179 if err != nil { 1180 framework.Logf("got err: %v, retry until timeout", err) 1181 return false, nil 1182 } 1183 srcIP = strings.TrimSpace(strings.Split(stdout, ":")[0]) 1184 return srcIP == pausePod.Status.PodIP, nil 1185 }); pollErr != nil { 1186 framework.Failf("Source IP not preserved from %v, expected '%v' got '%v'", pausePod.Name, pausePod.Status.PodIP, srcIP) 1187 } 1188 }) 1189 1190 ginkgo.It("should handle updates to ExternalTrafficPolicy field", func(ctx context.Context) { 1191 namespace := f.Namespace.Name 1192 serviceName := "external-local-update" 1193 jig := e2eservice.NewTestJig(cs, namespace, serviceName) 1194 1195 nodes, err := e2enode.GetBoundedReadySchedulableNodes(ctx, cs, e2eservice.MaxNodesForEndpointsTests) 1196 framework.ExpectNoError(err) 1197 if len(nodes.Items) < 2 { 1198 framework.Failf("Need at least 2 nodes to verify source ip from a node without endpoint") 1199 } 1200 1201 svc, err := jig.CreateOnlyLocalLoadBalancerService(ctx, loadBalancerCreateTimeout, true, nil) 1202 framework.ExpectNoError(err) 1203 ginkgo.DeferCleanup(func(ctx context.Context) { 1204 err = jig.ChangeServiceType(ctx, v1.ServiceTypeClusterIP, loadBalancerCreateTimeout) 1205 framework.ExpectNoError(err) 1206 err := cs.CoreV1().Services(svc.Namespace).Delete(ctx, svc.Name, metav1.DeleteOptions{}) 1207 framework.ExpectNoError(err) 1208 }) 1209 1210 // save the health check node port because it disappears when Local traffic policy is turned off. 1211 healthCheckNodePort := int(svc.Spec.HealthCheckNodePort) 1212 1213 ginkgo.By("changing ExternalTrafficPolicy to Cluster") 1214 svc, err = jig.UpdateService(ctx, func(svc *v1.Service) { 1215 svc.Spec.ExternalTrafficPolicy = v1.ServiceExternalTrafficPolicyCluster 1216 }) 1217 framework.ExpectNoError(err) 1218 if svc.Spec.HealthCheckNodePort > 0 { 1219 framework.Failf("Service HealthCheck NodePort still present") 1220 } 1221 1222 epNodes, err := jig.ListNodesWithEndpoint(ctx) 1223 framework.ExpectNoError(err) 1224 // map from name of nodes with endpoint to internal ip 1225 // it is assumed that there is only a single node with the endpoint 1226 endpointNodeMap := make(map[string]string) 1227 // map from name of nodes without endpoint to internal ip 1228 noEndpointNodeMap := make(map[string]string) 1229 for _, node := range epNodes { 1230 ips := e2enode.GetAddresses(&node, v1.NodeInternalIP) 1231 if len(ips) < 1 { 1232 framework.Failf("No internal ip found for node %s", node.Name) 1233 } 1234 endpointNodeMap[node.Name] = ips[0] 1235 } 1236 for _, n := range nodes.Items { 1237 ips := e2enode.GetAddresses(&n, v1.NodeInternalIP) 1238 if len(ips) < 1 { 1239 framework.Failf("No internal ip found for node %s", n.Name) 1240 } 1241 if _, ok := endpointNodeMap[n.Name]; !ok { 1242 noEndpointNodeMap[n.Name] = ips[0] 1243 } 1244 } 1245 gomega.Expect(endpointNodeMap).ToNot(gomega.BeEmpty()) 1246 gomega.Expect(noEndpointNodeMap).ToNot(gomega.BeEmpty()) 1247 1248 svcTCPPort := int(svc.Spec.Ports[0].Port) 1249 svcNodePort := int(svc.Spec.Ports[0].NodePort) 1250 ingressIP := e2eservice.GetIngressPoint(&svc.Status.LoadBalancer.Ingress[0]) 1251 path := "/clientip" 1252 dialCmd := "clientip" 1253 1254 config := e2enetwork.NewNetworkingTestConfig(ctx, f) 1255 1256 ginkgo.By(fmt.Sprintf("endpoints present on nodes %v, absent on nodes %v", endpointNodeMap, noEndpointNodeMap)) 1257 for nodeName, nodeIP := range noEndpointNodeMap { 1258 ginkgo.By(fmt.Sprintf("Checking %v (%v:%v/%v) proxies to endpoints on another node", nodeName, nodeIP[0], svcNodePort, dialCmd)) 1259 _, err := GetHTTPContentFromTestContainer(ctx, config, nodeIP, svcNodePort, e2eservice.KubeProxyLagTimeout, dialCmd) 1260 framework.ExpectNoError(err, "Could not reach HTTP service through %v:%v/%v after %v", nodeIP, svcNodePort, dialCmd, e2eservice.KubeProxyLagTimeout) 1261 } 1262 1263 for nodeName, nodeIP := range endpointNodeMap { 1264 ginkgo.By(fmt.Sprintf("checking kube-proxy health check fails on node with endpoint (%s), public IP %s", nodeName, nodeIP)) 1265 var body string 1266 pollFn := func(ctx context.Context) (bool, error) { 1267 // we expect connection failure here, but not other errors 1268 resp, err := config.GetResponseFromTestContainer(ctx, 1269 "http", 1270 "healthz", 1271 nodeIP, 1272 healthCheckNodePort) 1273 if err != nil { 1274 return false, nil 1275 } 1276 if len(resp.Errors) > 0 { 1277 return true, nil 1278 } 1279 if len(resp.Responses) > 0 { 1280 body = resp.Responses[0] 1281 } 1282 return false, nil 1283 } 1284 if pollErr := wait.PollUntilContextTimeout(ctx, framework.Poll, e2eservice.TestTimeout, true, pollFn); pollErr != nil { 1285 framework.Failf("Kube-proxy still exposing health check on node %v:%v, after traffic policy set to Cluster. body %s", 1286 nodeName, healthCheckNodePort, body) 1287 } 1288 } 1289 1290 // Poll till kube-proxy re-adds the MASQUERADE rule on the node. 1291 ginkgo.By(fmt.Sprintf("checking source ip is NOT preserved through loadbalancer %v", ingressIP)) 1292 var clientIP string 1293 pollErr := wait.PollUntilContextTimeout(ctx, framework.Poll, 3*e2eservice.KubeProxyLagTimeout, true, func(ctx context.Context) (bool, error) { 1294 clientIPPort, err := GetHTTPContent(ingressIP, svcTCPPort, e2eservice.KubeProxyLagTimeout, path) 1295 if err != nil { 1296 return false, nil 1297 } 1298 // The clientIPPort returned from GetHTTPContent is in this format: x.x.x.x:port or [xx:xx:xx::x]:port 1299 host, _, err := net.SplitHostPort(clientIPPort) 1300 if err != nil { 1301 framework.Logf("SplitHostPort returned unexpected error: %q", clientIPPort) 1302 return false, nil 1303 } 1304 ip := netutils.ParseIPSloppy(host) 1305 if ip == nil { 1306 framework.Logf("Invalid client IP address format: %q", host) 1307 return false, nil 1308 } 1309 if subnetPrefix.Contains(ip) { 1310 return true, nil 1311 } 1312 return false, nil 1313 }) 1314 if pollErr != nil { 1315 framework.Failf("Source IP WAS preserved with Cluster traffic policy. Got %v, expected a cluster ip.", clientIP) 1316 } 1317 1318 // TODO: We need to attempt to create another service with the previously 1319 // allocated healthcheck nodePort. If the health check nodePort has been 1320 // freed, the new service creation will succeed, upon which we cleanup. 1321 // If the health check nodePort has NOT been freed, the new service 1322 // creation will fail. 1323 1324 ginkgo.By("setting ExternalTrafficPolicy back to Local") 1325 svc, err = jig.UpdateService(ctx, func(svc *v1.Service) { 1326 svc.Spec.ExternalTrafficPolicy = v1.ServiceExternalTrafficPolicyLocal 1327 // Request the same healthCheckNodePort as before, to test the user-requested allocation path 1328 svc.Spec.HealthCheckNodePort = int32(healthCheckNodePort) 1329 }) 1330 framework.ExpectNoError(err) 1331 loadBalancerPropagationTimeout := e2eservice.GetServiceLoadBalancerPropagationTimeout(ctx, cs) 1332 pollErr = wait.PollUntilContextTimeout(ctx, framework.PollShortTimeout, loadBalancerPropagationTimeout, true, func(ctx context.Context) (bool, error) { 1333 clientIPPort, err := GetHTTPContent(ingressIP, svcTCPPort, e2eservice.KubeProxyLagTimeout, path) 1334 if err != nil { 1335 return false, nil 1336 } 1337 ginkgo.By(fmt.Sprintf("Endpoint %v:%v%v returned client ip %v", ingressIP, svcTCPPort, path, clientIPPort)) 1338 // The clientIPPort returned from GetHTTPContent is in this format: x.x.x.x:port or [xx:xx:xx::x]:port 1339 host, _, err := net.SplitHostPort(clientIPPort) 1340 if err != nil { 1341 framework.Logf("SplitHostPort returned unexpected error: %q", clientIPPort) 1342 return false, nil 1343 } 1344 ip := netutils.ParseIPSloppy(host) 1345 if ip == nil { 1346 framework.Logf("Invalid client IP address format: %q", host) 1347 return false, nil 1348 } 1349 if !subnetPrefix.Contains(ip) { 1350 return true, nil 1351 } 1352 return false, nil 1353 }) 1354 if pollErr != nil { 1355 framework.Failf("Source IP (%v) is not the client IP after ExternalTrafficPolicy set back to Local, expected a public IP.", clientIP) 1356 } 1357 }) 1358 }) 1359 1360 func testRollingUpdateLBConnectivityDisruption(ctx context.Context, f *framework.Framework, externalTrafficPolicy v1.ServiceExternalTrafficPolicyType, minSuccessRate float64) { 1361 cs := f.ClientSet 1362 ns := f.Namespace.Name 1363 name := "test-lb-rolling-update" 1364 labels := map[string]string{"name": name} 1365 gracePeriod := int64(60) 1366 maxUnavailable := intstr.FromString("10%") 1367 ds := e2edaemonset.NewDaemonSet(name, e2eapps.AgnhostImage, labels, nil, nil, 1368 []v1.ContainerPort{ 1369 {ContainerPort: 80}, 1370 }, 1371 "netexec", "--http-port=80", fmt.Sprintf("--delay-shutdown=%d", gracePeriod), 1372 ) 1373 ds.Spec.UpdateStrategy = appsv1.DaemonSetUpdateStrategy{ 1374 Type: appsv1.RollingUpdateDaemonSetStrategyType, 1375 RollingUpdate: &appsv1.RollingUpdateDaemonSet{ 1376 MaxUnavailable: &maxUnavailable, 1377 }, 1378 } 1379 ds.Spec.Template.Labels = labels 1380 ds.Spec.Template.Spec.TerminationGracePeriodSeconds = ptr.To(gracePeriod) 1381 1382 nodeNames := e2edaemonset.SchedulableNodes(ctx, cs, ds) 1383 e2eskipper.SkipUnlessAtLeast(len(nodeNames), 2, "load-balancer rolling update test requires at least 2 schedulable nodes for the DaemonSet") 1384 if len(nodeNames) > 25 { 1385 e2eskipper.Skipf("load-balancer rolling update test skipped for large environments with more than 25 nodes") 1386 } 1387 1388 ginkgo.By(fmt.Sprintf("Creating DaemonSet %q", name)) 1389 ds, err := cs.AppsV1().DaemonSets(ns).Create(context.TODO(), ds, metav1.CreateOptions{}) 1390 framework.ExpectNoError(err) 1391 1392 ginkgo.By("Checking that daemon pods launch on every schedulable node of the cluster") 1393 creationTimeout := e2eservice.GetServiceLoadBalancerCreationTimeout(ctx, cs) 1394 err = wait.PollUntilContextTimeout(ctx, framework.Poll, creationTimeout, true, e2edaemonset.CheckDaemonPodOnNodes(f, ds, nodeNames)) 1395 framework.ExpectNoError(err, "error waiting for daemon pods to start") 1396 err = e2edaemonset.CheckDaemonStatus(ctx, f, name) 1397 framework.ExpectNoError(err) 1398 1399 ginkgo.By(fmt.Sprintf("Creating a service %s with type=LoadBalancer externalTrafficPolicy=%s in namespace %s", name, externalTrafficPolicy, ns)) 1400 jig := e2eservice.NewTestJig(cs, ns, name) 1401 jig.Labels = labels 1402 service, err := jig.CreateLoadBalancerService(ctx, creationTimeout, func(svc *v1.Service) { 1403 svc.Spec.ExternalTrafficPolicy = externalTrafficPolicy 1404 }) 1405 framework.ExpectNoError(err) 1406 1407 lbNameOrAddress := e2eservice.GetIngressPoint(&service.Status.LoadBalancer.Ingress[0]) 1408 svcPort := int(service.Spec.Ports[0].Port) 1409 1410 ginkgo.By("Hitting the DaemonSet's pods through the service's load balancer") 1411 // FIXME: need a better platform-independent timeout 1412 timeout := e2eservice.LoadBalancerLagTimeoutAWS 1413 e2eservice.TestReachableHTTP(ctx, lbNameOrAddress, svcPort, timeout) 1414 1415 ginkgo.By("Starting a goroutine to continuously hit the DaemonSet's pods through the service's load balancer") 1416 var totalRequests uint64 = 0 1417 var networkErrors uint64 = 0 1418 var httpErrors uint64 = 0 1419 done := make(chan struct{}) 1420 defer close(done) 1421 go func() { 1422 defer ginkgo.GinkgoRecover() 1423 1424 wait.Until(func() { 1425 atomic.AddUint64(&totalRequests, 1) 1426 client := &http.Client{ 1427 Transport: utilnet.SetTransportDefaults(&http.Transport{ 1428 DisableKeepAlives: true, 1429 }), 1430 Timeout: 5 * time.Second, 1431 } 1432 ipPort := net.JoinHostPort(lbNameOrAddress, strconv.Itoa(svcPort)) 1433 msg := "hello" 1434 url := fmt.Sprintf("http://%s/echo?msg=%s", ipPort, msg) 1435 resp, err := client.Get(url) 1436 if err != nil { 1437 framework.Logf("Got error testing for reachability of %s: %v", url, err) 1438 atomic.AddUint64(&networkErrors, 1) 1439 return 1440 } 1441 defer func() { _ = resp.Body.Close() }() 1442 if resp.StatusCode != http.StatusOK { 1443 framework.Logf("Got bad status code: %d", resp.StatusCode) 1444 atomic.AddUint64(&httpErrors, 1) 1445 return 1446 } 1447 body, err := io.ReadAll(resp.Body) 1448 if err != nil { 1449 framework.Logf("Got error reading HTTP body: %v", err) 1450 atomic.AddUint64(&httpErrors, 1) 1451 return 1452 } 1453 if string(body) != msg { 1454 framework.Logf("The response body does not contain expected string %s", string(body)) 1455 atomic.AddUint64(&httpErrors, 1) 1456 return 1457 } 1458 }, time.Duration(0), done) 1459 }() 1460 1461 ginkgo.By("Triggering DaemonSet rolling update several times") 1462 var previousTotalRequests uint64 = 0 1463 var previousNetworkErrors uint64 = 0 1464 var previousHTTPErrors uint64 = 0 1465 for i := 1; i <= 5; i++ { 1466 framework.Logf("Update daemon pods environment: [{\"name\":\"VERSION\",\"value\":\"%d\"}]", i) 1467 patch := fmt.Sprintf(`{"spec":{"template":{"spec":{"containers":[{"name":"%s","env":[{"name":"VERSION","value":"%d"}]}]}}}}`, ds.Spec.Template.Spec.Containers[0].Name, i) 1468 ds, err = cs.AppsV1().DaemonSets(ns).Patch(context.TODO(), name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{}) 1469 framework.ExpectNoError(err) 1470 1471 framework.Logf("Check that daemon pods are available on every node of the cluster with the updated environment.") 1472 err = wait.PollUntilContextTimeout(ctx, framework.Poll, creationTimeout, true, func(ctx context.Context) (bool, error) { 1473 podList, err := cs.CoreV1().Pods(ds.Namespace).List(ctx, metav1.ListOptions{}) 1474 if err != nil { 1475 return false, err 1476 } 1477 pods := podList.Items 1478 1479 readyPods := 0 1480 for _, pod := range pods { 1481 if !metav1.IsControlledBy(&pod, ds) { 1482 continue 1483 } 1484 if pod.DeletionTimestamp != nil { 1485 continue 1486 } 1487 podVersion := "" 1488 for _, env := range pod.Spec.Containers[0].Env { 1489 if env.Name == "VERSION" { 1490 podVersion = env.Value 1491 break 1492 } 1493 } 1494 if podVersion != fmt.Sprintf("%d", i) { 1495 continue 1496 } 1497 podReady := podutil.IsPodAvailable(&pod, ds.Spec.MinReadySeconds, metav1.Now()) 1498 if !podReady { 1499 continue 1500 } 1501 readyPods += 1 1502 } 1503 framework.Logf("Number of running nodes: %d, number of updated ready pods: %d in daemonset %s", len(nodeNames), readyPods, ds.Name) 1504 return readyPods == len(nodeNames), nil 1505 }) 1506 framework.ExpectNoError(err, "error waiting for daemon pods to be ready") 1507 1508 // assert that the HTTP requests success rate is above the acceptable threshold after this rolling update 1509 currentTotalRequests := atomic.LoadUint64(&totalRequests) 1510 currentNetworkErrors := atomic.LoadUint64(&networkErrors) 1511 currentHTTPErrors := atomic.LoadUint64(&httpErrors) 1512 1513 partialTotalRequests := currentTotalRequests - previousTotalRequests 1514 partialNetworkErrors := currentNetworkErrors - previousNetworkErrors 1515 partialHTTPErrors := currentHTTPErrors - previousHTTPErrors 1516 partialSuccessRate := (float64(partialTotalRequests) - float64(partialNetworkErrors+partialHTTPErrors)) / float64(partialTotalRequests) 1517 1518 framework.Logf("Load Balancer total HTTP requests: %d", partialTotalRequests) 1519 framework.Logf("Network errors: %d", partialNetworkErrors) 1520 framework.Logf("HTTP errors: %d", partialHTTPErrors) 1521 framework.Logf("Success rate: %.2f%%", partialSuccessRate*100) 1522 if partialSuccessRate < minSuccessRate { 1523 framework.Failf("Encountered too many errors when doing HTTP requests to the load balancer address. Success rate is %.2f%%, and the minimum allowed threshold is %.2f%%.", partialSuccessRate*100, minSuccessRate*100) 1524 } 1525 1526 previousTotalRequests = currentTotalRequests 1527 previousNetworkErrors = currentNetworkErrors 1528 previousHTTPErrors = currentHTTPErrors 1529 } 1530 1531 // assert that the load balancer address is still reachable after the rolling updates are finished 1532 e2eservice.TestReachableHTTP(ctx, lbNameOrAddress, svcPort, timeout) 1533 }