github.com/cilium/cilium@v1.16.2/test/k8s/fqdn.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package k8sTest
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"net"
    10  
    11  	. "github.com/onsi/gomega"
    12  
    13  	. "github.com/cilium/cilium/test/ginkgo-ext"
    14  	"github.com/cilium/cilium/test/helpers"
    15  )
    16  
    17  // The 5.4 CI job is intended to catch BPF complexity regressions and as such
    18  // doesn't need to execute this test suite.
    19  var _ = SkipDescribeIf(helpers.RunsOn54Kernel, "K8sAgentFQDNTest", func() {
    20  	var (
    21  		kubectl *helpers.Kubectl
    22  
    23  		demoManifest   = ""
    24  		ciliumFilename string
    25  
    26  		apps    = []string{helpers.App2, helpers.App3}
    27  		appPods map[string]string
    28  
    29  		// The IPs are updated in BeforeAll
    30  		worldTarget          = "vagrant-cache.ci.cilium.io"
    31  		worldTargetIP        = "147.75.38.95"
    32  		worldInvalidTarget   = "cilium.io"
    33  		worldInvalidTargetIP = "104.198.14.52"
    34  	)
    35  
    36  	BeforeAll(func() {
    37  		// In case the IPs changed from above, update them here
    38  		var lookupErr error
    39  		err := helpers.WithTimeout(func() bool {
    40  			addrs, err2 := net.LookupHost(worldTarget)
    41  			if err2 != nil {
    42  				lookupErr = fmt.Errorf("error looking up target domain: %w", err2)
    43  				return false
    44  			}
    45  			worldTargetIP = addrs[0]
    46  			return true
    47  		}, fmt.Sprintf("Could not get %s IP", worldTarget), &helpers.TimeoutConfig{Timeout: helpers.HelperTimeout})
    48  		Expect(err).Should(BeNil(), "Error obtaining IP for test: %s", lookupErr)
    49  
    50  		lookupErr = nil
    51  		err = helpers.WithTimeout(func() bool {
    52  			addrs, err2 := net.LookupHost(worldInvalidTarget)
    53  			if err2 != nil {
    54  				lookupErr = fmt.Errorf("error looking up target domain: %w", err2)
    55  				return false
    56  			}
    57  			worldInvalidTargetIP = addrs[0]
    58  			return true
    59  		}, fmt.Sprintf("Could not get %s IP", worldInvalidTarget), &helpers.TimeoutConfig{Timeout: helpers.HelperTimeout})
    60  		Expect(err).Should(BeNil(), "Error obtaining IP for test: %s", lookupErr)
    61  
    62  		kubectl = helpers.CreateKubectl(helpers.K8s1VMName(), logger)
    63  		demoManifest = helpers.ManifestGet(kubectl.BasePath(), "demo.yaml")
    64  
    65  		ciliumFilename = helpers.TimestampFilename("cilium.yaml")
    66  		DeployCiliumAndDNS(kubectl, ciliumFilename)
    67  
    68  		By("Applying demo manifest")
    69  		res := kubectl.ApplyDefault(demoManifest)
    70  		res.ExpectSuccess("Demo config cannot be deployed")
    71  
    72  		err = kubectl.WaitforPods(helpers.DefaultNamespace, "-l zgroup=testapp", helpers.HelperTimeout)
    73  		Expect(err).Should(BeNil(), "Testapp is not ready after timeout")
    74  
    75  		appPods = helpers.GetAppPods(apps, helpers.DefaultNamespace, kubectl, "id")
    76  
    77  		// Validate that coredns is reachable from test pods
    78  		err = kubectl.NslookupInPod(helpers.DefaultNamespace, appPods[helpers.App2], "kube-dns.kube-system.svc.cluster.local")
    79  		Expect(err).Should(BeNil(), "Error reaching kube-dns before test: %s", err)
    80  	})
    81  
    82  	AfterFailed(func() {
    83  		kubectl.CiliumReport("cilium-dbg service list", "cilium-dbg endpoint list")
    84  	})
    85  
    86  	AfterAll(func() {
    87  		_ = kubectl.Delete(demoManifest)
    88  		ExpectAllPodsTerminated(kubectl)
    89  
    90  		UninstallCiliumFromManifest(kubectl, ciliumFilename)
    91  		kubectl.CloseSSHClient()
    92  	})
    93  
    94  	AfterEach(func() {
    95  		_ = kubectl.Exec(fmt.Sprintf("%s delete --all cnp", helpers.KubectlCmd))
    96  	})
    97  
    98  	It("Restart Cilium validate that FQDN is still working", func() {
    99  		// Test functionality:
   100  		// - When Cilium is running) Connectivity from App2 application can
   101  		// connect to DNS because dns-proxy filter the DNS request. If the
   102  		// connection is made correctly the IP is whitelisted by the FQDN rule
   103  		// until the DNS TTL expires.
   104  		// - When Cilium is not running) The DNS-proxy is not working, so the IP
   105  		// connectivity to an existing IP that was queried before will work,
   106  		// meanwhile connections using new DNS request will fail.
   107  		// - On restart) Cilium will restore the IPS that were white-listted in
   108  		// the FQDN and connection will work as normal.
   109  
   110  		ciliumPodK8s1, err := kubectl.GetCiliumPodOnNode(helpers.K8s1)
   111  		Expect(err).Should(BeNil(), "Cannot get cilium pod on k8s1")
   112  		monitorRes1, monitorCancel1 := kubectl.MonitorStart(ciliumPodK8s1)
   113  		ciliumPodK8s2, err := kubectl.GetCiliumPodOnNode(helpers.K8s2)
   114  		Expect(err).Should(BeNil(), "Cannot get cilium pod on k8s2")
   115  		monitorRes2, monitorCancel2 := kubectl.MonitorStart(ciliumPodK8s2)
   116  		defer func() {
   117  			monitorCancel1()
   118  			monitorCancel2()
   119  			helpers.WriteToReportFile(monitorRes1.CombineOutput().Bytes(), "fqdn-restart-cilium-monitor-k8s1.log")
   120  			helpers.WriteToReportFile(monitorRes2.CombineOutput().Bytes(), "fqdn-restart-cilium-monitor-k8s2.log")
   121  		}()
   122  
   123  		connectivityTest := func() {
   124  			By("Testing that connection from %q to %q should work",
   125  				appPods[helpers.App2], worldTarget)
   126  			res := kubectl.ExecPodCmd(
   127  				helpers.DefaultNamespace, appPods[helpers.App2],
   128  				helpers.CurlFail(worldTarget))
   129  			ExpectWithOffset(1, res).To(helpers.CMDSuccess(), "%q cannot curl to %q",
   130  				appPods[helpers.App2], worldTarget)
   131  
   132  			By("Testing that connection from %q to %q shouldn't work",
   133  				appPods[helpers.App2], worldInvalidTarget)
   134  			res = kubectl.ExecPodCmd(
   135  				helpers.DefaultNamespace, appPods[helpers.App2],
   136  				helpers.CurlFail(worldInvalidTarget))
   137  			ExpectWithOffset(1, res).ShouldNot(helpers.CMDSuccess(),
   138  				"%q can curl to %q when it should fail", appPods[helpers.App2], worldInvalidTarget)
   139  
   140  			By("Testing that connection from %q to %q works",
   141  				appPods[helpers.App2], worldTargetIP)
   142  			res = kubectl.ExecPodCmd(
   143  				helpers.DefaultNamespace, appPods[helpers.App2],
   144  				helpers.CurlFail(worldTargetIP))
   145  			res.ExpectSuccess("%q cannot curl to %q during restart", helpers.App2, worldTargetIP)
   146  
   147  			By("Testing that connection from %q to %q should not work",
   148  				appPods[helpers.App2], worldInvalidTargetIP)
   149  			res = kubectl.ExecPodCmd(
   150  				helpers.DefaultNamespace, appPods[helpers.App2],
   151  				helpers.CurlFail(worldInvalidTargetIP))
   152  			res.ExpectFail("%q can connect when it should not work", helpers.App2)
   153  		}
   154  
   155  		fqndProxyPolicy := helpers.ManifestGet(kubectl.BasePath(), "fqdn-proxy-policy.yaml")
   156  
   157  		_, err = kubectl.CiliumPolicyAction(
   158  			helpers.DefaultNamespace, fqndProxyPolicy,
   159  			helpers.KubectlApply, helpers.HelperTimeout)
   160  		Expect(err).To(BeNil(), "Cannot install fqdn proxy policy")
   161  
   162  		connectivityTest()
   163  
   164  		// Collect numeric identity of worldTargetIP and the worldTarget selector
   165  		ctx, cancel := context.WithCancel(context.Background())
   166  		defer cancel()
   167  		cmdTargetIdentity := fmt.Sprintf(`cilium-dbg ip list -o json | jq -r '.[] | select(.cidr == "%s/32") | .identity'`, worldTargetIP)
   168  		worldTargetIdentityBefore, err := kubectl.CiliumExecContext(ctx, ciliumPodK8s1, cmdTargetIdentity).IntOutput()
   169  		Expect(err).To(BeNil(), "Identity of IP %s for ToFQDN selector %s not found in IPCache", worldTargetIP, worldTarget)
   170  		Expect(worldTargetIdentityBefore).NotTo(BeZero())
   171  
   172  		cmdTargetSelector := fmt.Sprintf(`cilium-dbg policy selectors list -o json | jq '.[] | select(.selector | test("%s")) | .identities[] | .'`, worldTarget)
   173  		worldTargetSelectorBefore, err := kubectl.CiliumExecContext(ctx, ciliumPodK8s1, cmdTargetSelector).IntOutput()
   174  		Expect(err).To(BeNil(), "ToFQDN selector %s does not seem to select a numeric identity", worldTarget)
   175  		Expect(worldTargetSelectorBefore).NotTo(BeZero())
   176  
   177  		Expect(worldTargetIdentityBefore).To(Equal(worldTargetSelectorBefore), "Identity selected by ToFQDN selector %s does not match identity of IP %s", worldTarget, worldTargetIP)
   178  
   179  		By("restarting cilium pods")
   180  
   181  		// kill pid 1 in each cilium pod
   182  		cmd := fmt.Sprintf("%[1]s get pods -l k8s-app=cilium -n %[2]s |  tail -n +2 | cut -d ' ' -f 1 | xargs -I{} %[1]s exec -n %[2]s {} -- kill 1",
   183  			helpers.KubectlCmd, helpers.CiliumNamespace)
   184  		quit, run := kubectl.RepeatCommandInBackground(cmd)
   185  		channelClosed := false
   186  		defer func() {
   187  			if !channelClosed {
   188  				close(quit)
   189  			}
   190  		}()
   191  		<-run // waiting for first run to finish
   192  
   193  		By("Testing connectivity when cilium is restoring using IPS without DNS")
   194  		res := kubectl.ExecPodCmd(
   195  			helpers.DefaultNamespace, appPods[helpers.App2],
   196  			helpers.CurlFail(worldTargetIP))
   197  		res.ExpectSuccess("%q cannot curl to %q during restart", helpers.App2, worldTargetIP)
   198  
   199  		res = kubectl.ExecPodCmd(
   200  			helpers.DefaultNamespace, appPods[helpers.App2],
   201  			helpers.CurlFail(worldInvalidTargetIP))
   202  		res.ExpectFail("%q can connect when it should not work", helpers.App2)
   203  
   204  		// This test is failing consistently in quarantine, see #11213. Disable it for now
   205  		// to verify the rest of the test is running stable in quarantine. Once this is the
   206  		// case we could move the rest of the test out of quarantine and quarantine only
   207  		// this part.
   208  		if false {
   209  			// Re-run connectivity test while Cilium is still restarting. This should succeed as the same
   210  			// DNS names were used in a connectivity test before the restart.
   211  			connectivityTest()
   212  		}
   213  
   214  		channelClosed = true
   215  		close(quit)
   216  
   217  		ExpectAllPodsTerminated(kubectl)
   218  		ExpectCiliumReady(kubectl)
   219  
   220  		// Restart monitoring after Cilium restart
   221  		monitorRes1After, monitorCancel1After := kubectl.MonitorStart(ciliumPodK8s1)
   222  		monitorRes2After, monitorCancel2After := kubectl.MonitorStart(ciliumPodK8s2)
   223  		defer func() {
   224  			monitorCancel1After()
   225  			monitorCancel2After()
   226  			helpers.WriteToReportFile(monitorRes1After.CombineOutput().Bytes(), "fqdn-after-restart-cilium-monitor-k8s1.log")
   227  			helpers.WriteToReportFile(monitorRes2After.CombineOutput().Bytes(), "fqdn-after-restart-cilium-monitor-k8s2.log")
   228  		}()
   229  
   230  		// @TODO This endpoint ready call SHOULD NOT be here
   231  		// Here some packets can be lost due to two different scenarios:
   232  		//
   233  		// 1) On restore the endpoint/fqdn policies, the identity ID for the
   234  		// CIDRSet can be different, so if one endpoint start to regenerate and
   235  		// other still have the old identity things can mess around and some
   236  		// IPs are not white listed correctly. To prevent this, a restore for
   237  		// local-identities will be added in the future.
   238  		//
   239  		// 2) On restore, the Kubernetes watcher is sending the CNP back to
   240  		// Cilium, and before the endoint is restored the CNP can be applied
   241  		// without the ToCIDRSet, this means that there is no TOCIDR rule in
   242  		// the cilium policy and traffic will be drop.
   243  
   244  		// As mentioned above, these endpoints ready should not be there, the only
   245  		// reason to have this piece of code here is to reduce a flaky test.
   246  		err = kubectl.CiliumEndpointWaitReady()
   247  		Expect(err).To(BeNil(), "Endpoints are not ready after Cilium restarts")
   248  
   249  		// Collect numeric identity of worldTargetIP and the worldTarget selector after restart
   250  		worldTargetIdentityAfter, err := kubectl.CiliumExecContext(ctx, ciliumPodK8s1, cmdTargetIdentity).IntOutput()
   251  		Expect(err).To(BeNil(), "Identity of IP %s for ToFQDN selector %s not found in IPCache after restart", worldTargetIP, worldTarget)
   252  		Expect(worldTargetIdentityAfter).NotTo(BeZero())
   253  
   254  		worldTargetSelectorAfter, err := kubectl.CiliumExecContext(ctx, ciliumPodK8s1, cmdTargetSelector).IntOutput()
   255  		Expect(err).To(BeNil(), "ToFQDN selector %s does not seem to select a numeric identity after restart", worldTarget)
   256  		Expect(worldTargetSelectorAfter).NotTo(BeZero())
   257  
   258  		Expect(worldTargetIdentityAfter).To(Equal(worldTargetSelectorAfter), "Identity selected by ToFQDN selector %s does not match identity of IP %s after restart", worldTarget, worldTargetIP)
   259  		Expect(worldTargetIdentityAfter).To(Equal(worldTargetIdentityBefore), "Identity IP %s changed after restart", worldTargetIP)
   260  
   261  		By("Testing connectivity when cilium is *restored* using IPS without DNS")
   262  		res = kubectl.ExecPodCmd(
   263  			helpers.DefaultNamespace, appPods[helpers.App2],
   264  			helpers.CurlFail(worldTargetIP))
   265  		res.ExpectSuccess("%q cannot curl to %q after restart", helpers.App2, worldTargetIP)
   266  
   267  		res = kubectl.ExecPodCmd(
   268  			helpers.DefaultNamespace, appPods[helpers.App2],
   269  			helpers.CurlFail(worldInvalidTargetIP))
   270  		res.ExpectFail("%q can connect when it should not work", helpers.App2)
   271  
   272  		By("Testing connectivity using DNS request when cilium is restored correctly")
   273  		connectivityTest()
   274  	})
   275  
   276  	SkipItIf(helpers.RunsOnAKS, "Validate that multiple specs are working correctly", func() {
   277  		// To make sure that UUID in multiple specs are plumbed correctly to
   278  		// Cilium Policy
   279  		fqdnPolicy := helpers.ManifestGet(kubectl.BasePath(), "fqdn-proxy-multiple-specs.yaml")
   280  		world1Target := worldTarget
   281  		world2Target := worldInvalidTarget
   282  
   283  		_, err := kubectl.CiliumPolicyAction(
   284  			helpers.DefaultNamespace, fqdnPolicy,
   285  			helpers.KubectlApply, helpers.HelperTimeout)
   286  		Expect(err).To(BeNil(), "Cannot install fqdn proxy policy")
   287  
   288  		By("Validating APP2 policy connectivity")
   289  		res := kubectl.ExecPodCmd(
   290  			helpers.DefaultNamespace, appPods[helpers.App2],
   291  			helpers.CurlFail("--retry 5 "+world1Target))
   292  		res.ExpectSuccess("Can't connect to a valid target when it should work")
   293  
   294  		res = kubectl.ExecPodCmd(
   295  			helpers.DefaultNamespace, appPods[helpers.App2],
   296  			helpers.CurlFail(world2Target))
   297  		res.ExpectFail("Can connect to a valid target when it should NOT work")
   298  
   299  		By("Validating APP3 policy connectivity")
   300  
   301  		res = kubectl.ExecPodCmd(
   302  			helpers.DefaultNamespace, appPods[helpers.App3],
   303  			helpers.CurlWithRetries(world2Target, 5, true))
   304  		res.ExpectSuccess("Can't connect to a valid target when it should work")
   305  
   306  		res = kubectl.ExecPodCmd(
   307  			helpers.DefaultNamespace, appPods[helpers.App3],
   308  			helpers.CurlFail(world1Target))
   309  		res.ExpectFail("Can connect to a valid target when it should NOT work")
   310  	})
   311  
   312  	SkipItIf(helpers.RunsOnAKS, "Validate that FQDN policy continues to work after being updated", func() {
   313  		// To make sure that UUID in multiple specs are plumbed correctly to
   314  		// Cilium Policy
   315  		fqdnPolicy := helpers.ManifestGet(kubectl.BasePath(), "fqdn-proxy-multiple-specs.yaml")
   316  		world1Target := worldTarget
   317  		world2Target := worldInvalidTarget
   318  
   319  		_, err := kubectl.CiliumPolicyAction(
   320  			helpers.DefaultNamespace, fqdnPolicy,
   321  			helpers.KubectlApply, helpers.HelperTimeout)
   322  		Expect(err).To(BeNil(), "Cannot install fqdn proxy policy")
   323  
   324  		By("Validating APP2 policy connectivity")
   325  		res := kubectl.ExecPodCmd(
   326  			helpers.DefaultNamespace, appPods[helpers.App2],
   327  			helpers.CurlFail("--retry 5 "+world1Target))
   328  		res.ExpectSuccess("Can't connect to a valid target when it should work")
   329  
   330  		res = kubectl.ExecPodCmd(
   331  			helpers.DefaultNamespace, appPods[helpers.App2],
   332  			helpers.CurlFail(world2Target))
   333  		res.ExpectFail("Can connect to a valid target when it should NOT work")
   334  
   335  		By("Updating the policy to include an extra FQDN allow statement")
   336  		fqdnPolicy2 := helpers.ManifestGet(kubectl.BasePath(), "fqdn-proxy-multiple-specs-v2.yaml")
   337  		_, err = kubectl.CiliumPolicyAction(
   338  			helpers.DefaultNamespace, fqdnPolicy2,
   339  			helpers.KubectlApply, helpers.HelperTimeout)
   340  		Expect(err).To(BeNil(), "Cannot install fqdn proxy policy")
   341  
   342  		By("Validating APP2 policy connectivity after policy change")
   343  		res = kubectl.ExecPodCmd(
   344  			helpers.DefaultNamespace, appPods[helpers.App2],
   345  			helpers.CurlFail("--retry 5 "+world1Target))
   346  		res.ExpectSuccess("Can't connect to a valid target when it should work")
   347  
   348  		res = kubectl.ExecPodCmd(
   349  			helpers.DefaultNamespace, appPods[helpers.App2],
   350  			helpers.CurlFail(world2Target))
   351  		res.ExpectFail("Can connect to a valid target when it should NOT work")
   352  	})
   353  
   354  })