github.com/cilium/cilium@v1.16.2/pkg/datapath/iptables/iptables.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package iptables
     5  
     6  import (
     7  	"bufio"
     8  	"context"
     9  	"fmt"
    10  	"net"
    11  	"net/netip"
    12  	"os"
    13  	"regexp"
    14  	"strconv"
    15  	"strings"
    16  
    17  	"github.com/blang/semver/v4"
    18  	"github.com/cilium/hive/cell"
    19  	"github.com/cilium/hive/job"
    20  	"github.com/cilium/statedb"
    21  	"github.com/mattn/go-shellwords"
    22  	"github.com/sirupsen/logrus"
    23  	"github.com/vishvananda/netlink"
    24  
    25  	"github.com/cilium/cilium/daemon/cmd/cni"
    26  	"github.com/cilium/cilium/pkg/byteorder"
    27  	"github.com/cilium/cilium/pkg/cidr"
    28  	"github.com/cilium/cilium/pkg/command/exec"
    29  	"github.com/cilium/cilium/pkg/datapath/iptables/ipset"
    30  	"github.com/cilium/cilium/pkg/datapath/linux/linux_defaults"
    31  	"github.com/cilium/cilium/pkg/datapath/linux/modules"
    32  	"github.com/cilium/cilium/pkg/datapath/linux/route"
    33  	"github.com/cilium/cilium/pkg/datapath/linux/sysctl"
    34  	"github.com/cilium/cilium/pkg/datapath/tables"
    35  	"github.com/cilium/cilium/pkg/defaults"
    36  	"github.com/cilium/cilium/pkg/fqdn/proxy/ipfamily"
    37  	ipamOption "github.com/cilium/cilium/pkg/ipam/option"
    38  	lb "github.com/cilium/cilium/pkg/loadbalancer"
    39  	"github.com/cilium/cilium/pkg/lock"
    40  	"github.com/cilium/cilium/pkg/logging/logfields"
    41  	"github.com/cilium/cilium/pkg/node"
    42  	"github.com/cilium/cilium/pkg/option"
    43  	"github.com/cilium/cilium/pkg/time"
    44  	"github.com/cilium/cilium/pkg/versioncheck"
    45  )
    46  
    47  const (
    48  	oldCiliumPrefix       = "OLD_"
    49  	ciliumInputChain      = "CILIUM_INPUT"
    50  	ciliumOutputChain     = "CILIUM_OUTPUT"
    51  	ciliumOutputRawChain  = "CILIUM_OUTPUT_raw"
    52  	ciliumPostNatChain    = "CILIUM_POST_nat"
    53  	ciliumOutputNatChain  = "CILIUM_OUTPUT_nat"
    54  	ciliumPreNatChain     = "CILIUM_PRE_nat"
    55  	ciliumPostMangleChain = "CILIUM_POST_mangle"
    56  	ciliumPreMangleChain  = "CILIUM_PRE_mangle"
    57  	ciliumPreRawChain     = "CILIUM_PRE_raw"
    58  	ciliumForwardChain    = "CILIUM_FORWARD"
    59  	feederDescription     = "cilium-feeder:"
    60  	xfrmDescription       = "cilium-xfrm-notrack:"
    61  )
    62  
    63  // Minimum iptables versions supporting the -w and -w<seconds> flags
    64  var (
    65  	isWaitMinVersion        = versioncheck.MustCompile(">=1.4.20")
    66  	isWaitSecondsMinVersion = versioncheck.MustCompile(">=1.4.22")
    67  	noTrackPorts            = func(port uint16) []*lb.L4Addr {
    68  		return []*lb.L4Addr{
    69  			{
    70  				Protocol: lb.TCP,
    71  				Port:     port,
    72  			},
    73  			{
    74  				Protocol: lb.UDP,
    75  				Port:     port,
    76  			},
    77  		}
    78  	}
    79  )
    80  
    81  const (
    82  	waitString = "-w"
    83  )
    84  
    85  type runnable interface {
    86  	runProgOutput(args []string) (string, error)
    87  	runProg(args []string) error
    88  }
    89  
    90  type iptablesInterface interface {
    91  	runnable
    92  
    93  	getProg() string
    94  	getIpset() string
    95  }
    96  
    97  type ipt struct {
    98  	prog     string
    99  	ipset    string
   100  	waitArgs []string
   101  }
   102  
   103  func (ipt *ipt) initArgs(ctx context.Context, waitSeconds int) {
   104  	v, err := ipt.getVersion(ctx)
   105  	if err == nil {
   106  		switch {
   107  		case isWaitSecondsMinVersion(v):
   108  			ipt.waitArgs = []string{waitString, fmt.Sprintf("%d", waitSeconds)}
   109  		case isWaitMinVersion(v):
   110  			ipt.waitArgs = []string{waitString}
   111  		}
   112  	}
   113  }
   114  
   115  // package name is iptables so we use ip4tables internally for "iptables"
   116  var (
   117  	ip4tables = &ipt{prog: "iptables", ipset: ipset.CiliumNodeIPSetV4}
   118  	ip6tables = &ipt{prog: "ip6tables", ipset: ipset.CiliumNodeIPSetV6}
   119  )
   120  
   121  func (ipt *ipt) getProg() string {
   122  	return ipt.prog
   123  }
   124  
   125  func (ipt *ipt) getIpset() string {
   126  	return ipt.ipset
   127  }
   128  
   129  func (ipt *ipt) getVersion(ctx context.Context) (semver.Version, error) {
   130  	b, err := exec.CommandContext(ctx, ipt.prog, "--version").CombinedOutput(log, false)
   131  	if err != nil {
   132  		return semver.Version{}, err
   133  	}
   134  	v := regexp.MustCompile(`v([0-9]+(\.[0-9]+)+)`)
   135  	vString := v.FindStringSubmatch(string(b))
   136  	if vString == nil {
   137  		return semver.Version{}, fmt.Errorf("no iptables version found in string: %s", string(b))
   138  	}
   139  	return versioncheck.Version(vString[1])
   140  }
   141  
   142  func (ipt *ipt) runProgOutput(args []string) (string, error) {
   143  	fullCommand := fmt.Sprintf("%s %s", ipt.getProg(), strings.Join(args, " "))
   144  
   145  	log.Debugf("Running '%s' command", fullCommand)
   146  
   147  	// Add wait argument to deal with concurrent calls that would fail otherwise
   148  	iptArgs := make([]string, 0, len(ipt.waitArgs)+len(args))
   149  	iptArgs = append(iptArgs, ipt.waitArgs...)
   150  	iptArgs = append(iptArgs, args...)
   151  	out, err := exec.WithTimeout(defaults.ExecTimeout, ipt.prog, iptArgs...).Output(log, false)
   152  
   153  	if err != nil {
   154  		return "", fmt.Errorf("unable to run '%s' iptables command: %w", fullCommand, err)
   155  	}
   156  	return string(out), nil
   157  }
   158  
   159  func (ipt *ipt) runProg(args []string) error {
   160  	_, err := ipt.runProgOutput(args)
   161  	return err
   162  }
   163  
   164  func reverseRule(rule string) ([]string, error) {
   165  	if strings.HasPrefix(rule, "-A") {
   166  		// From: -A POSTROUTING -m comment [...]
   167  		// To:   -D POSTROUTING -m comment [...]
   168  		return shellwords.Parse(strings.Replace(rule, "-A", "-D", 1))
   169  	}
   170  
   171  	if strings.HasPrefix(rule, "-I") {
   172  		// From: -I POSTROUTING -m comment [...]
   173  		// To:   -D POSTROUTING -m comment [...]
   174  		return shellwords.Parse(strings.Replace(rule, "-I", "-D", 1))
   175  	}
   176  
   177  	return []string{}, nil
   178  }
   179  
   180  func ruleReferencesDisabledChain(disableIptablesFeederRules []string, rule string) (bool, string) {
   181  	for _, disabledChain := range disableIptablesFeederRules {
   182  		if strings.Contains(rule, " "+strings.ToUpper(disabledChain)+" ") {
   183  			return true, disabledChain
   184  		}
   185  	}
   186  
   187  	return false, ""
   188  }
   189  
   190  func isDisabledChain(disableIptablesFeederRules []string, chain string) bool {
   191  	for _, disabledChain := range disableIptablesFeederRules {
   192  		if strings.EqualFold(chain, disabledChain) {
   193  			return true
   194  		}
   195  	}
   196  
   197  	return false
   198  }
   199  
   200  func (m *Manager) removeCiliumRules(table string, prog runnable, match string) error {
   201  	rules, err := prog.runProgOutput([]string{"-t", table, "-S"})
   202  	if err != nil {
   203  		return err
   204  	}
   205  
   206  	scanner := bufio.NewScanner(strings.NewReader(rules))
   207  	for scanner.Scan() {
   208  		rule := scanner.Text()
   209  
   210  		// All rules installed by cilium either belong to a chain with
   211  		// the name CILIUM_ or call a chain with the name CILIUM_:
   212  		// -A CILIUM_FORWARD -o cilium_host -m comment --comment "cilium: any->cluster on cilium_host forward accept" -j ACCEPT
   213  		// -A POSTROUTING -m comment --comment "cilium-feeder: CILIUM_POST" -j CILIUM_POST
   214  		if !strings.Contains(rule, match) {
   215  			continue
   216  		}
   217  
   218  		// Temporary fix while Iptables is upgraded to >= 1.8.5
   219  		// (See GH-20884).
   220  		//
   221  		// The version currently shipped with Cilium (1.8.4) does not
   222  		// support the deletion of NOTRACK rules, so we will just ignore
   223  		// them here and let the agent remove them when it deletes the
   224  		// entire chain.
   225  		if strings.Contains(rule, "-j NOTRACK") {
   226  			continue
   227  		}
   228  
   229  		// do not remove feeder for chains that are set to be disabled
   230  		// ie catch the beginning of the rule like -A POSTROUTING to match it against
   231  		// disabled chains
   232  		if skip, disabledChain := ruleReferencesDisabledChain(m.cfg.DisableIptablesFeederRules, rule); skip {
   233  			log.WithField(logfields.Chain, disabledChain).Info("Skipping the removal of feeder chain")
   234  			continue
   235  		}
   236  
   237  		reversedRule, err := reverseRule(rule)
   238  		if err != nil {
   239  			log.WithError(err).WithField(logfields.Object, rule).Warnf("Unable to parse %s rule into slice. Leaving rule behind.", prog)
   240  			continue
   241  		}
   242  
   243  		if len(reversedRule) > 0 {
   244  			deleteRule := append([]string{"-t", table}, reversedRule...)
   245  			if err := prog.runProg(deleteRule); err != nil {
   246  				return err
   247  			}
   248  		}
   249  	}
   250  
   251  	return nil
   252  }
   253  
   254  // Manager manages the iptables-related configuration for Cilium.
   255  type Manager struct {
   256  	// This lock ensures there are no concurrent executions of the doInstallRules() and
   257  	// GetProxyPort() methods.
   258  	lock lock.Mutex
   259  
   260  	logger     logrus.FieldLogger
   261  	modulesMgr *modules.Manager
   262  	sysctl     sysctl.Sysctl
   263  
   264  	cfg       Config
   265  	sharedCfg SharedConfig
   266  
   267  	// anything that can trigger a reconciliation
   268  	reconcilerParams reconcilerParams
   269  
   270  	haveIp6tables        bool
   271  	haveSocketMatch      bool
   272  	haveBPFSocketAssign  bool
   273  	ipEarlyDemuxDisabled bool
   274  	cniConfigManager     cni.CNIConfigManager
   275  }
   276  
   277  type reconcilerParams struct {
   278  	localNodeStore *node.LocalNodeStore
   279  	db             *statedb.DB
   280  	devices        statedb.Table[*tables.Device]
   281  	proxies        chan reconciliationRequest[proxyInfo]
   282  	addNoTrackPod  chan reconciliationRequest[noTrackPodInfo]
   283  	delNoTrackPod  chan reconciliationRequest[noTrackPodInfo]
   284  }
   285  
   286  type params struct {
   287  	cell.In
   288  
   289  	Logger    logrus.FieldLogger
   290  	Lifecycle cell.Lifecycle
   291  
   292  	ModulesMgr       *modules.Manager
   293  	Sysctl           sysctl.Sysctl
   294  	CNIConfigManager cni.CNIConfigManager
   295  	LocalNodeStore   *node.LocalNodeStore
   296  
   297  	Cfg       Config
   298  	SharedCfg SharedConfig
   299  
   300  	JobGroup job.Group
   301  	DB       *statedb.DB
   302  	Devices  statedb.Table[*tables.Device]
   303  }
   304  
   305  func newIptablesManager(p params) *Manager {
   306  	iptMgr := &Manager{
   307  		logger:     p.Logger,
   308  		modulesMgr: p.ModulesMgr,
   309  		sysctl:     p.Sysctl,
   310  		cfg:        p.Cfg,
   311  		sharedCfg:  p.SharedCfg,
   312  		reconcilerParams: reconcilerParams{
   313  			localNodeStore: p.LocalNodeStore,
   314  			db:             p.DB,
   315  			devices:        p.Devices,
   316  			proxies:        make(chan reconciliationRequest[proxyInfo]),
   317  			addNoTrackPod:  make(chan reconciliationRequest[noTrackPodInfo]),
   318  			delNoTrackPod:  make(chan reconciliationRequest[noTrackPodInfo]),
   319  		},
   320  		haveIp6tables:    true,
   321  		cniConfigManager: p.CNIConfigManager,
   322  	}
   323  
   324  	argsInit := make(chan struct{})
   325  
   326  	// init iptables/ip6tables wait arguments before using them in the reconciler or in the manager (e.g: GetProxyPorts)
   327  	p.Lifecycle.Append(cell.Hook{
   328  		OnStart: func(ctx cell.HookContext) error {
   329  			defer close(argsInit)
   330  			ip4tables.initArgs(ctx, int(p.Cfg.IPTablesLockTimeout/time.Second))
   331  			if p.SharedCfg.EnableIPv6 {
   332  				ip6tables.initArgs(ctx, int(p.Cfg.IPTablesLockTimeout/time.Second))
   333  			}
   334  			return nil
   335  		},
   336  	})
   337  
   338  	p.Lifecycle.Append(iptMgr)
   339  
   340  	p.JobGroup.Add(
   341  		job.OneShot("iptables-reconciliation-loop", func(ctx context.Context, health cell.Health) error {
   342  			// each job runs in an independent goroutine, so we need to explicitly wait for
   343  			// iptables arguments initialization before starting the reconciler.
   344  			<-argsInit
   345  			return reconciliationLoop(
   346  				ctx, p.Logger, health,
   347  				iptMgr.sharedCfg.InstallIptRules, &iptMgr.reconcilerParams,
   348  				iptMgr.doInstallRules,
   349  				iptMgr.doInstallProxyRules,
   350  				iptMgr.installNoTrackRules,
   351  				iptMgr.removeNoTrackRules,
   352  			)
   353  		}),
   354  	)
   355  
   356  	return iptMgr
   357  }
   358  
   359  // Start initializes the iptables manager and checks for iptables kernel modules availability.
   360  func (m *Manager) Start(ctx cell.HookContext) error {
   361  	if os.Getenv("CILIUM_PREPEND_IPTABLES_CHAIN") != "" {
   362  		m.logger.Warning("CILIUM_PREPEND_IPTABLES_CHAIN env var has been deprecated. Please use 'CILIUM_PREPEND_IPTABLES_CHAINS' " +
   363  			"env var or '--prepend-iptables-chains' command line flag instead")
   364  	}
   365  
   366  	if err := enableIPForwarding(m.sysctl, m.sharedCfg.EnableIPv6); err != nil {
   367  		m.logger.WithError(err).Warning("enabling IP forwarding via sysctl failed")
   368  	}
   369  
   370  	if m.sharedCfg.EnableIPSec && m.sharedCfg.EnableL7Proxy {
   371  		m.disableIPEarlyDemux()
   372  	}
   373  
   374  	if err := m.modulesMgr.FindOrLoadModules(
   375  		"ip_tables", "iptable_nat", "iptable_mangle", "iptable_raw", "iptable_filter",
   376  	); err != nil {
   377  		m.logger.WithError(err).Warning(
   378  			"iptables modules could not be initialized. It probably means that iptables is not available on this system")
   379  	}
   380  
   381  	if err := m.modulesMgr.FindOrLoadModules(
   382  		"ip6_tables", "ip6table_mangle", "ip6table_raw", "ip6table_filter",
   383  	); err != nil {
   384  		if m.sharedCfg.EnableIPv6 {
   385  			return fmt.Errorf(
   386  				"IPv6 is enabled and ip6tables modules initialization failed: %w "+
   387  					"(try disabling IPv6 in Cilium or loading ip6_tables, ip6table_mangle, ip6table_raw and ip6table_filter kernel modules)", err)
   388  		}
   389  		m.logger.WithError(err).Debug(
   390  			"ip6tables kernel modules could not be loaded, so IPv6 cannot be used")
   391  		m.haveIp6tables = false
   392  	} else {
   393  		ipv6Disabled, err := os.ReadFile("/sys/module/ipv6/parameters/disable")
   394  		if err != nil {
   395  			if m.sharedCfg.EnableIPv6 {
   396  				return fmt.Errorf(
   397  					"IPv6 is enabled but IPv6 kernel support probing failed with: %w", err)
   398  			}
   399  			m.logger.WithError(err).Warning(
   400  				"Unable to read /sys/module/ipv6/parameters/disable, disabling IPv6 iptables support")
   401  			m.haveIp6tables = false
   402  		} else if strings.TrimSuffix(string(ipv6Disabled), "\n") == "1" {
   403  			m.logger.Debug(
   404  				"Kernel does not support IPv6, disabling IPv6 iptables support")
   405  			m.haveIp6tables = false
   406  		}
   407  	}
   408  
   409  	if err := m.modulesMgr.FindOrLoadModules("xt_socket"); err != nil {
   410  		if !m.sharedCfg.TunnelingEnabled {
   411  			// xt_socket module is needed to circumvent an explicit drop in ip_forward()
   412  			// logic for packets for which a local socket is found by ip early
   413  			// demux. xt_socket performs a local socket match and sets an skb mark on
   414  			// match, which will divert the packet to the local stack using our policy
   415  			// routing rule, thus avoiding being processed by ip_forward() at all.
   416  			//
   417  			// If xt_socket module does not exist we can disable ip early demux to to
   418  			// avoid the explicit drop in ip_forward(). This is not needed in tunneling
   419  			// modes, as then we'll set the skb mark in the bpf logic before the policy
   420  			// routing stage so that the packet is routed locally instead of being
   421  			// forwarded by ip_forward().
   422  			//
   423  			// We would not need the xt_socket at all if the datapath universally would
   424  			// set the "to proxy" skb mark bits on before the packet hits policy routing
   425  			// stage. Currently this is not true for endpoint routing modes.
   426  			m.logger.WithError(err).Warning("xt_socket kernel module could not be loaded")
   427  
   428  			if m.sharedCfg.EnableXTSocketFallback {
   429  				m.disableIPEarlyDemux()
   430  			}
   431  		}
   432  	} else {
   433  		m.haveSocketMatch = true
   434  	}
   435  	m.haveBPFSocketAssign = m.sharedCfg.EnableBPFTProxy
   436  
   437  	return nil
   438  }
   439  
   440  func (m *Manager) Stop(ctx cell.HookContext) error {
   441  	close(m.reconcilerParams.proxies)
   442  	close(m.reconcilerParams.addNoTrackPod)
   443  	close(m.reconcilerParams.delNoTrackPod)
   444  	return nil
   445  }
   446  
   447  func (m *Manager) disableIPEarlyDemux() {
   448  	if m.ipEarlyDemuxDisabled {
   449  		return
   450  	}
   451  
   452  	disabled := m.sysctl.Disable([]string{"net", "ipv4", "ip_early_demux"}) == nil
   453  	if disabled {
   454  		m.ipEarlyDemuxDisabled = true
   455  		m.logger.Info("Disabled ip_early_demux to allow proxy redirection with original source/destination address without xt_socket support also in non-tunneled datapath modes.")
   456  	} else {
   457  		m.logger.Warning("Could not disable ip_early_demux, traffic redirected due to an HTTP policy or visibility may be dropped unexpectedly")
   458  	}
   459  }
   460  
   461  // SupportsOriginalSourceAddr tells if an L7 proxy can use POD's original source address and port in
   462  // the upstream connection to allow the destination to properly derive the source security ID from
   463  // the source IP address.
   464  func (m *Manager) SupportsOriginalSourceAddr() bool {
   465  	// Original source address use works if xt_socket match is supported, or if ip early demux
   466  	// is disabled
   467  	return m.haveSocketMatch || m.ipEarlyDemuxDisabled
   468  }
   469  
   470  // removeRules removes iptables rules installed by Cilium.
   471  func (m *Manager) removeRules(prefix string) error {
   472  	// Set of tables that have had iptables rules in any Cilium version
   473  	tables := []string{"nat", "mangle", "raw", "filter"}
   474  	for _, t := range tables {
   475  		if err := m.removeCiliumRules(t, ip4tables, prefix+"CILIUM_"); err != nil {
   476  			return err
   477  		}
   478  
   479  		if m.haveIp6tables {
   480  			if err := m.removeCiliumRules(t, ip6tables, prefix+"CILIUM_"); err != nil {
   481  				return err
   482  			}
   483  		}
   484  	}
   485  
   486  	for _, c := range ciliumChains {
   487  		c.name = prefix + c.name
   488  		if err := c.remove(true, m.haveIp6tables); err != nil {
   489  			return err
   490  		}
   491  	}
   492  
   493  	return nil
   494  }
   495  
   496  // renameChains renames iptables chains installed by Cilium.
   497  func (m *Manager) renameChains(prefix string) error {
   498  	for _, c := range ciliumChains {
   499  		if err := c.rename(true, m.haveIp6tables, prefix+c.name); err != nil {
   500  			return err
   501  		}
   502  	}
   503  
   504  	return nil
   505  }
   506  
   507  func (m *Manager) inboundProxyRedirectRule(cmd string) []string {
   508  	// Mark host proxy transparent connections to be routed to the local stack.
   509  	// This comes before the TPROXY rules in the chain, and setting the mark
   510  	// without the proxy port number will make the TPROXY rule to not match,
   511  	// as we do not want to try to tproxy packets that are going to the stack
   512  	// already.
   513  	// This rule is needed for couple of reasons:
   514  	// 1. route return traffic to the proxy
   515  	// 2. route original direction traffic that would otherwise be intercepted
   516  	//    by ip_early_demux
   517  	// Explicitly support chaining Envoy listeners via the loopback device by
   518  	// excluding traffic for the loopback device.
   519  	toProxyMark := fmt.Sprintf("%#08x", linux_defaults.MagicMarkIsToProxy)
   520  	matchFromIPSecEncrypt := fmt.Sprintf("%#08x/%#08x", linux_defaults.RouteMarkEncrypt, linux_defaults.RouteMarkMask)
   521  	matchProxyToWorld := fmt.Sprintf("%#08x/%#08x", linux_defaults.MarkProxyToWorld, linux_defaults.RouteMarkMask)
   522  	return []string{
   523  		"-t", "mangle",
   524  		cmd, ciliumPreMangleChain,
   525  		"-m", "socket", "--transparent",
   526  		"!", "-o", "lo",
   527  		"-m", "mark", "!", "--mark", matchFromIPSecEncrypt,
   528  		"-m", "mark", "!", "--mark", matchProxyToWorld,
   529  		"-m", "comment", "--comment", "cilium: any->pod redirect proxied traffic to host proxy",
   530  		"-j", "MARK",
   531  		"--set-mark", toProxyMark}
   532  }
   533  
   534  func (m *Manager) iptProxyRule(rules string, prog runnable, l4proto, ip string, proxyPort uint16, name string) error {
   535  	// Match
   536  	port := uint32(byteorder.HostToNetwork16(proxyPort)) << 16
   537  	markMatch := fmt.Sprintf("%#x", linux_defaults.MagicMarkIsToProxy|port)
   538  	// TPROXY params
   539  	tProxyMark := fmt.Sprintf("%#x", linux_defaults.MagicMarkIsToProxy)
   540  	tProxyPort := fmt.Sprintf("%d", proxyPort)
   541  
   542  	existingRuleRegex := regexp.MustCompile(fmt.Sprintf("-A CILIUM_PRE_mangle -p %s -m mark --mark %s.*--on-ip %s", l4proto, markMatch, ip))
   543  	if existingRuleRegex.MatchString(rules) {
   544  		return nil
   545  	}
   546  
   547  	rule := []string{
   548  		"-t", "mangle",
   549  		"-A", ciliumPreMangleChain,
   550  		"-p", l4proto,
   551  		"-m", "mark", "--mark", markMatch,
   552  		"-m", "comment", "--comment", "cilium: TPROXY to host " + name + " proxy",
   553  		"-j", "TPROXY",
   554  		"--tproxy-mark", tProxyMark,
   555  		"--on-ip", ip,
   556  		"--on-port", tProxyPort,
   557  	}
   558  	return prog.runProg(rule)
   559  }
   560  
   561  func (m *Manager) installStaticProxyRules() error {
   562  	// match traffic to a proxy (upper 16 bits has the proxy port, which is masked out)
   563  	matchToProxy := fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkIsToProxy, linux_defaults.MagicMarkHostMask)
   564  	// proxy return traffic has 0 ID in the mask
   565  	matchProxyReply := fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkIsProxy, linux_defaults.MagicMarkProxyNoIDMask)
   566  	// proxy forward traffic
   567  	matchProxyForward := fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkEgress, linux_defaults.MagicMarkHostMask)
   568  	// L7 proxy upstream return traffic has Endpoint ID in the mask
   569  	matchL7ProxyUpstream := fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkIsProxyEPID, linux_defaults.MagicMarkProxyMask)
   570  	// match traffic from a proxy (either in forward or in return direction)
   571  	matchFromProxy := fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkIsProxy, linux_defaults.MagicMarkProxyMask)
   572  
   573  	if m.sharedCfg.EnableIPv4 {
   574  		// No conntrack for traffic to proxy
   575  		if err := ip4tables.runProg([]string{
   576  			"-t", "raw",
   577  			"-A", ciliumPreRawChain,
   578  			"-m", "mark", "--mark", matchToProxy,
   579  			"-m", "comment", "--comment", "cilium: NOTRACK for proxy traffic",
   580  			"-j", "CT", "--notrack"}); err != nil {
   581  			return err
   582  		}
   583  
   584  		// Explicit ACCEPT for the proxy traffic. Needed when the INPUT defaults to DROP.
   585  		// Matching needs to be the same as for the NOTRACK rule above.
   586  		if err := ip4tables.runProg([]string{
   587  			"-t", "filter",
   588  			"-A", ciliumInputChain,
   589  			"-m", "mark", "--mark", matchToProxy,
   590  			"-m", "comment", "--comment", "cilium: ACCEPT for proxy traffic",
   591  			"-j", "ACCEPT"}); err != nil {
   592  			return err
   593  		}
   594  
   595  		// No conntrack for proxy return traffic that is heading to lxc+
   596  		if err := ip4tables.runProg([]string{
   597  			"-t", "raw",
   598  			"-A", ciliumOutputRawChain,
   599  			"-o", "lxc+",
   600  			"-m", "mark", "--mark", matchProxyReply,
   601  			"-m", "comment", "--comment", "cilium: NOTRACK for proxy return traffic",
   602  			"-j", "CT", "--notrack"}); err != nil {
   603  			return err
   604  		}
   605  
   606  		// No conntrack for proxy return traffic that is heading to cilium_host
   607  		if err := ip4tables.runProg([]string{
   608  			"-t", "raw",
   609  			"-A", ciliumOutputRawChain,
   610  			"-o", defaults.HostDevice,
   611  			"-m", "mark", "--mark", matchProxyReply,
   612  			"-m", "comment", "--comment", "cilium: NOTRACK for proxy return traffic",
   613  			"-j", "CT", "--notrack"}); err != nil {
   614  			return err
   615  		}
   616  
   617  		// No conntrack for proxy forward traffic that is heading to cilium_host
   618  		if option.Config.EnableIPSec {
   619  			if err := ip4tables.runProg([]string{
   620  				"-t", "raw",
   621  				"-A", ciliumOutputRawChain,
   622  				"-o", defaults.HostDevice,
   623  				"-m", "mark", "--mark", matchProxyForward,
   624  				"-m", "comment", "--comment", "cilium: NOTRACK for proxy forward traffic",
   625  				"-j", "CT", "--notrack"}); err != nil {
   626  				return err
   627  			}
   628  		}
   629  
   630  		// No conntrack for proxy upstream traffic that is heading to lxc+
   631  		if err := ip4tables.runProg([]string{
   632  			"-t", "raw",
   633  			"-A", ciliumOutputRawChain,
   634  			"-o", "lxc+",
   635  			"-m", "mark", "--mark", matchL7ProxyUpstream,
   636  			"-m", "comment", "--comment", "cilium: NOTRACK for L7 proxy upstream traffic",
   637  			"-j", "CT", "--notrack"}); err != nil {
   638  			return err
   639  		}
   640  
   641  		// No conntrack for proxy upstream traffic that is heading to cilium_host
   642  		if err := ip4tables.runProg([]string{
   643  			"-t", "raw",
   644  			"-A", ciliumOutputRawChain,
   645  			"-o", defaults.HostDevice,
   646  			"-m", "mark", "--mark", matchL7ProxyUpstream,
   647  			"-m", "comment", "--comment", "cilium: NOTRACK for L7 proxy upstream traffic",
   648  			"-j", "CT", "--notrack"}); err != nil {
   649  			return err
   650  		}
   651  
   652  		// Explicit ACCEPT for the proxy return traffic. Needed when the OUTPUT defaults to DROP.
   653  		// Matching needs to be the same as for the NOTRACK rule above.
   654  		if err := ip4tables.runProg([]string{
   655  			"-t", "filter",
   656  			"-A", ciliumOutputChain,
   657  			"-m", "mark", "--mark", matchFromProxy,
   658  			"-m", "comment", "--comment", "cilium: ACCEPT for proxy traffic",
   659  			"-j", "ACCEPT"}); err != nil {
   660  			return err
   661  		}
   662  
   663  		// Explicit ACCEPT for the l7 proxy upstream traffic. Needed when the OUTPUT defaults to DROP.
   664  		// TODO: See if this is really needed. We do not have an ACCEPT for normal proxy upstream traffic.
   665  		if err := ip4tables.runProg([]string{
   666  			"-t", "filter",
   667  			"-A", ciliumOutputChain,
   668  			"-m", "mark", "--mark", matchL7ProxyUpstream,
   669  			"-m", "comment", "--comment", "cilium: ACCEPT for l7 proxy upstream traffic",
   670  			"-j", "ACCEPT"}); err != nil {
   671  			return err
   672  		}
   673  
   674  		if m.haveSocketMatch {
   675  			// Direct inbound TPROXYed traffic towards the socket
   676  			if err := ip4tables.runProg(m.inboundProxyRedirectRule("-A")); err != nil {
   677  				return err
   678  			}
   679  		}
   680  	}
   681  
   682  	if m.sharedCfg.EnableIPv6 {
   683  		// No conntrack for traffic to ingress proxy
   684  		if err := ip6tables.runProg([]string{
   685  			"-t", "raw",
   686  			"-A", ciliumPreRawChain,
   687  			"-m", "mark", "--mark", matchToProxy,
   688  			"-m", "comment", "--comment", "cilium: NOTRACK for proxy traffic",
   689  			"-j", "CT", "--notrack"}); err != nil {
   690  			return err
   691  		}
   692  
   693  		// Explicit ACCEPT for the proxy traffic. Needed when the INPUT defaults to DROP.
   694  		// Matching needs to be the same as for the NOTRACK rule above.
   695  		if err := ip6tables.runProg([]string{
   696  			"-t", "filter",
   697  			"-A", ciliumInputChain,
   698  			"-m", "mark", "--mark", matchToProxy,
   699  			"-m", "comment", "--comment", "cilium: ACCEPT for proxy traffic",
   700  			"-j", "ACCEPT"}); err != nil {
   701  			return err
   702  		}
   703  
   704  		// No conntrack for proxy return traffic that is heading to cilium_host
   705  		if err := ip6tables.runProg([]string{
   706  			"-t", "raw",
   707  			"-A", ciliumOutputRawChain,
   708  			"-o", defaults.HostDevice,
   709  			"-m", "mark", "--mark", matchProxyReply,
   710  			"-m", "comment", "--comment", "cilium: NOTRACK for proxy return traffic",
   711  			"-j", "CT", "--notrack"}); err != nil {
   712  			return err
   713  		}
   714  
   715  		// No conntrack for proxy upstream traffic that is heading to lxc+
   716  		if err := ip6tables.runProg([]string{
   717  			"-t", "raw",
   718  			"-A", ciliumOutputRawChain,
   719  			"-o", "lxc+",
   720  			"-m", "mark", "--mark", matchProxyReply,
   721  			"-m", "comment", "--comment", "cilium: NOTRACK for proxy return traffic",
   722  			"-j", "CT", "--notrack"}); err != nil {
   723  			return err
   724  		}
   725  
   726  		// Explicit ACCEPT for the proxy return traffic. Needed when the OUTPUT defaults to DROP.
   727  		// Matching needs to be the same as for the NOTRACK rule above.
   728  		if err := ip6tables.runProg([]string{
   729  			"-t", "filter",
   730  			"-A", ciliumOutputChain,
   731  			"-m", "mark", "--mark", matchFromProxy,
   732  			"-m", "comment", "--comment", "cilium: ACCEPT for proxy traffic",
   733  			"-j", "ACCEPT"}); err != nil {
   734  			return err
   735  		}
   736  
   737  		if m.haveSocketMatch {
   738  			// Direct inbound TPROXYed traffic towards the socket
   739  			if err := ip6tables.runProg(m.inboundProxyRedirectRule("-A")); err != nil {
   740  				return err
   741  			}
   742  		}
   743  	}
   744  
   745  	return nil
   746  }
   747  
   748  func (m *Manager) doCopyProxyRules(prog iptablesInterface, table string, re *regexp.Regexp, match, oldChain, newChain string) error {
   749  	rules, err := prog.runProgOutput([]string{"-t", table, "-S"})
   750  	if err != nil {
   751  		return err
   752  	}
   753  
   754  	scanner := bufio.NewScanner(strings.NewReader(rules))
   755  	for scanner.Scan() {
   756  		rule := scanner.Text()
   757  		if !re.MatchString(rule) || !strings.Contains(rule, match) {
   758  			continue
   759  		}
   760  
   761  		args, err := shellwords.Parse(strings.Replace(rule, oldChain, newChain, 1))
   762  		if err != nil {
   763  			log.WithFields(logrus.Fields{
   764  				"table":          table,
   765  				"prog":           prog.getProg(),
   766  				logfields.Object: rule,
   767  			}).WithError(err).Warn("Unable to parse TPROXY rule, disruption to traffic selected by L7 policy possible")
   768  			continue
   769  		}
   770  
   771  		copyRule := append([]string{"-t", table}, args...)
   772  		if err := prog.runProg(copyRule); err != nil {
   773  			return err
   774  		}
   775  	}
   776  
   777  	return nil
   778  }
   779  
   780  var tproxyMatch = regexp.MustCompile("CILIUM_PRE_mangle .*cilium: TPROXY")
   781  
   782  // copies old proxy rules
   783  func (m *Manager) copyProxyRules(oldChain string, match string) error {
   784  	if m.sharedCfg.EnableIPv4 {
   785  		if err := m.doCopyProxyRules(ip4tables, "mangle", tproxyMatch, match, oldChain, ciliumPreMangleChain); err != nil {
   786  			return err
   787  		}
   788  	}
   789  
   790  	if m.sharedCfg.EnableIPv6 {
   791  		if err := m.doCopyProxyRules(ip6tables, "mangle", tproxyMatch, match, oldChain, ciliumPreMangleChain); err != nil {
   792  			return err
   793  		}
   794  	}
   795  
   796  	return nil
   797  }
   798  
   799  // Redirect packets to the host proxy via TPROXY, as directed by the Cilium
   800  // datapath bpf programs via skb marks.
   801  func (m *Manager) addProxyRules(prog runnable, ip string, proxyPort uint16, name string) error {
   802  	rules, err := prog.runProgOutput([]string{"-t", "mangle", "-S"})
   803  	if err != nil {
   804  		return err
   805  	}
   806  
   807  	for _, proto := range []string{"tcp", "udp"} {
   808  		if err := m.iptProxyRule(rules, prog, proto, ip, proxyPort, name); err != nil {
   809  			return err
   810  		}
   811  	}
   812  
   813  	// Delete all other rules for this same proxy name
   814  	// These may accumulate if there is a bind failure on a previously used port
   815  	portAndIPMatch := fmt.Sprintf("TPROXY --on-port %d --on-ip %s ", proxyPort, ip)
   816  	scanner := bufio.NewScanner(strings.NewReader(rules))
   817  	for scanner.Scan() {
   818  		rule := scanner.Text()
   819  		if !strings.Contains(rule, "-A CILIUM_PRE_mangle ") || !strings.Contains(rule, "cilium: TPROXY to host "+name) || strings.Contains(rule, portAndIPMatch) {
   820  			continue
   821  		}
   822  
   823  		args, err := shellwords.Parse(strings.Replace(rule, "-A", "-D", 1))
   824  		if err != nil {
   825  			log.WithError(err).WithField(logfields.Object, rule).Warnf("Unable to parse %s TPROXY rule", prog)
   826  			continue
   827  		}
   828  
   829  		deleteRule := append([]string{"-t", "mangle"}, args...)
   830  		if err := prog.runProg(deleteRule); err != nil {
   831  			return err
   832  		}
   833  	}
   834  
   835  	return nil
   836  }
   837  
   838  func (m *Manager) endpointNoTrackRules(prog runnable, cmd string, IP string, port *lb.L4Addr) error {
   839  	var err error
   840  
   841  	protocol := strings.ToLower(port.Protocol)
   842  	p := strconv.FormatUint(uint64(port.Port), 10)
   843  
   844  	// currently the only use case for this is node-local-dns
   845  	// with LRP, node-local-dns should be deployed as a non-host-namespaced
   846  	// pod and  we want to skip kernel conntrack for any traffic between the
   847  	// application pod and the node-local-dns pod
   848  	// There are 4 types of packets that we want to skip conntrack:
   849  	// 1. From a non-host pod to the node-local-dns pod
   850  	// 2. From the node-local-dns pod to a non-host pod
   851  	// 3. From a hostNetwork pod to the node-local-dns pod
   852  	// 4. From the node-local-dns pod to a hostNetwork pod
   853  
   854  	// 1. The following 2 rules cover packets from non-host pod to node-local-dns
   855  	if err = prog.runProg([]string{
   856  		"-t", "raw",
   857  		cmd, ciliumPreRawChain,
   858  		"-p", protocol,
   859  		"-d", IP,
   860  		"--dport", p,
   861  		"-j", "CT",
   862  		"--notrack"}); err != nil {
   863  		log.WithError(err).Warning("Failed to enforce endpoint notrack")
   864  	}
   865  	if err = prog.runProg([]string{
   866  		"-t", "filter",
   867  		cmd, ciliumForwardChain,
   868  		"-p", protocol,
   869  		"-d", IP,
   870  		"--dport",
   871  		p, "-j",
   872  		"ACCEPT"}); err != nil {
   873  		log.WithError(err).Warning("Failed to enforce endpoint notrack")
   874  	}
   875  
   876  	// 2. The following 2 rules cover packets from node-local-dns to
   877  	// non-host pod
   878  	if err = prog.runProg([]string{
   879  		"-t", "raw",
   880  		cmd, ciliumPreRawChain,
   881  		"-p", protocol,
   882  		"-s", IP,
   883  		"--sport", p,
   884  		"-j", "CT",
   885  		"--notrack"}); err != nil {
   886  		log.WithError(err).Warning("Failed to enforce endpoint notrack")
   887  	}
   888  	if err = prog.runProg([]string{
   889  		"-t", "filter",
   890  		cmd, ciliumForwardChain,
   891  		"-p", protocol,
   892  		"-s", IP,
   893  		"--sport",
   894  		p, "-j",
   895  		"ACCEPT"}); err != nil {
   896  		log.WithError(err).Warning("Failed to enforce endpoint notrack")
   897  	}
   898  
   899  	// 3. The following 2 rules cover packets from host namespaced pod to
   900  	// node-local-dns
   901  	if err = prog.runProg([]string{
   902  		"-t", "raw",
   903  		cmd, ciliumOutputRawChain,
   904  		"-p", protocol,
   905  		"-d", IP,
   906  		"--dport", p,
   907  		"-j", "CT",
   908  		"--notrack"}); err != nil {
   909  		log.WithError(err).Warning("Failed to enforce endpoint notrack")
   910  	}
   911  	if err = prog.runProg([]string{
   912  		"-t", "filter",
   913  		cmd, ciliumOutputChain,
   914  		"-p", protocol,
   915  		"-d", IP,
   916  		"--dport", p,
   917  		"-j", "ACCEPT"}); err != nil {
   918  		log.WithError(err).Warning("Failed to enforce endpoint notrack")
   919  	}
   920  
   921  	// 4. The following rule (and the prerouting rule in case 2)
   922  	// covers packets from node-local-dns to host namespaced pod
   923  	if err = prog.runProg([]string{
   924  		"-t", "filter",
   925  		cmd, ciliumInputChain,
   926  		"-p", protocol,
   927  		"-s", IP,
   928  		"--sport",
   929  		p, "-j",
   930  		"ACCEPT"}); err != nil {
   931  		log.WithError(err).Warning("Failed to enforce endpoint notrack")
   932  	}
   933  
   934  	// The following rules are kept for compatibility with host-namespaced
   935  	// node-local-dns if user already deploys in the legacy mode without
   936  	// LRP.
   937  	if err = prog.runProg([]string{
   938  		"-t", "raw",
   939  		cmd, ciliumOutputRawChain,
   940  		"-p", protocol,
   941  		"-s", IP,
   942  		"--sport", p,
   943  		"-j", "CT",
   944  		"--notrack"}); err != nil {
   945  		log.WithError(err).Warning("Failed to enforce endpoint notrack")
   946  	}
   947  	if err = prog.runProg([]string{
   948  		"-t", "filter",
   949  		cmd, ciliumOutputChain,
   950  		"-p", protocol,
   951  		"-s", IP,
   952  		"--sport", p,
   953  		"-j", "ACCEPT"}); err != nil {
   954  		log.WithError(err).Warning("Failed to enforce endpoint notrack")
   955  	}
   956  	if err = prog.runProg([]string{
   957  		"-t", "filter",
   958  		cmd, ciliumInputChain,
   959  		"-p", protocol,
   960  		"-d", IP,
   961  		"--dport",
   962  		p, "-j",
   963  		"ACCEPT"}); err != nil {
   964  		log.WithError(err).Warning("Failed to enforce endpoint notrack")
   965  	}
   966  	return err
   967  }
   968  
   969  // InstallNoTrackRules is explicitly called when a pod has valid "policy.cilium.io/no-track-port" annotation.
   970  // When InstallNoConntrackIptRules flag is set, a super set of v4 NOTRACK rules will be automatically
   971  // installed upon agent bootstrap (via function addNoTrackPodTrafficRules) and this function will be skipped.
   972  // When InstallNoConntrackIptRules is not set, this function will be executed to install NOTRACK rules.
   973  // The rules installed by this function is very specific, for now, the only user is node-local-dns pods.
   974  func (m *Manager) InstallNoTrackRules(ip netip.Addr, port uint16) {
   975  	if m.skipPodTrafficConntrack(ip) {
   976  		return
   977  	}
   978  
   979  	reconciled := make(chan struct{})
   980  	m.reconcilerParams.addNoTrackPod <- reconciliationRequest[noTrackPodInfo]{noTrackPodInfo{ip, port}, reconciled}
   981  	<-reconciled
   982  }
   983  
   984  // See comments for InstallNoTrackRules.
   985  func (m *Manager) RemoveNoTrackRules(ip netip.Addr, port uint16) {
   986  	if m.skipPodTrafficConntrack(ip) {
   987  		return
   988  	}
   989  
   990  	reconciled := make(chan struct{})
   991  	m.reconcilerParams.delNoTrackPod <- reconciliationRequest[noTrackPodInfo]{noTrackPodInfo{ip, port}, reconciled}
   992  	<-reconciled
   993  }
   994  
   995  func (m *Manager) InstallProxyRules(proxyPort uint16, name string) {
   996  	reconciled := make(chan struct{})
   997  	m.reconcilerParams.proxies <- reconciliationRequest[proxyInfo]{proxyInfo{name, proxyPort}, reconciled}
   998  	<-reconciled
   999  }
  1000  
  1001  func (m *Manager) doInstallProxyRules(proxyPort uint16, name string) error {
  1002  	if m.haveBPFSocketAssign {
  1003  		log.WithField("port", proxyPort).
  1004  			Debug("Skipping proxy rule install due to BPF support")
  1005  		return nil
  1006  	}
  1007  
  1008  	if m.sharedCfg.EnableIPv4 {
  1009  		if err := m.addProxyRules(ip4tables, "127.0.0.1", proxyPort, name); err != nil {
  1010  			return err
  1011  		}
  1012  	}
  1013  	if m.sharedCfg.EnableIPv6 {
  1014  		if err := m.addProxyRules(ip6tables, "::1", proxyPort, name); err != nil {
  1015  			return err
  1016  		}
  1017  	}
  1018  
  1019  	return nil
  1020  }
  1021  
  1022  // GetProxyPorts enumerates all existing TPROXY rules in the datapath installed earlier with
  1023  // InstallProxyRules and returns all proxy ports found.
  1024  func (m *Manager) GetProxyPorts() map[string]uint16 {
  1025  	prog := ip4tables
  1026  	if !m.sharedCfg.EnableIPv4 {
  1027  		prog = ip6tables
  1028  	}
  1029  
  1030  	return m.doGetProxyPorts(prog)
  1031  }
  1032  
  1033  func (m *Manager) doGetProxyPorts(prog iptablesInterface) map[string]uint16 {
  1034  	portMap := make(map[string]uint16)
  1035  
  1036  	m.lock.Lock()
  1037  	defer m.lock.Unlock()
  1038  
  1039  	rules, err := prog.runProgOutput([]string{"-t", "mangle", "-n", "-L", ciliumPreMangleChain})
  1040  	if err != nil {
  1041  		return portMap
  1042  	}
  1043  
  1044  	re := regexp.MustCompile(
  1045  		"(cilium-[^ ]*) proxy.*TPROXY redirect " +
  1046  			"(0.0.0.0|" + ipfamily.IPv4().Localhost +
  1047  			"|::|" + ipfamily.IPv6().Localhost + ")" +
  1048  			":([1-9][0-9]*) mark",
  1049  	)
  1050  	strs := re.FindAllString(rules, -1)
  1051  	for _, str := range strs {
  1052  		// Pick the name and port number from each match
  1053  		name := re.ReplaceAllString(str, "$1")
  1054  		portStr := re.ReplaceAllString(str, "$3")
  1055  		portUInt64, err := strconv.ParseUint(portStr, 10, 16)
  1056  		if err == nil {
  1057  			portMap[name] = uint16(portUInt64)
  1058  		}
  1059  	}
  1060  	return portMap
  1061  }
  1062  
  1063  func (m *Manager) getDeliveryInterface(ifName string) string {
  1064  	switch {
  1065  	case m.sharedCfg.EnableEndpointRoutes:
  1066  		// aws-cni creates container interfaces with names like eni621c0fc8425.
  1067  		if m.cniConfigManager.GetChainingMode() == "aws-cni" {
  1068  			return "eni+"
  1069  		}
  1070  		return "lxc+"
  1071  
  1072  	case m.sharedCfg.IPAM == ipamOption.IPAMENI ||
  1073  		m.sharedCfg.IPAM == ipamOption.IPAMAlibabaCloud:
  1074  		return "lxc+"
  1075  
  1076  	default:
  1077  		return ifName
  1078  	}
  1079  }
  1080  
  1081  func (m *Manager) installForwardChainRules(ifName, localDeliveryInterface, forwardChain string) error {
  1082  	if m.sharedCfg.EnableIPv4 {
  1083  		if err := m.installForwardChainRulesIpX(ip4tables, ifName, localDeliveryInterface, forwardChain); err != nil {
  1084  			return err
  1085  		}
  1086  	}
  1087  	if m.sharedCfg.EnableIPv6 {
  1088  		return m.installForwardChainRulesIpX(ip6tables, ifName, localDeliveryInterface, forwardChain)
  1089  	}
  1090  
  1091  	return nil
  1092  }
  1093  
  1094  func (m *Manager) installForwardChainRulesIpX(prog runnable, ifName, localDeliveryInterface, forwardChain string) error {
  1095  	// While kube-proxy does change the policy of the iptables FORWARD chain
  1096  	// it doesn't seem to handle all cases, e.g. host network pods that use
  1097  	// the node IP which would still end up in default DENY. Similarly, for
  1098  	// plain Docker setup, we would otherwise hit default DENY in FORWARD chain.
  1099  	// Also, k8s 1.15 introduced "-m conntrack --ctstate INVALID -j DROP" which
  1100  	// in the direct routing case can drop EP replies.
  1101  	//
  1102  	// Therefore, add the rules below to avoid having a user to manually opt-in.
  1103  	// See also: https://github.com/kubernetes/kubernetes/issues/39823
  1104  	// In here can only be basic ACCEPT rules, nothing more complicated.
  1105  	//
  1106  	// The 2nd and 3rd rule are for the case of nodeport traffic where the backend is
  1107  	// remote. The traffic flow in FORWARD is as follows:
  1108  	//
  1109  	//  - Node serving nodeport request:
  1110  	//      IN=eno1 OUT=cilium_host
  1111  	//      IN=cilium_host OUT=eno1
  1112  	//
  1113  	//  - Node running backend:
  1114  	//       IN=eno1 OUT=cilium_host
  1115  	//       IN=lxc... OUT=eno1
  1116  	if err := prog.runProg([]string{
  1117  		"-A", forwardChain,
  1118  		"-o", ifName,
  1119  		"-m", "comment", "--comment", "cilium: any->cluster on " + ifName + " forward accept",
  1120  		"-j", "ACCEPT"}); err != nil {
  1121  		return err
  1122  	}
  1123  	if err := prog.runProg([]string{
  1124  		"-A", forwardChain,
  1125  		"-i", ifName,
  1126  		"-m", "comment", "--comment", "cilium: cluster->any on " + ifName + " forward accept (nodeport)",
  1127  		"-j", "ACCEPT"}); err != nil {
  1128  		return err
  1129  	}
  1130  	if err := prog.runProg([]string{
  1131  		"-A", forwardChain,
  1132  		"-i", "lxc+",
  1133  		"-m", "comment", "--comment", "cilium: cluster->any on lxc+ forward accept",
  1134  		"-j", "ACCEPT"}); err != nil {
  1135  		return err
  1136  	}
  1137  	// Proxy return traffic to a remote source needs '-i cilium_net'.
  1138  	if ifName == defaults.HostDevice {
  1139  		ifPeerName := defaults.SecondHostDevice
  1140  		if err := prog.runProg([]string{
  1141  			"-A", forwardChain,
  1142  			"-i", ifPeerName,
  1143  			"-m", "comment", "--comment", "cilium: cluster->any on " + ifPeerName + " forward accept (nodeport)",
  1144  			"-j", "ACCEPT"}); err != nil {
  1145  			return err
  1146  		}
  1147  	}
  1148  	// In case the delivery interface and the host interface are not the
  1149  	// same (enable-endpoint-routes), a separate set of rules to allow
  1150  	// from/to delivery interface is required.
  1151  	if localDeliveryInterface != ifName {
  1152  		if err := prog.runProg([]string{
  1153  			"-A", forwardChain,
  1154  			"-o", localDeliveryInterface,
  1155  			"-m", "comment", "--comment", "cilium: any->cluster on " + localDeliveryInterface + " forward accept",
  1156  			"-j", "ACCEPT"}); err != nil {
  1157  			return err
  1158  		}
  1159  		if err := prog.runProg([]string{
  1160  			"-A", forwardChain,
  1161  			"-i", localDeliveryInterface,
  1162  			"-m", "comment", "--comment", "cilium: cluster->any on " + localDeliveryInterface + " forward accept (nodeport)",
  1163  			"-j", "ACCEPT"}); err != nil {
  1164  			return err
  1165  		}
  1166  	}
  1167  	return nil
  1168  }
  1169  
  1170  func (m *Manager) installMasqueradeRules(
  1171  	prog iptablesInterface, nativeDevices []string,
  1172  	localDeliveryInterface, snatDstExclusionCIDR, allocRange, hostMasqueradeIP string,
  1173  ) error {
  1174  	devices := nativeDevices
  1175  
  1176  	if m.sharedCfg.NodeIpsetNeeded {
  1177  		// Exclude traffic to nodes from masquerade.
  1178  		progArgs := []string{
  1179  			"-t", "nat",
  1180  			"-A", ciliumPostNatChain,
  1181  		}
  1182  
  1183  		// If MasqueradeInterfaces is set, we need to mirror base condition of the
  1184  		// "cilium masquerade non-cluster" rule below, as the allocRange might not
  1185  		// be valid in such setups (e.g. in ENI mode).
  1186  		if len(m.sharedCfg.MasqueradeInterfaces) > 0 {
  1187  			progArgs = append(progArgs, "-o", strings.Join(m.sharedCfg.MasqueradeInterfaces, ","))
  1188  		} else {
  1189  			progArgs = append(progArgs, "-s", allocRange)
  1190  		}
  1191  
  1192  		progArgs = append(progArgs,
  1193  			"-m", "set", "--match-set", prog.getIpset(), "dst",
  1194  			"-m", "comment", "--comment", "exclude traffic to cluster nodes from masquerade",
  1195  			"-j", "ACCEPT",
  1196  		)
  1197  		if err := prog.runProg(progArgs); err != nil {
  1198  			return err
  1199  		}
  1200  	}
  1201  
  1202  	// Masquerade egress traffic leaving the node based on source routing
  1203  	//
  1204  	// If this option is enabled, then it takes precedence over the catch-all
  1205  	// MASQUERADE further below.
  1206  	if m.sharedCfg.EnableMasqueradeRouteSource {
  1207  		var defaultRoutes []netlink.Route
  1208  
  1209  		if len(m.sharedCfg.MasqueradeInterfaces) > 0 {
  1210  			devices = m.sharedCfg.MasqueradeInterfaces
  1211  		}
  1212  		family := netlink.FAMILY_V4
  1213  		if prog == ip6tables {
  1214  			family = netlink.FAMILY_V6
  1215  		}
  1216  		initialPass := true
  1217  		if routes, err := netlink.RouteList(nil, family); err == nil {
  1218  		nextPass:
  1219  			for _, r := range routes {
  1220  				var link netlink.Link
  1221  				match := false
  1222  				if r.LinkIndex > 0 {
  1223  					link, err = netlink.LinkByIndex(r.LinkIndex)
  1224  					if err != nil {
  1225  						continue
  1226  					}
  1227  					// Routes are dedicated to the specific interface, so we
  1228  					// need to install the SNAT rules also for that interface
  1229  					// via -o. If we cannot correlate to anything because no
  1230  					// devices were specified, we need to bail out.
  1231  					if len(devices) == 0 {
  1232  						return fmt.Errorf("cannot correlate source route device for generating masquerading rules")
  1233  					}
  1234  					for _, device := range devices {
  1235  						if device == link.Attrs().Name {
  1236  							match = true
  1237  							break
  1238  						}
  1239  					}
  1240  				} else {
  1241  					// There might be next hop groups where ifindex is zero
  1242  					// and the underlying next hop devices might not be known
  1243  					// to Cilium. In this case, assume match and don't encode
  1244  					// -o device.
  1245  					match = true
  1246  				}
  1247  				_, exclusionCIDR, err := net.ParseCIDR(snatDstExclusionCIDR)
  1248  				if !match || r.Src == nil || (err == nil && cidr.Equal(r.Dst, exclusionCIDR)) {
  1249  					continue
  1250  				}
  1251  				if initialPass && cidr.Equal(r.Dst, cidr.ZeroNet(r.Family)) {
  1252  					defaultRoutes = append(defaultRoutes, r)
  1253  					continue
  1254  				}
  1255  				progArgs := []string{
  1256  					"-t", "nat",
  1257  					"-A", ciliumPostNatChain,
  1258  					"-s", allocRange,
  1259  				}
  1260  				if cidr.Equal(r.Dst, cidr.ZeroNet(r.Family)) {
  1261  					progArgs = append(
  1262  						progArgs,
  1263  						"!", "-d", snatDstExclusionCIDR)
  1264  				} else {
  1265  					progArgs = append(
  1266  						progArgs,
  1267  						"-d", r.Dst.String())
  1268  				}
  1269  				if link != nil {
  1270  					progArgs = append(
  1271  						progArgs,
  1272  						"-o", link.Attrs().Name)
  1273  				} else {
  1274  					progArgs = append(
  1275  						progArgs,
  1276  						"!", "-o", "cilium_+")
  1277  				}
  1278  				progArgs = append(
  1279  					progArgs,
  1280  					"-m", "comment", "--comment", "cilium snat non-cluster via source route",
  1281  					"-j", "SNAT",
  1282  					"--to-source", r.Src.String())
  1283  				if m.cfg.IPTablesRandomFully {
  1284  					progArgs = append(progArgs, "--random-fully")
  1285  				}
  1286  				if err := prog.runProg(progArgs); err != nil {
  1287  					return err
  1288  				}
  1289  			}
  1290  			if initialPass {
  1291  				initialPass = false
  1292  				routes = defaultRoutes
  1293  				goto nextPass
  1294  			}
  1295  		}
  1296  	} else {
  1297  		// Masquerade all egress traffic leaving the node (catch-all)
  1298  		//
  1299  		// This rule must be first as the node ipset rule as it has different
  1300  		// exclusion criteria than the other rules in this table.
  1301  		//
  1302  		// The following conditions must be met:
  1303  		// * May not leave on a cilium_ interface, this excludes all
  1304  		//   tunnel traffic
  1305  		// * Must originate from an IP in the local allocation range
  1306  		// * Must not be reply if BPF NodePort is enabled
  1307  		// * Tunnel mode:
  1308  		//   * May not be targeted to an IP in the local allocation
  1309  		//     range
  1310  		// * Non-tunnel mode:
  1311  		//   * May not be targeted to an IP in the cluster range
  1312  		progArgs := []string{
  1313  			"-t", "nat",
  1314  			"-A", ciliumPostNatChain,
  1315  			"!", "-d", snatDstExclusionCIDR,
  1316  		}
  1317  		if len(m.sharedCfg.MasqueradeInterfaces) > 0 {
  1318  			progArgs = append(
  1319  				progArgs,
  1320  				"-o", strings.Join(m.sharedCfg.MasqueradeInterfaces, ","))
  1321  		} else {
  1322  			progArgs = append(
  1323  				progArgs,
  1324  				"-s", allocRange,
  1325  				"!", "-o", "cilium_+")
  1326  		}
  1327  		progArgs = append(
  1328  			progArgs,
  1329  			"-m", "comment", "--comment", "cilium masquerade non-cluster",
  1330  			"-j", "MASQUERADE")
  1331  		if m.cfg.IPTablesRandomFully {
  1332  			progArgs = append(progArgs, "--random-fully")
  1333  		}
  1334  		if err := prog.runProg(progArgs); err != nil {
  1335  			return err
  1336  		}
  1337  	}
  1338  
  1339  	// The following rule exclude traffic from the remaining rules in this chain.
  1340  	// If this rule matches, none of the remaining rules in this chain
  1341  	// are considered.
  1342  
  1343  	// Exclude proxy return traffic from the masquarade rules.
  1344  	if err := prog.runProg([]string{
  1345  		"-t", "nat",
  1346  		"-A", ciliumPostNatChain,
  1347  		// Don't match proxy (return) traffic
  1348  		"-m", "mark", "--mark", fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkIsProxy, linux_defaults.MagicMarkProxyMask),
  1349  		"-m", "comment", "--comment", "exclude proxy return traffic from masquerade",
  1350  		"-j", "ACCEPT"}); err != nil {
  1351  		return err
  1352  	}
  1353  
  1354  	if m.sharedCfg.TunnelingEnabled {
  1355  		// Masquerade all traffic from the host into the ifName
  1356  		// interface if the source is not in the node's pod CIDR.
  1357  		//
  1358  		// The following conditions must be met:
  1359  		// * Must be targeted for the ifName interface
  1360  		// * Must be targeted to an IP that is not local
  1361  		// * May not already be originating from the node's pod CIDR.
  1362  		if err := prog.runProg([]string{
  1363  			"-t", "nat",
  1364  			"-A", ciliumPostNatChain,
  1365  			"!", "-s", allocRange,
  1366  			"!", "-d", allocRange,
  1367  			"-o", defaults.HostDevice,
  1368  			"-m", "comment", "--comment", "cilium host->cluster masquerade",
  1369  			"-j", "SNAT", "--to-source", hostMasqueradeIP}); err != nil {
  1370  			return err
  1371  		}
  1372  	}
  1373  
  1374  	loopbackAddr := "127.0.0.1"
  1375  	if prog == ip6tables {
  1376  		loopbackAddr = "::1"
  1377  	}
  1378  
  1379  	// Masquerade all traffic from the host into local
  1380  	// endpoints if the source is 127.0.0.1. This is
  1381  	// required to force replies out of the endpoint's
  1382  	// network namespace.
  1383  	//
  1384  	// The following conditions must be met:
  1385  	// * Must be targeted for local endpoint
  1386  	// * Must be from 127.0.0.1
  1387  	if err := prog.runProg([]string{
  1388  		"-t", "nat",
  1389  		"-A", ciliumPostNatChain,
  1390  		"-s", loopbackAddr,
  1391  		"-o", localDeliveryInterface,
  1392  		"-m", "comment", "--comment", "cilium host->cluster from " + loopbackAddr + " masquerade",
  1393  		"-j", "SNAT", "--to-source", hostMasqueradeIP}); err != nil {
  1394  		return err
  1395  	}
  1396  
  1397  	// Masquerade all traffic that originated from a local
  1398  	// pod and thus carries a security identity and that
  1399  	// was also DNAT'ed. It must be masqueraded to ensure
  1400  	// that reverse NAT can be performed. Otherwise the
  1401  	// reply traffic would be sent directly to the pod
  1402  	// without traversing the Linux stack again.
  1403  	//
  1404  	// This is only done if EnableEndpointRoutes is
  1405  	// disabled, if EnableEndpointRoutes is enabled, then
  1406  	// all traffic always passes through the stack anyway.
  1407  	//
  1408  	// This is required for:
  1409  	//  - portmap/host if both source and destination are
  1410  	//    on the same node
  1411  	//  - kiam if source and server are on the same node
  1412  	if !m.sharedCfg.EnableEndpointRoutes {
  1413  		if err := prog.runProg([]string{
  1414  			"-t", "nat",
  1415  			"-A", ciliumPostNatChain,
  1416  			"-m", "mark", "--mark", fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkIdentity, linux_defaults.MagicMarkHostMask),
  1417  			"-o", localDeliveryInterface,
  1418  			"-m", "conntrack", "--ctstate", "DNAT",
  1419  			"-m", "comment", "--comment", "hairpin traffic that originated from a local pod",
  1420  			"-j", "SNAT", "--to-source", hostMasqueradeIP}); err != nil {
  1421  			return err
  1422  		}
  1423  	}
  1424  
  1425  	return nil
  1426  }
  1427  
  1428  func (m *Manager) installHostTrafficMarkRule(prog runnable) error {
  1429  	// Mark all packets sourced from processes running on the host with a
  1430  	// special marker so that we can differentiate traffic sourced locally
  1431  	// vs. traffic from the outside world that was masqueraded to appear
  1432  	// like it's from the host.
  1433  	//
  1434  	// Originally we set this mark only for traffic destined to the
  1435  	// ifName device, to ensure that any traffic directly reaching
  1436  	// to a Cilium-managed IP could be classified as from the host.
  1437  	//
  1438  	// However, there's another case where a local process attempts to
  1439  	// reach a service IP which is backed by a Cilium-managed pod. The
  1440  	// service implementation is outside of Cilium's control, for example,
  1441  	// handled by kube-proxy. We can tag even this traffic with a magic
  1442  	// mark, then when the service implementation proxies it back into
  1443  	// Cilium the BPF will see this mark and understand that the packet
  1444  	// originated from the host.
  1445  	matchFromIPSecEncrypt := fmt.Sprintf("%#08x/%#08x", linux_defaults.RouteMarkDecrypt, linux_defaults.RouteMarkMask)
  1446  	matchFromIPSecDecrypt := fmt.Sprintf("%#08x/%#08x", linux_defaults.RouteMarkEncrypt, linux_defaults.RouteMarkMask)
  1447  	matchOverlay := fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkOverlay, linux_defaults.MagicMarkHostMask)
  1448  	matchFromProxy := fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkIsProxy, linux_defaults.MagicMarkProxyMask)
  1449  	matchFromProxyEPID := fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkIsProxyEPID, linux_defaults.MagicMarkProxyMask)
  1450  	matchFromDNSProxy := fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkIdentity, linux_defaults.MagicMarkHostMask)
  1451  	markAsFromHost := fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkHost, linux_defaults.MagicMarkHostMask)
  1452  
  1453  	return prog.runProg([]string{
  1454  		"-t", "filter",
  1455  		"-A", ciliumOutputChain,
  1456  		"-m", "mark", "!", "--mark", matchFromIPSecDecrypt, // Don't match ipsec traffic
  1457  		"-m", "mark", "!", "--mark", matchFromIPSecEncrypt, // Don't match ipsec traffic
  1458  		"-m", "mark", "!", "--mark", matchOverlay, // Don't match Cilium's overlay traffic
  1459  		"-m", "mark", "!", "--mark", matchFromProxy, // Don't match proxy traffic
  1460  		"-m", "mark", "!", "--mark", matchFromProxyEPID, // Don't match proxy traffic
  1461  		"-m", "mark", "!", "--mark", matchFromDNSProxy, // Don't match DNS proxy egress traffic
  1462  		"-m", "comment", "--comment", "cilium: host->any mark as from host",
  1463  		"-j", "MARK", "--set-xmark", markAsFromHost})
  1464  }
  1465  
  1466  func (m *Manager) doInstallRules(state desiredState, firstInit bool) error {
  1467  	m.lock.Lock()
  1468  	defer m.lock.Unlock()
  1469  
  1470  	// Make sure we have no old "backups"
  1471  	if err := m.removeRules(oldCiliumPrefix); err != nil {
  1472  		return fmt.Errorf("failed to remove old backup rules: %w", err)
  1473  	}
  1474  
  1475  	if err := m.renameChains(oldCiliumPrefix); err != nil {
  1476  		return fmt.Errorf("failed to rename chains: %w", err)
  1477  	}
  1478  
  1479  	// install rules if needed
  1480  	if state.installRules {
  1481  		if err := m.installRules(state); err != nil {
  1482  			return fmt.Errorf("failed to install rules: %w", err)
  1483  		}
  1484  
  1485  		// copy old proxy rules over at initialization
  1486  		if firstInit {
  1487  			if err := m.copyProxyRules(oldCiliumPrefix+ciliumPreMangleChain, "cilium-dns-egress"); err != nil {
  1488  				return fmt.Errorf("cannot copy old proxy rules, disruption to traffic selected by L7 policy possible: %w", err)
  1489  			}
  1490  		}
  1491  
  1492  		for _, proxy := range state.proxies {
  1493  			if err := m.doInstallProxyRules(proxy.port, proxy.name); err != nil {
  1494  				return fmt.Errorf("cannot install proxy rules for %s: %w", proxy.name, err)
  1495  			}
  1496  		}
  1497  	}
  1498  
  1499  	if err := m.removeRules(oldCiliumPrefix); err != nil {
  1500  		return fmt.Errorf("failed to remove old rules: %w", err)
  1501  	}
  1502  
  1503  	return nil
  1504  }
  1505  
  1506  // installRules installs iptables rules for Cilium in specific use-cases
  1507  // (most specifically, interaction with kube-proxy).
  1508  func (m *Manager) installRules(state desiredState) error {
  1509  	// Install new rules
  1510  	for _, c := range ciliumChains {
  1511  		if err := c.add(m.sharedCfg.EnableIPv4, m.sharedCfg.EnableIPv6); err != nil {
  1512  			// do not return error for chain creation that are linked to disabled feeder rules
  1513  			if isDisabledChain(m.cfg.DisableIptablesFeederRules, c.hook) {
  1514  				log.WithField(logfields.Chain, c.name).Warningf("ignoring creation of chain since feeder rules for %s is disabled", c.hook)
  1515  				continue
  1516  			}
  1517  
  1518  			return fmt.Errorf("cannot add custom chain %s: %w", c.name, err)
  1519  		}
  1520  	}
  1521  
  1522  	if err := m.installStaticProxyRules(); err != nil {
  1523  		return fmt.Errorf("cannot install static proxy rules: %w", err)
  1524  	}
  1525  
  1526  	if err := m.addCiliumAcceptXfrmRules(); err != nil {
  1527  		return fmt.Errorf("cannot install xfrm rules: %w", err)
  1528  	}
  1529  
  1530  	localDeliveryInterface := m.getDeliveryInterface(defaults.HostDevice)
  1531  
  1532  	if err := m.installForwardChainRules(defaults.HostDevice, localDeliveryInterface, ciliumForwardChain); err != nil {
  1533  		return fmt.Errorf("cannot install forward chain rules to %s: %w", ciliumForwardChain, err)
  1534  	}
  1535  
  1536  	if m.sharedCfg.EnableIPv4 {
  1537  		if err := m.installHostTrafficMarkRule(ip4tables); err != nil {
  1538  			return fmt.Errorf("cannot install host traffic mark rule: %w", err)
  1539  		}
  1540  
  1541  		if m.sharedCfg.IptablesMasqueradingIPv4Enabled && state.localNodeInfo.internalIPv4 != nil {
  1542  			if err := m.installMasqueradeRules(ip4tables, state.devices.UnsortedList(), localDeliveryInterface,
  1543  				m.remoteSNATDstAddrExclusionCIDR(state.localNodeInfo.ipv4NativeRoutingCIDR, state.localNodeInfo.ipv4AllocCIDR),
  1544  				state.localNodeInfo.ipv4AllocCIDR,
  1545  				state.localNodeInfo.internalIPv4.String(),
  1546  			); err != nil {
  1547  				return fmt.Errorf("cannot install masquerade rules: %w", err)
  1548  			}
  1549  		}
  1550  	}
  1551  
  1552  	if m.sharedCfg.EnableIPv6 {
  1553  		if err := m.installHostTrafficMarkRule(ip6tables); err != nil {
  1554  			return fmt.Errorf("cannot install host traffic mark rule: %w", err)
  1555  		}
  1556  
  1557  		if m.sharedCfg.IptablesMasqueradingIPv6Enabled && state.localNodeInfo.internalIPv6 != nil {
  1558  			if err := m.installMasqueradeRules(ip6tables, state.devices.UnsortedList(), localDeliveryInterface,
  1559  				m.remoteSNATDstAddrExclusionCIDR(state.localNodeInfo.ipv6NativeRoutingCIDR, state.localNodeInfo.ipv6AllocCIDR),
  1560  				state.localNodeInfo.ipv6AllocCIDR,
  1561  				state.localNodeInfo.internalIPv6.String(),
  1562  			); err != nil {
  1563  				return fmt.Errorf("cannot install masquerade rules: %w", err)
  1564  			}
  1565  		}
  1566  	}
  1567  
  1568  	// AWS ENI requires to mark packets ingressing on the primary interface
  1569  	// and route them back the same way even if the pod responding is using
  1570  	// the IP of a different interface. Please see note in Reinitialize()
  1571  	// in pkg/datapath/loader for more details.
  1572  	if m.sharedCfg.IPAM == ipamOption.IPAMENI || m.sharedCfg.IPAM == ipamOption.IPAMAlibabaCloud {
  1573  		if err := m.addCiliumENIRules(); err != nil {
  1574  			return fmt.Errorf("cannot install rules for ENI multi-node NodePort: %w", err)
  1575  		}
  1576  	}
  1577  
  1578  	if m.sharedCfg.EnableIPSec {
  1579  		if err := m.addCiliumNoTrackXfrmRules(); err != nil {
  1580  			return fmt.Errorf("cannot install xfrm rules: %w", err)
  1581  		}
  1582  	}
  1583  
  1584  	podsCIDR := state.localNodeInfo.ipv4NativeRoutingCIDR
  1585  	if m.sharedCfg.InstallNoConntrackIptRules && podsCIDR != "" {
  1586  		if err := m.addNoTrackPodTrafficRules(ip4tables, podsCIDR); err != nil {
  1587  			return fmt.Errorf("cannot install pod traffic no CT rules: %w", err)
  1588  		}
  1589  	}
  1590  
  1591  	for noTrackPodInfo := range state.noTrackPods {
  1592  		if err := m.installNoTrackRules(noTrackPodInfo.ip, noTrackPodInfo.port); err != nil {
  1593  			return err
  1594  		}
  1595  	}
  1596  
  1597  	for _, c := range ciliumChains {
  1598  		// do not install feeder for chains that are set to be disabled
  1599  		if isDisabledChain(m.cfg.DisableIptablesFeederRules, c.hook) {
  1600  			log.WithField(logfields.Chain, c.hook).Infof("Skipping the install of feeder rule")
  1601  			continue
  1602  		}
  1603  
  1604  		if err := c.installFeeder(m.sharedCfg.EnableIPv4, m.sharedCfg.EnableIPv6, m.cfg.PrependIptablesChains); err != nil {
  1605  			return fmt.Errorf("cannot install feeder rule: %w", err)
  1606  		}
  1607  	}
  1608  
  1609  	return nil
  1610  }
  1611  
  1612  func (m *Manager) remoteSNATDstAddrExclusionCIDR(nativeRoutingCIDR, allocCIDR string) string {
  1613  	if nativeRoutingCIDR != "" {
  1614  		// ip{v4,v6}-native-routing-cidr is set, so use it
  1615  		return nativeRoutingCIDR
  1616  	}
  1617  
  1618  	return allocCIDR
  1619  }
  1620  
  1621  func (m *Manager) ciliumNoTrackXfrmRules(prog iptablesInterface, input string) error {
  1622  	matchFromIPSecEncrypt := fmt.Sprintf("%#08x/%#08x", linux_defaults.RouteMarkDecrypt, linux_defaults.RouteMarkMask)
  1623  	matchFromIPSecDecrypt := fmt.Sprintf("%#08x/%#08x", linux_defaults.RouteMarkEncrypt, linux_defaults.RouteMarkMask)
  1624  
  1625  	for _, match := range []string{matchFromIPSecDecrypt, matchFromIPSecEncrypt} {
  1626  		if err := prog.runProg([]string{
  1627  			"-t", "raw", input, ciliumPreRawChain,
  1628  			"-m", "mark", "--mark", match,
  1629  			"-m", "comment", "--comment", xfrmDescription,
  1630  			"-j", "CT", "--notrack"}); err != nil {
  1631  			return err
  1632  		}
  1633  	}
  1634  	return nil
  1635  }
  1636  
  1637  // Exclude crypto traffic from the filter and nat table rules.
  1638  // This avoids encryption bits and keyID, 0x*d00 for decryption
  1639  // and 0x*e00 for encryption, colliding with existing rules. Needed
  1640  // for kube-proxy for example.
  1641  func (m *Manager) addCiliumAcceptXfrmRules() error {
  1642  	if !m.sharedCfg.EnableIPSec {
  1643  		return nil
  1644  	}
  1645  
  1646  	insertAcceptXfrm := func(ipt *ipt, table, chain string) error {
  1647  		matchFromIPSecEncrypt := fmt.Sprintf("%#08x/%#08x", linux_defaults.RouteMarkDecrypt, linux_defaults.RouteMarkMask)
  1648  		matchFromIPSecDecrypt := fmt.Sprintf("%#08x/%#08x", linux_defaults.RouteMarkEncrypt, linux_defaults.RouteMarkMask)
  1649  
  1650  		comment := "exclude xfrm marks from " + table + " " + chain + " chain"
  1651  
  1652  		if err := ipt.runProg([]string{
  1653  			"-t", table,
  1654  			"-A", chain,
  1655  			"-m", "mark", "--mark", matchFromIPSecEncrypt,
  1656  			"-m", "comment", "--comment", comment,
  1657  			"-j", "ACCEPT"}); err != nil {
  1658  			return err
  1659  		}
  1660  
  1661  		return ipt.runProg([]string{
  1662  			"-t", table,
  1663  			"-A", chain,
  1664  			"-m", "mark", "--mark", matchFromIPSecDecrypt,
  1665  			"-m", "comment", "--comment", comment,
  1666  			"-j", "ACCEPT"})
  1667  	}
  1668  
  1669  	for _, chain := range ciliumChains {
  1670  		switch chain.table {
  1671  		case "filter", "nat":
  1672  			if m.sharedCfg.EnableIPv4 {
  1673  				if err := insertAcceptXfrm(ip4tables, chain.table, chain.name); err != nil {
  1674  					return err
  1675  				}
  1676  			}
  1677  			// ip6tables chain exists only if chain.ipv6 is true
  1678  			if m.sharedCfg.EnableIPv6 && chain.ipv6 {
  1679  				if err := insertAcceptXfrm(ip6tables, chain.table, chain.name); err != nil {
  1680  					return err
  1681  				}
  1682  			}
  1683  		}
  1684  	}
  1685  	return nil
  1686  }
  1687  
  1688  func (m *Manager) addCiliumNoTrackXfrmRules() (err error) {
  1689  	if m.sharedCfg.EnableIPv4 {
  1690  		if err = m.ciliumNoTrackXfrmRules(ip4tables, "-I"); err != nil {
  1691  			return
  1692  		}
  1693  	}
  1694  	if m.sharedCfg.EnableIPv6 {
  1695  		return m.ciliumNoTrackXfrmRules(ip6tables, "-I")
  1696  	}
  1697  	return nil
  1698  }
  1699  
  1700  func (m *Manager) installNoTrackRules(addr netip.Addr, port uint16) error {
  1701  	// Do not install per endpoint NOTRACK rules if we are already skipping
  1702  	// conntrack for all pod traffic.
  1703  	if m.skipPodTrafficConntrack(addr) {
  1704  		return nil
  1705  	}
  1706  
  1707  	prog := ip4tables
  1708  	if addr.Is6() {
  1709  		prog = ip6tables
  1710  	}
  1711  	for _, p := range noTrackPorts(port) {
  1712  		if err := m.endpointNoTrackRules(prog, "-A", addr.String(), p); err != nil {
  1713  			return err
  1714  		}
  1715  	}
  1716  	return nil
  1717  }
  1718  
  1719  func (m *Manager) removeNoTrackRules(addr netip.Addr, port uint16) error {
  1720  	// Do not remove per endpoint NOTRACK rules if we are already skipping
  1721  	// conntrack for all pod traffic.
  1722  	if m.skipPodTrafficConntrack(addr) {
  1723  		return nil
  1724  	}
  1725  
  1726  	prog := ip4tables
  1727  	if addr.Is6() {
  1728  		prog = ip6tables
  1729  	}
  1730  	for _, p := range noTrackPorts(port) {
  1731  		if err := m.endpointNoTrackRules(prog, "-D", addr.String(), p); err != nil {
  1732  			return err
  1733  		}
  1734  	}
  1735  	return nil
  1736  }
  1737  
  1738  // skipPodTrafficConntrack returns true if it's possible to install iptables
  1739  // `-j CT --notrack` rules to skip tracking pod traffic.
  1740  func (m *Manager) skipPodTrafficConntrack(addr netip.Addr) bool {
  1741  	if addr.Is4() && m.sharedCfg.InstallNoConntrackIptRules {
  1742  		return true
  1743  	}
  1744  	return false
  1745  }
  1746  
  1747  func (m *Manager) addNoTrackPodTrafficRules(prog runnable, podsCIDR string) error {
  1748  	for _, chain := range []string{ciliumPreRawChain, ciliumOutputRawChain} {
  1749  		if err := prog.runProg([]string{
  1750  			"-t", "raw",
  1751  			"-I", chain,
  1752  			"-s", podsCIDR,
  1753  			"-m", "comment", "--comment", "cilium: NOTRACK for pod traffic",
  1754  			"-j", "CT", "--notrack"}); err != nil {
  1755  			return err
  1756  		}
  1757  
  1758  		if err := prog.runProg([]string{
  1759  			"-t", "raw",
  1760  			"-I", chain,
  1761  			"-d", podsCIDR,
  1762  			"-m", "comment", "--comment", "cilium: NOTRACK for pod traffic",
  1763  			"-j", "CT", "--notrack"}); err != nil {
  1764  			return err
  1765  		}
  1766  	}
  1767  
  1768  	return nil
  1769  }
  1770  
  1771  func (m *Manager) addCiliumENIRules() error {
  1772  	if !m.sharedCfg.EnableIPv4 {
  1773  		return nil
  1774  	}
  1775  
  1776  	iface, err := route.NodeDeviceWithDefaultRoute(m.sharedCfg.EnableIPv4, m.sharedCfg.EnableIPv6)
  1777  	if err != nil {
  1778  		return fmt.Errorf("failed to find interface with default route: %w", err)
  1779  	}
  1780  
  1781  	nfmask := fmt.Sprintf("%#08x", linux_defaults.MarkMultinodeNodeport)
  1782  	ctmask := fmt.Sprintf("%#08x", linux_defaults.MaskMultinodeNodeport)
  1783  
  1784  	// Note: these rules need the xt_connmark module (iptables usually
  1785  	// loads it when required, unless loading modules after boot has been
  1786  	// disabled).
  1787  	if err := ip4tables.runProg([]string{
  1788  		"-t", "mangle",
  1789  		"-A", ciliumPreMangleChain,
  1790  		"-i", iface.Attrs().Name,
  1791  		"-m", "comment", "--comment", "cilium: primary ENI",
  1792  		"-m", "addrtype", "--dst-type", "LOCAL", "--limit-iface-in",
  1793  		"-j", "CONNMARK", "--set-xmark", nfmask + "/" + ctmask}); err != nil {
  1794  		return err
  1795  	}
  1796  
  1797  	return ip4tables.runProg([]string{
  1798  		"-t", "mangle",
  1799  		"-A", ciliumPreMangleChain,
  1800  		"-i", "lxc+",
  1801  		"-m", "comment", "--comment", "cilium: primary ENI",
  1802  		"-j", "CONNMARK", "--restore-mark", "--nfmask", nfmask, "--ctmask", ctmask})
  1803  }