k8s.io/kubernetes@v1.29.3/pkg/proxy/nftables/proxier.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package nftables
    18  
    19  //
    20  // NOTE: this needs to be tested in e2e since it uses nftables for everything.
    21  //
    22  
    23  import (
    24  	"context"
    25  	"crypto/sha256"
    26  	"encoding/base32"
    27  	"fmt"
    28  	"net"
    29  	"reflect"
    30  	"strconv"
    31  	"strings"
    32  	"sync"
    33  	"sync/atomic"
    34  	"time"
    35  
    36  	"github.com/danwinship/knftables"
    37  
    38  	v1 "k8s.io/api/core/v1"
    39  	discovery "k8s.io/api/discovery/v1"
    40  	"k8s.io/apimachinery/pkg/types"
    41  	"k8s.io/apimachinery/pkg/util/sets"
    42  	"k8s.io/apimachinery/pkg/util/wait"
    43  	"k8s.io/client-go/tools/events"
    44  	utilsysctl "k8s.io/component-helpers/node/util/sysctl"
    45  	"k8s.io/klog/v2"
    46  	"k8s.io/kubernetes/pkg/proxy"
    47  	"k8s.io/kubernetes/pkg/proxy/conntrack"
    48  	"k8s.io/kubernetes/pkg/proxy/healthcheck"
    49  	"k8s.io/kubernetes/pkg/proxy/metaproxier"
    50  	"k8s.io/kubernetes/pkg/proxy/metrics"
    51  	proxyutil "k8s.io/kubernetes/pkg/proxy/util"
    52  	proxyutiliptables "k8s.io/kubernetes/pkg/proxy/util/iptables"
    53  	"k8s.io/kubernetes/pkg/util/async"
    54  	utilexec "k8s.io/utils/exec"
    55  	netutils "k8s.io/utils/net"
    56  	"k8s.io/utils/ptr"
    57  )
    58  
    59  const (
    60  	// Our nftables table. All of our chains/sets/maps are created inside this table,
    61  	// so they don't need any "kube-" or "kube-proxy-" prefix of their own.
    62  	kubeProxyTable = "kube-proxy"
    63  
    64  	// service dispatch
    65  	kubeServicesChain       = "services"
    66  	kubeServiceIPsMap       = "service-ips"
    67  	kubeServiceNodePortsMap = "service-nodeports"
    68  
    69  	// set of IPs that accept NodePort traffic
    70  	kubeNodePortIPsSet = "nodeport-ips"
    71  
    72  	// handling for services with no endpoints
    73  	kubeEndpointsCheckChain    = "endpoints-check"
    74  	kubeNoEndpointServicesMap  = "no-endpoint-services"
    75  	kubeNoEndpointNodePortsMap = "no-endpoint-nodeports"
    76  	kubeRejectChain            = "reject-chain"
    77  
    78  	// LoadBalancerSourceRanges handling
    79  	kubeFirewallSet             = "firewall"
    80  	kubeFirewallCheckChain      = "firewall-check"
    81  	kubeFirewallAllowSet        = "firewall-allow"
    82  	kubeFirewallAllowCheckChain = "firewall-allow-check"
    83  
    84  	// masquerading
    85  	kubeMarkMasqChain     = "mark-for-masquerade"
    86  	kubeMasqueradingChain = "masquerading"
    87  
    88  	// chain for special filtering rules
    89  	kubeForwardChain = "forward"
    90  )
    91  
    92  const sysctlNFConntrackTCPBeLiberal = "net/netfilter/nf_conntrack_tcp_be_liberal"
    93  
    94  // internal struct for string service information
    95  type servicePortInfo struct {
    96  	*proxy.BaseServicePortInfo
    97  	// The following fields are computed and stored for performance reasons.
    98  	nameString             string
    99  	clusterPolicyChainName string
   100  	localPolicyChainName   string
   101  	externalChainName      string
   102  }
   103  
   104  // returns a new proxy.ServicePort which abstracts a serviceInfo
   105  func newServiceInfo(port *v1.ServicePort, service *v1.Service, bsvcPortInfo *proxy.BaseServicePortInfo) proxy.ServicePort {
   106  	svcPort := &servicePortInfo{BaseServicePortInfo: bsvcPortInfo}
   107  
   108  	// Store the following for performance reasons.
   109  	svcName := types.NamespacedName{Namespace: service.Namespace, Name: service.Name}
   110  	svcPortName := proxy.ServicePortName{NamespacedName: svcName, Port: port.Name}
   111  	svcPort.nameString = svcPortName.String()
   112  
   113  	chainNameBase := servicePortChainNameBase(&svcPortName, strings.ToLower(string(svcPort.Protocol())))
   114  	svcPort.clusterPolicyChainName = servicePortPolicyClusterChainNamePrefix + chainNameBase
   115  	svcPort.localPolicyChainName = servicePortPolicyLocalChainNamePrefix + chainNameBase
   116  	svcPort.externalChainName = serviceExternalChainNamePrefix + chainNameBase
   117  
   118  	return svcPort
   119  }
   120  
   121  // internal struct for endpoints information
   122  type endpointInfo struct {
   123  	*proxy.BaseEndpointInfo
   124  
   125  	chainName       string
   126  	affinitySetName string
   127  }
   128  
   129  // returns a new proxy.Endpoint which abstracts a endpointInfo
   130  func newEndpointInfo(baseInfo *proxy.BaseEndpointInfo, svcPortName *proxy.ServicePortName) proxy.Endpoint {
   131  	chainNameBase := servicePortEndpointChainNameBase(svcPortName, strings.ToLower(string(svcPortName.Protocol)), baseInfo.String())
   132  	return &endpointInfo{
   133  		BaseEndpointInfo: baseInfo,
   134  		chainName:        servicePortEndpointChainNamePrefix + chainNameBase,
   135  		affinitySetName:  servicePortEndpointAffinityNamePrefix + chainNameBase,
   136  	}
   137  }
   138  
   139  // Proxier is an nftables based proxy
   140  type Proxier struct {
   141  	// ipFamily defines the IP family which this proxier is tracking.
   142  	ipFamily v1.IPFamily
   143  
   144  	// endpointsChanges and serviceChanges contains all changes to endpoints and
   145  	// services that happened since nftables was synced. For a single object,
   146  	// changes are accumulated, i.e. previous is state from before all of them,
   147  	// current is state after applying all of those.
   148  	endpointsChanges *proxy.EndpointsChangeTracker
   149  	serviceChanges   *proxy.ServiceChangeTracker
   150  
   151  	mu           sync.Mutex // protects the following fields
   152  	svcPortMap   proxy.ServicePortMap
   153  	endpointsMap proxy.EndpointsMap
   154  	nodeLabels   map[string]string
   155  	// endpointSlicesSynced, and servicesSynced are set to true
   156  	// when corresponding objects are synced after startup. This is used to avoid
   157  	// updating nftables with some partial data after kube-proxy restart.
   158  	endpointSlicesSynced bool
   159  	servicesSynced       bool
   160  	initialized          int32
   161  	syncRunner           *async.BoundedFrequencyRunner // governs calls to syncProxyRules
   162  	syncPeriod           time.Duration
   163  
   164  	// These are effectively const and do not need the mutex to be held.
   165  	nftables       knftables.Interface
   166  	masqueradeAll  bool
   167  	masqueradeMark string
   168  	exec           utilexec.Interface
   169  	localDetector  proxyutiliptables.LocalTrafficDetector
   170  	hostname       string
   171  	nodeIP         net.IP
   172  	recorder       events.EventRecorder
   173  
   174  	serviceHealthServer healthcheck.ServiceHealthServer
   175  	healthzServer       *healthcheck.ProxierHealthServer
   176  
   177  	// conntrackTCPLiberal indicates whether the system sets the kernel nf_conntrack_tcp_be_liberal
   178  	conntrackTCPLiberal bool
   179  
   180  	// nodePortAddresses selects the interfaces where nodePort works.
   181  	nodePortAddresses *proxyutil.NodePortAddresses
   182  	// networkInterfacer defines an interface for several net library functions.
   183  	// Inject for test purpose.
   184  	networkInterfacer proxyutil.NetworkInterfacer
   185  
   186  	// staleChains contains information about chains to be deleted later
   187  	staleChains map[string]time.Time
   188  }
   189  
   190  // Proxier implements proxy.Provider
   191  var _ proxy.Provider = &Proxier{}
   192  
   193  // NewProxier returns a new nftables Proxier. Once a proxier is created, it will keep
   194  // nftables up to date in the background and will not terminate if a particular nftables
   195  // call fails.
   196  func NewProxier(ipFamily v1.IPFamily,
   197  	sysctl utilsysctl.Interface,
   198  	syncPeriod time.Duration,
   199  	minSyncPeriod time.Duration,
   200  	masqueradeAll bool,
   201  	masqueradeBit int,
   202  	localDetector proxyutiliptables.LocalTrafficDetector,
   203  	hostname string,
   204  	nodeIP net.IP,
   205  	recorder events.EventRecorder,
   206  	healthzServer *healthcheck.ProxierHealthServer,
   207  	nodePortAddressStrings []string,
   208  	initOnly bool,
   209  ) (*Proxier, error) {
   210  	nodePortAddresses := proxyutil.NewNodePortAddresses(ipFamily, nodePortAddressStrings)
   211  
   212  	// Be conservative in what you do, be liberal in what you accept from others.
   213  	// If it's non-zero, we mark only out of window RST segments as INVALID.
   214  	// Ref: https://docs.kernel.org/networking/nf_conntrack-sysctl.html
   215  	conntrackTCPLiberal := false
   216  	if val, err := sysctl.GetSysctl(sysctlNFConntrackTCPBeLiberal); err == nil && val != 0 {
   217  		conntrackTCPLiberal = true
   218  		klog.InfoS("nf_conntrack_tcp_be_liberal set, not installing DROP rules for INVALID packets")
   219  	}
   220  
   221  	if initOnly {
   222  		klog.InfoS("System initialized and --init-only specified")
   223  		return nil, nil
   224  	}
   225  
   226  	// Generate the masquerade mark to use for SNAT rules.
   227  	masqueradeValue := 1 << uint(masqueradeBit)
   228  	masqueradeMark := fmt.Sprintf("%#08x", masqueradeValue)
   229  	klog.V(2).InfoS("Using nftables mark for masquerade", "ipFamily", ipFamily, "mark", masqueradeMark)
   230  
   231  	serviceHealthServer := healthcheck.NewServiceHealthServer(hostname, recorder, nodePortAddresses, healthzServer)
   232  
   233  	var nftablesFamily knftables.Family
   234  	if ipFamily == v1.IPv4Protocol {
   235  		nftablesFamily = knftables.IPv4Family
   236  	} else {
   237  		nftablesFamily = knftables.IPv6Family
   238  	}
   239  	nft, err := knftables.New(nftablesFamily, kubeProxyTable)
   240  	if err != nil {
   241  		return nil, err
   242  	}
   243  
   244  	proxier := &Proxier{
   245  		ipFamily:            ipFamily,
   246  		svcPortMap:          make(proxy.ServicePortMap),
   247  		serviceChanges:      proxy.NewServiceChangeTracker(newServiceInfo, ipFamily, recorder, nil),
   248  		endpointsMap:        make(proxy.EndpointsMap),
   249  		endpointsChanges:    proxy.NewEndpointsChangeTracker(hostname, newEndpointInfo, ipFamily, recorder, nil),
   250  		syncPeriod:          syncPeriod,
   251  		nftables:            nft,
   252  		masqueradeAll:       masqueradeAll,
   253  		masqueradeMark:      masqueradeMark,
   254  		exec:                utilexec.New(),
   255  		localDetector:       localDetector,
   256  		hostname:            hostname,
   257  		nodeIP:              nodeIP,
   258  		recorder:            recorder,
   259  		serviceHealthServer: serviceHealthServer,
   260  		healthzServer:       healthzServer,
   261  		nodePortAddresses:   nodePortAddresses,
   262  		networkInterfacer:   proxyutil.RealNetwork{},
   263  		conntrackTCPLiberal: conntrackTCPLiberal,
   264  		staleChains:         make(map[string]time.Time),
   265  	}
   266  
   267  	burstSyncs := 2
   268  	klog.V(2).InfoS("NFTables sync params", "ipFamily", ipFamily, "minSyncPeriod", minSyncPeriod, "syncPeriod", syncPeriod, "burstSyncs", burstSyncs)
   269  	proxier.syncRunner = async.NewBoundedFrequencyRunner("sync-runner", proxier.syncProxyRules, minSyncPeriod, syncPeriod, burstSyncs)
   270  
   271  	return proxier, nil
   272  }
   273  
   274  // NewDualStackProxier creates a MetaProxier instance, with IPv4 and IPv6 proxies.
   275  func NewDualStackProxier(
   276  	sysctl utilsysctl.Interface,
   277  	syncPeriod time.Duration,
   278  	minSyncPeriod time.Duration,
   279  	masqueradeAll bool,
   280  	masqueradeBit int,
   281  	localDetectors [2]proxyutiliptables.LocalTrafficDetector,
   282  	hostname string,
   283  	nodeIPs map[v1.IPFamily]net.IP,
   284  	recorder events.EventRecorder,
   285  	healthzServer *healthcheck.ProxierHealthServer,
   286  	nodePortAddresses []string,
   287  	initOnly bool,
   288  ) (proxy.Provider, error) {
   289  	// Create an ipv4 instance of the single-stack proxier
   290  	ipv4Proxier, err := NewProxier(v1.IPv4Protocol, sysctl,
   291  		syncPeriod, minSyncPeriod, masqueradeAll, masqueradeBit, localDetectors[0], hostname,
   292  		nodeIPs[v1.IPv4Protocol], recorder, healthzServer, nodePortAddresses, initOnly)
   293  	if err != nil {
   294  		return nil, fmt.Errorf("unable to create ipv4 proxier: %v", err)
   295  	}
   296  
   297  	ipv6Proxier, err := NewProxier(v1.IPv6Protocol, sysctl,
   298  		syncPeriod, minSyncPeriod, masqueradeAll, masqueradeBit, localDetectors[1], hostname,
   299  		nodeIPs[v1.IPv6Protocol], recorder, healthzServer, nodePortAddresses, initOnly)
   300  	if err != nil {
   301  		return nil, fmt.Errorf("unable to create ipv6 proxier: %v", err)
   302  	}
   303  	if initOnly {
   304  		return nil, nil
   305  	}
   306  	return metaproxier.NewMetaProxier(ipv4Proxier, ipv6Proxier), nil
   307  }
   308  
   309  // nftablesBaseChains lists our "base chains"; those that are directly connected to the
   310  // netfilter hooks (e.g., "postrouting", "input", etc.), as opposed to "regular" chains,
   311  // which are only run when a rule jumps to them. See
   312  // https://wiki.nftables.org/wiki-nftables/index.php/Configuring_chains.
   313  //
   314  // These are set up from setupNFTables() and then not directly referenced by
   315  // syncProxyRules().
   316  //
   317  // All of our base chains have names that are just "${type}-${hook}". e.g., "nat-prerouting".
   318  type nftablesBaseChain struct {
   319  	name      string
   320  	chainType knftables.BaseChainType
   321  	hook      knftables.BaseChainHook
   322  	priority  knftables.BaseChainPriority
   323  }
   324  
   325  var nftablesBaseChains = []nftablesBaseChain{
   326  	// We want our filtering rules to operate on pre-DNAT dest IPs, so our filter
   327  	// chains have to run before DNAT.
   328  	{"filter-input", knftables.FilterType, knftables.InputHook, knftables.DNATPriority + "-1"},
   329  	{"filter-forward", knftables.FilterType, knftables.ForwardHook, knftables.DNATPriority + "-1"},
   330  	{"filter-output", knftables.FilterType, knftables.OutputHook, knftables.DNATPriority + "-1"},
   331  	{"nat-prerouting", knftables.NATType, knftables.PreroutingHook, knftables.DNATPriority},
   332  	{"nat-output", knftables.NATType, knftables.OutputHook, knftables.DNATPriority},
   333  	{"nat-postrouting", knftables.NATType, knftables.PostroutingHook, knftables.SNATPriority},
   334  }
   335  
   336  // nftablesJumpChains lists our top-level "regular chains" that are jumped to directly
   337  // from one of the base chains. These are set up from setupNFTables(), and some of them
   338  // are also referenced in syncProxyRules().
   339  type nftablesJumpChain struct {
   340  	dstChain  string
   341  	srcChain  string
   342  	extraArgs string
   343  }
   344  
   345  var nftablesJumpChains = []nftablesJumpChain{
   346  	{kubeEndpointsCheckChain, "filter-input", "ct state new"},
   347  	{kubeEndpointsCheckChain, "filter-forward", "ct state new"},
   348  	{kubeEndpointsCheckChain, "filter-output", "ct state new"},
   349  
   350  	{kubeForwardChain, "filter-forward", ""},
   351  
   352  	{kubeFirewallCheckChain, "filter-input", "ct state new"},
   353  	{kubeFirewallCheckChain, "filter-output", "ct state new"},
   354  	{kubeFirewallCheckChain, "filter-forward", "ct state new"},
   355  
   356  	{kubeServicesChain, "nat-output", ""},
   357  	{kubeServicesChain, "nat-prerouting", ""},
   358  	{kubeMasqueradingChain, "nat-postrouting", ""},
   359  }
   360  
   361  // ensureChain adds commands to tx to ensure that chain exists and doesn't contain
   362  // anything from before this transaction (using createdChains to ensure that we don't
   363  // Flush a chain more than once and lose *new* rules as well.)
   364  func ensureChain(chain string, tx *knftables.Transaction, createdChains sets.Set[string]) {
   365  	if createdChains.Has(chain) {
   366  		return
   367  	}
   368  	tx.Add(&knftables.Chain{
   369  		Name: chain,
   370  	})
   371  	tx.Flush(&knftables.Chain{
   372  		Name: chain,
   373  	})
   374  	createdChains.Insert(chain)
   375  }
   376  
   377  func (proxier *Proxier) setupNFTables(tx *knftables.Transaction) {
   378  	ipX := "ip"
   379  	ipvX_addr := "ipv4_addr" //nolint:stylecheck // var name intentionally resembles value
   380  	noLocalhost := "ip daddr != 127.0.0.0/8"
   381  	if proxier.ipFamily == v1.IPv6Protocol {
   382  		ipX = "ip6"
   383  		ipvX_addr = "ipv6_addr"
   384  		noLocalhost = "ip6 daddr != ::1"
   385  	}
   386  
   387  	tx.Add(&knftables.Table{
   388  		Comment: ptr.To("rules for kube-proxy"),
   389  	})
   390  
   391  	// Create and flush base chains
   392  	for _, bc := range nftablesBaseChains {
   393  		chain := &knftables.Chain{
   394  			Name:     bc.name,
   395  			Type:     ptr.To(bc.chainType),
   396  			Hook:     ptr.To(bc.hook),
   397  			Priority: ptr.To(bc.priority),
   398  		}
   399  		tx.Add(chain)
   400  		tx.Flush(chain)
   401  	}
   402  
   403  	// Create and flush ordinary chains and add rules jumping to them
   404  	createdChains := sets.New[string]()
   405  	for _, c := range nftablesJumpChains {
   406  		ensureChain(c.dstChain, tx, createdChains)
   407  		tx.Add(&knftables.Rule{
   408  			Chain: c.srcChain,
   409  			Rule: knftables.Concat(
   410  				c.extraArgs,
   411  				"jump", c.dstChain,
   412  			),
   413  		})
   414  	}
   415  
   416  	// Ensure all of our other "top-level" chains exist
   417  	for _, chain := range []string{kubeServicesChain, kubeForwardChain, kubeMasqueradingChain, kubeMarkMasqChain} {
   418  		ensureChain(chain, tx, createdChains)
   419  	}
   420  
   421  	// Add the rules in the mark-for-masquerade and masquerading chains
   422  	tx.Add(&knftables.Rule{
   423  		Chain: kubeMarkMasqChain,
   424  		Rule: knftables.Concat(
   425  			"mark", "set", "mark", "or", proxier.masqueradeMark,
   426  		),
   427  	})
   428  
   429  	tx.Add(&knftables.Rule{
   430  		Chain: kubeMasqueradingChain,
   431  		Rule: knftables.Concat(
   432  			"mark", "and", proxier.masqueradeMark, "==", "0",
   433  			"return",
   434  		),
   435  	})
   436  	tx.Add(&knftables.Rule{
   437  		Chain: kubeMasqueradingChain,
   438  		Rule: knftables.Concat(
   439  			"mark", "set", "mark", "xor", proxier.masqueradeMark,
   440  		),
   441  	})
   442  	tx.Add(&knftables.Rule{
   443  		Chain: kubeMasqueradingChain,
   444  		Rule:  "masquerade fully-random",
   445  	})
   446  
   447  	// Drop the packets in INVALID state, which would potentially cause
   448  	// unexpected connection reset if nf_conntrack_tcp_be_liberal is not set.
   449  	// Ref: https://github.com/kubernetes/kubernetes/issues/74839
   450  	// Ref: https://github.com/kubernetes/kubernetes/issues/117924
   451  	if !proxier.conntrackTCPLiberal {
   452  		tx.Add(&knftables.Rule{
   453  			Chain: kubeForwardChain,
   454  			Rule:  "ct state invalid drop",
   455  		})
   456  	}
   457  
   458  	// Fill in nodeport-ips set if needed (or delete it if not). (We do "add+delete"
   459  	// rather than just "delete" when we want to ensure the set doesn't exist, because
   460  	// doing just "delete" would return an error if the set didn't exist.)
   461  	tx.Add(&knftables.Set{
   462  		Name:    kubeNodePortIPsSet,
   463  		Type:    ipvX_addr,
   464  		Comment: ptr.To("IPs that accept NodePort traffic"),
   465  	})
   466  	if proxier.nodePortAddresses.MatchAll() {
   467  		tx.Delete(&knftables.Set{
   468  			Name: kubeNodePortIPsSet,
   469  		})
   470  	} else {
   471  		tx.Flush(&knftables.Set{
   472  			Name: kubeNodePortIPsSet,
   473  		})
   474  		nodeIPs, err := proxier.nodePortAddresses.GetNodeIPs(proxier.networkInterfacer)
   475  		if err != nil {
   476  			klog.ErrorS(err, "Failed to get node ip address matching nodeport cidrs, services with nodeport may not work as intended", "CIDRs", proxier.nodePortAddresses)
   477  		}
   478  		for _, ip := range nodeIPs {
   479  			if ip.IsLoopback() {
   480  				klog.ErrorS(nil, "--nodeport-addresses includes localhost but localhost NodePorts are not supported", "address", ip.String())
   481  				continue
   482  			}
   483  			tx.Add(&knftables.Element{
   484  				Set: kubeNodePortIPsSet,
   485  				Key: []string{
   486  					ip.String(),
   487  				},
   488  			})
   489  		}
   490  	}
   491  
   492  	// Set up "no endpoints" drop/reject handling
   493  	tx.Add(&knftables.Map{
   494  		Name:    kubeNoEndpointServicesMap,
   495  		Type:    ipvX_addr + " . inet_proto . inet_service : verdict",
   496  		Comment: ptr.To("vmap to drop or reject packets to services with no endpoints"),
   497  	})
   498  	tx.Add(&knftables.Map{
   499  		Name:    kubeNoEndpointNodePortsMap,
   500  		Type:    "inet_proto . inet_service : verdict",
   501  		Comment: ptr.To("vmap to drop or reject packets to service nodeports with no endpoints"),
   502  	})
   503  
   504  	tx.Add(&knftables.Chain{
   505  		Name:    kubeRejectChain,
   506  		Comment: ptr.To("helper for @no-endpoint-services / @no-endpoint-nodeports"),
   507  	})
   508  	tx.Flush(&knftables.Chain{
   509  		Name: kubeRejectChain,
   510  	})
   511  	tx.Add(&knftables.Rule{
   512  		Chain: kubeRejectChain,
   513  		Rule:  "reject",
   514  	})
   515  
   516  	tx.Add(&knftables.Rule{
   517  		Chain: kubeEndpointsCheckChain,
   518  		Rule: knftables.Concat(
   519  			ipX, "daddr", ".", "meta l4proto", ".", "th dport",
   520  			"vmap", "@", kubeNoEndpointServicesMap,
   521  		),
   522  	})
   523  
   524  	if proxier.nodePortAddresses.MatchAll() {
   525  		tx.Add(&knftables.Rule{
   526  			Chain: kubeEndpointsCheckChain,
   527  			Rule: knftables.Concat(
   528  				"fib daddr type local",
   529  				noLocalhost,
   530  				"meta l4proto . th dport",
   531  				"vmap", "@", kubeNoEndpointNodePortsMap,
   532  			),
   533  		})
   534  	} else {
   535  		tx.Add(&knftables.Rule{
   536  			Chain: kubeEndpointsCheckChain,
   537  			Rule: knftables.Concat(
   538  				ipX, "daddr", "@", kubeNodePortIPsSet,
   539  				"meta l4proto . th dport",
   540  				"vmap", "@", kubeNoEndpointNodePortsMap,
   541  			),
   542  		})
   543  	}
   544  
   545  	// Set up LoadBalancerSourceRanges firewalling
   546  	tx.Add(&knftables.Set{
   547  		Name:    kubeFirewallSet,
   548  		Type:    ipvX_addr + " . inet_proto . inet_service",
   549  		Comment: ptr.To("destinations that are subject to LoadBalancerSourceRanges"),
   550  	})
   551  	tx.Add(&knftables.Set{
   552  		Name:    kubeFirewallAllowSet,
   553  		Type:    ipvX_addr + " . inet_proto . inet_service . " + ipvX_addr,
   554  		Flags:   []knftables.SetFlag{knftables.IntervalFlag},
   555  		Comment: ptr.To("destinations+sources that are allowed by LoadBalancerSourceRanges"),
   556  	})
   557  
   558  	ensureChain(kubeFirewallCheckChain, tx, createdChains)
   559  	ensureChain(kubeFirewallAllowCheckChain, tx, createdChains)
   560  	tx.Add(&knftables.Rule{
   561  		Chain: kubeFirewallCheckChain,
   562  		Rule: knftables.Concat(
   563  			ipX, "daddr", ".", "meta l4proto", ".", "th dport", "@", kubeFirewallSet,
   564  			"jump", kubeFirewallAllowCheckChain,
   565  		),
   566  	})
   567  	tx.Add(&knftables.Rule{
   568  		Chain: kubeFirewallAllowCheckChain,
   569  		Rule: knftables.Concat(
   570  			ipX, "daddr", ".", "meta l4proto", ".", "th dport", ".", ipX, "saddr", "@", kubeFirewallAllowSet,
   571  			"return",
   572  		),
   573  	})
   574  	tx.Add(&knftables.Rule{
   575  		Chain: kubeFirewallAllowCheckChain,
   576  		Rule:  "drop",
   577  	})
   578  
   579  	// Set up service dispatch
   580  	tx.Add(&knftables.Map{
   581  		Name:    kubeServiceIPsMap,
   582  		Type:    ipvX_addr + " . inet_proto . inet_service : verdict",
   583  		Comment: ptr.To("ClusterIP, ExternalIP and LoadBalancer IP traffic"),
   584  	})
   585  	tx.Add(&knftables.Map{
   586  		Name:    kubeServiceNodePortsMap,
   587  		Type:    "inet_proto . inet_service : verdict",
   588  		Comment: ptr.To("NodePort traffic"),
   589  	})
   590  	tx.Add(&knftables.Rule{
   591  		Chain: kubeServicesChain,
   592  		Rule: knftables.Concat(
   593  			ipX, "daddr", ".", "meta l4proto", ".", "th dport",
   594  			"vmap", "@", kubeServiceIPsMap,
   595  		),
   596  	})
   597  	if proxier.nodePortAddresses.MatchAll() {
   598  		tx.Add(&knftables.Rule{
   599  			Chain: kubeServicesChain,
   600  			Rule: knftables.Concat(
   601  				"fib daddr type local",
   602  				noLocalhost,
   603  				"meta l4proto . th dport",
   604  				"vmap", "@", kubeServiceNodePortsMap,
   605  			),
   606  		})
   607  	} else {
   608  		tx.Add(&knftables.Rule{
   609  			Chain: kubeServicesChain,
   610  			Rule: knftables.Concat(
   611  				ipX, "daddr @nodeport-ips",
   612  				"meta l4proto . th dport",
   613  				"vmap", "@", kubeServiceNodePortsMap,
   614  			),
   615  		})
   616  	}
   617  }
   618  
   619  // CleanupLeftovers removes all nftables rules and chains created by the Proxier
   620  // It returns true if an error was encountered. Errors are logged.
   621  func CleanupLeftovers() bool {
   622  	var encounteredError bool
   623  
   624  	for _, family := range []knftables.Family{knftables.IPv4Family, knftables.IPv6Family} {
   625  		nft, err := knftables.New(family, kubeProxyTable)
   626  		if err == nil {
   627  			tx := nft.NewTransaction()
   628  			tx.Delete(&knftables.Table{})
   629  			err = nft.Run(context.TODO(), tx)
   630  		}
   631  		if err != nil && !knftables.IsNotFound(err) {
   632  			klog.ErrorS(err, "Error cleaning up nftables rules")
   633  			encounteredError = true
   634  		}
   635  	}
   636  
   637  	return encounteredError
   638  }
   639  
   640  // Sync is called to synchronize the proxier state to nftables as soon as possible.
   641  func (proxier *Proxier) Sync() {
   642  	if proxier.healthzServer != nil {
   643  		proxier.healthzServer.QueuedUpdate(proxier.ipFamily)
   644  	}
   645  	metrics.SyncProxyRulesLastQueuedTimestamp.SetToCurrentTime()
   646  	proxier.syncRunner.Run()
   647  }
   648  
   649  // SyncLoop runs periodic work.  This is expected to run as a goroutine or as the main loop of the app.  It does not return.
   650  func (proxier *Proxier) SyncLoop() {
   651  	// Update healthz timestamp at beginning in case Sync() never succeeds.
   652  	if proxier.healthzServer != nil {
   653  		proxier.healthzServer.Updated(proxier.ipFamily)
   654  	}
   655  
   656  	// synthesize "last change queued" time as the informers are syncing.
   657  	metrics.SyncProxyRulesLastQueuedTimestamp.SetToCurrentTime()
   658  	proxier.syncRunner.Loop(wait.NeverStop)
   659  }
   660  
   661  func (proxier *Proxier) setInitialized(value bool) {
   662  	var initialized int32
   663  	if value {
   664  		initialized = 1
   665  	}
   666  	atomic.StoreInt32(&proxier.initialized, initialized)
   667  }
   668  
   669  func (proxier *Proxier) isInitialized() bool {
   670  	return atomic.LoadInt32(&proxier.initialized) > 0
   671  }
   672  
   673  // OnServiceAdd is called whenever creation of new service object
   674  // is observed.
   675  func (proxier *Proxier) OnServiceAdd(service *v1.Service) {
   676  	proxier.OnServiceUpdate(nil, service)
   677  }
   678  
   679  // OnServiceUpdate is called whenever modification of an existing
   680  // service object is observed.
   681  func (proxier *Proxier) OnServiceUpdate(oldService, service *v1.Service) {
   682  	if proxier.serviceChanges.Update(oldService, service) && proxier.isInitialized() {
   683  		proxier.Sync()
   684  	}
   685  }
   686  
   687  // OnServiceDelete is called whenever deletion of an existing service
   688  // object is observed.
   689  func (proxier *Proxier) OnServiceDelete(service *v1.Service) {
   690  	proxier.OnServiceUpdate(service, nil)
   691  
   692  }
   693  
   694  // OnServiceSynced is called once all the initial event handlers were
   695  // called and the state is fully propagated to local cache.
   696  func (proxier *Proxier) OnServiceSynced() {
   697  	proxier.mu.Lock()
   698  	proxier.servicesSynced = true
   699  	proxier.setInitialized(proxier.endpointSlicesSynced)
   700  	proxier.mu.Unlock()
   701  
   702  	// Sync unconditionally - this is called once per lifetime.
   703  	proxier.syncProxyRules()
   704  }
   705  
   706  // OnEndpointSliceAdd is called whenever creation of a new endpoint slice object
   707  // is observed.
   708  func (proxier *Proxier) OnEndpointSliceAdd(endpointSlice *discovery.EndpointSlice) {
   709  	if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, false) && proxier.isInitialized() {
   710  		proxier.Sync()
   711  	}
   712  }
   713  
   714  // OnEndpointSliceUpdate is called whenever modification of an existing endpoint
   715  // slice object is observed.
   716  func (proxier *Proxier) OnEndpointSliceUpdate(_, endpointSlice *discovery.EndpointSlice) {
   717  	if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, false) && proxier.isInitialized() {
   718  		proxier.Sync()
   719  	}
   720  }
   721  
   722  // OnEndpointSliceDelete is called whenever deletion of an existing endpoint slice
   723  // object is observed.
   724  func (proxier *Proxier) OnEndpointSliceDelete(endpointSlice *discovery.EndpointSlice) {
   725  	if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, true) && proxier.isInitialized() {
   726  		proxier.Sync()
   727  	}
   728  }
   729  
   730  // OnEndpointSlicesSynced is called once all the initial event handlers were
   731  // called and the state is fully propagated to local cache.
   732  func (proxier *Proxier) OnEndpointSlicesSynced() {
   733  	proxier.mu.Lock()
   734  	proxier.endpointSlicesSynced = true
   735  	proxier.setInitialized(proxier.servicesSynced)
   736  	proxier.mu.Unlock()
   737  
   738  	// Sync unconditionally - this is called once per lifetime.
   739  	proxier.syncProxyRules()
   740  }
   741  
   742  // OnNodeAdd is called whenever creation of new node object
   743  // is observed.
   744  func (proxier *Proxier) OnNodeAdd(node *v1.Node) {
   745  	if node.Name != proxier.hostname {
   746  		klog.ErrorS(nil, "Received a watch event for a node that doesn't match the current node",
   747  			"eventNode", node.Name, "currentNode", proxier.hostname)
   748  		return
   749  	}
   750  
   751  	if reflect.DeepEqual(proxier.nodeLabels, node.Labels) {
   752  		return
   753  	}
   754  
   755  	proxier.mu.Lock()
   756  	proxier.nodeLabels = map[string]string{}
   757  	for k, v := range node.Labels {
   758  		proxier.nodeLabels[k] = v
   759  	}
   760  	proxier.mu.Unlock()
   761  	klog.V(4).InfoS("Updated proxier node labels", "labels", node.Labels)
   762  
   763  	proxier.Sync()
   764  }
   765  
   766  // OnNodeUpdate is called whenever modification of an existing
   767  // node object is observed.
   768  func (proxier *Proxier) OnNodeUpdate(oldNode, node *v1.Node) {
   769  	if node.Name != proxier.hostname {
   770  		klog.ErrorS(nil, "Received a watch event for a node that doesn't match the current node",
   771  			"eventNode", node.Name, "currentNode", proxier.hostname)
   772  		return
   773  	}
   774  
   775  	if reflect.DeepEqual(proxier.nodeLabels, node.Labels) {
   776  		return
   777  	}
   778  
   779  	proxier.mu.Lock()
   780  	proxier.nodeLabels = map[string]string{}
   781  	for k, v := range node.Labels {
   782  		proxier.nodeLabels[k] = v
   783  	}
   784  	proxier.mu.Unlock()
   785  	klog.V(4).InfoS("Updated proxier node labels", "labels", node.Labels)
   786  
   787  	proxier.Sync()
   788  }
   789  
   790  // OnNodeDelete is called whenever deletion of an existing node
   791  // object is observed.
   792  func (proxier *Proxier) OnNodeDelete(node *v1.Node) {
   793  	if node.Name != proxier.hostname {
   794  		klog.ErrorS(nil, "Received a watch event for a node that doesn't match the current node",
   795  			"eventNode", node.Name, "currentNode", proxier.hostname)
   796  		return
   797  	}
   798  
   799  	proxier.mu.Lock()
   800  	proxier.nodeLabels = nil
   801  	proxier.mu.Unlock()
   802  
   803  	proxier.Sync()
   804  }
   805  
   806  // OnNodeSynced is called once all the initial event handlers were
   807  // called and the state is fully propagated to local cache.
   808  func (proxier *Proxier) OnNodeSynced() {
   809  }
   810  
   811  const (
   812  	// Maximum length for one of our chain name prefixes, including the trailing
   813  	// hyphen.
   814  	chainNamePrefixLengthMax = 16
   815  
   816  	// Maximum length of the string returned from servicePortChainNameBase or
   817  	// servicePortEndpointChainNameBase.
   818  	chainNameBaseLengthMax = knftables.NameLengthMax - chainNamePrefixLengthMax
   819  )
   820  
   821  const (
   822  	servicePortPolicyClusterChainNamePrefix = "service-"
   823  	servicePortPolicyLocalChainNamePrefix   = "local-"
   824  	serviceExternalChainNamePrefix          = "external-"
   825  	servicePortEndpointChainNamePrefix      = "endpoint-"
   826  	servicePortEndpointAffinityNamePrefix   = "affinity-"
   827  )
   828  
   829  // hashAndTruncate prefixes name with a hash of itself and then truncates to
   830  // chainNameBaseLengthMax. The hash ensures that (a) the name is still unique if we have
   831  // to truncate the end, and (b) it's visually distinguishable from other chains that would
   832  // otherwise have nearly identical names (e.g., different endpoint chains for a given
   833  // service that differ in only a single digit).
   834  func hashAndTruncate(name string) string {
   835  	hash := sha256.Sum256([]byte(name))
   836  	encoded := base32.StdEncoding.EncodeToString(hash[:])
   837  	name = encoded[:8] + "-" + name
   838  	if len(name) > chainNameBaseLengthMax {
   839  		name = name[:chainNameBaseLengthMax-3] + "..."
   840  	}
   841  	return name
   842  }
   843  
   844  // servicePortChainNameBase returns the base name for a chain for the given ServicePort.
   845  // This is something like "HASH-namespace/serviceName/protocol/portName", e.g,
   846  // "ULMVA6XW-ns1/svc1/tcp/p80".
   847  func servicePortChainNameBase(servicePortName *proxy.ServicePortName, protocol string) string {
   848  	// nftables chains can contain the characters [A-Za-z0-9_./-] (but must start with
   849  	// a letter, underscore, or dot).
   850  	//
   851  	// Namespace, Service, and Port names can contain [a-z0-9-] (with some additional
   852  	// restrictions that aren't relevant here).
   853  	//
   854  	// Protocol is /(tcp|udp|sctp)/.
   855  	//
   856  	// Thus, we can safely use all Namespace names, Service names, protocol values,
   857  	// and Port names directly in nftables chain names (though note that this assumes
   858  	// that the chain name won't *start* with any of those strings, since that might
   859  	// be illegal). We use "/" to separate the parts of the name, which is one of the
   860  	// two characters allowed in a chain name that isn't allowed in our input strings.
   861  
   862  	name := fmt.Sprintf("%s/%s/%s/%s",
   863  		servicePortName.NamespacedName.Namespace,
   864  		servicePortName.NamespacedName.Name,
   865  		protocol,
   866  		servicePortName.Port,
   867  	)
   868  
   869  	// The namespace, service, and port name can each be up to 63 characters, protocol
   870  	// can be up to 4, plus 8 for the hash and 4 additional punctuation characters.
   871  	// That's a total of 205, which is less than chainNameBaseLengthMax (240). So this
   872  	// will never actually return a truncated name.
   873  	return hashAndTruncate(name)
   874  }
   875  
   876  // servicePortEndpointChainNameBase returns the suffix for chain names for the given
   877  // endpoint. This is something like
   878  // "HASH-namespace/serviceName/protocol/portName__endpointIP/endpointport", e.g.,
   879  // "5OJB2KTY-ns1/svc1/tcp/p80__10.180.0.1/80".
   880  func servicePortEndpointChainNameBase(servicePortName *proxy.ServicePortName, protocol, endpoint string) string {
   881  	// As above in servicePortChainNameBase: Namespace, Service, Port, Protocol, and
   882  	// EndpointPort are all safe to copy into the chain name directly. But if
   883  	// EndpointIP is IPv6 then it will contain colons, which aren't allowed in a chain
   884  	// name. IPv6 IPs are also quite long, but we can't safely truncate them (e.g. to
   885  	// only the final segment) because (especially for manually-created external
   886  	// endpoints), we can't know for sure that any part of them is redundant.
   887  
   888  	endpointIP, endpointPort, _ := net.SplitHostPort(endpoint)
   889  	if strings.Contains(endpointIP, ":") {
   890  		endpointIP = strings.ReplaceAll(endpointIP, ":", ".")
   891  	}
   892  
   893  	// As above, we use "/" to separate parts of the name, and "__" to separate the
   894  	// "service" part from the "endpoint" part.
   895  	name := fmt.Sprintf("%s/%s/%s/%s__%s/%s",
   896  		servicePortName.NamespacedName.Namespace,
   897  		servicePortName.NamespacedName.Name,
   898  		protocol,
   899  		servicePortName.Port,
   900  		endpointIP,
   901  		endpointPort,
   902  	)
   903  
   904  	// The part of name before the "__" can be up to 205 characters (as with
   905  	// servicePortChainNameBase above). An IPv6 address can be up to 39 characters, and
   906  	// a port can be up to 5 digits, plus 3 punctuation characters gives a max total
   907  	// length of 252, well over chainNameBaseLengthMax (240), so truncation is
   908  	// theoretically possible (though incredibly unlikely).
   909  	return hashAndTruncate(name)
   910  }
   911  
   912  func isServiceChainName(chainString string) bool {
   913  	// The chains returned from servicePortChainNameBase and
   914  	// servicePortEndpointChainNameBase will always have at least one "/" in them.
   915  	// Since none of our "stock" chain names use slashes, we can distinguish them this
   916  	// way.
   917  	return strings.Contains(chainString, "/")
   918  }
   919  
   920  func isAffinitySetName(set string) bool {
   921  	return strings.HasPrefix(set, servicePortEndpointAffinityNamePrefix)
   922  }
   923  
   924  // This is where all of the nftables calls happen.
   925  // This assumes proxier.mu is NOT held
   926  func (proxier *Proxier) syncProxyRules() {
   927  	proxier.mu.Lock()
   928  	defer proxier.mu.Unlock()
   929  
   930  	// don't sync rules till we've received services and endpoints
   931  	if !proxier.isInitialized() {
   932  		klog.V(2).InfoS("Not syncing nftables until Services and Endpoints have been received from master")
   933  		return
   934  	}
   935  
   936  	//
   937  	// Below this point we will not return until we try to write the nftables rules.
   938  	//
   939  
   940  	// Keep track of how long syncs take.
   941  	start := time.Now()
   942  	defer func() {
   943  		metrics.SyncProxyRulesLatency.Observe(metrics.SinceInSeconds(start))
   944  		klog.V(2).InfoS("SyncProxyRules complete", "elapsed", time.Since(start))
   945  	}()
   946  
   947  	serviceUpdateResult := proxier.svcPortMap.Update(proxier.serviceChanges)
   948  	endpointUpdateResult := proxier.endpointsMap.Update(proxier.endpointsChanges)
   949  
   950  	klog.V(2).InfoS("Syncing nftables rules")
   951  
   952  	success := false
   953  	defer func() {
   954  		if !success {
   955  			klog.InfoS("Sync failed", "retryingTime", proxier.syncPeriod)
   956  			proxier.syncRunner.RetryAfter(proxier.syncPeriod)
   957  		}
   958  	}()
   959  
   960  	// If there are sufficiently-stale chains left over from previous transactions,
   961  	// try to delete them now.
   962  	if len(proxier.staleChains) > 0 {
   963  		oneSecondAgo := start.Add(-time.Second)
   964  		tx := proxier.nftables.NewTransaction()
   965  		deleted := 0
   966  		for chain, modtime := range proxier.staleChains {
   967  			if modtime.Before(oneSecondAgo) {
   968  				tx.Delete(&knftables.Chain{
   969  					Name: chain,
   970  				})
   971  				delete(proxier.staleChains, chain)
   972  				deleted++
   973  			}
   974  		}
   975  		if deleted > 0 {
   976  			klog.InfoS("Deleting stale nftables chains", "numChains", deleted)
   977  			err := proxier.nftables.Run(context.TODO(), tx)
   978  			if err != nil {
   979  				// We already deleted the entries from staleChains, but if
   980  				// the chains still exist, they'll just get added back
   981  				// (with a later timestamp) at the end of the sync.
   982  				klog.ErrorS(err, "Unable to delete stale chains; will retry later")
   983  				// FIXME: metric
   984  			}
   985  		}
   986  	}
   987  
   988  	// Now start the actual syncing transaction
   989  	tx := proxier.nftables.NewTransaction()
   990  	proxier.setupNFTables(tx)
   991  
   992  	// We need to use, eg, "ip daddr" for IPv4 but "ip6 daddr" for IPv6
   993  	ipX := "ip"
   994  	ipvX_addr := "ipv4_addr" //nolint:stylecheck // var name intentionally resembles value
   995  	if proxier.ipFamily == v1.IPv6Protocol {
   996  		ipX = "ip6"
   997  		ipvX_addr = "ipv6_addr"
   998  	}
   999  
  1000  	// We currently fully-rebuild our sets and maps on each resync
  1001  	tx.Flush(&knftables.Set{
  1002  		Name: kubeFirewallSet,
  1003  	})
  1004  	tx.Flush(&knftables.Set{
  1005  		Name: kubeFirewallAllowSet,
  1006  	})
  1007  	tx.Flush(&knftables.Map{
  1008  		Name: kubeNoEndpointServicesMap,
  1009  	})
  1010  	tx.Flush(&knftables.Map{
  1011  		Name: kubeNoEndpointNodePortsMap,
  1012  	})
  1013  	tx.Flush(&knftables.Map{
  1014  		Name: kubeServiceIPsMap,
  1015  	})
  1016  	tx.Flush(&knftables.Map{
  1017  		Name: kubeServiceNodePortsMap,
  1018  	})
  1019  
  1020  	// Accumulate service/endpoint chains and affinity sets to keep.
  1021  	activeChains := sets.New[string]()
  1022  	activeAffinitySets := sets.New[string]()
  1023  
  1024  	// Compute total number of endpoint chains across all services
  1025  	// to get a sense of how big the cluster is.
  1026  	totalEndpoints := 0
  1027  	for svcName := range proxier.svcPortMap {
  1028  		totalEndpoints += len(proxier.endpointsMap[svcName])
  1029  	}
  1030  
  1031  	// These two variables are used to publish the sync_proxy_rules_no_endpoints_total
  1032  	// metric.
  1033  	serviceNoLocalEndpointsTotalInternal := 0
  1034  	serviceNoLocalEndpointsTotalExternal := 0
  1035  
  1036  	// Build rules for each service-port.
  1037  	for svcName, svc := range proxier.svcPortMap {
  1038  		svcInfo, ok := svc.(*servicePortInfo)
  1039  		if !ok {
  1040  			klog.ErrorS(nil, "Failed to cast serviceInfo", "serviceName", svcName)
  1041  			continue
  1042  		}
  1043  		protocol := strings.ToLower(string(svcInfo.Protocol()))
  1044  		svcPortNameString := svcInfo.nameString
  1045  
  1046  		// Figure out the endpoints for Cluster and Local traffic policy.
  1047  		// allLocallyReachableEndpoints is the set of all endpoints that can be routed to
  1048  		// from this node, given the service's traffic policies. hasEndpoints is true
  1049  		// if the service has any usable endpoints on any node, not just this one.
  1050  		allEndpoints := proxier.endpointsMap[svcName]
  1051  		clusterEndpoints, localEndpoints, allLocallyReachableEndpoints, hasEndpoints := proxy.CategorizeEndpoints(allEndpoints, svcInfo, proxier.nodeLabels)
  1052  
  1053  		// Note the endpoint chains that will be used
  1054  		for _, ep := range allLocallyReachableEndpoints {
  1055  			if epInfo, ok := ep.(*endpointInfo); ok {
  1056  				ensureChain(epInfo.chainName, tx, activeChains)
  1057  			}
  1058  		}
  1059  
  1060  		// clusterPolicyChain contains the endpoints used with "Cluster" traffic policy
  1061  		clusterPolicyChain := svcInfo.clusterPolicyChainName
  1062  		usesClusterPolicyChain := len(clusterEndpoints) > 0 && svcInfo.UsesClusterEndpoints()
  1063  		if usesClusterPolicyChain {
  1064  			ensureChain(clusterPolicyChain, tx, activeChains)
  1065  		}
  1066  
  1067  		// localPolicyChain contains the endpoints used with "Local" traffic policy
  1068  		localPolicyChain := svcInfo.localPolicyChainName
  1069  		usesLocalPolicyChain := len(localEndpoints) > 0 && svcInfo.UsesLocalEndpoints()
  1070  		if usesLocalPolicyChain {
  1071  			ensureChain(localPolicyChain, tx, activeChains)
  1072  		}
  1073  
  1074  		// internalPolicyChain is the chain containing the endpoints for
  1075  		// "internal" (ClusterIP) traffic. internalTrafficChain is the chain that
  1076  		// internal traffic is routed to (which is always the same as
  1077  		// internalPolicyChain). hasInternalEndpoints is true if we should
  1078  		// generate rules pointing to internalTrafficChain, or false if there are
  1079  		// no available internal endpoints.
  1080  		internalPolicyChain := clusterPolicyChain
  1081  		hasInternalEndpoints := hasEndpoints
  1082  		if svcInfo.InternalPolicyLocal() {
  1083  			internalPolicyChain = localPolicyChain
  1084  			if len(localEndpoints) == 0 {
  1085  				hasInternalEndpoints = false
  1086  			}
  1087  		}
  1088  		internalTrafficChain := internalPolicyChain
  1089  
  1090  		// Similarly, externalPolicyChain is the chain containing the endpoints
  1091  		// for "external" (NodePort, LoadBalancer, and ExternalIP) traffic.
  1092  		// externalTrafficChain is the chain that external traffic is routed to
  1093  		// (which is always the service's "EXT" chain). hasExternalEndpoints is
  1094  		// true if there are endpoints that will be reached by external traffic.
  1095  		// (But we may still have to generate externalTrafficChain even if there
  1096  		// are no external endpoints, to ensure that the short-circuit rules for
  1097  		// local traffic are set up.)
  1098  		externalPolicyChain := clusterPolicyChain
  1099  		hasExternalEndpoints := hasEndpoints
  1100  		if svcInfo.ExternalPolicyLocal() {
  1101  			externalPolicyChain = localPolicyChain
  1102  			if len(localEndpoints) == 0 {
  1103  				hasExternalEndpoints = false
  1104  			}
  1105  		}
  1106  		externalTrafficChain := svcInfo.externalChainName // eventually jumps to externalPolicyChain
  1107  
  1108  		// usesExternalTrafficChain is based on hasEndpoints, not hasExternalEndpoints,
  1109  		// because we need the local-traffic-short-circuiting rules even when there
  1110  		// are no externally-usable endpoints.
  1111  		usesExternalTrafficChain := hasEndpoints && svcInfo.ExternallyAccessible()
  1112  		if usesExternalTrafficChain {
  1113  			ensureChain(externalTrafficChain, tx, activeChains)
  1114  		}
  1115  
  1116  		var internalTrafficFilterVerdict, externalTrafficFilterVerdict string
  1117  		if !hasEndpoints {
  1118  			// The service has no endpoints at all; hasInternalEndpoints and
  1119  			// hasExternalEndpoints will also be false, and we will not
  1120  			// generate any chains in the "nat" table for the service; only
  1121  			// rules in the "filter" table rejecting incoming packets for
  1122  			// the service's IPs.
  1123  			internalTrafficFilterVerdict = fmt.Sprintf("goto %s", kubeRejectChain)
  1124  			externalTrafficFilterVerdict = fmt.Sprintf("goto %s", kubeRejectChain)
  1125  		} else {
  1126  			if !hasInternalEndpoints {
  1127  				// The internalTrafficPolicy is "Local" but there are no local
  1128  				// endpoints. Traffic to the clusterIP will be dropped, but
  1129  				// external traffic may still be accepted.
  1130  				internalTrafficFilterVerdict = "drop"
  1131  				serviceNoLocalEndpointsTotalInternal++
  1132  			}
  1133  			if !hasExternalEndpoints {
  1134  				// The externalTrafficPolicy is "Local" but there are no
  1135  				// local endpoints. Traffic to "external" IPs from outside
  1136  				// the cluster will be dropped, but traffic from inside
  1137  				// the cluster may still be accepted.
  1138  				externalTrafficFilterVerdict = "drop"
  1139  				serviceNoLocalEndpointsTotalExternal++
  1140  			}
  1141  		}
  1142  
  1143  		// Capture the clusterIP.
  1144  		if hasInternalEndpoints {
  1145  			tx.Add(&knftables.Element{
  1146  				Map: kubeServiceIPsMap,
  1147  				Key: []string{
  1148  					svcInfo.ClusterIP().String(),
  1149  					protocol,
  1150  					strconv.Itoa(svcInfo.Port()),
  1151  				},
  1152  				Value: []string{
  1153  					fmt.Sprintf("goto %s", internalTrafficChain),
  1154  				},
  1155  			})
  1156  		} else {
  1157  			// No endpoints.
  1158  			tx.Add(&knftables.Element{
  1159  				Map: kubeNoEndpointServicesMap,
  1160  				Key: []string{
  1161  					svcInfo.ClusterIP().String(),
  1162  					protocol,
  1163  					strconv.Itoa(svcInfo.Port()),
  1164  				},
  1165  				Value: []string{
  1166  					internalTrafficFilterVerdict,
  1167  				},
  1168  				Comment: &svcPortNameString,
  1169  			})
  1170  		}
  1171  
  1172  		// Capture externalIPs.
  1173  		for _, externalIP := range svcInfo.ExternalIPStrings() {
  1174  			if hasEndpoints {
  1175  				// Send traffic bound for external IPs to the "external
  1176  				// destinations" chain.
  1177  				tx.Add(&knftables.Element{
  1178  					Map: kubeServiceIPsMap,
  1179  					Key: []string{
  1180  						externalIP,
  1181  						protocol,
  1182  						strconv.Itoa(svcInfo.Port()),
  1183  					},
  1184  					Value: []string{
  1185  						fmt.Sprintf("goto %s", externalTrafficChain),
  1186  					},
  1187  				})
  1188  			}
  1189  			if !hasExternalEndpoints {
  1190  				// Either no endpoints at all (REJECT) or no endpoints for
  1191  				// external traffic (DROP anything that didn't get
  1192  				// short-circuited by the EXT chain.)
  1193  				tx.Add(&knftables.Element{
  1194  					Map: kubeNoEndpointServicesMap,
  1195  					Key: []string{
  1196  						externalIP,
  1197  						protocol,
  1198  						strconv.Itoa(svcInfo.Port()),
  1199  					},
  1200  					Value: []string{
  1201  						externalTrafficFilterVerdict,
  1202  					},
  1203  					Comment: &svcPortNameString,
  1204  				})
  1205  			}
  1206  		}
  1207  
  1208  		// Capture load-balancer ingress.
  1209  		for _, lbip := range svcInfo.LoadBalancerVIPStrings() {
  1210  			if hasEndpoints {
  1211  				tx.Add(&knftables.Element{
  1212  					Map: kubeServiceIPsMap,
  1213  					Key: []string{
  1214  						lbip,
  1215  						protocol,
  1216  						strconv.Itoa(svcInfo.Port()),
  1217  					},
  1218  					Value: []string{
  1219  						fmt.Sprintf("goto %s", externalTrafficChain),
  1220  					},
  1221  				})
  1222  			}
  1223  
  1224  			if len(svcInfo.LoadBalancerSourceRanges()) > 0 {
  1225  				tx.Add(&knftables.Element{
  1226  					Set: kubeFirewallSet,
  1227  					Key: []string{
  1228  						lbip,
  1229  						protocol,
  1230  						strconv.Itoa(svcInfo.Port()),
  1231  					},
  1232  					Comment: &svcPortNameString,
  1233  				})
  1234  
  1235  				allowFromNode := false
  1236  				for _, src := range svcInfo.LoadBalancerSourceRanges() {
  1237  					_, cidr, _ := netutils.ParseCIDRSloppy(src)
  1238  					if cidr == nil {
  1239  						continue
  1240  					}
  1241  					tx.Add(&knftables.Element{
  1242  						Set: kubeFirewallAllowSet,
  1243  						Key: []string{
  1244  							lbip,
  1245  							protocol,
  1246  							strconv.Itoa(svcInfo.Port()),
  1247  							src,
  1248  						},
  1249  						Comment: &svcPortNameString,
  1250  					})
  1251  					if cidr.Contains(proxier.nodeIP) {
  1252  						allowFromNode = true
  1253  					}
  1254  				}
  1255  				// For VIP-like LBs, the VIP is often added as a local
  1256  				// address (via an IP route rule).  In that case, a request
  1257  				// from a node to the VIP will not hit the loadbalancer but
  1258  				// will loop back with the source IP set to the VIP.  We
  1259  				// need the following rules to allow requests from this node.
  1260  				if allowFromNode {
  1261  					tx.Add(&knftables.Element{
  1262  						Set: kubeFirewallAllowSet,
  1263  						Key: []string{
  1264  							lbip,
  1265  							protocol,
  1266  							strconv.Itoa(svcInfo.Port()),
  1267  							lbip,
  1268  						},
  1269  					})
  1270  				}
  1271  			}
  1272  		}
  1273  		if !hasExternalEndpoints {
  1274  			// Either no endpoints at all (REJECT) or no endpoints for
  1275  			// external traffic (DROP anything that didn't get short-circuited
  1276  			// by the EXT chain.)
  1277  			for _, lbip := range svcInfo.LoadBalancerVIPStrings() {
  1278  				tx.Add(&knftables.Element{
  1279  					Map: kubeNoEndpointServicesMap,
  1280  					Key: []string{
  1281  						lbip,
  1282  						protocol,
  1283  						strconv.Itoa(svcInfo.Port()),
  1284  					},
  1285  					Value: []string{
  1286  						externalTrafficFilterVerdict,
  1287  					},
  1288  					Comment: &svcPortNameString,
  1289  				})
  1290  			}
  1291  		}
  1292  
  1293  		// Capture nodeports.
  1294  		if svcInfo.NodePort() != 0 {
  1295  			if hasEndpoints {
  1296  				// Jump to the external destination chain.  For better or for
  1297  				// worse, nodeports are not subect to loadBalancerSourceRanges,
  1298  				// and we can't change that.
  1299  				tx.Add(&knftables.Element{
  1300  					Map: kubeServiceNodePortsMap,
  1301  					Key: []string{
  1302  						protocol,
  1303  						strconv.Itoa(svcInfo.NodePort()),
  1304  					},
  1305  					Value: []string{
  1306  						fmt.Sprintf("goto %s", externalTrafficChain),
  1307  					},
  1308  				})
  1309  			}
  1310  			if !hasExternalEndpoints {
  1311  				// Either no endpoints at all (REJECT) or no endpoints for
  1312  				// external traffic (DROP anything that didn't get
  1313  				// short-circuited by the EXT chain.)
  1314  				tx.Add(&knftables.Element{
  1315  					Map: kubeNoEndpointNodePortsMap,
  1316  					Key: []string{
  1317  						protocol,
  1318  						strconv.Itoa(svcInfo.NodePort()),
  1319  					},
  1320  					Value: []string{
  1321  						externalTrafficFilterVerdict,
  1322  					},
  1323  					Comment: &svcPortNameString,
  1324  				})
  1325  			}
  1326  		}
  1327  
  1328  		// Set up internal traffic handling.
  1329  		if hasInternalEndpoints {
  1330  			if proxier.masqueradeAll {
  1331  				tx.Add(&knftables.Rule{
  1332  					Chain: internalTrafficChain,
  1333  					Rule: knftables.Concat(
  1334  						ipX, "daddr", svcInfo.ClusterIP(),
  1335  						protocol, "dport", svcInfo.Port(),
  1336  						"jump", kubeMarkMasqChain,
  1337  					),
  1338  				})
  1339  			} else if proxier.localDetector.IsImplemented() {
  1340  				// This masquerades off-cluster traffic to a service VIP. The
  1341  				// idea is that you can establish a static route for your
  1342  				// Service range, routing to any node, and that node will
  1343  				// bridge into the Service for you. Since that might bounce
  1344  				// off-node, we masquerade here.
  1345  				tx.Add(&knftables.Rule{
  1346  					Chain: internalTrafficChain,
  1347  					Rule: knftables.Concat(
  1348  						ipX, "daddr", svcInfo.ClusterIP(),
  1349  						protocol, "dport", svcInfo.Port(),
  1350  						proxier.localDetector.IfNotLocalNFT(),
  1351  						"jump", kubeMarkMasqChain,
  1352  					),
  1353  				})
  1354  			}
  1355  		}
  1356  
  1357  		// Set up external traffic handling (if any "external" destinations are
  1358  		// enabled). All captured traffic for all external destinations should
  1359  		// jump to externalTrafficChain, which will handle some special cases and
  1360  		// then jump to externalPolicyChain.
  1361  		if usesExternalTrafficChain {
  1362  			if !svcInfo.ExternalPolicyLocal() {
  1363  				// If we are using non-local endpoints we need to masquerade,
  1364  				// in case we cross nodes.
  1365  				tx.Add(&knftables.Rule{
  1366  					Chain: externalTrafficChain,
  1367  					Rule: knftables.Concat(
  1368  						"jump", kubeMarkMasqChain,
  1369  					),
  1370  				})
  1371  			} else {
  1372  				// If we are only using same-node endpoints, we can retain the
  1373  				// source IP in most cases.
  1374  
  1375  				if proxier.localDetector.IsImplemented() {
  1376  					// Treat all locally-originated pod -> external destination
  1377  					// traffic as a special-case.  It is subject to neither
  1378  					// form of traffic policy, which simulates going up-and-out
  1379  					// to an external load-balancer and coming back in.
  1380  					tx.Add(&knftables.Rule{
  1381  						Chain: externalTrafficChain,
  1382  						Rule: knftables.Concat(
  1383  							proxier.localDetector.IfLocalNFT(),
  1384  							"goto", clusterPolicyChain,
  1385  						),
  1386  						Comment: ptr.To("short-circuit pod traffic"),
  1387  					})
  1388  				}
  1389  
  1390  				// Locally originated traffic (not a pod, but the host node)
  1391  				// still needs masquerade because the LBIP itself is a local
  1392  				// address, so that will be the chosen source IP.
  1393  				tx.Add(&knftables.Rule{
  1394  					Chain: externalTrafficChain,
  1395  					Rule: knftables.Concat(
  1396  						"fib", "saddr", "type", "local",
  1397  						"jump", kubeMarkMasqChain,
  1398  					),
  1399  					Comment: ptr.To("masquerade local traffic"),
  1400  				})
  1401  
  1402  				// Redirect all src-type=LOCAL -> external destination to the
  1403  				// policy=cluster chain. This allows traffic originating
  1404  				// from the host to be redirected to the service correctly.
  1405  				tx.Add(&knftables.Rule{
  1406  					Chain: externalTrafficChain,
  1407  					Rule: knftables.Concat(
  1408  						"fib", "saddr", "type", "local",
  1409  						"goto", clusterPolicyChain,
  1410  					),
  1411  					Comment: ptr.To("short-circuit local traffic"),
  1412  				})
  1413  			}
  1414  
  1415  			// Anything else falls thru to the appropriate policy chain.
  1416  			if hasExternalEndpoints {
  1417  				tx.Add(&knftables.Rule{
  1418  					Chain: externalTrafficChain,
  1419  					Rule: knftables.Concat(
  1420  						"goto", externalPolicyChain,
  1421  					),
  1422  				})
  1423  			}
  1424  		}
  1425  
  1426  		if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP {
  1427  			// Generate the per-endpoint affinity sets
  1428  			for _, ep := range allLocallyReachableEndpoints {
  1429  				epInfo, ok := ep.(*endpointInfo)
  1430  				if !ok {
  1431  					klog.ErrorS(nil, "Failed to cast endpointsInfo", "endpointsInfo", ep)
  1432  					continue
  1433  				}
  1434  
  1435  				// Create a set to store current affinity mappings. As
  1436  				// with the iptables backend, endpoint affinity is
  1437  				// recorded for connections from a particular source IP
  1438  				// (without regard to source port) to a particular
  1439  				// ServicePort (without regard to which service IP was
  1440  				// used to reach the service). This may be changed in the
  1441  				// future.
  1442  				tx.Add(&knftables.Set{
  1443  					Name: epInfo.affinitySetName,
  1444  					Type: ipvX_addr,
  1445  					Flags: []knftables.SetFlag{
  1446  						// The nft docs say "dynamic" is only
  1447  						// needed for sets containing stateful
  1448  						// objects (eg counters), but (at least on
  1449  						// RHEL8) if we create the set without
  1450  						// "dynamic", it later gets mutated to
  1451  						// have it, and then the next attempt to
  1452  						// tx.Add() it here fails because it looks
  1453  						// like we're trying to change the flags.
  1454  						knftables.DynamicFlag,
  1455  						knftables.TimeoutFlag,
  1456  					},
  1457  					Timeout: ptr.To(time.Duration(svcInfo.StickyMaxAgeSeconds()) * time.Second),
  1458  				})
  1459  				activeAffinitySets.Insert(epInfo.affinitySetName)
  1460  			}
  1461  		}
  1462  
  1463  		// If Cluster policy is in use, create the chain and create rules jumping
  1464  		// from clusterPolicyChain to the clusterEndpoints
  1465  		if usesClusterPolicyChain {
  1466  			proxier.writeServiceToEndpointRules(tx, svcPortNameString, svcInfo, clusterPolicyChain, clusterEndpoints)
  1467  		}
  1468  
  1469  		// If Local policy is in use, create rules jumping from localPolicyChain
  1470  		// to the localEndpoints
  1471  		if usesLocalPolicyChain {
  1472  			proxier.writeServiceToEndpointRules(tx, svcPortNameString, svcInfo, localPolicyChain, localEndpoints)
  1473  		}
  1474  
  1475  		// Generate the per-endpoint chains
  1476  		for _, ep := range allLocallyReachableEndpoints {
  1477  			epInfo, ok := ep.(*endpointInfo)
  1478  			if !ok {
  1479  				klog.ErrorS(nil, "Failed to cast endpointInfo", "endpointInfo", ep)
  1480  				continue
  1481  			}
  1482  
  1483  			endpointChain := epInfo.chainName
  1484  
  1485  			// Handle traffic that loops back to the originator with SNAT.
  1486  			tx.Add(&knftables.Rule{
  1487  				Chain: endpointChain,
  1488  				Rule: knftables.Concat(
  1489  					ipX, "saddr", epInfo.IP(),
  1490  					"jump", kubeMarkMasqChain,
  1491  				),
  1492  			})
  1493  
  1494  			// Handle session affinity
  1495  			if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP {
  1496  				tx.Add(&knftables.Rule{
  1497  					Chain: endpointChain,
  1498  					Rule: knftables.Concat(
  1499  						"update", "@", epInfo.affinitySetName,
  1500  						"{", ipX, "saddr", "}",
  1501  					),
  1502  				})
  1503  			}
  1504  
  1505  			// DNAT to final destination.
  1506  			tx.Add(&knftables.Rule{
  1507  				Chain: endpointChain,
  1508  				Rule: knftables.Concat(
  1509  					"meta l4proto", protocol,
  1510  					"dnat to", epInfo.String(),
  1511  				),
  1512  			})
  1513  		}
  1514  	}
  1515  
  1516  	// Figure out which chains are now stale. Unfortunately, we can't delete them
  1517  	// right away, because with kernels before 6.2, if there is a map element pointing
  1518  	// to a chain, and you delete that map element, the kernel doesn't notice until a
  1519  	// short amount of time later that the chain is now unreferenced. So we flush them
  1520  	// now, and record the time that they become stale in staleChains so they can be
  1521  	// deleted later.
  1522  	existingChains, err := proxier.nftables.List(context.TODO(), "chains")
  1523  	if err == nil {
  1524  		for _, chain := range existingChains {
  1525  			if isServiceChainName(chain) && !activeChains.Has(chain) {
  1526  				tx.Flush(&knftables.Chain{
  1527  					Name: chain,
  1528  				})
  1529  				proxier.staleChains[chain] = start
  1530  			}
  1531  		}
  1532  	} else if !knftables.IsNotFound(err) {
  1533  		klog.ErrorS(err, "Failed to list nftables chains: stale chains will not be deleted")
  1534  	}
  1535  
  1536  	// OTOH, we can immediately delete any stale affinity sets
  1537  	existingSets, err := proxier.nftables.List(context.TODO(), "sets")
  1538  	if err == nil {
  1539  		for _, set := range existingSets {
  1540  			if isAffinitySetName(set) && !activeAffinitySets.Has(set) {
  1541  				tx.Delete(&knftables.Set{
  1542  					Name: set,
  1543  				})
  1544  			}
  1545  		}
  1546  	} else if !knftables.IsNotFound(err) {
  1547  		klog.ErrorS(err, "Failed to list nftables sets: stale affinity sets will not be deleted")
  1548  	}
  1549  
  1550  	// Sync rules.
  1551  	klog.V(2).InfoS("Reloading service nftables data",
  1552  		"numServices", len(proxier.svcPortMap),
  1553  		"numEndpoints", totalEndpoints,
  1554  	)
  1555  
  1556  	// FIXME
  1557  	// klog.V(9).InfoS("Running nftables transaction", "transaction", tx.Bytes())
  1558  
  1559  	err = proxier.nftables.Run(context.TODO(), tx)
  1560  	if err != nil {
  1561  		klog.ErrorS(err, "nftables sync failed")
  1562  		metrics.IptablesRestoreFailuresTotal.Inc()
  1563  		return
  1564  	}
  1565  	success = true
  1566  
  1567  	for name, lastChangeTriggerTimes := range endpointUpdateResult.LastChangeTriggerTimes {
  1568  		for _, lastChangeTriggerTime := range lastChangeTriggerTimes {
  1569  			latency := metrics.SinceInSeconds(lastChangeTriggerTime)
  1570  			metrics.NetworkProgrammingLatency.Observe(latency)
  1571  			klog.V(4).InfoS("Network programming", "endpoint", klog.KRef(name.Namespace, name.Name), "elapsed", latency)
  1572  		}
  1573  	}
  1574  
  1575  	metrics.SyncProxyRulesNoLocalEndpointsTotal.WithLabelValues("internal").Set(float64(serviceNoLocalEndpointsTotalInternal))
  1576  	metrics.SyncProxyRulesNoLocalEndpointsTotal.WithLabelValues("external").Set(float64(serviceNoLocalEndpointsTotalExternal))
  1577  	if proxier.healthzServer != nil {
  1578  		proxier.healthzServer.Updated(proxier.ipFamily)
  1579  	}
  1580  	metrics.SyncProxyRulesLastTimestamp.SetToCurrentTime()
  1581  
  1582  	// Update service healthchecks.  The endpoints list might include services that are
  1583  	// not "OnlyLocal", but the services list will not, and the serviceHealthServer
  1584  	// will just drop those endpoints.
  1585  	if err := proxier.serviceHealthServer.SyncServices(proxier.svcPortMap.HealthCheckNodePorts()); err != nil {
  1586  		klog.ErrorS(err, "Error syncing healthcheck services")
  1587  	}
  1588  	if err := proxier.serviceHealthServer.SyncEndpoints(proxier.endpointsMap.LocalReadyEndpoints()); err != nil {
  1589  		klog.ErrorS(err, "Error syncing healthcheck endpoints")
  1590  	}
  1591  
  1592  	// Finish housekeeping, clear stale conntrack entries for UDP Services
  1593  	conntrack.CleanStaleEntries(proxier.ipFamily == v1.IPv6Protocol, proxier.exec, proxier.svcPortMap, serviceUpdateResult, endpointUpdateResult)
  1594  }
  1595  
  1596  func (proxier *Proxier) writeServiceToEndpointRules(tx *knftables.Transaction, svcPortNameString string, svcInfo *servicePortInfo, svcChain string, endpoints []proxy.Endpoint) {
  1597  	// First write session affinity rules, if applicable.
  1598  	if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP {
  1599  		ipX := "ip"
  1600  		if proxier.ipFamily == v1.IPv6Protocol {
  1601  			ipX = "ip6"
  1602  		}
  1603  
  1604  		for _, ep := range endpoints {
  1605  			epInfo, ok := ep.(*endpointInfo)
  1606  			if !ok {
  1607  				continue
  1608  			}
  1609  
  1610  			tx.Add(&knftables.Rule{
  1611  				Chain: svcChain,
  1612  				Rule: knftables.Concat(
  1613  					ipX, "saddr", "@", epInfo.affinitySetName,
  1614  					"goto", epInfo.chainName,
  1615  				),
  1616  			})
  1617  		}
  1618  	}
  1619  
  1620  	// Now write loadbalancing rule
  1621  	var elements []string
  1622  	for i, ep := range endpoints {
  1623  		epInfo, ok := ep.(*endpointInfo)
  1624  		if !ok {
  1625  			continue
  1626  		}
  1627  
  1628  		elements = append(elements,
  1629  			strconv.Itoa(i), ":", "goto", epInfo.chainName,
  1630  		)
  1631  		if i != len(endpoints)-1 {
  1632  			elements = append(elements, ",")
  1633  		}
  1634  	}
  1635  	tx.Add(&knftables.Rule{
  1636  		Chain: svcChain,
  1637  		Rule: knftables.Concat(
  1638  			"numgen random mod", len(endpoints), "vmap",
  1639  			"{", elements, "}",
  1640  		),
  1641  	})
  1642  }