k8s.io/kubernetes@v1.29.3/pkg/proxy/iptables/proxier.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package iptables
    18  
    19  //
    20  // NOTE: this needs to be tested in e2e since it uses iptables for everything.
    21  //
    22  
    23  import (
    24  	"bytes"
    25  	"crypto/sha256"
    26  	"encoding/base32"
    27  	"fmt"
    28  	"net"
    29  	"reflect"
    30  	"strconv"
    31  	"strings"
    32  	"sync"
    33  	"sync/atomic"
    34  	"time"
    35  
    36  	v1 "k8s.io/api/core/v1"
    37  	discovery "k8s.io/api/discovery/v1"
    38  	"k8s.io/apimachinery/pkg/types"
    39  	"k8s.io/apimachinery/pkg/util/wait"
    40  	"k8s.io/client-go/tools/events"
    41  	utilsysctl "k8s.io/component-helpers/node/util/sysctl"
    42  	"k8s.io/klog/v2"
    43  	"k8s.io/kubernetes/pkg/proxy"
    44  	"k8s.io/kubernetes/pkg/proxy/conntrack"
    45  	"k8s.io/kubernetes/pkg/proxy/healthcheck"
    46  	"k8s.io/kubernetes/pkg/proxy/metaproxier"
    47  	"k8s.io/kubernetes/pkg/proxy/metrics"
    48  	proxyutil "k8s.io/kubernetes/pkg/proxy/util"
    49  	proxyutiliptables "k8s.io/kubernetes/pkg/proxy/util/iptables"
    50  	"k8s.io/kubernetes/pkg/util/async"
    51  	utiliptables "k8s.io/kubernetes/pkg/util/iptables"
    52  	utilexec "k8s.io/utils/exec"
    53  	netutils "k8s.io/utils/net"
    54  )
    55  
    56  const (
    57  	// the services chain
    58  	kubeServicesChain utiliptables.Chain = "KUBE-SERVICES"
    59  
    60  	// the external services chain
    61  	kubeExternalServicesChain utiliptables.Chain = "KUBE-EXTERNAL-SERVICES"
    62  
    63  	// the nodeports chain
    64  	kubeNodePortsChain utiliptables.Chain = "KUBE-NODEPORTS"
    65  
    66  	// the kubernetes postrouting chain
    67  	kubePostroutingChain utiliptables.Chain = "KUBE-POSTROUTING"
    68  
    69  	// kubeMarkMasqChain is the mark-for-masquerade chain
    70  	kubeMarkMasqChain utiliptables.Chain = "KUBE-MARK-MASQ"
    71  
    72  	// the kubernetes forward chain
    73  	kubeForwardChain utiliptables.Chain = "KUBE-FORWARD"
    74  
    75  	// kubeProxyFirewallChain is the kube-proxy firewall chain
    76  	kubeProxyFirewallChain utiliptables.Chain = "KUBE-PROXY-FIREWALL"
    77  
    78  	// kube proxy canary chain is used for monitoring rule reload
    79  	kubeProxyCanaryChain utiliptables.Chain = "KUBE-PROXY-CANARY"
    80  
    81  	// kubeletFirewallChain is a duplicate of kubelet's firewall containing
    82  	// the anti-martian-packet rule. It should not be used for any other
    83  	// rules.
    84  	kubeletFirewallChain utiliptables.Chain = "KUBE-FIREWALL"
    85  
    86  	// largeClusterEndpointsThreshold is the number of endpoints at which
    87  	// we switch into "large cluster mode" and optimize for iptables
    88  	// performance over iptables debuggability
    89  	largeClusterEndpointsThreshold = 1000
    90  )
    91  
    92  const sysctlRouteLocalnet = "net/ipv4/conf/all/route_localnet"
    93  const sysctlNFConntrackTCPBeLiberal = "net/netfilter/nf_conntrack_tcp_be_liberal"
    94  
    95  // internal struct for string service information
    96  type servicePortInfo struct {
    97  	*proxy.BaseServicePortInfo
    98  	// The following fields are computed and stored for performance reasons.
    99  	nameString             string
   100  	clusterPolicyChainName utiliptables.Chain
   101  	localPolicyChainName   utiliptables.Chain
   102  	firewallChainName      utiliptables.Chain
   103  	externalChainName      utiliptables.Chain
   104  }
   105  
   106  // returns a new proxy.ServicePort which abstracts a serviceInfo
   107  func newServiceInfo(port *v1.ServicePort, service *v1.Service, bsvcPortInfo *proxy.BaseServicePortInfo) proxy.ServicePort {
   108  	svcPort := &servicePortInfo{BaseServicePortInfo: bsvcPortInfo}
   109  
   110  	// Store the following for performance reasons.
   111  	svcName := types.NamespacedName{Namespace: service.Namespace, Name: service.Name}
   112  	svcPortName := proxy.ServicePortName{NamespacedName: svcName, Port: port.Name}
   113  	protocol := strings.ToLower(string(svcPort.Protocol()))
   114  	svcPort.nameString = svcPortName.String()
   115  	svcPort.clusterPolicyChainName = servicePortPolicyClusterChain(svcPort.nameString, protocol)
   116  	svcPort.localPolicyChainName = servicePortPolicyLocalChainName(svcPort.nameString, protocol)
   117  	svcPort.firewallChainName = serviceFirewallChainName(svcPort.nameString, protocol)
   118  	svcPort.externalChainName = serviceExternalChainName(svcPort.nameString, protocol)
   119  
   120  	return svcPort
   121  }
   122  
   123  // internal struct for endpoints information
   124  type endpointInfo struct {
   125  	*proxy.BaseEndpointInfo
   126  
   127  	ChainName utiliptables.Chain
   128  }
   129  
   130  // returns a new proxy.Endpoint which abstracts a endpointInfo
   131  func newEndpointInfo(baseInfo *proxy.BaseEndpointInfo, svcPortName *proxy.ServicePortName) proxy.Endpoint {
   132  	return &endpointInfo{
   133  		BaseEndpointInfo: baseInfo,
   134  		ChainName:        servicePortEndpointChainName(svcPortName.String(), strings.ToLower(string(svcPortName.Protocol)), baseInfo.String()),
   135  	}
   136  }
   137  
   138  // Proxier is an iptables based proxy for connections between a localhost:lport
   139  // and services that provide the actual backends.
   140  type Proxier struct {
   141  	// ipFamily defines the IP family which this proxier is tracking.
   142  	ipFamily v1.IPFamily
   143  
   144  	// endpointsChanges and serviceChanges contains all changes to endpoints and
   145  	// services that happened since iptables was synced. For a single object,
   146  	// changes are accumulated, i.e. previous is state from before all of them,
   147  	// current is state after applying all of those.
   148  	endpointsChanges *proxy.EndpointsChangeTracker
   149  	serviceChanges   *proxy.ServiceChangeTracker
   150  
   151  	mu           sync.Mutex // protects the following fields
   152  	svcPortMap   proxy.ServicePortMap
   153  	endpointsMap proxy.EndpointsMap
   154  	nodeLabels   map[string]string
   155  	// endpointSlicesSynced, and servicesSynced are set to true
   156  	// when corresponding objects are synced after startup. This is used to avoid
   157  	// updating iptables with some partial data after kube-proxy restart.
   158  	endpointSlicesSynced bool
   159  	servicesSynced       bool
   160  	needFullSync         bool
   161  	initialized          int32
   162  	syncRunner           *async.BoundedFrequencyRunner // governs calls to syncProxyRules
   163  	syncPeriod           time.Duration
   164  	lastIPTablesCleanup  time.Time
   165  
   166  	// These are effectively const and do not need the mutex to be held.
   167  	iptables       utiliptables.Interface
   168  	masqueradeAll  bool
   169  	masqueradeMark string
   170  	exec           utilexec.Interface
   171  	localDetector  proxyutiliptables.LocalTrafficDetector
   172  	hostname       string
   173  	nodeIP         net.IP
   174  	recorder       events.EventRecorder
   175  
   176  	serviceHealthServer healthcheck.ServiceHealthServer
   177  	healthzServer       *healthcheck.ProxierHealthServer
   178  
   179  	// Since converting probabilities (floats) to strings is expensive
   180  	// and we are using only probabilities in the format of 1/n, we are
   181  	// precomputing some number of those and cache for future reuse.
   182  	precomputedProbabilities []string
   183  
   184  	// The following buffers are used to reuse memory and avoid allocations
   185  	// that are significantly impacting performance.
   186  	iptablesData             *bytes.Buffer
   187  	existingFilterChainsData *bytes.Buffer
   188  	filterChains             proxyutil.LineBuffer
   189  	filterRules              proxyutil.LineBuffer
   190  	natChains                proxyutil.LineBuffer
   191  	natRules                 proxyutil.LineBuffer
   192  
   193  	// largeClusterMode is set at the beginning of syncProxyRules if we are
   194  	// going to end up outputting "lots" of iptables rules and so we need to
   195  	// optimize for performance over debuggability.
   196  	largeClusterMode bool
   197  
   198  	// localhostNodePorts indicates whether we allow NodePort services to be accessed
   199  	// via localhost.
   200  	localhostNodePorts bool
   201  
   202  	// conntrackTCPLiberal indicates whether the system sets the kernel nf_conntrack_tcp_be_liberal
   203  	conntrackTCPLiberal bool
   204  
   205  	// nodePortAddresses selects the interfaces where nodePort works.
   206  	nodePortAddresses *proxyutil.NodePortAddresses
   207  	// networkInterfacer defines an interface for several net library functions.
   208  	// Inject for test purpose.
   209  	networkInterfacer proxyutil.NetworkInterfacer
   210  }
   211  
   212  // Proxier implements proxy.Provider
   213  var _ proxy.Provider = &Proxier{}
   214  
   215  // NewProxier returns a new Proxier given an iptables Interface instance.
   216  // Because of the iptables logic, it is assumed that there is only a single Proxier active on a machine.
   217  // An error will be returned if iptables fails to update or acquire the initial lock.
   218  // Once a proxier is created, it will keep iptables up to date in the background and
   219  // will not terminate if a particular iptables call fails.
   220  func NewProxier(ipFamily v1.IPFamily,
   221  	ipt utiliptables.Interface,
   222  	sysctl utilsysctl.Interface,
   223  	exec utilexec.Interface,
   224  	syncPeriod time.Duration,
   225  	minSyncPeriod time.Duration,
   226  	masqueradeAll bool,
   227  	localhostNodePorts bool,
   228  	masqueradeBit int,
   229  	localDetector proxyutiliptables.LocalTrafficDetector,
   230  	hostname string,
   231  	nodeIP net.IP,
   232  	recorder events.EventRecorder,
   233  	healthzServer *healthcheck.ProxierHealthServer,
   234  	nodePortAddressStrings []string,
   235  	initOnly bool,
   236  ) (*Proxier, error) {
   237  	nodePortAddresses := proxyutil.NewNodePortAddresses(ipFamily, nodePortAddressStrings)
   238  
   239  	if !nodePortAddresses.ContainsIPv4Loopback() {
   240  		localhostNodePorts = false
   241  	}
   242  	if localhostNodePorts {
   243  		// Set the route_localnet sysctl we need for exposing NodePorts on loopback addresses
   244  		// Refer to https://issues.k8s.io/90259
   245  		klog.InfoS("Setting route_localnet=1 to allow node-ports on localhost; to change this either disable iptables.localhostNodePorts (--iptables-localhost-nodeports) or set nodePortAddresses (--nodeport-addresses) to filter loopback addresses")
   246  		if err := proxyutil.EnsureSysctl(sysctl, sysctlRouteLocalnet, 1); err != nil {
   247  			return nil, err
   248  		}
   249  	}
   250  
   251  	// Be conservative in what you do, be liberal in what you accept from others.
   252  	// If it's non-zero, we mark only out of window RST segments as INVALID.
   253  	// Ref: https://docs.kernel.org/networking/nf_conntrack-sysctl.html
   254  	conntrackTCPLiberal := false
   255  	if val, err := sysctl.GetSysctl(sysctlNFConntrackTCPBeLiberal); err == nil && val != 0 {
   256  		conntrackTCPLiberal = true
   257  		klog.InfoS("nf_conntrack_tcp_be_liberal set, not installing DROP rules for INVALID packets")
   258  	}
   259  
   260  	if initOnly {
   261  		klog.InfoS("System initialized and --init-only specified")
   262  		return nil, nil
   263  	}
   264  
   265  	// Generate the masquerade mark to use for SNAT rules.
   266  	masqueradeValue := 1 << uint(masqueradeBit)
   267  	masqueradeMark := fmt.Sprintf("%#08x", masqueradeValue)
   268  	klog.V(2).InfoS("Using iptables mark for masquerade", "ipFamily", ipt.Protocol(), "mark", masqueradeMark)
   269  
   270  	serviceHealthServer := healthcheck.NewServiceHealthServer(hostname, recorder, nodePortAddresses, healthzServer)
   271  
   272  	proxier := &Proxier{
   273  		ipFamily:                 ipFamily,
   274  		svcPortMap:               make(proxy.ServicePortMap),
   275  		serviceChanges:           proxy.NewServiceChangeTracker(newServiceInfo, ipFamily, recorder, nil),
   276  		endpointsMap:             make(proxy.EndpointsMap),
   277  		endpointsChanges:         proxy.NewEndpointsChangeTracker(hostname, newEndpointInfo, ipFamily, recorder, nil),
   278  		needFullSync:             true,
   279  		syncPeriod:               syncPeriod,
   280  		iptables:                 ipt,
   281  		masqueradeAll:            masqueradeAll,
   282  		masqueradeMark:           masqueradeMark,
   283  		exec:                     exec,
   284  		localDetector:            localDetector,
   285  		hostname:                 hostname,
   286  		nodeIP:                   nodeIP,
   287  		recorder:                 recorder,
   288  		serviceHealthServer:      serviceHealthServer,
   289  		healthzServer:            healthzServer,
   290  		precomputedProbabilities: make([]string, 0, 1001),
   291  		iptablesData:             bytes.NewBuffer(nil),
   292  		existingFilterChainsData: bytes.NewBuffer(nil),
   293  		filterChains:             proxyutil.NewLineBuffer(),
   294  		filterRules:              proxyutil.NewLineBuffer(),
   295  		natChains:                proxyutil.NewLineBuffer(),
   296  		natRules:                 proxyutil.NewLineBuffer(),
   297  		localhostNodePorts:       localhostNodePorts,
   298  		nodePortAddresses:        nodePortAddresses,
   299  		networkInterfacer:        proxyutil.RealNetwork{},
   300  		conntrackTCPLiberal:      conntrackTCPLiberal,
   301  	}
   302  
   303  	burstSyncs := 2
   304  	klog.V(2).InfoS("Iptables sync params", "ipFamily", ipt.Protocol(), "minSyncPeriod", minSyncPeriod, "syncPeriod", syncPeriod, "burstSyncs", burstSyncs)
   305  	// We pass syncPeriod to ipt.Monitor, which will call us only if it needs to.
   306  	// We need to pass *some* maxInterval to NewBoundedFrequencyRunner anyway though.
   307  	// time.Hour is arbitrary.
   308  	proxier.syncRunner = async.NewBoundedFrequencyRunner("sync-runner", proxier.syncProxyRules, minSyncPeriod, time.Hour, burstSyncs)
   309  
   310  	go ipt.Monitor(kubeProxyCanaryChain, []utiliptables.Table{utiliptables.TableMangle, utiliptables.TableNAT, utiliptables.TableFilter},
   311  		proxier.forceSyncProxyRules, syncPeriod, wait.NeverStop)
   312  
   313  	if ipt.HasRandomFully() {
   314  		klog.V(2).InfoS("Iptables supports --random-fully", "ipFamily", ipt.Protocol())
   315  	} else {
   316  		klog.V(2).InfoS("Iptables does not support --random-fully", "ipFamily", ipt.Protocol())
   317  	}
   318  
   319  	return proxier, nil
   320  }
   321  
   322  // NewDualStackProxier creates a MetaProxier instance, with IPv4 and IPv6 proxies.
   323  func NewDualStackProxier(
   324  	ipt [2]utiliptables.Interface,
   325  	sysctl utilsysctl.Interface,
   326  	exec utilexec.Interface,
   327  	syncPeriod time.Duration,
   328  	minSyncPeriod time.Duration,
   329  	masqueradeAll bool,
   330  	localhostNodePorts bool,
   331  	masqueradeBit int,
   332  	localDetectors [2]proxyutiliptables.LocalTrafficDetector,
   333  	hostname string,
   334  	nodeIPs map[v1.IPFamily]net.IP,
   335  	recorder events.EventRecorder,
   336  	healthzServer *healthcheck.ProxierHealthServer,
   337  	nodePortAddresses []string,
   338  	initOnly bool,
   339  ) (proxy.Provider, error) {
   340  	// Create an ipv4 instance of the single-stack proxier
   341  	ipv4Proxier, err := NewProxier(v1.IPv4Protocol, ipt[0], sysctl,
   342  		exec, syncPeriod, minSyncPeriod, masqueradeAll, localhostNodePorts, masqueradeBit, localDetectors[0], hostname,
   343  		nodeIPs[v1.IPv4Protocol], recorder, healthzServer, nodePortAddresses, initOnly)
   344  	if err != nil {
   345  		return nil, fmt.Errorf("unable to create ipv4 proxier: %v", err)
   346  	}
   347  
   348  	ipv6Proxier, err := NewProxier(v1.IPv6Protocol, ipt[1], sysctl,
   349  		exec, syncPeriod, minSyncPeriod, masqueradeAll, false, masqueradeBit, localDetectors[1], hostname,
   350  		nodeIPs[v1.IPv6Protocol], recorder, healthzServer, nodePortAddresses, initOnly)
   351  	if err != nil {
   352  		return nil, fmt.Errorf("unable to create ipv6 proxier: %v", err)
   353  	}
   354  	if initOnly {
   355  		return nil, nil
   356  	}
   357  	return metaproxier.NewMetaProxier(ipv4Proxier, ipv6Proxier), nil
   358  }
   359  
   360  type iptablesJumpChain struct {
   361  	table     utiliptables.Table
   362  	dstChain  utiliptables.Chain
   363  	srcChain  utiliptables.Chain
   364  	comment   string
   365  	extraArgs []string
   366  }
   367  
   368  var iptablesJumpChains = []iptablesJumpChain{
   369  	{utiliptables.TableFilter, kubeExternalServicesChain, utiliptables.ChainInput, "kubernetes externally-visible service portals", []string{"-m", "conntrack", "--ctstate", "NEW"}},
   370  	{utiliptables.TableFilter, kubeExternalServicesChain, utiliptables.ChainForward, "kubernetes externally-visible service portals", []string{"-m", "conntrack", "--ctstate", "NEW"}},
   371  	{utiliptables.TableFilter, kubeNodePortsChain, utiliptables.ChainInput, "kubernetes health check service ports", nil},
   372  	{utiliptables.TableFilter, kubeServicesChain, utiliptables.ChainForward, "kubernetes service portals", []string{"-m", "conntrack", "--ctstate", "NEW"}},
   373  	{utiliptables.TableFilter, kubeServicesChain, utiliptables.ChainOutput, "kubernetes service portals", []string{"-m", "conntrack", "--ctstate", "NEW"}},
   374  	{utiliptables.TableFilter, kubeForwardChain, utiliptables.ChainForward, "kubernetes forwarding rules", nil},
   375  	{utiliptables.TableFilter, kubeProxyFirewallChain, utiliptables.ChainInput, "kubernetes load balancer firewall", []string{"-m", "conntrack", "--ctstate", "NEW"}},
   376  	{utiliptables.TableFilter, kubeProxyFirewallChain, utiliptables.ChainOutput, "kubernetes load balancer firewall", []string{"-m", "conntrack", "--ctstate", "NEW"}},
   377  	{utiliptables.TableFilter, kubeProxyFirewallChain, utiliptables.ChainForward, "kubernetes load balancer firewall", []string{"-m", "conntrack", "--ctstate", "NEW"}},
   378  	{utiliptables.TableNAT, kubeServicesChain, utiliptables.ChainOutput, "kubernetes service portals", nil},
   379  	{utiliptables.TableNAT, kubeServicesChain, utiliptables.ChainPrerouting, "kubernetes service portals", nil},
   380  	{utiliptables.TableNAT, kubePostroutingChain, utiliptables.ChainPostrouting, "kubernetes postrouting rules", nil},
   381  }
   382  
   383  // Duplicates of chains created in pkg/kubelet/kubelet_network_linux.go; we create these
   384  // on startup but do not delete them in CleanupLeftovers.
   385  var iptablesKubeletJumpChains = []iptablesJumpChain{
   386  	{utiliptables.TableFilter, kubeletFirewallChain, utiliptables.ChainInput, "", nil},
   387  	{utiliptables.TableFilter, kubeletFirewallChain, utiliptables.ChainOutput, "", nil},
   388  }
   389  
   390  // When chains get removed from iptablesJumpChains, add them here so they get cleaned up
   391  // on upgrade.
   392  var iptablesCleanupOnlyChains = []iptablesJumpChain{}
   393  
   394  // CleanupLeftovers removes all iptables rules and chains created by the Proxier
   395  // It returns true if an error was encountered. Errors are logged.
   396  func CleanupLeftovers(ipt utiliptables.Interface) (encounteredError bool) {
   397  	// Unlink our chains
   398  	for _, jump := range append(iptablesJumpChains, iptablesCleanupOnlyChains...) {
   399  		args := append(jump.extraArgs,
   400  			"-m", "comment", "--comment", jump.comment,
   401  			"-j", string(jump.dstChain),
   402  		)
   403  		if err := ipt.DeleteRule(jump.table, jump.srcChain, args...); err != nil {
   404  			if !utiliptables.IsNotFoundError(err) {
   405  				klog.ErrorS(err, "Error removing pure-iptables proxy rule")
   406  				encounteredError = true
   407  			}
   408  		}
   409  	}
   410  
   411  	// Flush and remove all of our "-t nat" chains.
   412  	iptablesData := bytes.NewBuffer(nil)
   413  	if err := ipt.SaveInto(utiliptables.TableNAT, iptablesData); err != nil {
   414  		klog.ErrorS(err, "Failed to execute iptables-save", "table", utiliptables.TableNAT)
   415  		encounteredError = true
   416  	} else {
   417  		existingNATChains := utiliptables.GetChainsFromTable(iptablesData.Bytes())
   418  		natChains := proxyutil.NewLineBuffer()
   419  		natRules := proxyutil.NewLineBuffer()
   420  		natChains.Write("*nat")
   421  		// Start with chains we know we need to remove.
   422  		for _, chain := range []utiliptables.Chain{kubeServicesChain, kubeNodePortsChain, kubePostroutingChain} {
   423  			if _, found := existingNATChains[chain]; found {
   424  				chainString := string(chain)
   425  				natChains.Write(utiliptables.MakeChainLine(chain)) // flush
   426  				natRules.Write("-X", chainString)                  // delete
   427  			}
   428  		}
   429  		// Hunt for service and endpoint chains.
   430  		for chain := range existingNATChains {
   431  			chainString := string(chain)
   432  			if isServiceChainName(chainString) {
   433  				natChains.Write(utiliptables.MakeChainLine(chain)) // flush
   434  				natRules.Write("-X", chainString)                  // delete
   435  			}
   436  		}
   437  		natRules.Write("COMMIT")
   438  		natLines := append(natChains.Bytes(), natRules.Bytes()...)
   439  		// Write it.
   440  		err = ipt.Restore(utiliptables.TableNAT, natLines, utiliptables.NoFlushTables, utiliptables.RestoreCounters)
   441  		if err != nil {
   442  			klog.ErrorS(err, "Failed to execute iptables-restore", "table", utiliptables.TableNAT)
   443  			metrics.IptablesRestoreFailuresTotal.Inc()
   444  			encounteredError = true
   445  		}
   446  	}
   447  
   448  	// Flush and remove all of our "-t filter" chains.
   449  	iptablesData.Reset()
   450  	if err := ipt.SaveInto(utiliptables.TableFilter, iptablesData); err != nil {
   451  		klog.ErrorS(err, "Failed to execute iptables-save", "table", utiliptables.TableFilter)
   452  		encounteredError = true
   453  	} else {
   454  		existingFilterChains := utiliptables.GetChainsFromTable(iptablesData.Bytes())
   455  		filterChains := proxyutil.NewLineBuffer()
   456  		filterRules := proxyutil.NewLineBuffer()
   457  		filterChains.Write("*filter")
   458  		for _, chain := range []utiliptables.Chain{kubeServicesChain, kubeExternalServicesChain, kubeForwardChain, kubeNodePortsChain} {
   459  			if _, found := existingFilterChains[chain]; found {
   460  				chainString := string(chain)
   461  				filterChains.Write(utiliptables.MakeChainLine(chain))
   462  				filterRules.Write("-X", chainString)
   463  			}
   464  		}
   465  		filterRules.Write("COMMIT")
   466  		filterLines := append(filterChains.Bytes(), filterRules.Bytes()...)
   467  		// Write it.
   468  		if err := ipt.Restore(utiliptables.TableFilter, filterLines, utiliptables.NoFlushTables, utiliptables.RestoreCounters); err != nil {
   469  			klog.ErrorS(err, "Failed to execute iptables-restore", "table", utiliptables.TableFilter)
   470  			metrics.IptablesRestoreFailuresTotal.Inc()
   471  			encounteredError = true
   472  		}
   473  	}
   474  	return encounteredError
   475  }
   476  
   477  func computeProbability(n int) string {
   478  	return fmt.Sprintf("%0.10f", 1.0/float64(n))
   479  }
   480  
   481  // This assumes proxier.mu is held
   482  func (proxier *Proxier) precomputeProbabilities(numberOfPrecomputed int) {
   483  	if len(proxier.precomputedProbabilities) == 0 {
   484  		proxier.precomputedProbabilities = append(proxier.precomputedProbabilities, "<bad value>")
   485  	}
   486  	for i := len(proxier.precomputedProbabilities); i <= numberOfPrecomputed; i++ {
   487  		proxier.precomputedProbabilities = append(proxier.precomputedProbabilities, computeProbability(i))
   488  	}
   489  }
   490  
   491  // This assumes proxier.mu is held
   492  func (proxier *Proxier) probability(n int) string {
   493  	if n >= len(proxier.precomputedProbabilities) {
   494  		proxier.precomputeProbabilities(n)
   495  	}
   496  	return proxier.precomputedProbabilities[n]
   497  }
   498  
   499  // Sync is called to synchronize the proxier state to iptables as soon as possible.
   500  func (proxier *Proxier) Sync() {
   501  	if proxier.healthzServer != nil {
   502  		proxier.healthzServer.QueuedUpdate(proxier.ipFamily)
   503  	}
   504  	metrics.SyncProxyRulesLastQueuedTimestamp.SetToCurrentTime()
   505  	proxier.syncRunner.Run()
   506  }
   507  
   508  // SyncLoop runs periodic work.  This is expected to run as a goroutine or as the main loop of the app.  It does not return.
   509  func (proxier *Proxier) SyncLoop() {
   510  	// Update healthz timestamp at beginning in case Sync() never succeeds.
   511  	if proxier.healthzServer != nil {
   512  		proxier.healthzServer.Updated(proxier.ipFamily)
   513  	}
   514  
   515  	// synthesize "last change queued" time as the informers are syncing.
   516  	metrics.SyncProxyRulesLastQueuedTimestamp.SetToCurrentTime()
   517  	proxier.syncRunner.Loop(wait.NeverStop)
   518  }
   519  
   520  func (proxier *Proxier) setInitialized(value bool) {
   521  	var initialized int32
   522  	if value {
   523  		initialized = 1
   524  	}
   525  	atomic.StoreInt32(&proxier.initialized, initialized)
   526  }
   527  
   528  func (proxier *Proxier) isInitialized() bool {
   529  	return atomic.LoadInt32(&proxier.initialized) > 0
   530  }
   531  
   532  // OnServiceAdd is called whenever creation of new service object
   533  // is observed.
   534  func (proxier *Proxier) OnServiceAdd(service *v1.Service) {
   535  	proxier.OnServiceUpdate(nil, service)
   536  }
   537  
   538  // OnServiceUpdate is called whenever modification of an existing
   539  // service object is observed.
   540  func (proxier *Proxier) OnServiceUpdate(oldService, service *v1.Service) {
   541  	if proxier.serviceChanges.Update(oldService, service) && proxier.isInitialized() {
   542  		proxier.Sync()
   543  	}
   544  }
   545  
   546  // OnServiceDelete is called whenever deletion of an existing service
   547  // object is observed.
   548  func (proxier *Proxier) OnServiceDelete(service *v1.Service) {
   549  	proxier.OnServiceUpdate(service, nil)
   550  
   551  }
   552  
   553  // OnServiceSynced is called once all the initial event handlers were
   554  // called and the state is fully propagated to local cache.
   555  func (proxier *Proxier) OnServiceSynced() {
   556  	proxier.mu.Lock()
   557  	proxier.servicesSynced = true
   558  	proxier.setInitialized(proxier.endpointSlicesSynced)
   559  	proxier.mu.Unlock()
   560  
   561  	// Sync unconditionally - this is called once per lifetime.
   562  	proxier.syncProxyRules()
   563  }
   564  
   565  // OnEndpointSliceAdd is called whenever creation of a new endpoint slice object
   566  // is observed.
   567  func (proxier *Proxier) OnEndpointSliceAdd(endpointSlice *discovery.EndpointSlice) {
   568  	if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, false) && proxier.isInitialized() {
   569  		proxier.Sync()
   570  	}
   571  }
   572  
   573  // OnEndpointSliceUpdate is called whenever modification of an existing endpoint
   574  // slice object is observed.
   575  func (proxier *Proxier) OnEndpointSliceUpdate(_, endpointSlice *discovery.EndpointSlice) {
   576  	if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, false) && proxier.isInitialized() {
   577  		proxier.Sync()
   578  	}
   579  }
   580  
   581  // OnEndpointSliceDelete is called whenever deletion of an existing endpoint slice
   582  // object is observed.
   583  func (proxier *Proxier) OnEndpointSliceDelete(endpointSlice *discovery.EndpointSlice) {
   584  	if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, true) && proxier.isInitialized() {
   585  		proxier.Sync()
   586  	}
   587  }
   588  
   589  // OnEndpointSlicesSynced is called once all the initial event handlers were
   590  // called and the state is fully propagated to local cache.
   591  func (proxier *Proxier) OnEndpointSlicesSynced() {
   592  	proxier.mu.Lock()
   593  	proxier.endpointSlicesSynced = true
   594  	proxier.setInitialized(proxier.servicesSynced)
   595  	proxier.mu.Unlock()
   596  
   597  	// Sync unconditionally - this is called once per lifetime.
   598  	proxier.syncProxyRules()
   599  }
   600  
   601  // OnNodeAdd is called whenever creation of new node object
   602  // is observed.
   603  func (proxier *Proxier) OnNodeAdd(node *v1.Node) {
   604  	if node.Name != proxier.hostname {
   605  		klog.ErrorS(nil, "Received a watch event for a node that doesn't match the current node",
   606  			"eventNode", node.Name, "currentNode", proxier.hostname)
   607  		return
   608  	}
   609  
   610  	if reflect.DeepEqual(proxier.nodeLabels, node.Labels) {
   611  		return
   612  	}
   613  
   614  	proxier.mu.Lock()
   615  	proxier.nodeLabels = map[string]string{}
   616  	for k, v := range node.Labels {
   617  		proxier.nodeLabels[k] = v
   618  	}
   619  	proxier.needFullSync = true
   620  	proxier.mu.Unlock()
   621  	klog.V(4).InfoS("Updated proxier node labels", "labels", node.Labels)
   622  
   623  	proxier.Sync()
   624  }
   625  
   626  // OnNodeUpdate is called whenever modification of an existing
   627  // node object is observed.
   628  func (proxier *Proxier) OnNodeUpdate(oldNode, node *v1.Node) {
   629  	if node.Name != proxier.hostname {
   630  		klog.ErrorS(nil, "Received a watch event for a node that doesn't match the current node",
   631  			"eventNode", node.Name, "currentNode", proxier.hostname)
   632  		return
   633  	}
   634  
   635  	if reflect.DeepEqual(proxier.nodeLabels, node.Labels) {
   636  		return
   637  	}
   638  
   639  	proxier.mu.Lock()
   640  	proxier.nodeLabels = map[string]string{}
   641  	for k, v := range node.Labels {
   642  		proxier.nodeLabels[k] = v
   643  	}
   644  	proxier.needFullSync = true
   645  	proxier.mu.Unlock()
   646  	klog.V(4).InfoS("Updated proxier node labels", "labels", node.Labels)
   647  
   648  	proxier.Sync()
   649  }
   650  
   651  // OnNodeDelete is called whenever deletion of an existing node
   652  // object is observed.
   653  func (proxier *Proxier) OnNodeDelete(node *v1.Node) {
   654  	if node.Name != proxier.hostname {
   655  		klog.ErrorS(nil, "Received a watch event for a node that doesn't match the current node",
   656  			"eventNode", node.Name, "currentNode", proxier.hostname)
   657  		return
   658  	}
   659  
   660  	proxier.mu.Lock()
   661  	proxier.nodeLabels = nil
   662  	proxier.needFullSync = true
   663  	proxier.mu.Unlock()
   664  
   665  	proxier.Sync()
   666  }
   667  
   668  // OnNodeSynced is called once all the initial event handlers were
   669  // called and the state is fully propagated to local cache.
   670  func (proxier *Proxier) OnNodeSynced() {
   671  }
   672  
   673  // portProtoHash takes the ServicePortName and protocol for a service
   674  // returns the associated 16 character hash. This is computed by hashing (sha256)
   675  // then encoding to base32 and truncating to 16 chars. We do this because IPTables
   676  // Chain Names must be <= 28 chars long, and the longer they are the harder they are to read.
   677  func portProtoHash(servicePortName string, protocol string) string {
   678  	hash := sha256.Sum256([]byte(servicePortName + protocol))
   679  	encoded := base32.StdEncoding.EncodeToString(hash[:])
   680  	return encoded[:16]
   681  }
   682  
   683  const (
   684  	servicePortPolicyClusterChainNamePrefix = "KUBE-SVC-"
   685  	servicePortPolicyLocalChainNamePrefix   = "KUBE-SVL-"
   686  	serviceFirewallChainNamePrefix          = "KUBE-FW-"
   687  	serviceExternalChainNamePrefix          = "KUBE-EXT-"
   688  	servicePortEndpointChainNamePrefix      = "KUBE-SEP-"
   689  )
   690  
   691  // servicePortPolicyClusterChain returns the name of the KUBE-SVC-XXXX chain for a service, which is the
   692  // main iptables chain for that service, used for dispatching to endpoints when using `Cluster`
   693  // traffic policy.
   694  func servicePortPolicyClusterChain(servicePortName string, protocol string) utiliptables.Chain {
   695  	return utiliptables.Chain(servicePortPolicyClusterChainNamePrefix + portProtoHash(servicePortName, protocol))
   696  }
   697  
   698  // servicePortPolicyLocalChainName returns the name of the KUBE-SVL-XXXX chain for a service, which
   699  // handles dispatching to local endpoints when using `Local` traffic policy. This chain only
   700  // exists if the service has `Local` internal or external traffic policy.
   701  func servicePortPolicyLocalChainName(servicePortName string, protocol string) utiliptables.Chain {
   702  	return utiliptables.Chain(servicePortPolicyLocalChainNamePrefix + portProtoHash(servicePortName, protocol))
   703  }
   704  
   705  // serviceFirewallChainName returns the name of the KUBE-FW-XXXX chain for a service, which
   706  // is used to implement the filtering for the LoadBalancerSourceRanges feature.
   707  func serviceFirewallChainName(servicePortName string, protocol string) utiliptables.Chain {
   708  	return utiliptables.Chain(serviceFirewallChainNamePrefix + portProtoHash(servicePortName, protocol))
   709  }
   710  
   711  // serviceExternalChainName returns the name of the KUBE-EXT-XXXX chain for a service, which
   712  // implements "short-circuiting" for internally-originated external-destination traffic when using
   713  // `Local` external traffic policy.  It forwards traffic from local sources to the KUBE-SVC-XXXX
   714  // chain and traffic from external sources to the KUBE-SVL-XXXX chain.
   715  func serviceExternalChainName(servicePortName string, protocol string) utiliptables.Chain {
   716  	return utiliptables.Chain(serviceExternalChainNamePrefix + portProtoHash(servicePortName, protocol))
   717  }
   718  
   719  // servicePortEndpointChainName returns the name of the KUBE-SEP-XXXX chain for a particular
   720  // service endpoint.
   721  func servicePortEndpointChainName(servicePortName string, protocol string, endpoint string) utiliptables.Chain {
   722  	hash := sha256.Sum256([]byte(servicePortName + protocol + endpoint))
   723  	encoded := base32.StdEncoding.EncodeToString(hash[:])
   724  	return utiliptables.Chain(servicePortEndpointChainNamePrefix + encoded[:16])
   725  }
   726  
   727  func isServiceChainName(chainString string) bool {
   728  	prefixes := []string{
   729  		servicePortPolicyClusterChainNamePrefix,
   730  		servicePortPolicyLocalChainNamePrefix,
   731  		servicePortEndpointChainNamePrefix,
   732  		serviceFirewallChainNamePrefix,
   733  		serviceExternalChainNamePrefix,
   734  	}
   735  
   736  	for _, p := range prefixes {
   737  		if strings.HasPrefix(chainString, p) {
   738  			return true
   739  		}
   740  	}
   741  	return false
   742  }
   743  
   744  // Assumes proxier.mu is held.
   745  func (proxier *Proxier) appendServiceCommentLocked(args []string, svcName string) []string {
   746  	// Not printing these comments, can reduce size of iptables (in case of large
   747  	// number of endpoints) even by 40%+. So if total number of endpoint chains
   748  	// is large enough, we simply drop those comments.
   749  	if proxier.largeClusterMode {
   750  		return args
   751  	}
   752  	return append(args, "-m", "comment", "--comment", svcName)
   753  }
   754  
   755  // Called by the iptables.Monitor, and in response to topology changes; this calls
   756  // syncProxyRules() and tells it to resync all services, regardless of whether the
   757  // Service or Endpoints/EndpointSlice objects themselves have changed
   758  func (proxier *Proxier) forceSyncProxyRules() {
   759  	proxier.mu.Lock()
   760  	proxier.needFullSync = true
   761  	proxier.mu.Unlock()
   762  
   763  	proxier.syncProxyRules()
   764  }
   765  
   766  // This is where all of the iptables-save/restore calls happen.
   767  // The only other iptables rules are those that are setup in iptablesInit()
   768  // This assumes proxier.mu is NOT held
   769  func (proxier *Proxier) syncProxyRules() {
   770  	proxier.mu.Lock()
   771  	defer proxier.mu.Unlock()
   772  
   773  	// don't sync rules till we've received services and endpoints
   774  	if !proxier.isInitialized() {
   775  		klog.V(2).InfoS("Not syncing iptables until Services and Endpoints have been received from master")
   776  		return
   777  	}
   778  
   779  	// The value of proxier.needFullSync may change before the defer funcs run, so
   780  	// we need to keep track of whether it was set at the *start* of the sync.
   781  	tryPartialSync := !proxier.needFullSync
   782  
   783  	// Keep track of how long syncs take.
   784  	start := time.Now()
   785  	defer func() {
   786  		metrics.SyncProxyRulesLatency.Observe(metrics.SinceInSeconds(start))
   787  		if tryPartialSync {
   788  			metrics.SyncPartialProxyRulesLatency.Observe(metrics.SinceInSeconds(start))
   789  		} else {
   790  			metrics.SyncFullProxyRulesLatency.Observe(metrics.SinceInSeconds(start))
   791  		}
   792  		klog.V(2).InfoS("SyncProxyRules complete", "elapsed", time.Since(start))
   793  	}()
   794  
   795  	serviceUpdateResult := proxier.svcPortMap.Update(proxier.serviceChanges)
   796  	endpointUpdateResult := proxier.endpointsMap.Update(proxier.endpointsChanges)
   797  
   798  	klog.V(2).InfoS("Syncing iptables rules")
   799  
   800  	success := false
   801  	defer func() {
   802  		if !success {
   803  			klog.InfoS("Sync failed", "retryingTime", proxier.syncPeriod)
   804  			proxier.syncRunner.RetryAfter(proxier.syncPeriod)
   805  			if tryPartialSync {
   806  				metrics.IptablesPartialRestoreFailuresTotal.Inc()
   807  			}
   808  			// proxier.serviceChanges and proxier.endpointChanges have already
   809  			// been flushed, so we've lost the state needed to be able to do
   810  			// a partial sync.
   811  			proxier.needFullSync = true
   812  		}
   813  	}()
   814  
   815  	if !tryPartialSync {
   816  		// Ensure that our jump rules (eg from PREROUTING to KUBE-SERVICES) exist.
   817  		// We can't do this as part of the iptables-restore because we don't want
   818  		// to specify/replace *all* of the rules in PREROUTING, etc.
   819  		//
   820  		// We need to create these rules when kube-proxy first starts, and we need
   821  		// to recreate them if the utiliptables Monitor detects that iptables has
   822  		// been flushed. In both of those cases, the code will force a full sync.
   823  		// In all other cases, it ought to be safe to assume that the rules
   824  		// already exist, so we'll skip this step when doing a partial sync, to
   825  		// save us from having to invoke /sbin/iptables 20 times on each sync
   826  		// (which will be very slow on hosts with lots of iptables rules).
   827  		for _, jump := range append(iptablesJumpChains, iptablesKubeletJumpChains...) {
   828  			if _, err := proxier.iptables.EnsureChain(jump.table, jump.dstChain); err != nil {
   829  				klog.ErrorS(err, "Failed to ensure chain exists", "table", jump.table, "chain", jump.dstChain)
   830  				return
   831  			}
   832  			args := jump.extraArgs
   833  			if jump.comment != "" {
   834  				args = append(args, "-m", "comment", "--comment", jump.comment)
   835  			}
   836  			args = append(args, "-j", string(jump.dstChain))
   837  			if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, jump.table, jump.srcChain, args...); err != nil {
   838  				klog.ErrorS(err, "Failed to ensure chain jumps", "table", jump.table, "srcChain", jump.srcChain, "dstChain", jump.dstChain)
   839  				return
   840  			}
   841  		}
   842  	}
   843  
   844  	//
   845  	// Below this point we will not return until we try to write the iptables rules.
   846  	//
   847  
   848  	// Reset all buffers used later.
   849  	// This is to avoid memory reallocations and thus improve performance.
   850  	proxier.filterChains.Reset()
   851  	proxier.filterRules.Reset()
   852  	proxier.natChains.Reset()
   853  	proxier.natRules.Reset()
   854  
   855  	skippedNatChains := proxyutil.NewDiscardLineBuffer()
   856  	skippedNatRules := proxyutil.NewDiscardLineBuffer()
   857  
   858  	// Write chain lines for all the "top-level" chains we'll be filling in
   859  	for _, chainName := range []utiliptables.Chain{kubeServicesChain, kubeExternalServicesChain, kubeForwardChain, kubeNodePortsChain, kubeProxyFirewallChain} {
   860  		proxier.filterChains.Write(utiliptables.MakeChainLine(chainName))
   861  	}
   862  	for _, chainName := range []utiliptables.Chain{kubeServicesChain, kubeNodePortsChain, kubePostroutingChain, kubeMarkMasqChain} {
   863  		proxier.natChains.Write(utiliptables.MakeChainLine(chainName))
   864  	}
   865  
   866  	// Install the kubernetes-specific postrouting rules. We use a whole chain for
   867  	// this so that it is easier to flush and change, for example if the mark
   868  	// value should ever change.
   869  
   870  	proxier.natRules.Write(
   871  		"-A", string(kubePostroutingChain),
   872  		"-m", "mark", "!", "--mark", fmt.Sprintf("%s/%s", proxier.masqueradeMark, proxier.masqueradeMark),
   873  		"-j", "RETURN",
   874  	)
   875  	// Clear the mark to avoid re-masquerading if the packet re-traverses the network stack.
   876  	proxier.natRules.Write(
   877  		"-A", string(kubePostroutingChain),
   878  		"-j", "MARK", "--xor-mark", proxier.masqueradeMark,
   879  	)
   880  	masqRule := []string{
   881  		"-A", string(kubePostroutingChain),
   882  		"-m", "comment", "--comment", `"kubernetes service traffic requiring SNAT"`,
   883  		"-j", "MASQUERADE",
   884  	}
   885  	if proxier.iptables.HasRandomFully() {
   886  		masqRule = append(masqRule, "--random-fully")
   887  	}
   888  	proxier.natRules.Write(masqRule)
   889  
   890  	// Install the kubernetes-specific masquerade mark rule. We use a whole chain for
   891  	// this so that it is easier to flush and change, for example if the mark
   892  	// value should ever change.
   893  	proxier.natRules.Write(
   894  		"-A", string(kubeMarkMasqChain),
   895  		"-j", "MARK", "--or-mark", proxier.masqueradeMark,
   896  	)
   897  
   898  	isIPv6 := proxier.iptables.IsIPv6()
   899  	if !isIPv6 && proxier.localhostNodePorts {
   900  		// Kube-proxy's use of `route_localnet` to enable NodePorts on localhost
   901  		// creates a security hole (https://issue.k8s.io/90259) which this
   902  		// iptables rule mitigates.
   903  
   904  		// NOTE: kubelet creates an identical copy of this rule. If you want to
   905  		// change this rule in the future, you MUST do so in a way that will
   906  		// interoperate correctly with skewed versions of the rule created by
   907  		// kubelet. (Actually, kubelet uses "--dst"/"--src" rather than "-d"/"-s"
   908  		// but that's just a command-line thing and results in the same rule being
   909  		// created in the kernel.)
   910  		proxier.filterChains.Write(utiliptables.MakeChainLine(kubeletFirewallChain))
   911  		proxier.filterRules.Write(
   912  			"-A", string(kubeletFirewallChain),
   913  			"-m", "comment", "--comment", `"block incoming localnet connections"`,
   914  			"-d", "127.0.0.0/8",
   915  			"!", "-s", "127.0.0.0/8",
   916  			"-m", "conntrack",
   917  			"!", "--ctstate", "RELATED,ESTABLISHED,DNAT",
   918  			"-j", "DROP",
   919  		)
   920  	}
   921  
   922  	// Accumulate NAT chains to keep.
   923  	activeNATChains := map[utiliptables.Chain]bool{} // use a map as a set
   924  
   925  	// To avoid growing this slice, we arbitrarily set its size to 64,
   926  	// there is never more than that many arguments for a single line.
   927  	// Note that even if we go over 64, it will still be correct - it
   928  	// is just for efficiency, not correctness.
   929  	args := make([]string, 64)
   930  
   931  	// Compute total number of endpoint chains across all services
   932  	// to get a sense of how big the cluster is.
   933  	totalEndpoints := 0
   934  	for svcName := range proxier.svcPortMap {
   935  		totalEndpoints += len(proxier.endpointsMap[svcName])
   936  	}
   937  	proxier.largeClusterMode = (totalEndpoints > largeClusterEndpointsThreshold)
   938  
   939  	// These two variables are used to publish the sync_proxy_rules_no_endpoints_total
   940  	// metric.
   941  	serviceNoLocalEndpointsTotalInternal := 0
   942  	serviceNoLocalEndpointsTotalExternal := 0
   943  
   944  	// Build rules for each service-port.
   945  	for svcName, svc := range proxier.svcPortMap {
   946  		svcInfo, ok := svc.(*servicePortInfo)
   947  		if !ok {
   948  			klog.ErrorS(nil, "Failed to cast serviceInfo", "serviceName", svcName)
   949  			continue
   950  		}
   951  		protocol := strings.ToLower(string(svcInfo.Protocol()))
   952  		svcPortNameString := svcInfo.nameString
   953  
   954  		// Figure out the endpoints for Cluster and Local traffic policy.
   955  		// allLocallyReachableEndpoints is the set of all endpoints that can be routed to
   956  		// from this node, given the service's traffic policies. hasEndpoints is true
   957  		// if the service has any usable endpoints on any node, not just this one.
   958  		allEndpoints := proxier.endpointsMap[svcName]
   959  		clusterEndpoints, localEndpoints, allLocallyReachableEndpoints, hasEndpoints := proxy.CategorizeEndpoints(allEndpoints, svcInfo, proxier.nodeLabels)
   960  
   961  		// Note the endpoint chains that will be used
   962  		for _, ep := range allLocallyReachableEndpoints {
   963  			if epInfo, ok := ep.(*endpointInfo); ok {
   964  				activeNATChains[epInfo.ChainName] = true
   965  			}
   966  		}
   967  
   968  		// clusterPolicyChain contains the endpoints used with "Cluster" traffic policy
   969  		clusterPolicyChain := svcInfo.clusterPolicyChainName
   970  		usesClusterPolicyChain := len(clusterEndpoints) > 0 && svcInfo.UsesClusterEndpoints()
   971  		if usesClusterPolicyChain {
   972  			activeNATChains[clusterPolicyChain] = true
   973  		}
   974  
   975  		// localPolicyChain contains the endpoints used with "Local" traffic policy
   976  		localPolicyChain := svcInfo.localPolicyChainName
   977  		usesLocalPolicyChain := len(localEndpoints) > 0 && svcInfo.UsesLocalEndpoints()
   978  		if usesLocalPolicyChain {
   979  			activeNATChains[localPolicyChain] = true
   980  		}
   981  
   982  		// internalPolicyChain is the chain containing the endpoints for
   983  		// "internal" (ClusterIP) traffic. internalTrafficChain is the chain that
   984  		// internal traffic is routed to (which is always the same as
   985  		// internalPolicyChain). hasInternalEndpoints is true if we should
   986  		// generate rules pointing to internalTrafficChain, or false if there are
   987  		// no available internal endpoints.
   988  		internalPolicyChain := clusterPolicyChain
   989  		hasInternalEndpoints := hasEndpoints
   990  		if svcInfo.InternalPolicyLocal() {
   991  			internalPolicyChain = localPolicyChain
   992  			if len(localEndpoints) == 0 {
   993  				hasInternalEndpoints = false
   994  			}
   995  		}
   996  		internalTrafficChain := internalPolicyChain
   997  
   998  		// Similarly, externalPolicyChain is the chain containing the endpoints
   999  		// for "external" (NodePort, LoadBalancer, and ExternalIP) traffic.
  1000  		// externalTrafficChain is the chain that external traffic is routed to
  1001  		// (which is always the service's "EXT" chain). hasExternalEndpoints is
  1002  		// true if there are endpoints that will be reached by external traffic.
  1003  		// (But we may still have to generate externalTrafficChain even if there
  1004  		// are no external endpoints, to ensure that the short-circuit rules for
  1005  		// local traffic are set up.)
  1006  		externalPolicyChain := clusterPolicyChain
  1007  		hasExternalEndpoints := hasEndpoints
  1008  		if svcInfo.ExternalPolicyLocal() {
  1009  			externalPolicyChain = localPolicyChain
  1010  			if len(localEndpoints) == 0 {
  1011  				hasExternalEndpoints = false
  1012  			}
  1013  		}
  1014  		externalTrafficChain := svcInfo.externalChainName // eventually jumps to externalPolicyChain
  1015  
  1016  		// usesExternalTrafficChain is based on hasEndpoints, not hasExternalEndpoints,
  1017  		// because we need the local-traffic-short-circuiting rules even when there
  1018  		// are no externally-usable endpoints.
  1019  		usesExternalTrafficChain := hasEndpoints && svcInfo.ExternallyAccessible()
  1020  		if usesExternalTrafficChain {
  1021  			activeNATChains[externalTrafficChain] = true
  1022  		}
  1023  
  1024  		// Traffic to LoadBalancer IPs can go directly to externalTrafficChain
  1025  		// unless LoadBalancerSourceRanges is in use in which case we will
  1026  		// create a firewall chain.
  1027  		loadBalancerTrafficChain := externalTrafficChain
  1028  		fwChain := svcInfo.firewallChainName
  1029  		usesFWChain := hasEndpoints && len(svcInfo.LoadBalancerVIPStrings()) > 0 && len(svcInfo.LoadBalancerSourceRanges()) > 0
  1030  		if usesFWChain {
  1031  			activeNATChains[fwChain] = true
  1032  			loadBalancerTrafficChain = fwChain
  1033  		}
  1034  
  1035  		var internalTrafficFilterTarget, internalTrafficFilterComment string
  1036  		var externalTrafficFilterTarget, externalTrafficFilterComment string
  1037  		if !hasEndpoints {
  1038  			// The service has no endpoints at all; hasInternalEndpoints and
  1039  			// hasExternalEndpoints will also be false, and we will not
  1040  			// generate any chains in the "nat" table for the service; only
  1041  			// rules in the "filter" table rejecting incoming packets for
  1042  			// the service's IPs.
  1043  			internalTrafficFilterTarget = "REJECT"
  1044  			internalTrafficFilterComment = fmt.Sprintf(`"%s has no endpoints"`, svcPortNameString)
  1045  			externalTrafficFilterTarget = "REJECT"
  1046  			externalTrafficFilterComment = internalTrafficFilterComment
  1047  		} else {
  1048  			if !hasInternalEndpoints {
  1049  				// The internalTrafficPolicy is "Local" but there are no local
  1050  				// endpoints. Traffic to the clusterIP will be dropped, but
  1051  				// external traffic may still be accepted.
  1052  				internalTrafficFilterTarget = "DROP"
  1053  				internalTrafficFilterComment = fmt.Sprintf(`"%s has no local endpoints"`, svcPortNameString)
  1054  				serviceNoLocalEndpointsTotalInternal++
  1055  			}
  1056  			if !hasExternalEndpoints {
  1057  				// The externalTrafficPolicy is "Local" but there are no
  1058  				// local endpoints. Traffic to "external" IPs from outside
  1059  				// the cluster will be dropped, but traffic from inside
  1060  				// the cluster may still be accepted.
  1061  				externalTrafficFilterTarget = "DROP"
  1062  				externalTrafficFilterComment = fmt.Sprintf(`"%s has no local endpoints"`, svcPortNameString)
  1063  				serviceNoLocalEndpointsTotalExternal++
  1064  			}
  1065  		}
  1066  
  1067  		filterRules := proxier.filterRules
  1068  		natChains := proxier.natChains
  1069  		natRules := proxier.natRules
  1070  
  1071  		// Capture the clusterIP.
  1072  		if hasInternalEndpoints {
  1073  			natRules.Write(
  1074  				"-A", string(kubeServicesChain),
  1075  				"-m", "comment", "--comment", fmt.Sprintf(`"%s cluster IP"`, svcPortNameString),
  1076  				"-m", protocol, "-p", protocol,
  1077  				"-d", svcInfo.ClusterIP().String(),
  1078  				"--dport", strconv.Itoa(svcInfo.Port()),
  1079  				"-j", string(internalTrafficChain))
  1080  		} else {
  1081  			// No endpoints.
  1082  			filterRules.Write(
  1083  				"-A", string(kubeServicesChain),
  1084  				"-m", "comment", "--comment", internalTrafficFilterComment,
  1085  				"-m", protocol, "-p", protocol,
  1086  				"-d", svcInfo.ClusterIP().String(),
  1087  				"--dport", strconv.Itoa(svcInfo.Port()),
  1088  				"-j", internalTrafficFilterTarget,
  1089  			)
  1090  		}
  1091  
  1092  		// Capture externalIPs.
  1093  		for _, externalIP := range svcInfo.ExternalIPStrings() {
  1094  			if hasEndpoints {
  1095  				// Send traffic bound for external IPs to the "external
  1096  				// destinations" chain.
  1097  				natRules.Write(
  1098  					"-A", string(kubeServicesChain),
  1099  					"-m", "comment", "--comment", fmt.Sprintf(`"%s external IP"`, svcPortNameString),
  1100  					"-m", protocol, "-p", protocol,
  1101  					"-d", externalIP,
  1102  					"--dport", strconv.Itoa(svcInfo.Port()),
  1103  					"-j", string(externalTrafficChain))
  1104  			}
  1105  			if !hasExternalEndpoints {
  1106  				// Either no endpoints at all (REJECT) or no endpoints for
  1107  				// external traffic (DROP anything that didn't get
  1108  				// short-circuited by the EXT chain.)
  1109  				filterRules.Write(
  1110  					"-A", string(kubeExternalServicesChain),
  1111  					"-m", "comment", "--comment", externalTrafficFilterComment,
  1112  					"-m", protocol, "-p", protocol,
  1113  					"-d", externalIP,
  1114  					"--dport", strconv.Itoa(svcInfo.Port()),
  1115  					"-j", externalTrafficFilterTarget,
  1116  				)
  1117  			}
  1118  		}
  1119  
  1120  		// Capture load-balancer ingress.
  1121  		for _, lbip := range svcInfo.LoadBalancerVIPStrings() {
  1122  			if hasEndpoints {
  1123  				natRules.Write(
  1124  					"-A", string(kubeServicesChain),
  1125  					"-m", "comment", "--comment", fmt.Sprintf(`"%s loadbalancer IP"`, svcPortNameString),
  1126  					"-m", protocol, "-p", protocol,
  1127  					"-d", lbip,
  1128  					"--dport", strconv.Itoa(svcInfo.Port()),
  1129  					"-j", string(loadBalancerTrafficChain))
  1130  
  1131  			}
  1132  			if usesFWChain {
  1133  				filterRules.Write(
  1134  					"-A", string(kubeProxyFirewallChain),
  1135  					"-m", "comment", "--comment", fmt.Sprintf(`"%s traffic not accepted by %s"`, svcPortNameString, svcInfo.firewallChainName),
  1136  					"-m", protocol, "-p", protocol,
  1137  					"-d", lbip,
  1138  					"--dport", strconv.Itoa(svcInfo.Port()),
  1139  					"-j", "DROP")
  1140  			}
  1141  		}
  1142  		if !hasExternalEndpoints {
  1143  			// Either no endpoints at all (REJECT) or no endpoints for
  1144  			// external traffic (DROP anything that didn't get short-circuited
  1145  			// by the EXT chain.)
  1146  			for _, lbip := range svcInfo.LoadBalancerVIPStrings() {
  1147  				filterRules.Write(
  1148  					"-A", string(kubeExternalServicesChain),
  1149  					"-m", "comment", "--comment", externalTrafficFilterComment,
  1150  					"-m", protocol, "-p", protocol,
  1151  					"-d", lbip,
  1152  					"--dport", strconv.Itoa(svcInfo.Port()),
  1153  					"-j", externalTrafficFilterTarget,
  1154  				)
  1155  			}
  1156  		}
  1157  
  1158  		// Capture nodeports.
  1159  		if svcInfo.NodePort() != 0 {
  1160  			if hasEndpoints {
  1161  				// Jump to the external destination chain.  For better or for
  1162  				// worse, nodeports are not subect to loadBalancerSourceRanges,
  1163  				// and we can't change that.
  1164  				natRules.Write(
  1165  					"-A", string(kubeNodePortsChain),
  1166  					"-m", "comment", "--comment", svcPortNameString,
  1167  					"-m", protocol, "-p", protocol,
  1168  					"--dport", strconv.Itoa(svcInfo.NodePort()),
  1169  					"-j", string(externalTrafficChain))
  1170  			}
  1171  			if !hasExternalEndpoints {
  1172  				// Either no endpoints at all (REJECT) or no endpoints for
  1173  				// external traffic (DROP anything that didn't get
  1174  				// short-circuited by the EXT chain.)
  1175  				filterRules.Write(
  1176  					"-A", string(kubeExternalServicesChain),
  1177  					"-m", "comment", "--comment", externalTrafficFilterComment,
  1178  					"-m", "addrtype", "--dst-type", "LOCAL",
  1179  					"-m", protocol, "-p", protocol,
  1180  					"--dport", strconv.Itoa(svcInfo.NodePort()),
  1181  					"-j", externalTrafficFilterTarget,
  1182  				)
  1183  			}
  1184  		}
  1185  
  1186  		// Capture healthCheckNodePorts.
  1187  		if svcInfo.HealthCheckNodePort() != 0 {
  1188  			// no matter if node has local endpoints, healthCheckNodePorts
  1189  			// need to add a rule to accept the incoming connection
  1190  			filterRules.Write(
  1191  				"-A", string(kubeNodePortsChain),
  1192  				"-m", "comment", "--comment", fmt.Sprintf(`"%s health check node port"`, svcPortNameString),
  1193  				"-m", "tcp", "-p", "tcp",
  1194  				"--dport", strconv.Itoa(svcInfo.HealthCheckNodePort()),
  1195  				"-j", "ACCEPT",
  1196  			)
  1197  		}
  1198  
  1199  		// If the SVC/SVL/EXT/FW/SEP chains have not changed since the last sync
  1200  		// then we can omit them from the restore input. However, we have to still
  1201  		// figure out how many chains we _would_ have written, to make the metrics
  1202  		// come out right, so we just compute them and throw them away.
  1203  		if tryPartialSync && !serviceUpdateResult.UpdatedServices.Has(svcName.NamespacedName) && !endpointUpdateResult.UpdatedServices.Has(svcName.NamespacedName) {
  1204  			natChains = skippedNatChains
  1205  			natRules = skippedNatRules
  1206  		}
  1207  
  1208  		// Set up internal traffic handling.
  1209  		if hasInternalEndpoints {
  1210  			args = append(args[:0],
  1211  				"-m", "comment", "--comment", fmt.Sprintf(`"%s cluster IP"`, svcPortNameString),
  1212  				"-m", protocol, "-p", protocol,
  1213  				"-d", svcInfo.ClusterIP().String(),
  1214  				"--dport", strconv.Itoa(svcInfo.Port()),
  1215  			)
  1216  			if proxier.masqueradeAll {
  1217  				natRules.Write(
  1218  					"-A", string(internalTrafficChain),
  1219  					args,
  1220  					"-j", string(kubeMarkMasqChain))
  1221  			} else if proxier.localDetector.IsImplemented() {
  1222  				// This masquerades off-cluster traffic to a service VIP. The
  1223  				// idea is that you can establish a static route for your
  1224  				// Service range, routing to any node, and that node will
  1225  				// bridge into the Service for you. Since that might bounce
  1226  				// off-node, we masquerade here.
  1227  				natRules.Write(
  1228  					"-A", string(internalTrafficChain),
  1229  					args,
  1230  					proxier.localDetector.IfNotLocal(),
  1231  					"-j", string(kubeMarkMasqChain))
  1232  			}
  1233  		}
  1234  
  1235  		// Set up external traffic handling (if any "external" destinations are
  1236  		// enabled). All captured traffic for all external destinations should
  1237  		// jump to externalTrafficChain, which will handle some special cases and
  1238  		// then jump to externalPolicyChain.
  1239  		if usesExternalTrafficChain {
  1240  			natChains.Write(utiliptables.MakeChainLine(externalTrafficChain))
  1241  
  1242  			if !svcInfo.ExternalPolicyLocal() {
  1243  				// If we are using non-local endpoints we need to masquerade,
  1244  				// in case we cross nodes.
  1245  				natRules.Write(
  1246  					"-A", string(externalTrafficChain),
  1247  					"-m", "comment", "--comment", fmt.Sprintf(`"masquerade traffic for %s external destinations"`, svcPortNameString),
  1248  					"-j", string(kubeMarkMasqChain))
  1249  			} else {
  1250  				// If we are only using same-node endpoints, we can retain the
  1251  				// source IP in most cases.
  1252  
  1253  				if proxier.localDetector.IsImplemented() {
  1254  					// Treat all locally-originated pod -> external destination
  1255  					// traffic as a special-case.  It is subject to neither
  1256  					// form of traffic policy, which simulates going up-and-out
  1257  					// to an external load-balancer and coming back in.
  1258  					natRules.Write(
  1259  						"-A", string(externalTrafficChain),
  1260  						"-m", "comment", "--comment", fmt.Sprintf(`"pod traffic for %s external destinations"`, svcPortNameString),
  1261  						proxier.localDetector.IfLocal(),
  1262  						"-j", string(clusterPolicyChain))
  1263  				}
  1264  
  1265  				// Locally originated traffic (not a pod, but the host node)
  1266  				// still needs masquerade because the LBIP itself is a local
  1267  				// address, so that will be the chosen source IP.
  1268  				natRules.Write(
  1269  					"-A", string(externalTrafficChain),
  1270  					"-m", "comment", "--comment", fmt.Sprintf(`"masquerade LOCAL traffic for %s external destinations"`, svcPortNameString),
  1271  					"-m", "addrtype", "--src-type", "LOCAL",
  1272  					"-j", string(kubeMarkMasqChain))
  1273  
  1274  				// Redirect all src-type=LOCAL -> external destination to the
  1275  				// policy=cluster chain. This allows traffic originating
  1276  				// from the host to be redirected to the service correctly.
  1277  				natRules.Write(
  1278  					"-A", string(externalTrafficChain),
  1279  					"-m", "comment", "--comment", fmt.Sprintf(`"route LOCAL traffic for %s external destinations"`, svcPortNameString),
  1280  					"-m", "addrtype", "--src-type", "LOCAL",
  1281  					"-j", string(clusterPolicyChain))
  1282  			}
  1283  
  1284  			// Anything else falls thru to the appropriate policy chain.
  1285  			if hasExternalEndpoints {
  1286  				natRules.Write(
  1287  					"-A", string(externalTrafficChain),
  1288  					"-j", string(externalPolicyChain))
  1289  			}
  1290  		}
  1291  
  1292  		// Set up firewall chain, if needed
  1293  		if usesFWChain {
  1294  			natChains.Write(utiliptables.MakeChainLine(fwChain))
  1295  
  1296  			// The service firewall rules are created based on the
  1297  			// loadBalancerSourceRanges field. This only works for VIP-like
  1298  			// loadbalancers that preserve source IPs. For loadbalancers which
  1299  			// direct traffic to service NodePort, the firewall rules will not
  1300  			// apply.
  1301  			args = append(args[:0],
  1302  				"-A", string(fwChain),
  1303  				"-m", "comment", "--comment", fmt.Sprintf(`"%s loadbalancer IP"`, svcPortNameString),
  1304  			)
  1305  
  1306  			// firewall filter based on each source range
  1307  			allowFromNode := false
  1308  			for _, src := range svcInfo.LoadBalancerSourceRanges() {
  1309  				natRules.Write(args, "-s", src, "-j", string(externalTrafficChain))
  1310  				_, cidr, err := netutils.ParseCIDRSloppy(src)
  1311  				if err != nil {
  1312  					klog.ErrorS(err, "Error parsing CIDR in LoadBalancerSourceRanges, dropping it", "cidr", cidr)
  1313  				} else if cidr.Contains(proxier.nodeIP) {
  1314  					allowFromNode = true
  1315  				}
  1316  			}
  1317  			// For VIP-like LBs, the VIP is often added as a local
  1318  			// address (via an IP route rule).  In that case, a request
  1319  			// from a node to the VIP will not hit the loadbalancer but
  1320  			// will loop back with the source IP set to the VIP.  We
  1321  			// need the following rules to allow requests from this node.
  1322  			if allowFromNode {
  1323  				for _, lbip := range svcInfo.LoadBalancerVIPStrings() {
  1324  					natRules.Write(
  1325  						args,
  1326  						"-s", lbip,
  1327  						"-j", string(externalTrafficChain))
  1328  				}
  1329  			}
  1330  			// If the packet was able to reach the end of firewall chain,
  1331  			// then it did not get DNATed, so it will match the
  1332  			// corresponding KUBE-PROXY-FIREWALL rule.
  1333  			natRules.Write(
  1334  				"-A", string(fwChain),
  1335  				"-m", "comment", "--comment", fmt.Sprintf(`"other traffic to %s will be dropped by KUBE-PROXY-FIREWALL"`, svcPortNameString),
  1336  			)
  1337  		}
  1338  
  1339  		// If Cluster policy is in use, create the chain and create rules jumping
  1340  		// from clusterPolicyChain to the clusterEndpoints
  1341  		if usesClusterPolicyChain {
  1342  			natChains.Write(utiliptables.MakeChainLine(clusterPolicyChain))
  1343  			proxier.writeServiceToEndpointRules(natRules, svcPortNameString, svcInfo, clusterPolicyChain, clusterEndpoints, args)
  1344  		}
  1345  
  1346  		// If Local policy is in use, create the chain and create rules jumping
  1347  		// from localPolicyChain to the localEndpoints
  1348  		if usesLocalPolicyChain {
  1349  			natChains.Write(utiliptables.MakeChainLine(localPolicyChain))
  1350  			proxier.writeServiceToEndpointRules(natRules, svcPortNameString, svcInfo, localPolicyChain, localEndpoints, args)
  1351  		}
  1352  
  1353  		// Generate the per-endpoint chains.
  1354  		for _, ep := range allLocallyReachableEndpoints {
  1355  			epInfo, ok := ep.(*endpointInfo)
  1356  			if !ok {
  1357  				klog.ErrorS(nil, "Failed to cast endpointInfo", "endpointInfo", ep)
  1358  				continue
  1359  			}
  1360  
  1361  			endpointChain := epInfo.ChainName
  1362  
  1363  			// Create the endpoint chain
  1364  			natChains.Write(utiliptables.MakeChainLine(endpointChain))
  1365  			activeNATChains[endpointChain] = true
  1366  
  1367  			args = append(args[:0], "-A", string(endpointChain))
  1368  			args = proxier.appendServiceCommentLocked(args, svcPortNameString)
  1369  			// Handle traffic that loops back to the originator with SNAT.
  1370  			natRules.Write(
  1371  				args,
  1372  				"-s", epInfo.IP(),
  1373  				"-j", string(kubeMarkMasqChain))
  1374  			// Update client-affinity lists.
  1375  			if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP {
  1376  				args = append(args, "-m", "recent", "--name", string(endpointChain), "--set")
  1377  			}
  1378  			// DNAT to final destination.
  1379  			args = append(args, "-m", protocol, "-p", protocol, "-j", "DNAT", "--to-destination", epInfo.String())
  1380  			natRules.Write(args)
  1381  		}
  1382  	}
  1383  
  1384  	// Delete chains no longer in use. Since "iptables-save" can take several seconds
  1385  	// to run on hosts with lots of iptables rules, we don't bother to do this on
  1386  	// every sync in large clusters. (Stale chains will not be referenced by any
  1387  	// active rules, so they're harmless other than taking up memory.)
  1388  	deletedChains := 0
  1389  	if !proxier.largeClusterMode || time.Since(proxier.lastIPTablesCleanup) > proxier.syncPeriod {
  1390  		var existingNATChains map[utiliptables.Chain]struct{}
  1391  
  1392  		proxier.iptablesData.Reset()
  1393  		if err := proxier.iptables.SaveInto(utiliptables.TableNAT, proxier.iptablesData); err == nil {
  1394  			existingNATChains = utiliptables.GetChainsFromTable(proxier.iptablesData.Bytes())
  1395  
  1396  			for chain := range existingNATChains {
  1397  				if !activeNATChains[chain] {
  1398  					chainString := string(chain)
  1399  					if !isServiceChainName(chainString) {
  1400  						// Ignore chains that aren't ours.
  1401  						continue
  1402  					}
  1403  					// We must (as per iptables) write a chain-line
  1404  					// for it, which has the nice effect of flushing
  1405  					// the chain. Then we can remove the chain.
  1406  					proxier.natChains.Write(utiliptables.MakeChainLine(chain))
  1407  					proxier.natRules.Write("-X", chainString)
  1408  					deletedChains++
  1409  				}
  1410  			}
  1411  			proxier.lastIPTablesCleanup = time.Now()
  1412  		} else {
  1413  			klog.ErrorS(err, "Failed to execute iptables-save: stale chains will not be deleted")
  1414  		}
  1415  	}
  1416  
  1417  	// Finally, tail-call to the nodePorts chain.  This needs to be after all
  1418  	// other service portal rules.
  1419  	if proxier.nodePortAddresses.MatchAll() {
  1420  		destinations := []string{"-m", "addrtype", "--dst-type", "LOCAL"}
  1421  		// Block localhost nodePorts if they are not supported. (For IPv6 they never
  1422  		// work, and for IPv4 they only work if we previously set `route_localnet`.)
  1423  		if isIPv6 {
  1424  			destinations = append(destinations, "!", "-d", "::1/128")
  1425  		} else if !proxier.localhostNodePorts {
  1426  			destinations = append(destinations, "!", "-d", "127.0.0.0/8")
  1427  		}
  1428  
  1429  		proxier.natRules.Write(
  1430  			"-A", string(kubeServicesChain),
  1431  			"-m", "comment", "--comment", `"kubernetes service nodeports; NOTE: this must be the last rule in this chain"`,
  1432  			destinations,
  1433  			"-j", string(kubeNodePortsChain))
  1434  	} else {
  1435  		nodeIPs, err := proxier.nodePortAddresses.GetNodeIPs(proxier.networkInterfacer)
  1436  		if err != nil {
  1437  			klog.ErrorS(err, "Failed to get node ip address matching nodeport cidrs, services with nodeport may not work as intended", "CIDRs", proxier.nodePortAddresses)
  1438  		}
  1439  		for _, ip := range nodeIPs {
  1440  			if ip.IsLoopback() {
  1441  				if isIPv6 {
  1442  					klog.ErrorS(nil, "--nodeport-addresses includes localhost but localhost NodePorts are not supported on IPv6", "address", ip.String())
  1443  					continue
  1444  				} else if !proxier.localhostNodePorts {
  1445  					klog.ErrorS(nil, "--nodeport-addresses includes localhost but --iptables-localhost-nodeports=false was passed", "address", ip.String())
  1446  					continue
  1447  				}
  1448  			}
  1449  
  1450  			// create nodeport rules for each IP one by one
  1451  			proxier.natRules.Write(
  1452  				"-A", string(kubeServicesChain),
  1453  				"-m", "comment", "--comment", `"kubernetes service nodeports; NOTE: this must be the last rule in this chain"`,
  1454  				"-d", ip.String(),
  1455  				"-j", string(kubeNodePortsChain))
  1456  		}
  1457  	}
  1458  
  1459  	// Drop the packets in INVALID state, which would potentially cause
  1460  	// unexpected connection reset if nf_conntrack_tcp_be_liberal is not set.
  1461  	// Ref: https://github.com/kubernetes/kubernetes/issues/74839
  1462  	// Ref: https://github.com/kubernetes/kubernetes/issues/117924
  1463  	if !proxier.conntrackTCPLiberal {
  1464  		proxier.filterRules.Write(
  1465  			"-A", string(kubeForwardChain),
  1466  			"-m", "conntrack",
  1467  			"--ctstate", "INVALID",
  1468  			"-j", "DROP",
  1469  		)
  1470  	}
  1471  
  1472  	// If the masqueradeMark has been added then we want to forward that same
  1473  	// traffic, this allows NodePort traffic to be forwarded even if the default
  1474  	// FORWARD policy is not accept.
  1475  	proxier.filterRules.Write(
  1476  		"-A", string(kubeForwardChain),
  1477  		"-m", "comment", "--comment", `"kubernetes forwarding rules"`,
  1478  		"-m", "mark", "--mark", fmt.Sprintf("%s/%s", proxier.masqueradeMark, proxier.masqueradeMark),
  1479  		"-j", "ACCEPT",
  1480  	)
  1481  
  1482  	// The following rule ensures the traffic after the initial packet accepted
  1483  	// by the "kubernetes forwarding rules" rule above will be accepted.
  1484  	proxier.filterRules.Write(
  1485  		"-A", string(kubeForwardChain),
  1486  		"-m", "comment", "--comment", `"kubernetes forwarding conntrack rule"`,
  1487  		"-m", "conntrack",
  1488  		"--ctstate", "RELATED,ESTABLISHED",
  1489  		"-j", "ACCEPT",
  1490  	)
  1491  
  1492  	metrics.IptablesRulesTotal.WithLabelValues(string(utiliptables.TableFilter)).Set(float64(proxier.filterRules.Lines()))
  1493  	metrics.IptablesRulesLastSync.WithLabelValues(string(utiliptables.TableFilter)).Set(float64(proxier.filterRules.Lines()))
  1494  	metrics.IptablesRulesTotal.WithLabelValues(string(utiliptables.TableNAT)).Set(float64(proxier.natRules.Lines() + skippedNatRules.Lines() - deletedChains))
  1495  	metrics.IptablesRulesLastSync.WithLabelValues(string(utiliptables.TableNAT)).Set(float64(proxier.natRules.Lines() - deletedChains))
  1496  
  1497  	// Sync rules.
  1498  	proxier.iptablesData.Reset()
  1499  	proxier.iptablesData.WriteString("*filter\n")
  1500  	proxier.iptablesData.Write(proxier.filterChains.Bytes())
  1501  	proxier.iptablesData.Write(proxier.filterRules.Bytes())
  1502  	proxier.iptablesData.WriteString("COMMIT\n")
  1503  	proxier.iptablesData.WriteString("*nat\n")
  1504  	proxier.iptablesData.Write(proxier.natChains.Bytes())
  1505  	proxier.iptablesData.Write(proxier.natRules.Bytes())
  1506  	proxier.iptablesData.WriteString("COMMIT\n")
  1507  
  1508  	klog.V(2).InfoS("Reloading service iptables data",
  1509  		"numServices", len(proxier.svcPortMap),
  1510  		"numEndpoints", totalEndpoints,
  1511  		"numFilterChains", proxier.filterChains.Lines(),
  1512  		"numFilterRules", proxier.filterRules.Lines(),
  1513  		"numNATChains", proxier.natChains.Lines(),
  1514  		"numNATRules", proxier.natRules.Lines(),
  1515  	)
  1516  	klog.V(9).InfoS("Restoring iptables", "rules", proxier.iptablesData.Bytes())
  1517  
  1518  	// NOTE: NoFlushTables is used so we don't flush non-kubernetes chains in the table
  1519  	err := proxier.iptables.RestoreAll(proxier.iptablesData.Bytes(), utiliptables.NoFlushTables, utiliptables.RestoreCounters)
  1520  	if err != nil {
  1521  		if pErr, ok := err.(utiliptables.ParseError); ok {
  1522  			lines := utiliptables.ExtractLines(proxier.iptablesData.Bytes(), pErr.Line(), 3)
  1523  			klog.ErrorS(pErr, "Failed to execute iptables-restore", "rules", lines)
  1524  		} else {
  1525  			klog.ErrorS(err, "Failed to execute iptables-restore")
  1526  		}
  1527  		metrics.IptablesRestoreFailuresTotal.Inc()
  1528  		return
  1529  	}
  1530  	success = true
  1531  	proxier.needFullSync = false
  1532  
  1533  	for name, lastChangeTriggerTimes := range endpointUpdateResult.LastChangeTriggerTimes {
  1534  		for _, lastChangeTriggerTime := range lastChangeTriggerTimes {
  1535  			latency := metrics.SinceInSeconds(lastChangeTriggerTime)
  1536  			metrics.NetworkProgrammingLatency.Observe(latency)
  1537  			klog.V(4).InfoS("Network programming", "endpoint", klog.KRef(name.Namespace, name.Name), "elapsed", latency)
  1538  		}
  1539  	}
  1540  
  1541  	metrics.SyncProxyRulesNoLocalEndpointsTotal.WithLabelValues("internal").Set(float64(serviceNoLocalEndpointsTotalInternal))
  1542  	metrics.SyncProxyRulesNoLocalEndpointsTotal.WithLabelValues("external").Set(float64(serviceNoLocalEndpointsTotalExternal))
  1543  	if proxier.healthzServer != nil {
  1544  		proxier.healthzServer.Updated(proxier.ipFamily)
  1545  	}
  1546  	metrics.SyncProxyRulesLastTimestamp.SetToCurrentTime()
  1547  
  1548  	// Update service healthchecks.  The endpoints list might include services that are
  1549  	// not "OnlyLocal", but the services list will not, and the serviceHealthServer
  1550  	// will just drop those endpoints.
  1551  	if err := proxier.serviceHealthServer.SyncServices(proxier.svcPortMap.HealthCheckNodePorts()); err != nil {
  1552  		klog.ErrorS(err, "Error syncing healthcheck services")
  1553  	}
  1554  	if err := proxier.serviceHealthServer.SyncEndpoints(proxier.endpointsMap.LocalReadyEndpoints()); err != nil {
  1555  		klog.ErrorS(err, "Error syncing healthcheck endpoints")
  1556  	}
  1557  
  1558  	// Finish housekeeping, clear stale conntrack entries for UDP Services
  1559  	conntrack.CleanStaleEntries(proxier.iptables.IsIPv6(), proxier.exec, proxier.svcPortMap, serviceUpdateResult, endpointUpdateResult)
  1560  }
  1561  
  1562  func (proxier *Proxier) writeServiceToEndpointRules(natRules proxyutil.LineBuffer, svcPortNameString string, svcInfo proxy.ServicePort, svcChain utiliptables.Chain, endpoints []proxy.Endpoint, args []string) {
  1563  	// First write session affinity rules, if applicable.
  1564  	if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP {
  1565  		for _, ep := range endpoints {
  1566  			epInfo, ok := ep.(*endpointInfo)
  1567  			if !ok {
  1568  				continue
  1569  			}
  1570  			comment := fmt.Sprintf(`"%s -> %s"`, svcPortNameString, epInfo.String())
  1571  
  1572  			args = append(args[:0],
  1573  				"-A", string(svcChain),
  1574  			)
  1575  			args = proxier.appendServiceCommentLocked(args, comment)
  1576  			args = append(args,
  1577  				"-m", "recent", "--name", string(epInfo.ChainName),
  1578  				"--rcheck", "--seconds", strconv.Itoa(svcInfo.StickyMaxAgeSeconds()), "--reap",
  1579  				"-j", string(epInfo.ChainName),
  1580  			)
  1581  			natRules.Write(args)
  1582  		}
  1583  	}
  1584  
  1585  	// Now write loadbalancing rules.
  1586  	numEndpoints := len(endpoints)
  1587  	for i, ep := range endpoints {
  1588  		epInfo, ok := ep.(*endpointInfo)
  1589  		if !ok {
  1590  			continue
  1591  		}
  1592  		comment := fmt.Sprintf(`"%s -> %s"`, svcPortNameString, epInfo.String())
  1593  
  1594  		args = append(args[:0], "-A", string(svcChain))
  1595  		args = proxier.appendServiceCommentLocked(args, comment)
  1596  		if i < (numEndpoints - 1) {
  1597  			// Each rule is a probabilistic match.
  1598  			args = append(args,
  1599  				"-m", "statistic",
  1600  				"--mode", "random",
  1601  				"--probability", proxier.probability(numEndpoints-i))
  1602  		}
  1603  		// The final (or only if n == 1) rule is a guaranteed match.
  1604  		natRules.Write(args, "-j", string(epInfo.ChainName))
  1605  	}
  1606  }