k8s.io/kubernetes@v1.29.3/pkg/proxy/iptables/proxier.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package iptables 18 19 // 20 // NOTE: this needs to be tested in e2e since it uses iptables for everything. 21 // 22 23 import ( 24 "bytes" 25 "crypto/sha256" 26 "encoding/base32" 27 "fmt" 28 "net" 29 "reflect" 30 "strconv" 31 "strings" 32 "sync" 33 "sync/atomic" 34 "time" 35 36 v1 "k8s.io/api/core/v1" 37 discovery "k8s.io/api/discovery/v1" 38 "k8s.io/apimachinery/pkg/types" 39 "k8s.io/apimachinery/pkg/util/wait" 40 "k8s.io/client-go/tools/events" 41 utilsysctl "k8s.io/component-helpers/node/util/sysctl" 42 "k8s.io/klog/v2" 43 "k8s.io/kubernetes/pkg/proxy" 44 "k8s.io/kubernetes/pkg/proxy/conntrack" 45 "k8s.io/kubernetes/pkg/proxy/healthcheck" 46 "k8s.io/kubernetes/pkg/proxy/metaproxier" 47 "k8s.io/kubernetes/pkg/proxy/metrics" 48 proxyutil "k8s.io/kubernetes/pkg/proxy/util" 49 proxyutiliptables "k8s.io/kubernetes/pkg/proxy/util/iptables" 50 "k8s.io/kubernetes/pkg/util/async" 51 utiliptables "k8s.io/kubernetes/pkg/util/iptables" 52 utilexec "k8s.io/utils/exec" 53 netutils "k8s.io/utils/net" 54 ) 55 56 const ( 57 // the services chain 58 kubeServicesChain utiliptables.Chain = "KUBE-SERVICES" 59 60 // the external services chain 61 kubeExternalServicesChain utiliptables.Chain = "KUBE-EXTERNAL-SERVICES" 62 63 // the nodeports chain 64 kubeNodePortsChain utiliptables.Chain = "KUBE-NODEPORTS" 65 66 // the kubernetes postrouting chain 67 kubePostroutingChain utiliptables.Chain = "KUBE-POSTROUTING" 68 69 // kubeMarkMasqChain is the mark-for-masquerade chain 70 kubeMarkMasqChain utiliptables.Chain = "KUBE-MARK-MASQ" 71 72 // the kubernetes forward chain 73 kubeForwardChain utiliptables.Chain = "KUBE-FORWARD" 74 75 // kubeProxyFirewallChain is the kube-proxy firewall chain 76 kubeProxyFirewallChain utiliptables.Chain = "KUBE-PROXY-FIREWALL" 77 78 // kube proxy canary chain is used for monitoring rule reload 79 kubeProxyCanaryChain utiliptables.Chain = "KUBE-PROXY-CANARY" 80 81 // kubeletFirewallChain is a duplicate of kubelet's firewall containing 82 // the anti-martian-packet rule. It should not be used for any other 83 // rules. 84 kubeletFirewallChain utiliptables.Chain = "KUBE-FIREWALL" 85 86 // largeClusterEndpointsThreshold is the number of endpoints at which 87 // we switch into "large cluster mode" and optimize for iptables 88 // performance over iptables debuggability 89 largeClusterEndpointsThreshold = 1000 90 ) 91 92 const sysctlRouteLocalnet = "net/ipv4/conf/all/route_localnet" 93 const sysctlNFConntrackTCPBeLiberal = "net/netfilter/nf_conntrack_tcp_be_liberal" 94 95 // internal struct for string service information 96 type servicePortInfo struct { 97 *proxy.BaseServicePortInfo 98 // The following fields are computed and stored for performance reasons. 99 nameString string 100 clusterPolicyChainName utiliptables.Chain 101 localPolicyChainName utiliptables.Chain 102 firewallChainName utiliptables.Chain 103 externalChainName utiliptables.Chain 104 } 105 106 // returns a new proxy.ServicePort which abstracts a serviceInfo 107 func newServiceInfo(port *v1.ServicePort, service *v1.Service, bsvcPortInfo *proxy.BaseServicePortInfo) proxy.ServicePort { 108 svcPort := &servicePortInfo{BaseServicePortInfo: bsvcPortInfo} 109 110 // Store the following for performance reasons. 111 svcName := types.NamespacedName{Namespace: service.Namespace, Name: service.Name} 112 svcPortName := proxy.ServicePortName{NamespacedName: svcName, Port: port.Name} 113 protocol := strings.ToLower(string(svcPort.Protocol())) 114 svcPort.nameString = svcPortName.String() 115 svcPort.clusterPolicyChainName = servicePortPolicyClusterChain(svcPort.nameString, protocol) 116 svcPort.localPolicyChainName = servicePortPolicyLocalChainName(svcPort.nameString, protocol) 117 svcPort.firewallChainName = serviceFirewallChainName(svcPort.nameString, protocol) 118 svcPort.externalChainName = serviceExternalChainName(svcPort.nameString, protocol) 119 120 return svcPort 121 } 122 123 // internal struct for endpoints information 124 type endpointInfo struct { 125 *proxy.BaseEndpointInfo 126 127 ChainName utiliptables.Chain 128 } 129 130 // returns a new proxy.Endpoint which abstracts a endpointInfo 131 func newEndpointInfo(baseInfo *proxy.BaseEndpointInfo, svcPortName *proxy.ServicePortName) proxy.Endpoint { 132 return &endpointInfo{ 133 BaseEndpointInfo: baseInfo, 134 ChainName: servicePortEndpointChainName(svcPortName.String(), strings.ToLower(string(svcPortName.Protocol)), baseInfo.String()), 135 } 136 } 137 138 // Proxier is an iptables based proxy for connections between a localhost:lport 139 // and services that provide the actual backends. 140 type Proxier struct { 141 // ipFamily defines the IP family which this proxier is tracking. 142 ipFamily v1.IPFamily 143 144 // endpointsChanges and serviceChanges contains all changes to endpoints and 145 // services that happened since iptables was synced. For a single object, 146 // changes are accumulated, i.e. previous is state from before all of them, 147 // current is state after applying all of those. 148 endpointsChanges *proxy.EndpointsChangeTracker 149 serviceChanges *proxy.ServiceChangeTracker 150 151 mu sync.Mutex // protects the following fields 152 svcPortMap proxy.ServicePortMap 153 endpointsMap proxy.EndpointsMap 154 nodeLabels map[string]string 155 // endpointSlicesSynced, and servicesSynced are set to true 156 // when corresponding objects are synced after startup. This is used to avoid 157 // updating iptables with some partial data after kube-proxy restart. 158 endpointSlicesSynced bool 159 servicesSynced bool 160 needFullSync bool 161 initialized int32 162 syncRunner *async.BoundedFrequencyRunner // governs calls to syncProxyRules 163 syncPeriod time.Duration 164 lastIPTablesCleanup time.Time 165 166 // These are effectively const and do not need the mutex to be held. 167 iptables utiliptables.Interface 168 masqueradeAll bool 169 masqueradeMark string 170 exec utilexec.Interface 171 localDetector proxyutiliptables.LocalTrafficDetector 172 hostname string 173 nodeIP net.IP 174 recorder events.EventRecorder 175 176 serviceHealthServer healthcheck.ServiceHealthServer 177 healthzServer *healthcheck.ProxierHealthServer 178 179 // Since converting probabilities (floats) to strings is expensive 180 // and we are using only probabilities in the format of 1/n, we are 181 // precomputing some number of those and cache for future reuse. 182 precomputedProbabilities []string 183 184 // The following buffers are used to reuse memory and avoid allocations 185 // that are significantly impacting performance. 186 iptablesData *bytes.Buffer 187 existingFilterChainsData *bytes.Buffer 188 filterChains proxyutil.LineBuffer 189 filterRules proxyutil.LineBuffer 190 natChains proxyutil.LineBuffer 191 natRules proxyutil.LineBuffer 192 193 // largeClusterMode is set at the beginning of syncProxyRules if we are 194 // going to end up outputting "lots" of iptables rules and so we need to 195 // optimize for performance over debuggability. 196 largeClusterMode bool 197 198 // localhostNodePorts indicates whether we allow NodePort services to be accessed 199 // via localhost. 200 localhostNodePorts bool 201 202 // conntrackTCPLiberal indicates whether the system sets the kernel nf_conntrack_tcp_be_liberal 203 conntrackTCPLiberal bool 204 205 // nodePortAddresses selects the interfaces where nodePort works. 206 nodePortAddresses *proxyutil.NodePortAddresses 207 // networkInterfacer defines an interface for several net library functions. 208 // Inject for test purpose. 209 networkInterfacer proxyutil.NetworkInterfacer 210 } 211 212 // Proxier implements proxy.Provider 213 var _ proxy.Provider = &Proxier{} 214 215 // NewProxier returns a new Proxier given an iptables Interface instance. 216 // Because of the iptables logic, it is assumed that there is only a single Proxier active on a machine. 217 // An error will be returned if iptables fails to update or acquire the initial lock. 218 // Once a proxier is created, it will keep iptables up to date in the background and 219 // will not terminate if a particular iptables call fails. 220 func NewProxier(ipFamily v1.IPFamily, 221 ipt utiliptables.Interface, 222 sysctl utilsysctl.Interface, 223 exec utilexec.Interface, 224 syncPeriod time.Duration, 225 minSyncPeriod time.Duration, 226 masqueradeAll bool, 227 localhostNodePorts bool, 228 masqueradeBit int, 229 localDetector proxyutiliptables.LocalTrafficDetector, 230 hostname string, 231 nodeIP net.IP, 232 recorder events.EventRecorder, 233 healthzServer *healthcheck.ProxierHealthServer, 234 nodePortAddressStrings []string, 235 initOnly bool, 236 ) (*Proxier, error) { 237 nodePortAddresses := proxyutil.NewNodePortAddresses(ipFamily, nodePortAddressStrings) 238 239 if !nodePortAddresses.ContainsIPv4Loopback() { 240 localhostNodePorts = false 241 } 242 if localhostNodePorts { 243 // Set the route_localnet sysctl we need for exposing NodePorts on loopback addresses 244 // Refer to https://issues.k8s.io/90259 245 klog.InfoS("Setting route_localnet=1 to allow node-ports on localhost; to change this either disable iptables.localhostNodePorts (--iptables-localhost-nodeports) or set nodePortAddresses (--nodeport-addresses) to filter loopback addresses") 246 if err := proxyutil.EnsureSysctl(sysctl, sysctlRouteLocalnet, 1); err != nil { 247 return nil, err 248 } 249 } 250 251 // Be conservative in what you do, be liberal in what you accept from others. 252 // If it's non-zero, we mark only out of window RST segments as INVALID. 253 // Ref: https://docs.kernel.org/networking/nf_conntrack-sysctl.html 254 conntrackTCPLiberal := false 255 if val, err := sysctl.GetSysctl(sysctlNFConntrackTCPBeLiberal); err == nil && val != 0 { 256 conntrackTCPLiberal = true 257 klog.InfoS("nf_conntrack_tcp_be_liberal set, not installing DROP rules for INVALID packets") 258 } 259 260 if initOnly { 261 klog.InfoS("System initialized and --init-only specified") 262 return nil, nil 263 } 264 265 // Generate the masquerade mark to use for SNAT rules. 266 masqueradeValue := 1 << uint(masqueradeBit) 267 masqueradeMark := fmt.Sprintf("%#08x", masqueradeValue) 268 klog.V(2).InfoS("Using iptables mark for masquerade", "ipFamily", ipt.Protocol(), "mark", masqueradeMark) 269 270 serviceHealthServer := healthcheck.NewServiceHealthServer(hostname, recorder, nodePortAddresses, healthzServer) 271 272 proxier := &Proxier{ 273 ipFamily: ipFamily, 274 svcPortMap: make(proxy.ServicePortMap), 275 serviceChanges: proxy.NewServiceChangeTracker(newServiceInfo, ipFamily, recorder, nil), 276 endpointsMap: make(proxy.EndpointsMap), 277 endpointsChanges: proxy.NewEndpointsChangeTracker(hostname, newEndpointInfo, ipFamily, recorder, nil), 278 needFullSync: true, 279 syncPeriod: syncPeriod, 280 iptables: ipt, 281 masqueradeAll: masqueradeAll, 282 masqueradeMark: masqueradeMark, 283 exec: exec, 284 localDetector: localDetector, 285 hostname: hostname, 286 nodeIP: nodeIP, 287 recorder: recorder, 288 serviceHealthServer: serviceHealthServer, 289 healthzServer: healthzServer, 290 precomputedProbabilities: make([]string, 0, 1001), 291 iptablesData: bytes.NewBuffer(nil), 292 existingFilterChainsData: bytes.NewBuffer(nil), 293 filterChains: proxyutil.NewLineBuffer(), 294 filterRules: proxyutil.NewLineBuffer(), 295 natChains: proxyutil.NewLineBuffer(), 296 natRules: proxyutil.NewLineBuffer(), 297 localhostNodePorts: localhostNodePorts, 298 nodePortAddresses: nodePortAddresses, 299 networkInterfacer: proxyutil.RealNetwork{}, 300 conntrackTCPLiberal: conntrackTCPLiberal, 301 } 302 303 burstSyncs := 2 304 klog.V(2).InfoS("Iptables sync params", "ipFamily", ipt.Protocol(), "minSyncPeriod", minSyncPeriod, "syncPeriod", syncPeriod, "burstSyncs", burstSyncs) 305 // We pass syncPeriod to ipt.Monitor, which will call us only if it needs to. 306 // We need to pass *some* maxInterval to NewBoundedFrequencyRunner anyway though. 307 // time.Hour is arbitrary. 308 proxier.syncRunner = async.NewBoundedFrequencyRunner("sync-runner", proxier.syncProxyRules, minSyncPeriod, time.Hour, burstSyncs) 309 310 go ipt.Monitor(kubeProxyCanaryChain, []utiliptables.Table{utiliptables.TableMangle, utiliptables.TableNAT, utiliptables.TableFilter}, 311 proxier.forceSyncProxyRules, syncPeriod, wait.NeverStop) 312 313 if ipt.HasRandomFully() { 314 klog.V(2).InfoS("Iptables supports --random-fully", "ipFamily", ipt.Protocol()) 315 } else { 316 klog.V(2).InfoS("Iptables does not support --random-fully", "ipFamily", ipt.Protocol()) 317 } 318 319 return proxier, nil 320 } 321 322 // NewDualStackProxier creates a MetaProxier instance, with IPv4 and IPv6 proxies. 323 func NewDualStackProxier( 324 ipt [2]utiliptables.Interface, 325 sysctl utilsysctl.Interface, 326 exec utilexec.Interface, 327 syncPeriod time.Duration, 328 minSyncPeriod time.Duration, 329 masqueradeAll bool, 330 localhostNodePorts bool, 331 masqueradeBit int, 332 localDetectors [2]proxyutiliptables.LocalTrafficDetector, 333 hostname string, 334 nodeIPs map[v1.IPFamily]net.IP, 335 recorder events.EventRecorder, 336 healthzServer *healthcheck.ProxierHealthServer, 337 nodePortAddresses []string, 338 initOnly bool, 339 ) (proxy.Provider, error) { 340 // Create an ipv4 instance of the single-stack proxier 341 ipv4Proxier, err := NewProxier(v1.IPv4Protocol, ipt[0], sysctl, 342 exec, syncPeriod, minSyncPeriod, masqueradeAll, localhostNodePorts, masqueradeBit, localDetectors[0], hostname, 343 nodeIPs[v1.IPv4Protocol], recorder, healthzServer, nodePortAddresses, initOnly) 344 if err != nil { 345 return nil, fmt.Errorf("unable to create ipv4 proxier: %v", err) 346 } 347 348 ipv6Proxier, err := NewProxier(v1.IPv6Protocol, ipt[1], sysctl, 349 exec, syncPeriod, minSyncPeriod, masqueradeAll, false, masqueradeBit, localDetectors[1], hostname, 350 nodeIPs[v1.IPv6Protocol], recorder, healthzServer, nodePortAddresses, initOnly) 351 if err != nil { 352 return nil, fmt.Errorf("unable to create ipv6 proxier: %v", err) 353 } 354 if initOnly { 355 return nil, nil 356 } 357 return metaproxier.NewMetaProxier(ipv4Proxier, ipv6Proxier), nil 358 } 359 360 type iptablesJumpChain struct { 361 table utiliptables.Table 362 dstChain utiliptables.Chain 363 srcChain utiliptables.Chain 364 comment string 365 extraArgs []string 366 } 367 368 var iptablesJumpChains = []iptablesJumpChain{ 369 {utiliptables.TableFilter, kubeExternalServicesChain, utiliptables.ChainInput, "kubernetes externally-visible service portals", []string{"-m", "conntrack", "--ctstate", "NEW"}}, 370 {utiliptables.TableFilter, kubeExternalServicesChain, utiliptables.ChainForward, "kubernetes externally-visible service portals", []string{"-m", "conntrack", "--ctstate", "NEW"}}, 371 {utiliptables.TableFilter, kubeNodePortsChain, utiliptables.ChainInput, "kubernetes health check service ports", nil}, 372 {utiliptables.TableFilter, kubeServicesChain, utiliptables.ChainForward, "kubernetes service portals", []string{"-m", "conntrack", "--ctstate", "NEW"}}, 373 {utiliptables.TableFilter, kubeServicesChain, utiliptables.ChainOutput, "kubernetes service portals", []string{"-m", "conntrack", "--ctstate", "NEW"}}, 374 {utiliptables.TableFilter, kubeForwardChain, utiliptables.ChainForward, "kubernetes forwarding rules", nil}, 375 {utiliptables.TableFilter, kubeProxyFirewallChain, utiliptables.ChainInput, "kubernetes load balancer firewall", []string{"-m", "conntrack", "--ctstate", "NEW"}}, 376 {utiliptables.TableFilter, kubeProxyFirewallChain, utiliptables.ChainOutput, "kubernetes load balancer firewall", []string{"-m", "conntrack", "--ctstate", "NEW"}}, 377 {utiliptables.TableFilter, kubeProxyFirewallChain, utiliptables.ChainForward, "kubernetes load balancer firewall", []string{"-m", "conntrack", "--ctstate", "NEW"}}, 378 {utiliptables.TableNAT, kubeServicesChain, utiliptables.ChainOutput, "kubernetes service portals", nil}, 379 {utiliptables.TableNAT, kubeServicesChain, utiliptables.ChainPrerouting, "kubernetes service portals", nil}, 380 {utiliptables.TableNAT, kubePostroutingChain, utiliptables.ChainPostrouting, "kubernetes postrouting rules", nil}, 381 } 382 383 // Duplicates of chains created in pkg/kubelet/kubelet_network_linux.go; we create these 384 // on startup but do not delete them in CleanupLeftovers. 385 var iptablesKubeletJumpChains = []iptablesJumpChain{ 386 {utiliptables.TableFilter, kubeletFirewallChain, utiliptables.ChainInput, "", nil}, 387 {utiliptables.TableFilter, kubeletFirewallChain, utiliptables.ChainOutput, "", nil}, 388 } 389 390 // When chains get removed from iptablesJumpChains, add them here so they get cleaned up 391 // on upgrade. 392 var iptablesCleanupOnlyChains = []iptablesJumpChain{} 393 394 // CleanupLeftovers removes all iptables rules and chains created by the Proxier 395 // It returns true if an error was encountered. Errors are logged. 396 func CleanupLeftovers(ipt utiliptables.Interface) (encounteredError bool) { 397 // Unlink our chains 398 for _, jump := range append(iptablesJumpChains, iptablesCleanupOnlyChains...) { 399 args := append(jump.extraArgs, 400 "-m", "comment", "--comment", jump.comment, 401 "-j", string(jump.dstChain), 402 ) 403 if err := ipt.DeleteRule(jump.table, jump.srcChain, args...); err != nil { 404 if !utiliptables.IsNotFoundError(err) { 405 klog.ErrorS(err, "Error removing pure-iptables proxy rule") 406 encounteredError = true 407 } 408 } 409 } 410 411 // Flush and remove all of our "-t nat" chains. 412 iptablesData := bytes.NewBuffer(nil) 413 if err := ipt.SaveInto(utiliptables.TableNAT, iptablesData); err != nil { 414 klog.ErrorS(err, "Failed to execute iptables-save", "table", utiliptables.TableNAT) 415 encounteredError = true 416 } else { 417 existingNATChains := utiliptables.GetChainsFromTable(iptablesData.Bytes()) 418 natChains := proxyutil.NewLineBuffer() 419 natRules := proxyutil.NewLineBuffer() 420 natChains.Write("*nat") 421 // Start with chains we know we need to remove. 422 for _, chain := range []utiliptables.Chain{kubeServicesChain, kubeNodePortsChain, kubePostroutingChain} { 423 if _, found := existingNATChains[chain]; found { 424 chainString := string(chain) 425 natChains.Write(utiliptables.MakeChainLine(chain)) // flush 426 natRules.Write("-X", chainString) // delete 427 } 428 } 429 // Hunt for service and endpoint chains. 430 for chain := range existingNATChains { 431 chainString := string(chain) 432 if isServiceChainName(chainString) { 433 natChains.Write(utiliptables.MakeChainLine(chain)) // flush 434 natRules.Write("-X", chainString) // delete 435 } 436 } 437 natRules.Write("COMMIT") 438 natLines := append(natChains.Bytes(), natRules.Bytes()...) 439 // Write it. 440 err = ipt.Restore(utiliptables.TableNAT, natLines, utiliptables.NoFlushTables, utiliptables.RestoreCounters) 441 if err != nil { 442 klog.ErrorS(err, "Failed to execute iptables-restore", "table", utiliptables.TableNAT) 443 metrics.IptablesRestoreFailuresTotal.Inc() 444 encounteredError = true 445 } 446 } 447 448 // Flush and remove all of our "-t filter" chains. 449 iptablesData.Reset() 450 if err := ipt.SaveInto(utiliptables.TableFilter, iptablesData); err != nil { 451 klog.ErrorS(err, "Failed to execute iptables-save", "table", utiliptables.TableFilter) 452 encounteredError = true 453 } else { 454 existingFilterChains := utiliptables.GetChainsFromTable(iptablesData.Bytes()) 455 filterChains := proxyutil.NewLineBuffer() 456 filterRules := proxyutil.NewLineBuffer() 457 filterChains.Write("*filter") 458 for _, chain := range []utiliptables.Chain{kubeServicesChain, kubeExternalServicesChain, kubeForwardChain, kubeNodePortsChain} { 459 if _, found := existingFilterChains[chain]; found { 460 chainString := string(chain) 461 filterChains.Write(utiliptables.MakeChainLine(chain)) 462 filterRules.Write("-X", chainString) 463 } 464 } 465 filterRules.Write("COMMIT") 466 filterLines := append(filterChains.Bytes(), filterRules.Bytes()...) 467 // Write it. 468 if err := ipt.Restore(utiliptables.TableFilter, filterLines, utiliptables.NoFlushTables, utiliptables.RestoreCounters); err != nil { 469 klog.ErrorS(err, "Failed to execute iptables-restore", "table", utiliptables.TableFilter) 470 metrics.IptablesRestoreFailuresTotal.Inc() 471 encounteredError = true 472 } 473 } 474 return encounteredError 475 } 476 477 func computeProbability(n int) string { 478 return fmt.Sprintf("%0.10f", 1.0/float64(n)) 479 } 480 481 // This assumes proxier.mu is held 482 func (proxier *Proxier) precomputeProbabilities(numberOfPrecomputed int) { 483 if len(proxier.precomputedProbabilities) == 0 { 484 proxier.precomputedProbabilities = append(proxier.precomputedProbabilities, "<bad value>") 485 } 486 for i := len(proxier.precomputedProbabilities); i <= numberOfPrecomputed; i++ { 487 proxier.precomputedProbabilities = append(proxier.precomputedProbabilities, computeProbability(i)) 488 } 489 } 490 491 // This assumes proxier.mu is held 492 func (proxier *Proxier) probability(n int) string { 493 if n >= len(proxier.precomputedProbabilities) { 494 proxier.precomputeProbabilities(n) 495 } 496 return proxier.precomputedProbabilities[n] 497 } 498 499 // Sync is called to synchronize the proxier state to iptables as soon as possible. 500 func (proxier *Proxier) Sync() { 501 if proxier.healthzServer != nil { 502 proxier.healthzServer.QueuedUpdate(proxier.ipFamily) 503 } 504 metrics.SyncProxyRulesLastQueuedTimestamp.SetToCurrentTime() 505 proxier.syncRunner.Run() 506 } 507 508 // SyncLoop runs periodic work. This is expected to run as a goroutine or as the main loop of the app. It does not return. 509 func (proxier *Proxier) SyncLoop() { 510 // Update healthz timestamp at beginning in case Sync() never succeeds. 511 if proxier.healthzServer != nil { 512 proxier.healthzServer.Updated(proxier.ipFamily) 513 } 514 515 // synthesize "last change queued" time as the informers are syncing. 516 metrics.SyncProxyRulesLastQueuedTimestamp.SetToCurrentTime() 517 proxier.syncRunner.Loop(wait.NeverStop) 518 } 519 520 func (proxier *Proxier) setInitialized(value bool) { 521 var initialized int32 522 if value { 523 initialized = 1 524 } 525 atomic.StoreInt32(&proxier.initialized, initialized) 526 } 527 528 func (proxier *Proxier) isInitialized() bool { 529 return atomic.LoadInt32(&proxier.initialized) > 0 530 } 531 532 // OnServiceAdd is called whenever creation of new service object 533 // is observed. 534 func (proxier *Proxier) OnServiceAdd(service *v1.Service) { 535 proxier.OnServiceUpdate(nil, service) 536 } 537 538 // OnServiceUpdate is called whenever modification of an existing 539 // service object is observed. 540 func (proxier *Proxier) OnServiceUpdate(oldService, service *v1.Service) { 541 if proxier.serviceChanges.Update(oldService, service) && proxier.isInitialized() { 542 proxier.Sync() 543 } 544 } 545 546 // OnServiceDelete is called whenever deletion of an existing service 547 // object is observed. 548 func (proxier *Proxier) OnServiceDelete(service *v1.Service) { 549 proxier.OnServiceUpdate(service, nil) 550 551 } 552 553 // OnServiceSynced is called once all the initial event handlers were 554 // called and the state is fully propagated to local cache. 555 func (proxier *Proxier) OnServiceSynced() { 556 proxier.mu.Lock() 557 proxier.servicesSynced = true 558 proxier.setInitialized(proxier.endpointSlicesSynced) 559 proxier.mu.Unlock() 560 561 // Sync unconditionally - this is called once per lifetime. 562 proxier.syncProxyRules() 563 } 564 565 // OnEndpointSliceAdd is called whenever creation of a new endpoint slice object 566 // is observed. 567 func (proxier *Proxier) OnEndpointSliceAdd(endpointSlice *discovery.EndpointSlice) { 568 if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, false) && proxier.isInitialized() { 569 proxier.Sync() 570 } 571 } 572 573 // OnEndpointSliceUpdate is called whenever modification of an existing endpoint 574 // slice object is observed. 575 func (proxier *Proxier) OnEndpointSliceUpdate(_, endpointSlice *discovery.EndpointSlice) { 576 if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, false) && proxier.isInitialized() { 577 proxier.Sync() 578 } 579 } 580 581 // OnEndpointSliceDelete is called whenever deletion of an existing endpoint slice 582 // object is observed. 583 func (proxier *Proxier) OnEndpointSliceDelete(endpointSlice *discovery.EndpointSlice) { 584 if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, true) && proxier.isInitialized() { 585 proxier.Sync() 586 } 587 } 588 589 // OnEndpointSlicesSynced is called once all the initial event handlers were 590 // called and the state is fully propagated to local cache. 591 func (proxier *Proxier) OnEndpointSlicesSynced() { 592 proxier.mu.Lock() 593 proxier.endpointSlicesSynced = true 594 proxier.setInitialized(proxier.servicesSynced) 595 proxier.mu.Unlock() 596 597 // Sync unconditionally - this is called once per lifetime. 598 proxier.syncProxyRules() 599 } 600 601 // OnNodeAdd is called whenever creation of new node object 602 // is observed. 603 func (proxier *Proxier) OnNodeAdd(node *v1.Node) { 604 if node.Name != proxier.hostname { 605 klog.ErrorS(nil, "Received a watch event for a node that doesn't match the current node", 606 "eventNode", node.Name, "currentNode", proxier.hostname) 607 return 608 } 609 610 if reflect.DeepEqual(proxier.nodeLabels, node.Labels) { 611 return 612 } 613 614 proxier.mu.Lock() 615 proxier.nodeLabels = map[string]string{} 616 for k, v := range node.Labels { 617 proxier.nodeLabels[k] = v 618 } 619 proxier.needFullSync = true 620 proxier.mu.Unlock() 621 klog.V(4).InfoS("Updated proxier node labels", "labels", node.Labels) 622 623 proxier.Sync() 624 } 625 626 // OnNodeUpdate is called whenever modification of an existing 627 // node object is observed. 628 func (proxier *Proxier) OnNodeUpdate(oldNode, node *v1.Node) { 629 if node.Name != proxier.hostname { 630 klog.ErrorS(nil, "Received a watch event for a node that doesn't match the current node", 631 "eventNode", node.Name, "currentNode", proxier.hostname) 632 return 633 } 634 635 if reflect.DeepEqual(proxier.nodeLabels, node.Labels) { 636 return 637 } 638 639 proxier.mu.Lock() 640 proxier.nodeLabels = map[string]string{} 641 for k, v := range node.Labels { 642 proxier.nodeLabels[k] = v 643 } 644 proxier.needFullSync = true 645 proxier.mu.Unlock() 646 klog.V(4).InfoS("Updated proxier node labels", "labels", node.Labels) 647 648 proxier.Sync() 649 } 650 651 // OnNodeDelete is called whenever deletion of an existing node 652 // object is observed. 653 func (proxier *Proxier) OnNodeDelete(node *v1.Node) { 654 if node.Name != proxier.hostname { 655 klog.ErrorS(nil, "Received a watch event for a node that doesn't match the current node", 656 "eventNode", node.Name, "currentNode", proxier.hostname) 657 return 658 } 659 660 proxier.mu.Lock() 661 proxier.nodeLabels = nil 662 proxier.needFullSync = true 663 proxier.mu.Unlock() 664 665 proxier.Sync() 666 } 667 668 // OnNodeSynced is called once all the initial event handlers were 669 // called and the state is fully propagated to local cache. 670 func (proxier *Proxier) OnNodeSynced() { 671 } 672 673 // portProtoHash takes the ServicePortName and protocol for a service 674 // returns the associated 16 character hash. This is computed by hashing (sha256) 675 // then encoding to base32 and truncating to 16 chars. We do this because IPTables 676 // Chain Names must be <= 28 chars long, and the longer they are the harder they are to read. 677 func portProtoHash(servicePortName string, protocol string) string { 678 hash := sha256.Sum256([]byte(servicePortName + protocol)) 679 encoded := base32.StdEncoding.EncodeToString(hash[:]) 680 return encoded[:16] 681 } 682 683 const ( 684 servicePortPolicyClusterChainNamePrefix = "KUBE-SVC-" 685 servicePortPolicyLocalChainNamePrefix = "KUBE-SVL-" 686 serviceFirewallChainNamePrefix = "KUBE-FW-" 687 serviceExternalChainNamePrefix = "KUBE-EXT-" 688 servicePortEndpointChainNamePrefix = "KUBE-SEP-" 689 ) 690 691 // servicePortPolicyClusterChain returns the name of the KUBE-SVC-XXXX chain for a service, which is the 692 // main iptables chain for that service, used for dispatching to endpoints when using `Cluster` 693 // traffic policy. 694 func servicePortPolicyClusterChain(servicePortName string, protocol string) utiliptables.Chain { 695 return utiliptables.Chain(servicePortPolicyClusterChainNamePrefix + portProtoHash(servicePortName, protocol)) 696 } 697 698 // servicePortPolicyLocalChainName returns the name of the KUBE-SVL-XXXX chain for a service, which 699 // handles dispatching to local endpoints when using `Local` traffic policy. This chain only 700 // exists if the service has `Local` internal or external traffic policy. 701 func servicePortPolicyLocalChainName(servicePortName string, protocol string) utiliptables.Chain { 702 return utiliptables.Chain(servicePortPolicyLocalChainNamePrefix + portProtoHash(servicePortName, protocol)) 703 } 704 705 // serviceFirewallChainName returns the name of the KUBE-FW-XXXX chain for a service, which 706 // is used to implement the filtering for the LoadBalancerSourceRanges feature. 707 func serviceFirewallChainName(servicePortName string, protocol string) utiliptables.Chain { 708 return utiliptables.Chain(serviceFirewallChainNamePrefix + portProtoHash(servicePortName, protocol)) 709 } 710 711 // serviceExternalChainName returns the name of the KUBE-EXT-XXXX chain for a service, which 712 // implements "short-circuiting" for internally-originated external-destination traffic when using 713 // `Local` external traffic policy. It forwards traffic from local sources to the KUBE-SVC-XXXX 714 // chain and traffic from external sources to the KUBE-SVL-XXXX chain. 715 func serviceExternalChainName(servicePortName string, protocol string) utiliptables.Chain { 716 return utiliptables.Chain(serviceExternalChainNamePrefix + portProtoHash(servicePortName, protocol)) 717 } 718 719 // servicePortEndpointChainName returns the name of the KUBE-SEP-XXXX chain for a particular 720 // service endpoint. 721 func servicePortEndpointChainName(servicePortName string, protocol string, endpoint string) utiliptables.Chain { 722 hash := sha256.Sum256([]byte(servicePortName + protocol + endpoint)) 723 encoded := base32.StdEncoding.EncodeToString(hash[:]) 724 return utiliptables.Chain(servicePortEndpointChainNamePrefix + encoded[:16]) 725 } 726 727 func isServiceChainName(chainString string) bool { 728 prefixes := []string{ 729 servicePortPolicyClusterChainNamePrefix, 730 servicePortPolicyLocalChainNamePrefix, 731 servicePortEndpointChainNamePrefix, 732 serviceFirewallChainNamePrefix, 733 serviceExternalChainNamePrefix, 734 } 735 736 for _, p := range prefixes { 737 if strings.HasPrefix(chainString, p) { 738 return true 739 } 740 } 741 return false 742 } 743 744 // Assumes proxier.mu is held. 745 func (proxier *Proxier) appendServiceCommentLocked(args []string, svcName string) []string { 746 // Not printing these comments, can reduce size of iptables (in case of large 747 // number of endpoints) even by 40%+. So if total number of endpoint chains 748 // is large enough, we simply drop those comments. 749 if proxier.largeClusterMode { 750 return args 751 } 752 return append(args, "-m", "comment", "--comment", svcName) 753 } 754 755 // Called by the iptables.Monitor, and in response to topology changes; this calls 756 // syncProxyRules() and tells it to resync all services, regardless of whether the 757 // Service or Endpoints/EndpointSlice objects themselves have changed 758 func (proxier *Proxier) forceSyncProxyRules() { 759 proxier.mu.Lock() 760 proxier.needFullSync = true 761 proxier.mu.Unlock() 762 763 proxier.syncProxyRules() 764 } 765 766 // This is where all of the iptables-save/restore calls happen. 767 // The only other iptables rules are those that are setup in iptablesInit() 768 // This assumes proxier.mu is NOT held 769 func (proxier *Proxier) syncProxyRules() { 770 proxier.mu.Lock() 771 defer proxier.mu.Unlock() 772 773 // don't sync rules till we've received services and endpoints 774 if !proxier.isInitialized() { 775 klog.V(2).InfoS("Not syncing iptables until Services and Endpoints have been received from master") 776 return 777 } 778 779 // The value of proxier.needFullSync may change before the defer funcs run, so 780 // we need to keep track of whether it was set at the *start* of the sync. 781 tryPartialSync := !proxier.needFullSync 782 783 // Keep track of how long syncs take. 784 start := time.Now() 785 defer func() { 786 metrics.SyncProxyRulesLatency.Observe(metrics.SinceInSeconds(start)) 787 if tryPartialSync { 788 metrics.SyncPartialProxyRulesLatency.Observe(metrics.SinceInSeconds(start)) 789 } else { 790 metrics.SyncFullProxyRulesLatency.Observe(metrics.SinceInSeconds(start)) 791 } 792 klog.V(2).InfoS("SyncProxyRules complete", "elapsed", time.Since(start)) 793 }() 794 795 serviceUpdateResult := proxier.svcPortMap.Update(proxier.serviceChanges) 796 endpointUpdateResult := proxier.endpointsMap.Update(proxier.endpointsChanges) 797 798 klog.V(2).InfoS("Syncing iptables rules") 799 800 success := false 801 defer func() { 802 if !success { 803 klog.InfoS("Sync failed", "retryingTime", proxier.syncPeriod) 804 proxier.syncRunner.RetryAfter(proxier.syncPeriod) 805 if tryPartialSync { 806 metrics.IptablesPartialRestoreFailuresTotal.Inc() 807 } 808 // proxier.serviceChanges and proxier.endpointChanges have already 809 // been flushed, so we've lost the state needed to be able to do 810 // a partial sync. 811 proxier.needFullSync = true 812 } 813 }() 814 815 if !tryPartialSync { 816 // Ensure that our jump rules (eg from PREROUTING to KUBE-SERVICES) exist. 817 // We can't do this as part of the iptables-restore because we don't want 818 // to specify/replace *all* of the rules in PREROUTING, etc. 819 // 820 // We need to create these rules when kube-proxy first starts, and we need 821 // to recreate them if the utiliptables Monitor detects that iptables has 822 // been flushed. In both of those cases, the code will force a full sync. 823 // In all other cases, it ought to be safe to assume that the rules 824 // already exist, so we'll skip this step when doing a partial sync, to 825 // save us from having to invoke /sbin/iptables 20 times on each sync 826 // (which will be very slow on hosts with lots of iptables rules). 827 for _, jump := range append(iptablesJumpChains, iptablesKubeletJumpChains...) { 828 if _, err := proxier.iptables.EnsureChain(jump.table, jump.dstChain); err != nil { 829 klog.ErrorS(err, "Failed to ensure chain exists", "table", jump.table, "chain", jump.dstChain) 830 return 831 } 832 args := jump.extraArgs 833 if jump.comment != "" { 834 args = append(args, "-m", "comment", "--comment", jump.comment) 835 } 836 args = append(args, "-j", string(jump.dstChain)) 837 if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, jump.table, jump.srcChain, args...); err != nil { 838 klog.ErrorS(err, "Failed to ensure chain jumps", "table", jump.table, "srcChain", jump.srcChain, "dstChain", jump.dstChain) 839 return 840 } 841 } 842 } 843 844 // 845 // Below this point we will not return until we try to write the iptables rules. 846 // 847 848 // Reset all buffers used later. 849 // This is to avoid memory reallocations and thus improve performance. 850 proxier.filterChains.Reset() 851 proxier.filterRules.Reset() 852 proxier.natChains.Reset() 853 proxier.natRules.Reset() 854 855 skippedNatChains := proxyutil.NewDiscardLineBuffer() 856 skippedNatRules := proxyutil.NewDiscardLineBuffer() 857 858 // Write chain lines for all the "top-level" chains we'll be filling in 859 for _, chainName := range []utiliptables.Chain{kubeServicesChain, kubeExternalServicesChain, kubeForwardChain, kubeNodePortsChain, kubeProxyFirewallChain} { 860 proxier.filterChains.Write(utiliptables.MakeChainLine(chainName)) 861 } 862 for _, chainName := range []utiliptables.Chain{kubeServicesChain, kubeNodePortsChain, kubePostroutingChain, kubeMarkMasqChain} { 863 proxier.natChains.Write(utiliptables.MakeChainLine(chainName)) 864 } 865 866 // Install the kubernetes-specific postrouting rules. We use a whole chain for 867 // this so that it is easier to flush and change, for example if the mark 868 // value should ever change. 869 870 proxier.natRules.Write( 871 "-A", string(kubePostroutingChain), 872 "-m", "mark", "!", "--mark", fmt.Sprintf("%s/%s", proxier.masqueradeMark, proxier.masqueradeMark), 873 "-j", "RETURN", 874 ) 875 // Clear the mark to avoid re-masquerading if the packet re-traverses the network stack. 876 proxier.natRules.Write( 877 "-A", string(kubePostroutingChain), 878 "-j", "MARK", "--xor-mark", proxier.masqueradeMark, 879 ) 880 masqRule := []string{ 881 "-A", string(kubePostroutingChain), 882 "-m", "comment", "--comment", `"kubernetes service traffic requiring SNAT"`, 883 "-j", "MASQUERADE", 884 } 885 if proxier.iptables.HasRandomFully() { 886 masqRule = append(masqRule, "--random-fully") 887 } 888 proxier.natRules.Write(masqRule) 889 890 // Install the kubernetes-specific masquerade mark rule. We use a whole chain for 891 // this so that it is easier to flush and change, for example if the mark 892 // value should ever change. 893 proxier.natRules.Write( 894 "-A", string(kubeMarkMasqChain), 895 "-j", "MARK", "--or-mark", proxier.masqueradeMark, 896 ) 897 898 isIPv6 := proxier.iptables.IsIPv6() 899 if !isIPv6 && proxier.localhostNodePorts { 900 // Kube-proxy's use of `route_localnet` to enable NodePorts on localhost 901 // creates a security hole (https://issue.k8s.io/90259) which this 902 // iptables rule mitigates. 903 904 // NOTE: kubelet creates an identical copy of this rule. If you want to 905 // change this rule in the future, you MUST do so in a way that will 906 // interoperate correctly with skewed versions of the rule created by 907 // kubelet. (Actually, kubelet uses "--dst"/"--src" rather than "-d"/"-s" 908 // but that's just a command-line thing and results in the same rule being 909 // created in the kernel.) 910 proxier.filterChains.Write(utiliptables.MakeChainLine(kubeletFirewallChain)) 911 proxier.filterRules.Write( 912 "-A", string(kubeletFirewallChain), 913 "-m", "comment", "--comment", `"block incoming localnet connections"`, 914 "-d", "127.0.0.0/8", 915 "!", "-s", "127.0.0.0/8", 916 "-m", "conntrack", 917 "!", "--ctstate", "RELATED,ESTABLISHED,DNAT", 918 "-j", "DROP", 919 ) 920 } 921 922 // Accumulate NAT chains to keep. 923 activeNATChains := map[utiliptables.Chain]bool{} // use a map as a set 924 925 // To avoid growing this slice, we arbitrarily set its size to 64, 926 // there is never more than that many arguments for a single line. 927 // Note that even if we go over 64, it will still be correct - it 928 // is just for efficiency, not correctness. 929 args := make([]string, 64) 930 931 // Compute total number of endpoint chains across all services 932 // to get a sense of how big the cluster is. 933 totalEndpoints := 0 934 for svcName := range proxier.svcPortMap { 935 totalEndpoints += len(proxier.endpointsMap[svcName]) 936 } 937 proxier.largeClusterMode = (totalEndpoints > largeClusterEndpointsThreshold) 938 939 // These two variables are used to publish the sync_proxy_rules_no_endpoints_total 940 // metric. 941 serviceNoLocalEndpointsTotalInternal := 0 942 serviceNoLocalEndpointsTotalExternal := 0 943 944 // Build rules for each service-port. 945 for svcName, svc := range proxier.svcPortMap { 946 svcInfo, ok := svc.(*servicePortInfo) 947 if !ok { 948 klog.ErrorS(nil, "Failed to cast serviceInfo", "serviceName", svcName) 949 continue 950 } 951 protocol := strings.ToLower(string(svcInfo.Protocol())) 952 svcPortNameString := svcInfo.nameString 953 954 // Figure out the endpoints for Cluster and Local traffic policy. 955 // allLocallyReachableEndpoints is the set of all endpoints that can be routed to 956 // from this node, given the service's traffic policies. hasEndpoints is true 957 // if the service has any usable endpoints on any node, not just this one. 958 allEndpoints := proxier.endpointsMap[svcName] 959 clusterEndpoints, localEndpoints, allLocallyReachableEndpoints, hasEndpoints := proxy.CategorizeEndpoints(allEndpoints, svcInfo, proxier.nodeLabels) 960 961 // Note the endpoint chains that will be used 962 for _, ep := range allLocallyReachableEndpoints { 963 if epInfo, ok := ep.(*endpointInfo); ok { 964 activeNATChains[epInfo.ChainName] = true 965 } 966 } 967 968 // clusterPolicyChain contains the endpoints used with "Cluster" traffic policy 969 clusterPolicyChain := svcInfo.clusterPolicyChainName 970 usesClusterPolicyChain := len(clusterEndpoints) > 0 && svcInfo.UsesClusterEndpoints() 971 if usesClusterPolicyChain { 972 activeNATChains[clusterPolicyChain] = true 973 } 974 975 // localPolicyChain contains the endpoints used with "Local" traffic policy 976 localPolicyChain := svcInfo.localPolicyChainName 977 usesLocalPolicyChain := len(localEndpoints) > 0 && svcInfo.UsesLocalEndpoints() 978 if usesLocalPolicyChain { 979 activeNATChains[localPolicyChain] = true 980 } 981 982 // internalPolicyChain is the chain containing the endpoints for 983 // "internal" (ClusterIP) traffic. internalTrafficChain is the chain that 984 // internal traffic is routed to (which is always the same as 985 // internalPolicyChain). hasInternalEndpoints is true if we should 986 // generate rules pointing to internalTrafficChain, or false if there are 987 // no available internal endpoints. 988 internalPolicyChain := clusterPolicyChain 989 hasInternalEndpoints := hasEndpoints 990 if svcInfo.InternalPolicyLocal() { 991 internalPolicyChain = localPolicyChain 992 if len(localEndpoints) == 0 { 993 hasInternalEndpoints = false 994 } 995 } 996 internalTrafficChain := internalPolicyChain 997 998 // Similarly, externalPolicyChain is the chain containing the endpoints 999 // for "external" (NodePort, LoadBalancer, and ExternalIP) traffic. 1000 // externalTrafficChain is the chain that external traffic is routed to 1001 // (which is always the service's "EXT" chain). hasExternalEndpoints is 1002 // true if there are endpoints that will be reached by external traffic. 1003 // (But we may still have to generate externalTrafficChain even if there 1004 // are no external endpoints, to ensure that the short-circuit rules for 1005 // local traffic are set up.) 1006 externalPolicyChain := clusterPolicyChain 1007 hasExternalEndpoints := hasEndpoints 1008 if svcInfo.ExternalPolicyLocal() { 1009 externalPolicyChain = localPolicyChain 1010 if len(localEndpoints) == 0 { 1011 hasExternalEndpoints = false 1012 } 1013 } 1014 externalTrafficChain := svcInfo.externalChainName // eventually jumps to externalPolicyChain 1015 1016 // usesExternalTrafficChain is based on hasEndpoints, not hasExternalEndpoints, 1017 // because we need the local-traffic-short-circuiting rules even when there 1018 // are no externally-usable endpoints. 1019 usesExternalTrafficChain := hasEndpoints && svcInfo.ExternallyAccessible() 1020 if usesExternalTrafficChain { 1021 activeNATChains[externalTrafficChain] = true 1022 } 1023 1024 // Traffic to LoadBalancer IPs can go directly to externalTrafficChain 1025 // unless LoadBalancerSourceRanges is in use in which case we will 1026 // create a firewall chain. 1027 loadBalancerTrafficChain := externalTrafficChain 1028 fwChain := svcInfo.firewallChainName 1029 usesFWChain := hasEndpoints && len(svcInfo.LoadBalancerVIPStrings()) > 0 && len(svcInfo.LoadBalancerSourceRanges()) > 0 1030 if usesFWChain { 1031 activeNATChains[fwChain] = true 1032 loadBalancerTrafficChain = fwChain 1033 } 1034 1035 var internalTrafficFilterTarget, internalTrafficFilterComment string 1036 var externalTrafficFilterTarget, externalTrafficFilterComment string 1037 if !hasEndpoints { 1038 // The service has no endpoints at all; hasInternalEndpoints and 1039 // hasExternalEndpoints will also be false, and we will not 1040 // generate any chains in the "nat" table for the service; only 1041 // rules in the "filter" table rejecting incoming packets for 1042 // the service's IPs. 1043 internalTrafficFilterTarget = "REJECT" 1044 internalTrafficFilterComment = fmt.Sprintf(`"%s has no endpoints"`, svcPortNameString) 1045 externalTrafficFilterTarget = "REJECT" 1046 externalTrafficFilterComment = internalTrafficFilterComment 1047 } else { 1048 if !hasInternalEndpoints { 1049 // The internalTrafficPolicy is "Local" but there are no local 1050 // endpoints. Traffic to the clusterIP will be dropped, but 1051 // external traffic may still be accepted. 1052 internalTrafficFilterTarget = "DROP" 1053 internalTrafficFilterComment = fmt.Sprintf(`"%s has no local endpoints"`, svcPortNameString) 1054 serviceNoLocalEndpointsTotalInternal++ 1055 } 1056 if !hasExternalEndpoints { 1057 // The externalTrafficPolicy is "Local" but there are no 1058 // local endpoints. Traffic to "external" IPs from outside 1059 // the cluster will be dropped, but traffic from inside 1060 // the cluster may still be accepted. 1061 externalTrafficFilterTarget = "DROP" 1062 externalTrafficFilterComment = fmt.Sprintf(`"%s has no local endpoints"`, svcPortNameString) 1063 serviceNoLocalEndpointsTotalExternal++ 1064 } 1065 } 1066 1067 filterRules := proxier.filterRules 1068 natChains := proxier.natChains 1069 natRules := proxier.natRules 1070 1071 // Capture the clusterIP. 1072 if hasInternalEndpoints { 1073 natRules.Write( 1074 "-A", string(kubeServicesChain), 1075 "-m", "comment", "--comment", fmt.Sprintf(`"%s cluster IP"`, svcPortNameString), 1076 "-m", protocol, "-p", protocol, 1077 "-d", svcInfo.ClusterIP().String(), 1078 "--dport", strconv.Itoa(svcInfo.Port()), 1079 "-j", string(internalTrafficChain)) 1080 } else { 1081 // No endpoints. 1082 filterRules.Write( 1083 "-A", string(kubeServicesChain), 1084 "-m", "comment", "--comment", internalTrafficFilterComment, 1085 "-m", protocol, "-p", protocol, 1086 "-d", svcInfo.ClusterIP().String(), 1087 "--dport", strconv.Itoa(svcInfo.Port()), 1088 "-j", internalTrafficFilterTarget, 1089 ) 1090 } 1091 1092 // Capture externalIPs. 1093 for _, externalIP := range svcInfo.ExternalIPStrings() { 1094 if hasEndpoints { 1095 // Send traffic bound for external IPs to the "external 1096 // destinations" chain. 1097 natRules.Write( 1098 "-A", string(kubeServicesChain), 1099 "-m", "comment", "--comment", fmt.Sprintf(`"%s external IP"`, svcPortNameString), 1100 "-m", protocol, "-p", protocol, 1101 "-d", externalIP, 1102 "--dport", strconv.Itoa(svcInfo.Port()), 1103 "-j", string(externalTrafficChain)) 1104 } 1105 if !hasExternalEndpoints { 1106 // Either no endpoints at all (REJECT) or no endpoints for 1107 // external traffic (DROP anything that didn't get 1108 // short-circuited by the EXT chain.) 1109 filterRules.Write( 1110 "-A", string(kubeExternalServicesChain), 1111 "-m", "comment", "--comment", externalTrafficFilterComment, 1112 "-m", protocol, "-p", protocol, 1113 "-d", externalIP, 1114 "--dport", strconv.Itoa(svcInfo.Port()), 1115 "-j", externalTrafficFilterTarget, 1116 ) 1117 } 1118 } 1119 1120 // Capture load-balancer ingress. 1121 for _, lbip := range svcInfo.LoadBalancerVIPStrings() { 1122 if hasEndpoints { 1123 natRules.Write( 1124 "-A", string(kubeServicesChain), 1125 "-m", "comment", "--comment", fmt.Sprintf(`"%s loadbalancer IP"`, svcPortNameString), 1126 "-m", protocol, "-p", protocol, 1127 "-d", lbip, 1128 "--dport", strconv.Itoa(svcInfo.Port()), 1129 "-j", string(loadBalancerTrafficChain)) 1130 1131 } 1132 if usesFWChain { 1133 filterRules.Write( 1134 "-A", string(kubeProxyFirewallChain), 1135 "-m", "comment", "--comment", fmt.Sprintf(`"%s traffic not accepted by %s"`, svcPortNameString, svcInfo.firewallChainName), 1136 "-m", protocol, "-p", protocol, 1137 "-d", lbip, 1138 "--dport", strconv.Itoa(svcInfo.Port()), 1139 "-j", "DROP") 1140 } 1141 } 1142 if !hasExternalEndpoints { 1143 // Either no endpoints at all (REJECT) or no endpoints for 1144 // external traffic (DROP anything that didn't get short-circuited 1145 // by the EXT chain.) 1146 for _, lbip := range svcInfo.LoadBalancerVIPStrings() { 1147 filterRules.Write( 1148 "-A", string(kubeExternalServicesChain), 1149 "-m", "comment", "--comment", externalTrafficFilterComment, 1150 "-m", protocol, "-p", protocol, 1151 "-d", lbip, 1152 "--dport", strconv.Itoa(svcInfo.Port()), 1153 "-j", externalTrafficFilterTarget, 1154 ) 1155 } 1156 } 1157 1158 // Capture nodeports. 1159 if svcInfo.NodePort() != 0 { 1160 if hasEndpoints { 1161 // Jump to the external destination chain. For better or for 1162 // worse, nodeports are not subect to loadBalancerSourceRanges, 1163 // and we can't change that. 1164 natRules.Write( 1165 "-A", string(kubeNodePortsChain), 1166 "-m", "comment", "--comment", svcPortNameString, 1167 "-m", protocol, "-p", protocol, 1168 "--dport", strconv.Itoa(svcInfo.NodePort()), 1169 "-j", string(externalTrafficChain)) 1170 } 1171 if !hasExternalEndpoints { 1172 // Either no endpoints at all (REJECT) or no endpoints for 1173 // external traffic (DROP anything that didn't get 1174 // short-circuited by the EXT chain.) 1175 filterRules.Write( 1176 "-A", string(kubeExternalServicesChain), 1177 "-m", "comment", "--comment", externalTrafficFilterComment, 1178 "-m", "addrtype", "--dst-type", "LOCAL", 1179 "-m", protocol, "-p", protocol, 1180 "--dport", strconv.Itoa(svcInfo.NodePort()), 1181 "-j", externalTrafficFilterTarget, 1182 ) 1183 } 1184 } 1185 1186 // Capture healthCheckNodePorts. 1187 if svcInfo.HealthCheckNodePort() != 0 { 1188 // no matter if node has local endpoints, healthCheckNodePorts 1189 // need to add a rule to accept the incoming connection 1190 filterRules.Write( 1191 "-A", string(kubeNodePortsChain), 1192 "-m", "comment", "--comment", fmt.Sprintf(`"%s health check node port"`, svcPortNameString), 1193 "-m", "tcp", "-p", "tcp", 1194 "--dport", strconv.Itoa(svcInfo.HealthCheckNodePort()), 1195 "-j", "ACCEPT", 1196 ) 1197 } 1198 1199 // If the SVC/SVL/EXT/FW/SEP chains have not changed since the last sync 1200 // then we can omit them from the restore input. However, we have to still 1201 // figure out how many chains we _would_ have written, to make the metrics 1202 // come out right, so we just compute them and throw them away. 1203 if tryPartialSync && !serviceUpdateResult.UpdatedServices.Has(svcName.NamespacedName) && !endpointUpdateResult.UpdatedServices.Has(svcName.NamespacedName) { 1204 natChains = skippedNatChains 1205 natRules = skippedNatRules 1206 } 1207 1208 // Set up internal traffic handling. 1209 if hasInternalEndpoints { 1210 args = append(args[:0], 1211 "-m", "comment", "--comment", fmt.Sprintf(`"%s cluster IP"`, svcPortNameString), 1212 "-m", protocol, "-p", protocol, 1213 "-d", svcInfo.ClusterIP().String(), 1214 "--dport", strconv.Itoa(svcInfo.Port()), 1215 ) 1216 if proxier.masqueradeAll { 1217 natRules.Write( 1218 "-A", string(internalTrafficChain), 1219 args, 1220 "-j", string(kubeMarkMasqChain)) 1221 } else if proxier.localDetector.IsImplemented() { 1222 // This masquerades off-cluster traffic to a service VIP. The 1223 // idea is that you can establish a static route for your 1224 // Service range, routing to any node, and that node will 1225 // bridge into the Service for you. Since that might bounce 1226 // off-node, we masquerade here. 1227 natRules.Write( 1228 "-A", string(internalTrafficChain), 1229 args, 1230 proxier.localDetector.IfNotLocal(), 1231 "-j", string(kubeMarkMasqChain)) 1232 } 1233 } 1234 1235 // Set up external traffic handling (if any "external" destinations are 1236 // enabled). All captured traffic for all external destinations should 1237 // jump to externalTrafficChain, which will handle some special cases and 1238 // then jump to externalPolicyChain. 1239 if usesExternalTrafficChain { 1240 natChains.Write(utiliptables.MakeChainLine(externalTrafficChain)) 1241 1242 if !svcInfo.ExternalPolicyLocal() { 1243 // If we are using non-local endpoints we need to masquerade, 1244 // in case we cross nodes. 1245 natRules.Write( 1246 "-A", string(externalTrafficChain), 1247 "-m", "comment", "--comment", fmt.Sprintf(`"masquerade traffic for %s external destinations"`, svcPortNameString), 1248 "-j", string(kubeMarkMasqChain)) 1249 } else { 1250 // If we are only using same-node endpoints, we can retain the 1251 // source IP in most cases. 1252 1253 if proxier.localDetector.IsImplemented() { 1254 // Treat all locally-originated pod -> external destination 1255 // traffic as a special-case. It is subject to neither 1256 // form of traffic policy, which simulates going up-and-out 1257 // to an external load-balancer and coming back in. 1258 natRules.Write( 1259 "-A", string(externalTrafficChain), 1260 "-m", "comment", "--comment", fmt.Sprintf(`"pod traffic for %s external destinations"`, svcPortNameString), 1261 proxier.localDetector.IfLocal(), 1262 "-j", string(clusterPolicyChain)) 1263 } 1264 1265 // Locally originated traffic (not a pod, but the host node) 1266 // still needs masquerade because the LBIP itself is a local 1267 // address, so that will be the chosen source IP. 1268 natRules.Write( 1269 "-A", string(externalTrafficChain), 1270 "-m", "comment", "--comment", fmt.Sprintf(`"masquerade LOCAL traffic for %s external destinations"`, svcPortNameString), 1271 "-m", "addrtype", "--src-type", "LOCAL", 1272 "-j", string(kubeMarkMasqChain)) 1273 1274 // Redirect all src-type=LOCAL -> external destination to the 1275 // policy=cluster chain. This allows traffic originating 1276 // from the host to be redirected to the service correctly. 1277 natRules.Write( 1278 "-A", string(externalTrafficChain), 1279 "-m", "comment", "--comment", fmt.Sprintf(`"route LOCAL traffic for %s external destinations"`, svcPortNameString), 1280 "-m", "addrtype", "--src-type", "LOCAL", 1281 "-j", string(clusterPolicyChain)) 1282 } 1283 1284 // Anything else falls thru to the appropriate policy chain. 1285 if hasExternalEndpoints { 1286 natRules.Write( 1287 "-A", string(externalTrafficChain), 1288 "-j", string(externalPolicyChain)) 1289 } 1290 } 1291 1292 // Set up firewall chain, if needed 1293 if usesFWChain { 1294 natChains.Write(utiliptables.MakeChainLine(fwChain)) 1295 1296 // The service firewall rules are created based on the 1297 // loadBalancerSourceRanges field. This only works for VIP-like 1298 // loadbalancers that preserve source IPs. For loadbalancers which 1299 // direct traffic to service NodePort, the firewall rules will not 1300 // apply. 1301 args = append(args[:0], 1302 "-A", string(fwChain), 1303 "-m", "comment", "--comment", fmt.Sprintf(`"%s loadbalancer IP"`, svcPortNameString), 1304 ) 1305 1306 // firewall filter based on each source range 1307 allowFromNode := false 1308 for _, src := range svcInfo.LoadBalancerSourceRanges() { 1309 natRules.Write(args, "-s", src, "-j", string(externalTrafficChain)) 1310 _, cidr, err := netutils.ParseCIDRSloppy(src) 1311 if err != nil { 1312 klog.ErrorS(err, "Error parsing CIDR in LoadBalancerSourceRanges, dropping it", "cidr", cidr) 1313 } else if cidr.Contains(proxier.nodeIP) { 1314 allowFromNode = true 1315 } 1316 } 1317 // For VIP-like LBs, the VIP is often added as a local 1318 // address (via an IP route rule). In that case, a request 1319 // from a node to the VIP will not hit the loadbalancer but 1320 // will loop back with the source IP set to the VIP. We 1321 // need the following rules to allow requests from this node. 1322 if allowFromNode { 1323 for _, lbip := range svcInfo.LoadBalancerVIPStrings() { 1324 natRules.Write( 1325 args, 1326 "-s", lbip, 1327 "-j", string(externalTrafficChain)) 1328 } 1329 } 1330 // If the packet was able to reach the end of firewall chain, 1331 // then it did not get DNATed, so it will match the 1332 // corresponding KUBE-PROXY-FIREWALL rule. 1333 natRules.Write( 1334 "-A", string(fwChain), 1335 "-m", "comment", "--comment", fmt.Sprintf(`"other traffic to %s will be dropped by KUBE-PROXY-FIREWALL"`, svcPortNameString), 1336 ) 1337 } 1338 1339 // If Cluster policy is in use, create the chain and create rules jumping 1340 // from clusterPolicyChain to the clusterEndpoints 1341 if usesClusterPolicyChain { 1342 natChains.Write(utiliptables.MakeChainLine(clusterPolicyChain)) 1343 proxier.writeServiceToEndpointRules(natRules, svcPortNameString, svcInfo, clusterPolicyChain, clusterEndpoints, args) 1344 } 1345 1346 // If Local policy is in use, create the chain and create rules jumping 1347 // from localPolicyChain to the localEndpoints 1348 if usesLocalPolicyChain { 1349 natChains.Write(utiliptables.MakeChainLine(localPolicyChain)) 1350 proxier.writeServiceToEndpointRules(natRules, svcPortNameString, svcInfo, localPolicyChain, localEndpoints, args) 1351 } 1352 1353 // Generate the per-endpoint chains. 1354 for _, ep := range allLocallyReachableEndpoints { 1355 epInfo, ok := ep.(*endpointInfo) 1356 if !ok { 1357 klog.ErrorS(nil, "Failed to cast endpointInfo", "endpointInfo", ep) 1358 continue 1359 } 1360 1361 endpointChain := epInfo.ChainName 1362 1363 // Create the endpoint chain 1364 natChains.Write(utiliptables.MakeChainLine(endpointChain)) 1365 activeNATChains[endpointChain] = true 1366 1367 args = append(args[:0], "-A", string(endpointChain)) 1368 args = proxier.appendServiceCommentLocked(args, svcPortNameString) 1369 // Handle traffic that loops back to the originator with SNAT. 1370 natRules.Write( 1371 args, 1372 "-s", epInfo.IP(), 1373 "-j", string(kubeMarkMasqChain)) 1374 // Update client-affinity lists. 1375 if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP { 1376 args = append(args, "-m", "recent", "--name", string(endpointChain), "--set") 1377 } 1378 // DNAT to final destination. 1379 args = append(args, "-m", protocol, "-p", protocol, "-j", "DNAT", "--to-destination", epInfo.String()) 1380 natRules.Write(args) 1381 } 1382 } 1383 1384 // Delete chains no longer in use. Since "iptables-save" can take several seconds 1385 // to run on hosts with lots of iptables rules, we don't bother to do this on 1386 // every sync in large clusters. (Stale chains will not be referenced by any 1387 // active rules, so they're harmless other than taking up memory.) 1388 deletedChains := 0 1389 if !proxier.largeClusterMode || time.Since(proxier.lastIPTablesCleanup) > proxier.syncPeriod { 1390 var existingNATChains map[utiliptables.Chain]struct{} 1391 1392 proxier.iptablesData.Reset() 1393 if err := proxier.iptables.SaveInto(utiliptables.TableNAT, proxier.iptablesData); err == nil { 1394 existingNATChains = utiliptables.GetChainsFromTable(proxier.iptablesData.Bytes()) 1395 1396 for chain := range existingNATChains { 1397 if !activeNATChains[chain] { 1398 chainString := string(chain) 1399 if !isServiceChainName(chainString) { 1400 // Ignore chains that aren't ours. 1401 continue 1402 } 1403 // We must (as per iptables) write a chain-line 1404 // for it, which has the nice effect of flushing 1405 // the chain. Then we can remove the chain. 1406 proxier.natChains.Write(utiliptables.MakeChainLine(chain)) 1407 proxier.natRules.Write("-X", chainString) 1408 deletedChains++ 1409 } 1410 } 1411 proxier.lastIPTablesCleanup = time.Now() 1412 } else { 1413 klog.ErrorS(err, "Failed to execute iptables-save: stale chains will not be deleted") 1414 } 1415 } 1416 1417 // Finally, tail-call to the nodePorts chain. This needs to be after all 1418 // other service portal rules. 1419 if proxier.nodePortAddresses.MatchAll() { 1420 destinations := []string{"-m", "addrtype", "--dst-type", "LOCAL"} 1421 // Block localhost nodePorts if they are not supported. (For IPv6 they never 1422 // work, and for IPv4 they only work if we previously set `route_localnet`.) 1423 if isIPv6 { 1424 destinations = append(destinations, "!", "-d", "::1/128") 1425 } else if !proxier.localhostNodePorts { 1426 destinations = append(destinations, "!", "-d", "127.0.0.0/8") 1427 } 1428 1429 proxier.natRules.Write( 1430 "-A", string(kubeServicesChain), 1431 "-m", "comment", "--comment", `"kubernetes service nodeports; NOTE: this must be the last rule in this chain"`, 1432 destinations, 1433 "-j", string(kubeNodePortsChain)) 1434 } else { 1435 nodeIPs, err := proxier.nodePortAddresses.GetNodeIPs(proxier.networkInterfacer) 1436 if err != nil { 1437 klog.ErrorS(err, "Failed to get node ip address matching nodeport cidrs, services with nodeport may not work as intended", "CIDRs", proxier.nodePortAddresses) 1438 } 1439 for _, ip := range nodeIPs { 1440 if ip.IsLoopback() { 1441 if isIPv6 { 1442 klog.ErrorS(nil, "--nodeport-addresses includes localhost but localhost NodePorts are not supported on IPv6", "address", ip.String()) 1443 continue 1444 } else if !proxier.localhostNodePorts { 1445 klog.ErrorS(nil, "--nodeport-addresses includes localhost but --iptables-localhost-nodeports=false was passed", "address", ip.String()) 1446 continue 1447 } 1448 } 1449 1450 // create nodeport rules for each IP one by one 1451 proxier.natRules.Write( 1452 "-A", string(kubeServicesChain), 1453 "-m", "comment", "--comment", `"kubernetes service nodeports; NOTE: this must be the last rule in this chain"`, 1454 "-d", ip.String(), 1455 "-j", string(kubeNodePortsChain)) 1456 } 1457 } 1458 1459 // Drop the packets in INVALID state, which would potentially cause 1460 // unexpected connection reset if nf_conntrack_tcp_be_liberal is not set. 1461 // Ref: https://github.com/kubernetes/kubernetes/issues/74839 1462 // Ref: https://github.com/kubernetes/kubernetes/issues/117924 1463 if !proxier.conntrackTCPLiberal { 1464 proxier.filterRules.Write( 1465 "-A", string(kubeForwardChain), 1466 "-m", "conntrack", 1467 "--ctstate", "INVALID", 1468 "-j", "DROP", 1469 ) 1470 } 1471 1472 // If the masqueradeMark has been added then we want to forward that same 1473 // traffic, this allows NodePort traffic to be forwarded even if the default 1474 // FORWARD policy is not accept. 1475 proxier.filterRules.Write( 1476 "-A", string(kubeForwardChain), 1477 "-m", "comment", "--comment", `"kubernetes forwarding rules"`, 1478 "-m", "mark", "--mark", fmt.Sprintf("%s/%s", proxier.masqueradeMark, proxier.masqueradeMark), 1479 "-j", "ACCEPT", 1480 ) 1481 1482 // The following rule ensures the traffic after the initial packet accepted 1483 // by the "kubernetes forwarding rules" rule above will be accepted. 1484 proxier.filterRules.Write( 1485 "-A", string(kubeForwardChain), 1486 "-m", "comment", "--comment", `"kubernetes forwarding conntrack rule"`, 1487 "-m", "conntrack", 1488 "--ctstate", "RELATED,ESTABLISHED", 1489 "-j", "ACCEPT", 1490 ) 1491 1492 metrics.IptablesRulesTotal.WithLabelValues(string(utiliptables.TableFilter)).Set(float64(proxier.filterRules.Lines())) 1493 metrics.IptablesRulesLastSync.WithLabelValues(string(utiliptables.TableFilter)).Set(float64(proxier.filterRules.Lines())) 1494 metrics.IptablesRulesTotal.WithLabelValues(string(utiliptables.TableNAT)).Set(float64(proxier.natRules.Lines() + skippedNatRules.Lines() - deletedChains)) 1495 metrics.IptablesRulesLastSync.WithLabelValues(string(utiliptables.TableNAT)).Set(float64(proxier.natRules.Lines() - deletedChains)) 1496 1497 // Sync rules. 1498 proxier.iptablesData.Reset() 1499 proxier.iptablesData.WriteString("*filter\n") 1500 proxier.iptablesData.Write(proxier.filterChains.Bytes()) 1501 proxier.iptablesData.Write(proxier.filterRules.Bytes()) 1502 proxier.iptablesData.WriteString("COMMIT\n") 1503 proxier.iptablesData.WriteString("*nat\n") 1504 proxier.iptablesData.Write(proxier.natChains.Bytes()) 1505 proxier.iptablesData.Write(proxier.natRules.Bytes()) 1506 proxier.iptablesData.WriteString("COMMIT\n") 1507 1508 klog.V(2).InfoS("Reloading service iptables data", 1509 "numServices", len(proxier.svcPortMap), 1510 "numEndpoints", totalEndpoints, 1511 "numFilterChains", proxier.filterChains.Lines(), 1512 "numFilterRules", proxier.filterRules.Lines(), 1513 "numNATChains", proxier.natChains.Lines(), 1514 "numNATRules", proxier.natRules.Lines(), 1515 ) 1516 klog.V(9).InfoS("Restoring iptables", "rules", proxier.iptablesData.Bytes()) 1517 1518 // NOTE: NoFlushTables is used so we don't flush non-kubernetes chains in the table 1519 err := proxier.iptables.RestoreAll(proxier.iptablesData.Bytes(), utiliptables.NoFlushTables, utiliptables.RestoreCounters) 1520 if err != nil { 1521 if pErr, ok := err.(utiliptables.ParseError); ok { 1522 lines := utiliptables.ExtractLines(proxier.iptablesData.Bytes(), pErr.Line(), 3) 1523 klog.ErrorS(pErr, "Failed to execute iptables-restore", "rules", lines) 1524 } else { 1525 klog.ErrorS(err, "Failed to execute iptables-restore") 1526 } 1527 metrics.IptablesRestoreFailuresTotal.Inc() 1528 return 1529 } 1530 success = true 1531 proxier.needFullSync = false 1532 1533 for name, lastChangeTriggerTimes := range endpointUpdateResult.LastChangeTriggerTimes { 1534 for _, lastChangeTriggerTime := range lastChangeTriggerTimes { 1535 latency := metrics.SinceInSeconds(lastChangeTriggerTime) 1536 metrics.NetworkProgrammingLatency.Observe(latency) 1537 klog.V(4).InfoS("Network programming", "endpoint", klog.KRef(name.Namespace, name.Name), "elapsed", latency) 1538 } 1539 } 1540 1541 metrics.SyncProxyRulesNoLocalEndpointsTotal.WithLabelValues("internal").Set(float64(serviceNoLocalEndpointsTotalInternal)) 1542 metrics.SyncProxyRulesNoLocalEndpointsTotal.WithLabelValues("external").Set(float64(serviceNoLocalEndpointsTotalExternal)) 1543 if proxier.healthzServer != nil { 1544 proxier.healthzServer.Updated(proxier.ipFamily) 1545 } 1546 metrics.SyncProxyRulesLastTimestamp.SetToCurrentTime() 1547 1548 // Update service healthchecks. The endpoints list might include services that are 1549 // not "OnlyLocal", but the services list will not, and the serviceHealthServer 1550 // will just drop those endpoints. 1551 if err := proxier.serviceHealthServer.SyncServices(proxier.svcPortMap.HealthCheckNodePorts()); err != nil { 1552 klog.ErrorS(err, "Error syncing healthcheck services") 1553 } 1554 if err := proxier.serviceHealthServer.SyncEndpoints(proxier.endpointsMap.LocalReadyEndpoints()); err != nil { 1555 klog.ErrorS(err, "Error syncing healthcheck endpoints") 1556 } 1557 1558 // Finish housekeeping, clear stale conntrack entries for UDP Services 1559 conntrack.CleanStaleEntries(proxier.iptables.IsIPv6(), proxier.exec, proxier.svcPortMap, serviceUpdateResult, endpointUpdateResult) 1560 } 1561 1562 func (proxier *Proxier) writeServiceToEndpointRules(natRules proxyutil.LineBuffer, svcPortNameString string, svcInfo proxy.ServicePort, svcChain utiliptables.Chain, endpoints []proxy.Endpoint, args []string) { 1563 // First write session affinity rules, if applicable. 1564 if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP { 1565 for _, ep := range endpoints { 1566 epInfo, ok := ep.(*endpointInfo) 1567 if !ok { 1568 continue 1569 } 1570 comment := fmt.Sprintf(`"%s -> %s"`, svcPortNameString, epInfo.String()) 1571 1572 args = append(args[:0], 1573 "-A", string(svcChain), 1574 ) 1575 args = proxier.appendServiceCommentLocked(args, comment) 1576 args = append(args, 1577 "-m", "recent", "--name", string(epInfo.ChainName), 1578 "--rcheck", "--seconds", strconv.Itoa(svcInfo.StickyMaxAgeSeconds()), "--reap", 1579 "-j", string(epInfo.ChainName), 1580 ) 1581 natRules.Write(args) 1582 } 1583 } 1584 1585 // Now write loadbalancing rules. 1586 numEndpoints := len(endpoints) 1587 for i, ep := range endpoints { 1588 epInfo, ok := ep.(*endpointInfo) 1589 if !ok { 1590 continue 1591 } 1592 comment := fmt.Sprintf(`"%s -> %s"`, svcPortNameString, epInfo.String()) 1593 1594 args = append(args[:0], "-A", string(svcChain)) 1595 args = proxier.appendServiceCommentLocked(args, comment) 1596 if i < (numEndpoints - 1) { 1597 // Each rule is a probabilistic match. 1598 args = append(args, 1599 "-m", "statistic", 1600 "--mode", "random", 1601 "--probability", proxier.probability(numEndpoints-i)) 1602 } 1603 // The final (or only if n == 1) rule is a guaranteed match. 1604 natRules.Write(args, "-j", string(epInfo.ChainName)) 1605 } 1606 }