k8s.io/kubernetes@v1.29.3/pkg/proxy/ipvs/proxier.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package ipvs 18 19 import ( 20 "bytes" 21 "errors" 22 "fmt" 23 "io" 24 "net" 25 "reflect" 26 "strconv" 27 "strings" 28 "sync" 29 "sync/atomic" 30 "time" 31 32 "k8s.io/klog/v2" 33 utilexec "k8s.io/utils/exec" 34 netutils "k8s.io/utils/net" 35 36 v1 "k8s.io/api/core/v1" 37 discovery "k8s.io/api/discovery/v1" 38 "k8s.io/apimachinery/pkg/types" 39 "k8s.io/apimachinery/pkg/util/sets" 40 "k8s.io/apimachinery/pkg/util/version" 41 "k8s.io/apimachinery/pkg/util/wait" 42 "k8s.io/client-go/tools/events" 43 utilsysctl "k8s.io/component-helpers/node/util/sysctl" 44 "k8s.io/kubernetes/pkg/proxy" 45 "k8s.io/kubernetes/pkg/proxy/conntrack" 46 "k8s.io/kubernetes/pkg/proxy/healthcheck" 47 utilipset "k8s.io/kubernetes/pkg/proxy/ipvs/ipset" 48 utilipvs "k8s.io/kubernetes/pkg/proxy/ipvs/util" 49 "k8s.io/kubernetes/pkg/proxy/metaproxier" 50 "k8s.io/kubernetes/pkg/proxy/metrics" 51 proxyutil "k8s.io/kubernetes/pkg/proxy/util" 52 proxyutiliptables "k8s.io/kubernetes/pkg/proxy/util/iptables" 53 "k8s.io/kubernetes/pkg/util/async" 54 utiliptables "k8s.io/kubernetes/pkg/util/iptables" 55 utilkernel "k8s.io/kubernetes/pkg/util/kernel" 56 ) 57 58 const ( 59 // kubeServicesChain is the services portal chain 60 kubeServicesChain utiliptables.Chain = "KUBE-SERVICES" 61 62 // kubeProxyFirewallChain is the kube-proxy firewall chain. 63 kubeProxyFirewallChain utiliptables.Chain = "KUBE-PROXY-FIREWALL" 64 65 // kubeSourceRangesFirewallChain is the firewall subchain for LoadBalancerSourceRanges. 66 kubeSourceRangesFirewallChain utiliptables.Chain = "KUBE-SOURCE-RANGES-FIREWALL" 67 68 // kubePostroutingChain is the kubernetes postrouting chain 69 kubePostroutingChain utiliptables.Chain = "KUBE-POSTROUTING" 70 71 // kubeMarkMasqChain is the mark-for-masquerade chain 72 kubeMarkMasqChain utiliptables.Chain = "KUBE-MARK-MASQ" 73 74 // kubeNodePortChain is the kubernetes node port chain 75 kubeNodePortChain utiliptables.Chain = "KUBE-NODE-PORT" 76 77 // kubeForwardChain is the kubernetes forward chain 78 kubeForwardChain utiliptables.Chain = "KUBE-FORWARD" 79 80 // kubeLoadBalancerChain is the kubernetes chain for loadbalancer type service 81 kubeLoadBalancerChain utiliptables.Chain = "KUBE-LOAD-BALANCER" 82 83 // kubeIPVSFilterChain filters external access to main netns 84 // https://github.com/kubernetes/kubernetes/issues/72236 85 kubeIPVSFilterChain utiliptables.Chain = "KUBE-IPVS-FILTER" 86 87 // kubeIPVSOutFilterChain filters access to load balancer services from node. 88 // https://github.com/kubernetes/kubernetes/issues/119656 89 kubeIPVSOutFilterChain utiliptables.Chain = "KUBE-IPVS-OUT-FILTER" 90 91 // defaultScheduler is the default ipvs scheduler algorithm - round robin. 92 defaultScheduler = "rr" 93 94 // defaultDummyDevice is the default dummy interface which ipvs service address will bind to it. 95 defaultDummyDevice = "kube-ipvs0" 96 ) 97 98 // iptablesJumpChain is tables of iptables chains that ipvs proxier used to install iptables or cleanup iptables. 99 // `to` is the iptables chain we want to operate. 100 // `from` is the source iptables chain 101 var iptablesJumpChain = []struct { 102 table utiliptables.Table 103 from utiliptables.Chain 104 to utiliptables.Chain 105 comment string 106 }{ 107 {utiliptables.TableNAT, utiliptables.ChainOutput, kubeServicesChain, "kubernetes service portals"}, 108 {utiliptables.TableNAT, utiliptables.ChainPrerouting, kubeServicesChain, "kubernetes service portals"}, 109 {utiliptables.TableNAT, utiliptables.ChainPostrouting, kubePostroutingChain, "kubernetes postrouting rules"}, 110 {utiliptables.TableFilter, utiliptables.ChainForward, kubeForwardChain, "kubernetes forwarding rules"}, 111 {utiliptables.TableFilter, utiliptables.ChainInput, kubeNodePortChain, "kubernetes health check rules"}, 112 {utiliptables.TableFilter, utiliptables.ChainInput, kubeProxyFirewallChain, "kube-proxy firewall rules"}, 113 {utiliptables.TableFilter, utiliptables.ChainForward, kubeProxyFirewallChain, "kube-proxy firewall rules"}, 114 {utiliptables.TableFilter, utiliptables.ChainInput, kubeIPVSFilterChain, "kubernetes ipvs access filter"}, 115 {utiliptables.TableFilter, utiliptables.ChainOutput, kubeIPVSOutFilterChain, "kubernetes ipvs access filter"}, 116 } 117 118 var iptablesChains = []struct { 119 table utiliptables.Table 120 chain utiliptables.Chain 121 }{ 122 {utiliptables.TableNAT, kubeServicesChain}, 123 {utiliptables.TableNAT, kubePostroutingChain}, 124 {utiliptables.TableNAT, kubeNodePortChain}, 125 {utiliptables.TableNAT, kubeLoadBalancerChain}, 126 {utiliptables.TableNAT, kubeMarkMasqChain}, 127 {utiliptables.TableFilter, kubeForwardChain}, 128 {utiliptables.TableFilter, kubeNodePortChain}, 129 {utiliptables.TableFilter, kubeProxyFirewallChain}, 130 {utiliptables.TableFilter, kubeSourceRangesFirewallChain}, 131 {utiliptables.TableFilter, kubeIPVSFilterChain}, 132 {utiliptables.TableFilter, kubeIPVSOutFilterChain}, 133 } 134 135 var iptablesCleanupChains = []struct { 136 table utiliptables.Table 137 chain utiliptables.Chain 138 }{ 139 {utiliptables.TableNAT, kubeServicesChain}, 140 {utiliptables.TableNAT, kubePostroutingChain}, 141 {utiliptables.TableNAT, kubeNodePortChain}, 142 {utiliptables.TableNAT, kubeLoadBalancerChain}, 143 {utiliptables.TableFilter, kubeForwardChain}, 144 {utiliptables.TableFilter, kubeNodePortChain}, 145 {utiliptables.TableFilter, kubeProxyFirewallChain}, 146 {utiliptables.TableFilter, kubeSourceRangesFirewallChain}, 147 {utiliptables.TableFilter, kubeIPVSFilterChain}, 148 {utiliptables.TableFilter, kubeIPVSOutFilterChain}, 149 } 150 151 // ipsetInfo is all ipset we needed in ipvs proxier 152 var ipsetInfo = []struct { 153 name string 154 setType utilipset.Type 155 comment string 156 }{ 157 {kubeLoopBackIPSet, utilipset.HashIPPortIP, kubeLoopBackIPSetComment}, 158 {kubeClusterIPSet, utilipset.HashIPPort, kubeClusterIPSetComment}, 159 {kubeExternalIPSet, utilipset.HashIPPort, kubeExternalIPSetComment}, 160 {kubeExternalIPLocalSet, utilipset.HashIPPort, kubeExternalIPLocalSetComment}, 161 {kubeLoadBalancerSet, utilipset.HashIPPort, kubeLoadBalancerSetComment}, 162 {kubeLoadBalancerFWSet, utilipset.HashIPPort, kubeLoadBalancerFWSetComment}, 163 {kubeLoadBalancerLocalSet, utilipset.HashIPPort, kubeLoadBalancerLocalSetComment}, 164 {kubeLoadBalancerSourceIPSet, utilipset.HashIPPortIP, kubeLoadBalancerSourceIPSetComment}, 165 {kubeLoadBalancerSourceCIDRSet, utilipset.HashIPPortNet, kubeLoadBalancerSourceCIDRSetComment}, 166 {kubeNodePortSetTCP, utilipset.BitmapPort, kubeNodePortSetTCPComment}, 167 {kubeNodePortLocalSetTCP, utilipset.BitmapPort, kubeNodePortLocalSetTCPComment}, 168 {kubeNodePortSetUDP, utilipset.BitmapPort, kubeNodePortSetUDPComment}, 169 {kubeNodePortLocalSetUDP, utilipset.BitmapPort, kubeNodePortLocalSetUDPComment}, 170 {kubeNodePortSetSCTP, utilipset.HashIPPort, kubeNodePortSetSCTPComment}, 171 {kubeNodePortLocalSetSCTP, utilipset.HashIPPort, kubeNodePortLocalSetSCTPComment}, 172 {kubeHealthCheckNodePortSet, utilipset.BitmapPort, kubeHealthCheckNodePortSetComment}, 173 {kubeIPVSSet, utilipset.HashIP, kubeIPVSSetComment}, 174 } 175 176 // ipsetWithIptablesChain is the ipsets list with iptables source chain and the chain jump to 177 // `iptables -t nat -A <from> -m set --match-set <name> <matchType> -j <to>` 178 // example: iptables -t nat -A KUBE-SERVICES -m set --match-set KUBE-NODE-PORT-TCP dst -j KUBE-NODE-PORT 179 // ipsets with other match rules will be created Individually. 180 // Note: kubeNodePortLocalSetTCP must be prior to kubeNodePortSetTCP, the same for UDP. 181 var ipsetWithIptablesChain = []struct { 182 name string 183 table utiliptables.Table 184 from string 185 to string 186 matchType string 187 protocolMatch string 188 }{ 189 {kubeLoopBackIPSet, utiliptables.TableNAT, string(kubePostroutingChain), "MASQUERADE", "dst,dst,src", ""}, 190 {kubeLoadBalancerSet, utiliptables.TableNAT, string(kubeServicesChain), string(kubeLoadBalancerChain), "dst,dst", ""}, 191 {kubeLoadBalancerLocalSet, utiliptables.TableNAT, string(kubeLoadBalancerChain), "RETURN", "dst,dst", ""}, 192 {kubeNodePortLocalSetTCP, utiliptables.TableNAT, string(kubeNodePortChain), "RETURN", "dst", utilipset.ProtocolTCP}, 193 {kubeNodePortSetTCP, utiliptables.TableNAT, string(kubeNodePortChain), string(kubeMarkMasqChain), "dst", utilipset.ProtocolTCP}, 194 {kubeNodePortLocalSetUDP, utiliptables.TableNAT, string(kubeNodePortChain), "RETURN", "dst", utilipset.ProtocolUDP}, 195 {kubeNodePortSetUDP, utiliptables.TableNAT, string(kubeNodePortChain), string(kubeMarkMasqChain), "dst", utilipset.ProtocolUDP}, 196 {kubeNodePortLocalSetSCTP, utiliptables.TableNAT, string(kubeNodePortChain), "RETURN", "dst,dst", utilipset.ProtocolSCTP}, 197 {kubeNodePortSetSCTP, utiliptables.TableNAT, string(kubeNodePortChain), string(kubeMarkMasqChain), "dst,dst", utilipset.ProtocolSCTP}, 198 199 {kubeLoadBalancerFWSet, utiliptables.TableFilter, string(kubeProxyFirewallChain), string(kubeSourceRangesFirewallChain), "dst,dst", ""}, 200 {kubeLoadBalancerSourceCIDRSet, utiliptables.TableFilter, string(kubeSourceRangesFirewallChain), "RETURN", "dst,dst,src", ""}, 201 {kubeLoadBalancerSourceIPSet, utiliptables.TableFilter, string(kubeSourceRangesFirewallChain), "RETURN", "dst,dst,src", ""}, 202 } 203 204 // In IPVS proxy mode, the following flags need to be set 205 const ( 206 sysctlVSConnTrack = "net/ipv4/vs/conntrack" 207 sysctlConnReuse = "net/ipv4/vs/conn_reuse_mode" 208 sysctlExpireNoDestConn = "net/ipv4/vs/expire_nodest_conn" 209 sysctlExpireQuiescentTemplate = "net/ipv4/vs/expire_quiescent_template" 210 sysctlForward = "net/ipv4/ip_forward" 211 sysctlArpIgnore = "net/ipv4/conf/all/arp_ignore" 212 sysctlArpAnnounce = "net/ipv4/conf/all/arp_announce" 213 ) 214 215 // Proxier is an ipvs based proxy for connections between a localhost:lport 216 // and services that provide the actual backends. 217 type Proxier struct { 218 // the ipfamily on which this proxy is operating on. 219 ipFamily v1.IPFamily 220 // endpointsChanges and serviceChanges contains all changes to endpoints and 221 // services that happened since last syncProxyRules call. For a single object, 222 // changes are accumulated, i.e. previous is state from before all of them, 223 // current is state after applying all of those. 224 endpointsChanges *proxy.EndpointsChangeTracker 225 serviceChanges *proxy.ServiceChangeTracker 226 227 mu sync.Mutex // protects the following fields 228 svcPortMap proxy.ServicePortMap 229 endpointsMap proxy.EndpointsMap 230 nodeLabels map[string]string 231 // initialSync is a bool indicating if the proxier is syncing for the first time. 232 // It is set to true when a new proxier is initialized and then set to false on all 233 // future syncs. 234 // This lets us run specific logic that's required only during proxy startup. 235 // For eg: it enables us to update weights of existing destinations only on startup 236 // saving us the cost of querying and updating real servers during every sync. 237 initialSync bool 238 // endpointSlicesSynced, and servicesSynced are set to true when 239 // corresponding objects are synced after startup. This is used to avoid updating 240 // ipvs rules with some partial data after kube-proxy restart. 241 endpointSlicesSynced bool 242 servicesSynced bool 243 initialized int32 244 syncRunner *async.BoundedFrequencyRunner // governs calls to syncProxyRules 245 246 // These are effectively const and do not need the mutex to be held. 247 syncPeriod time.Duration 248 minSyncPeriod time.Duration 249 // Values are CIDR's to exclude when cleaning up IPVS rules. 250 excludeCIDRs []*net.IPNet 251 // Set to true to set sysctls arp_ignore and arp_announce 252 strictARP bool 253 iptables utiliptables.Interface 254 ipvs utilipvs.Interface 255 ipset utilipset.Interface 256 exec utilexec.Interface 257 masqueradeAll bool 258 masqueradeMark string 259 localDetector proxyutiliptables.LocalTrafficDetector 260 hostname string 261 nodeIP net.IP 262 recorder events.EventRecorder 263 264 serviceHealthServer healthcheck.ServiceHealthServer 265 healthzServer *healthcheck.ProxierHealthServer 266 267 ipvsScheduler string 268 // The following buffers are used to reuse memory and avoid allocations 269 // that are significantly impacting performance. 270 iptablesData *bytes.Buffer 271 filterChainsData *bytes.Buffer 272 natChains proxyutil.LineBuffer 273 filterChains proxyutil.LineBuffer 274 natRules proxyutil.LineBuffer 275 filterRules proxyutil.LineBuffer 276 // Added as a member to the struct to allow injection for testing. 277 netlinkHandle NetLinkHandle 278 // ipsetList is the list of ipsets that ipvs proxier used. 279 ipsetList map[string]*IPSet 280 // nodePortAddresses selects the interfaces where nodePort works. 281 nodePortAddresses *proxyutil.NodePortAddresses 282 // networkInterfacer defines an interface for several net library functions. 283 // Inject for test purpose. 284 networkInterfacer proxyutil.NetworkInterfacer 285 gracefuldeleteManager *GracefulTerminationManager 286 // serviceNoLocalEndpointsInternal represents the set of services that couldn't be applied 287 // due to the absence of local endpoints when the internal traffic policy is "Local". 288 // It is used to publish the sync_proxy_rules_no_endpoints_total 289 // metric with the traffic_policy label set to "internal". 290 // A Set is used here since we end up calculating endpoint topology multiple times for the same Service 291 // if it has multiple ports but each Service should only be counted once. 292 serviceNoLocalEndpointsInternal sets.Set[string] 293 // serviceNoLocalEndpointsExternal represents the set of services that couldn't be applied 294 // due to the absence of any endpoints when the external traffic policy is "Local". 295 // It is used to publish the sync_proxy_rules_no_endpoints_total 296 // metric with the traffic_policy label set to "external". 297 // A Set is used here since we end up calculating endpoint topology multiple times for the same Service 298 // if it has multiple ports but each Service should only be counted once. 299 serviceNoLocalEndpointsExternal sets.Set[string] 300 // lbNoNodeAccessIPPortProtocolEntries represents the set of loadBalancers IP + Port + Protocol that should not be accessible from K8s nodes 301 // We cannot directly restrict LB access from node using LoadBalancerSourceRanges, we need to install 302 // additional iptables rules. 303 // (ref: https://github.com/kubernetes/kubernetes/issues/119656) 304 lbNoNodeAccessIPPortProtocolEntries []*utilipset.Entry 305 } 306 307 // Proxier implements proxy.Provider 308 var _ proxy.Provider = &Proxier{} 309 310 // NewProxier returns a new Proxier given an iptables and ipvs Interface instance. 311 // Because of the iptables and ipvs logic, it is assumed that there is only a single Proxier active on a machine. 312 // An error will be returned if it fails to update or acquire the initial lock. 313 // Once a proxier is created, it will keep iptables and ipvs rules up to date in the background and 314 // will not terminate if a particular iptables or ipvs call fails. 315 func NewProxier(ipFamily v1.IPFamily, 316 ipt utiliptables.Interface, 317 ipvs utilipvs.Interface, 318 ipset utilipset.Interface, 319 sysctl utilsysctl.Interface, 320 exec utilexec.Interface, 321 syncPeriod time.Duration, 322 minSyncPeriod time.Duration, 323 excludeCIDRs []string, 324 strictARP bool, 325 tcpTimeout time.Duration, 326 tcpFinTimeout time.Duration, 327 udpTimeout time.Duration, 328 masqueradeAll bool, 329 masqueradeBit int, 330 localDetector proxyutiliptables.LocalTrafficDetector, 331 hostname string, 332 nodeIP net.IP, 333 recorder events.EventRecorder, 334 healthzServer *healthcheck.ProxierHealthServer, 335 scheduler string, 336 nodePortAddressStrings []string, 337 initOnly bool, 338 ) (*Proxier, error) { 339 // Set the conntrack sysctl we need for 340 if err := proxyutil.EnsureSysctl(sysctl, sysctlVSConnTrack, 1); err != nil { 341 return nil, err 342 } 343 344 kernelVersion, err := utilkernel.GetVersion() 345 if err != nil { 346 return nil, fmt.Errorf("failed to get kernel version: %w", err) 347 } 348 349 if kernelVersion.LessThan(version.MustParseGeneric(utilkernel.IPVSConnReuseModeMinSupportedKernelVersion)) { 350 klog.ErrorS(nil, "Can't set sysctl, kernel version doesn't satisfy minimum version requirements", "sysctl", sysctlConnReuse, "minimumKernelVersion", utilkernel.IPVSConnReuseModeMinSupportedKernelVersion) 351 } else if kernelVersion.AtLeast(version.MustParseGeneric(utilkernel.IPVSConnReuseModeFixedKernelVersion)) { 352 // https://github.com/kubernetes/kubernetes/issues/93297 353 klog.V(2).InfoS("Left as-is", "sysctl", sysctlConnReuse) 354 } else { 355 // Set the connection reuse mode 356 if err := proxyutil.EnsureSysctl(sysctl, sysctlConnReuse, 0); err != nil { 357 return nil, err 358 } 359 } 360 361 // Set the expire_nodest_conn sysctl we need for 362 if err := proxyutil.EnsureSysctl(sysctl, sysctlExpireNoDestConn, 1); err != nil { 363 return nil, err 364 } 365 366 // Set the expire_quiescent_template sysctl we need for 367 if err := proxyutil.EnsureSysctl(sysctl, sysctlExpireQuiescentTemplate, 1); err != nil { 368 return nil, err 369 } 370 371 // Set the ip_forward sysctl we need for 372 if err := proxyutil.EnsureSysctl(sysctl, sysctlForward, 1); err != nil { 373 return nil, err 374 } 375 376 if strictARP { 377 // Set the arp_ignore sysctl we need for 378 if err := proxyutil.EnsureSysctl(sysctl, sysctlArpIgnore, 1); err != nil { 379 return nil, err 380 } 381 382 // Set the arp_announce sysctl we need for 383 if err := proxyutil.EnsureSysctl(sysctl, sysctlArpAnnounce, 2); err != nil { 384 return nil, err 385 } 386 } 387 388 // Configure IPVS timeouts if any one of the timeout parameters have been set. 389 // This is the equivalent to running ipvsadm --set, a value of 0 indicates the 390 // current system timeout should be preserved 391 if tcpTimeout > 0 || tcpFinTimeout > 0 || udpTimeout > 0 { 392 if err := ipvs.ConfigureTimeouts(tcpTimeout, tcpFinTimeout, udpTimeout); err != nil { 393 klog.ErrorS(err, "Failed to configure IPVS timeouts") 394 } 395 } 396 397 if initOnly { 398 klog.InfoS("System initialized and --init-only specified") 399 return nil, nil 400 } 401 402 // Generate the masquerade mark to use for SNAT rules. 403 masqueradeValue := 1 << uint(masqueradeBit) 404 masqueradeMark := fmt.Sprintf("%#08x", masqueradeValue) 405 406 klog.V(2).InfoS("Record nodeIP and family", "nodeIP", nodeIP, "family", ipFamily) 407 408 if len(scheduler) == 0 { 409 klog.InfoS("IPVS scheduler not specified, use rr by default") 410 scheduler = defaultScheduler 411 } 412 413 nodePortAddresses := proxyutil.NewNodePortAddresses(ipFamily, nodePortAddressStrings) 414 415 serviceHealthServer := healthcheck.NewServiceHealthServer(hostname, recorder, nodePortAddresses, healthzServer) 416 417 // excludeCIDRs has been validated before, here we just parse it to IPNet list 418 parsedExcludeCIDRs, _ := netutils.ParseCIDRs(excludeCIDRs) 419 420 proxier := &Proxier{ 421 ipFamily: ipFamily, 422 svcPortMap: make(proxy.ServicePortMap), 423 serviceChanges: proxy.NewServiceChangeTracker(newServiceInfo, ipFamily, recorder, nil), 424 endpointsMap: make(proxy.EndpointsMap), 425 endpointsChanges: proxy.NewEndpointsChangeTracker(hostname, nil, ipFamily, recorder, nil), 426 initialSync: true, 427 syncPeriod: syncPeriod, 428 minSyncPeriod: minSyncPeriod, 429 excludeCIDRs: parsedExcludeCIDRs, 430 iptables: ipt, 431 masqueradeAll: masqueradeAll, 432 masqueradeMark: masqueradeMark, 433 exec: exec, 434 localDetector: localDetector, 435 hostname: hostname, 436 nodeIP: nodeIP, 437 recorder: recorder, 438 serviceHealthServer: serviceHealthServer, 439 healthzServer: healthzServer, 440 ipvs: ipvs, 441 ipvsScheduler: scheduler, 442 iptablesData: bytes.NewBuffer(nil), 443 filterChainsData: bytes.NewBuffer(nil), 444 natChains: proxyutil.NewLineBuffer(), 445 natRules: proxyutil.NewLineBuffer(), 446 filterChains: proxyutil.NewLineBuffer(), 447 filterRules: proxyutil.NewLineBuffer(), 448 netlinkHandle: NewNetLinkHandle(ipFamily == v1.IPv6Protocol), 449 ipset: ipset, 450 nodePortAddresses: nodePortAddresses, 451 networkInterfacer: proxyutil.RealNetwork{}, 452 gracefuldeleteManager: NewGracefulTerminationManager(ipvs), 453 } 454 // initialize ipsetList with all sets we needed 455 proxier.ipsetList = make(map[string]*IPSet) 456 for _, is := range ipsetInfo { 457 proxier.ipsetList[is.name] = NewIPSet(ipset, is.name, is.setType, (ipFamily == v1.IPv6Protocol), is.comment) 458 } 459 burstSyncs := 2 460 klog.V(2).InfoS("ipvs sync params", "ipFamily", ipt.Protocol(), "minSyncPeriod", minSyncPeriod, "syncPeriod", syncPeriod, "burstSyncs", burstSyncs) 461 proxier.syncRunner = async.NewBoundedFrequencyRunner("sync-runner", proxier.syncProxyRules, minSyncPeriod, syncPeriod, burstSyncs) 462 proxier.gracefuldeleteManager.Run() 463 return proxier, nil 464 } 465 466 // NewDualStackProxier returns a new Proxier for dual-stack operation 467 func NewDualStackProxier( 468 ipt [2]utiliptables.Interface, 469 ipvs utilipvs.Interface, 470 ipset utilipset.Interface, 471 sysctl utilsysctl.Interface, 472 exec utilexec.Interface, 473 syncPeriod time.Duration, 474 minSyncPeriod time.Duration, 475 excludeCIDRs []string, 476 strictARP bool, 477 tcpTimeout time.Duration, 478 tcpFinTimeout time.Duration, 479 udpTimeout time.Duration, 480 masqueradeAll bool, 481 masqueradeBit int, 482 localDetectors [2]proxyutiliptables.LocalTrafficDetector, 483 hostname string, 484 nodeIPs map[v1.IPFamily]net.IP, 485 recorder events.EventRecorder, 486 healthzServer *healthcheck.ProxierHealthServer, 487 scheduler string, 488 nodePortAddresses []string, 489 initOnly bool, 490 ) (proxy.Provider, error) { 491 492 safeIpset := newSafeIpset(ipset) 493 494 // Create an ipv4 instance of the single-stack proxier 495 ipv4Proxier, err := NewProxier(v1.IPv4Protocol, ipt[0], ipvs, safeIpset, sysctl, 496 exec, syncPeriod, minSyncPeriod, filterCIDRs(false, excludeCIDRs), strictARP, 497 tcpTimeout, tcpFinTimeout, udpTimeout, masqueradeAll, masqueradeBit, 498 localDetectors[0], hostname, nodeIPs[v1.IPv4Protocol], recorder, 499 healthzServer, scheduler, nodePortAddresses, initOnly) 500 if err != nil { 501 return nil, fmt.Errorf("unable to create ipv4 proxier: %v", err) 502 } 503 504 ipv6Proxier, err := NewProxier(v1.IPv6Protocol, ipt[1], ipvs, safeIpset, sysctl, 505 exec, syncPeriod, minSyncPeriod, filterCIDRs(true, excludeCIDRs), strictARP, 506 tcpTimeout, tcpFinTimeout, udpTimeout, masqueradeAll, masqueradeBit, 507 localDetectors[1], hostname, nodeIPs[v1.IPv6Protocol], recorder, 508 healthzServer, scheduler, nodePortAddresses, initOnly) 509 if err != nil { 510 return nil, fmt.Errorf("unable to create ipv6 proxier: %v", err) 511 } 512 if initOnly { 513 return nil, nil 514 } 515 516 // Return a meta-proxier that dispatch calls between the two 517 // single-stack proxier instances 518 return metaproxier.NewMetaProxier(ipv4Proxier, ipv6Proxier), nil 519 } 520 521 func filterCIDRs(wantIPv6 bool, cidrs []string) []string { 522 var filteredCIDRs []string 523 for _, cidr := range cidrs { 524 if netutils.IsIPv6CIDRString(cidr) == wantIPv6 { 525 filteredCIDRs = append(filteredCIDRs, cidr) 526 } 527 } 528 return filteredCIDRs 529 } 530 531 // internal struct for string service information 532 type servicePortInfo struct { 533 *proxy.BaseServicePortInfo 534 // The following fields are computed and stored for performance reasons. 535 nameString string 536 } 537 538 // returns a new proxy.ServicePort which abstracts a serviceInfo 539 func newServiceInfo(port *v1.ServicePort, service *v1.Service, bsvcPortInfo *proxy.BaseServicePortInfo) proxy.ServicePort { 540 svcPort := &servicePortInfo{BaseServicePortInfo: bsvcPortInfo} 541 542 // Store the following for performance reasons. 543 svcName := types.NamespacedName{Namespace: service.Namespace, Name: service.Name} 544 svcPortName := proxy.ServicePortName{NamespacedName: svcName, Port: port.Name} 545 svcPort.nameString = svcPortName.String() 546 547 return svcPort 548 } 549 550 // getFirstColumn reads all the content from r into memory and return a 551 // slice which consists of the first word from each line. 552 func getFirstColumn(r io.Reader) ([]string, error) { 553 b, err := io.ReadAll(r) 554 if err != nil { 555 return nil, err 556 } 557 558 lines := strings.Split(string(b), "\n") 559 words := make([]string, 0, len(lines)) 560 for i := range lines { 561 fields := strings.Fields(lines[i]) 562 if len(fields) > 0 { 563 words = append(words, fields[0]) 564 } 565 } 566 return words, nil 567 } 568 569 // CanUseIPVSProxier checks if we can use the ipvs Proxier. 570 // The ipset version and the scheduler are checked. If any virtual servers (VS) 571 // already exist with the configured scheduler, we just return. Otherwise 572 // we check if a dummy VS can be configured with the configured scheduler. 573 // Kernel modules will be loaded automatically if necessary. 574 func CanUseIPVSProxier(ipvs utilipvs.Interface, ipsetver IPSetVersioner, scheduler string) error { 575 // BUG: https://github.com/moby/ipvs/issues/27 576 // If ipvs is not compiled into the kernel no error is returned and handle==nil. 577 // This in turn causes ipvs.GetVirtualServers and ipvs.AddVirtualServer 578 // to return ok (err==nil). If/when this bug is fixed parameter "ipvs" will be nil 579 // if ipvs is not supported by the kernel. Until then a re-read work-around is used. 580 if ipvs == nil { 581 return fmt.Errorf("Ipvs not supported by the kernel") 582 } 583 584 // Check ipset version 585 versionString, err := ipsetver.GetVersion() 586 if err != nil { 587 return fmt.Errorf("error getting ipset version, error: %v", err) 588 } 589 if !checkMinVersion(versionString) { 590 return fmt.Errorf("ipset version: %s is less than min required version: %s", versionString, MinIPSetCheckVersion) 591 } 592 593 if scheduler == "" { 594 scheduler = defaultScheduler 595 } 596 597 // If any virtual server (VS) using the scheduler exist we skip the checks. 598 vservers, err := ipvs.GetVirtualServers() 599 if err != nil { 600 klog.ErrorS(err, "Can't read the ipvs") 601 return err 602 } 603 klog.V(5).InfoS("Virtual Servers", "count", len(vservers)) 604 if len(vservers) > 0 { 605 // This is most likely a kube-proxy re-start. We know that ipvs works 606 // and if any VS uses the configured scheduler, we are done. 607 for _, vs := range vservers { 608 if vs.Scheduler == scheduler { 609 klog.V(5).InfoS("VS exist, Skipping checks") 610 return nil 611 } 612 } 613 klog.V(5).InfoS("No existing VS uses the configured scheduler", "scheduler", scheduler) 614 } 615 616 // Try to insert a dummy VS with the passed scheduler. 617 // We should use a VIP address that is not used on the node. 618 // An address "198.51.100.0" from the TEST-NET-2 rage in https://datatracker.ietf.org/doc/html/rfc5737 619 // is used. These addresses are reserved for documentation. If the user is using 620 // this address for a VS anyway we *will* mess up, but that would be an invalid configuration. 621 // If the user have configured the address to an interface on the node (but not a VS) 622 // then traffic will temporary be routed to ipvs during the probe and dropped. 623 // The later case is also and invalid configuration, but the traffic impact will be minor. 624 // This should not be a problem if users honors reserved addresses, but cut/paste 625 // from documentation is not unheard of, so the restriction to not use the TEST-NET-2 range 626 // must be documented. 627 vs := utilipvs.VirtualServer{ 628 Address: netutils.ParseIPSloppy("198.51.100.0"), 629 Protocol: "TCP", 630 Port: 20000, 631 Scheduler: scheduler, 632 } 633 if err := ipvs.AddVirtualServer(&vs); err != nil { 634 klog.ErrorS(err, "Could not create dummy VS", "scheduler", scheduler) 635 return err 636 } 637 638 // To overcome the BUG described above we check that the VS is *really* added. 639 vservers, err = ipvs.GetVirtualServers() 640 if err != nil { 641 klog.ErrorS(err, "ipvs.GetVirtualServers") 642 return err 643 } 644 klog.V(5).InfoS("Virtual Servers after adding dummy", "count", len(vservers)) 645 if len(vservers) == 0 { 646 klog.InfoS("Dummy VS not created", "scheduler", scheduler) 647 return fmt.Errorf("Ipvs not supported") // This is a BUG work-around 648 } 649 klog.V(5).InfoS("Dummy VS created", "vs", vs) 650 651 if err := ipvs.DeleteVirtualServer(&vs); err != nil { 652 klog.ErrorS(err, "Could not delete dummy VS") 653 return err 654 } 655 656 return nil 657 } 658 659 // CleanupIptablesLeftovers removes all iptables rules and chains created by the Proxier 660 // It returns true if an error was encountered. Errors are logged. 661 func cleanupIptablesLeftovers(ipt utiliptables.Interface) (encounteredError bool) { 662 // Unlink the iptables chains created by ipvs Proxier 663 for _, jc := range iptablesJumpChain { 664 args := []string{ 665 "-m", "comment", "--comment", jc.comment, 666 "-j", string(jc.to), 667 } 668 if err := ipt.DeleteRule(jc.table, jc.from, args...); err != nil { 669 if !utiliptables.IsNotFoundError(err) { 670 klog.ErrorS(err, "Error removing iptables rules in ipvs proxier") 671 encounteredError = true 672 } 673 } 674 } 675 676 // Flush and remove all of our chains. Flushing all chains before removing them also removes all links between chains first. 677 for _, ch := range iptablesCleanupChains { 678 if err := ipt.FlushChain(ch.table, ch.chain); err != nil { 679 if !utiliptables.IsNotFoundError(err) { 680 klog.ErrorS(err, "Error removing iptables rules in ipvs proxier") 681 encounteredError = true 682 } 683 } 684 } 685 686 // Remove all of our chains. 687 for _, ch := range iptablesCleanupChains { 688 if err := ipt.DeleteChain(ch.table, ch.chain); err != nil { 689 if !utiliptables.IsNotFoundError(err) { 690 klog.ErrorS(err, "Error removing iptables rules in ipvs proxier") 691 encounteredError = true 692 } 693 } 694 } 695 696 return encounteredError 697 } 698 699 // CleanupLeftovers clean up all ipvs and iptables rules created by ipvs Proxier. 700 func CleanupLeftovers(ipvs utilipvs.Interface, ipt utiliptables.Interface, ipset utilipset.Interface) (encounteredError bool) { 701 // Clear all ipvs rules 702 if ipvs != nil { 703 err := ipvs.Flush() 704 if err != nil { 705 klog.ErrorS(err, "Error flushing ipvs rules") 706 encounteredError = true 707 } 708 } 709 // Delete dummy interface created by ipvs Proxier. 710 nl := NewNetLinkHandle(false) 711 err := nl.DeleteDummyDevice(defaultDummyDevice) 712 if err != nil { 713 klog.ErrorS(err, "Error deleting dummy device created by ipvs proxier", "device", defaultDummyDevice) 714 encounteredError = true 715 } 716 // Clear iptables created by ipvs Proxier. 717 encounteredError = cleanupIptablesLeftovers(ipt) || encounteredError 718 // Destroy ip sets created by ipvs Proxier. We should call it after cleaning up 719 // iptables since we can NOT delete ip set which is still referenced by iptables. 720 for _, set := range ipsetInfo { 721 err = ipset.DestroySet(set.name) 722 if err != nil { 723 if !utilipset.IsNotFoundError(err) { 724 klog.ErrorS(err, "Error removing ipset", "ipset", set.name) 725 encounteredError = true 726 } 727 } 728 } 729 return encounteredError 730 } 731 732 // Sync is called to synchronize the proxier state to iptables and ipvs as soon as possible. 733 func (proxier *Proxier) Sync() { 734 if proxier.healthzServer != nil { 735 proxier.healthzServer.QueuedUpdate(proxier.ipFamily) 736 } 737 metrics.SyncProxyRulesLastQueuedTimestamp.SetToCurrentTime() 738 proxier.syncRunner.Run() 739 } 740 741 // SyncLoop runs periodic work. This is expected to run as a goroutine or as the main loop of the app. It does not return. 742 func (proxier *Proxier) SyncLoop() { 743 // Update healthz timestamp at beginning in case Sync() never succeeds. 744 if proxier.healthzServer != nil { 745 proxier.healthzServer.Updated(proxier.ipFamily) 746 } 747 // synthesize "last change queued" time as the informers are syncing. 748 metrics.SyncProxyRulesLastQueuedTimestamp.SetToCurrentTime() 749 proxier.syncRunner.Loop(wait.NeverStop) 750 } 751 752 func (proxier *Proxier) setInitialized(value bool) { 753 var initialized int32 754 if value { 755 initialized = 1 756 } 757 atomic.StoreInt32(&proxier.initialized, initialized) 758 } 759 760 func (proxier *Proxier) isInitialized() bool { 761 return atomic.LoadInt32(&proxier.initialized) > 0 762 } 763 764 // OnServiceAdd is called whenever creation of new service object is observed. 765 func (proxier *Proxier) OnServiceAdd(service *v1.Service) { 766 proxier.OnServiceUpdate(nil, service) 767 } 768 769 // OnServiceUpdate is called whenever modification of an existing service object is observed. 770 func (proxier *Proxier) OnServiceUpdate(oldService, service *v1.Service) { 771 if proxier.serviceChanges.Update(oldService, service) && proxier.isInitialized() { 772 proxier.Sync() 773 } 774 } 775 776 // OnServiceDelete is called whenever deletion of an existing service object is observed. 777 func (proxier *Proxier) OnServiceDelete(service *v1.Service) { 778 proxier.OnServiceUpdate(service, nil) 779 } 780 781 // OnServiceSynced is called once all the initial event handlers were called and the state is fully propagated to local cache. 782 func (proxier *Proxier) OnServiceSynced() { 783 proxier.mu.Lock() 784 proxier.servicesSynced = true 785 proxier.setInitialized(proxier.endpointSlicesSynced) 786 proxier.mu.Unlock() 787 788 // Sync unconditionally - this is called once per lifetime. 789 proxier.syncProxyRules() 790 } 791 792 // OnEndpointSliceAdd is called whenever creation of a new endpoint slice object 793 // is observed. 794 func (proxier *Proxier) OnEndpointSliceAdd(endpointSlice *discovery.EndpointSlice) { 795 if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, false) && proxier.isInitialized() { 796 proxier.Sync() 797 } 798 } 799 800 // OnEndpointSliceUpdate is called whenever modification of an existing endpoint 801 // slice object is observed. 802 func (proxier *Proxier) OnEndpointSliceUpdate(_, endpointSlice *discovery.EndpointSlice) { 803 if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, false) && proxier.isInitialized() { 804 proxier.Sync() 805 } 806 } 807 808 // OnEndpointSliceDelete is called whenever deletion of an existing endpoint slice 809 // object is observed. 810 func (proxier *Proxier) OnEndpointSliceDelete(endpointSlice *discovery.EndpointSlice) { 811 if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, true) && proxier.isInitialized() { 812 proxier.Sync() 813 } 814 } 815 816 // OnEndpointSlicesSynced is called once all the initial event handlers were 817 // called and the state is fully propagated to local cache. 818 func (proxier *Proxier) OnEndpointSlicesSynced() { 819 proxier.mu.Lock() 820 proxier.endpointSlicesSynced = true 821 proxier.setInitialized(proxier.servicesSynced) 822 proxier.mu.Unlock() 823 824 // Sync unconditionally - this is called once per lifetime. 825 proxier.syncProxyRules() 826 } 827 828 // OnNodeAdd is called whenever creation of new node object 829 // is observed. 830 func (proxier *Proxier) OnNodeAdd(node *v1.Node) { 831 if node.Name != proxier.hostname { 832 klog.ErrorS(nil, "Received a watch event for a node that doesn't match the current node", "eventNode", node.Name, "currentNode", proxier.hostname) 833 return 834 } 835 836 if reflect.DeepEqual(proxier.nodeLabels, node.Labels) { 837 return 838 } 839 840 proxier.mu.Lock() 841 proxier.nodeLabels = map[string]string{} 842 for k, v := range node.Labels { 843 proxier.nodeLabels[k] = v 844 } 845 proxier.mu.Unlock() 846 klog.V(4).InfoS("Updated proxier node labels", "labels", node.Labels) 847 848 proxier.Sync() 849 } 850 851 // OnNodeUpdate is called whenever modification of an existing 852 // node object is observed. 853 func (proxier *Proxier) OnNodeUpdate(oldNode, node *v1.Node) { 854 if node.Name != proxier.hostname { 855 klog.ErrorS(nil, "Received a watch event for a node that doesn't match the current node", "eventNode", node.Name, "currentNode", proxier.hostname) 856 return 857 } 858 859 if reflect.DeepEqual(proxier.nodeLabels, node.Labels) { 860 return 861 } 862 863 proxier.mu.Lock() 864 proxier.nodeLabels = map[string]string{} 865 for k, v := range node.Labels { 866 proxier.nodeLabels[k] = v 867 } 868 proxier.mu.Unlock() 869 klog.V(4).InfoS("Updated proxier node labels", "labels", node.Labels) 870 871 proxier.Sync() 872 } 873 874 // OnNodeDelete is called whenever deletion of an existing node 875 // object is observed. 876 func (proxier *Proxier) OnNodeDelete(node *v1.Node) { 877 if node.Name != proxier.hostname { 878 klog.ErrorS(nil, "Received a watch event for a node that doesn't match the current node", "eventNode", node.Name, "currentNode", proxier.hostname) 879 return 880 } 881 882 proxier.mu.Lock() 883 proxier.nodeLabels = nil 884 proxier.mu.Unlock() 885 886 proxier.Sync() 887 } 888 889 // OnNodeSynced is called once all the initial event handlers were 890 // called and the state is fully propagated to local cache. 891 func (proxier *Proxier) OnNodeSynced() { 892 } 893 894 // This is where all of the ipvs calls happen. 895 func (proxier *Proxier) syncProxyRules() { 896 proxier.mu.Lock() 897 defer proxier.mu.Unlock() 898 899 // don't sync rules till we've received services and endpoints 900 if !proxier.isInitialized() { 901 klog.V(2).InfoS("Not syncing ipvs rules until Services and Endpoints have been received from master") 902 return 903 } 904 905 // its safe to set initialSync to false as it acts as a flag for startup actions 906 // and the mutex is held. 907 defer func() { 908 proxier.initialSync = false 909 }() 910 911 // Keep track of how long syncs take. 912 start := time.Now() 913 defer func() { 914 metrics.SyncProxyRulesLatency.Observe(metrics.SinceInSeconds(start)) 915 klog.V(4).InfoS("syncProxyRules complete", "elapsed", time.Since(start)) 916 }() 917 918 // We assume that if this was called, we really want to sync them, 919 // even if nothing changed in the meantime. In other words, callers are 920 // responsible for detecting no-op changes and not calling this function. 921 serviceUpdateResult := proxier.svcPortMap.Update(proxier.serviceChanges) 922 endpointUpdateResult := proxier.endpointsMap.Update(proxier.endpointsChanges) 923 924 klog.V(3).InfoS("Syncing ipvs proxier rules") 925 926 proxier.serviceNoLocalEndpointsInternal = sets.New[string]() 927 proxier.serviceNoLocalEndpointsExternal = sets.New[string]() 928 929 proxier.lbNoNodeAccessIPPortProtocolEntries = make([]*utilipset.Entry, 0) 930 931 // Begin install iptables 932 933 // Reset all buffers used later. 934 // This is to avoid memory reallocations and thus improve performance. 935 proxier.natChains.Reset() 936 proxier.natRules.Reset() 937 proxier.filterChains.Reset() 938 proxier.filterRules.Reset() 939 940 // Write table headers. 941 proxier.filterChains.Write("*filter") 942 proxier.natChains.Write("*nat") 943 944 proxier.createAndLinkKubeChain() 945 946 // make sure dummy interface exists in the system where ipvs Proxier will bind service address on it 947 _, err := proxier.netlinkHandle.EnsureDummyDevice(defaultDummyDevice) 948 if err != nil { 949 klog.ErrorS(err, "Failed to create dummy interface", "interface", defaultDummyDevice) 950 return 951 } 952 953 // make sure ip sets exists in the system. 954 for _, set := range proxier.ipsetList { 955 if err := ensureIPSet(set); err != nil { 956 return 957 } 958 set.resetEntries() 959 } 960 961 // activeIPVSServices represents IPVS service successfully created in this round of sync 962 activeIPVSServices := sets.New[string]() 963 // activeBindAddrs Represents addresses we want on the defaultDummyDevice after this round of sync 964 activeBindAddrs := sets.New[string]() 965 // alreadyBoundAddrs Represents addresses currently assigned to the dummy interface 966 alreadyBoundAddrs, err := proxier.netlinkHandle.GetLocalAddresses(defaultDummyDevice) 967 if err != nil { 968 klog.ErrorS(err, "Error listing addresses binded to dummy interface") 969 } 970 // nodeAddressSet All addresses *except* those on the dummy interface 971 nodeAddressSet, err := proxier.netlinkHandle.GetAllLocalAddressesExcept(defaultDummyDevice) 972 if err != nil { 973 klog.ErrorS(err, "Error listing node addresses") 974 } 975 976 hasNodePort := false 977 for _, svc := range proxier.svcPortMap { 978 svcInfo, ok := svc.(*servicePortInfo) 979 if ok && svcInfo.NodePort() != 0 { 980 hasNodePort = true 981 break 982 } 983 } 984 985 // List of node IP addresses to be used as IPVS services if nodePort is set. This 986 // can be reused for all nodePort services. 987 var nodeIPs []net.IP 988 if hasNodePort { 989 if proxier.nodePortAddresses.MatchAll() { 990 for _, ipStr := range nodeAddressSet.UnsortedList() { 991 nodeIPs = append(nodeIPs, netutils.ParseIPSloppy(ipStr)) 992 } 993 } else { 994 allNodeIPs, err := proxier.nodePortAddresses.GetNodeIPs(proxier.networkInterfacer) 995 if err != nil { 996 klog.ErrorS(err, "Failed to get node IP address matching nodeport cidr") 997 } else { 998 for _, ip := range allNodeIPs { 999 if !ip.IsLoopback() { 1000 nodeIPs = append(nodeIPs, ip) 1001 } 1002 } 1003 } 1004 } 1005 } 1006 1007 // Build IPVS rules for each service. 1008 for svcPortName, svcPort := range proxier.svcPortMap { 1009 svcInfo, ok := svcPort.(*servicePortInfo) 1010 if !ok { 1011 klog.ErrorS(nil, "Failed to cast serviceInfo", "servicePortName", svcPortName) 1012 continue 1013 } 1014 1015 protocol := strings.ToLower(string(svcInfo.Protocol())) 1016 // Precompute svcNameString; with many services the many calls 1017 // to ServicePortName.String() show up in CPU profiles. 1018 svcPortNameString := svcPortName.String() 1019 1020 // Handle traffic that loops back to the originator with SNAT. 1021 for _, e := range proxier.endpointsMap[svcPortName] { 1022 ep, ok := e.(*proxy.BaseEndpointInfo) 1023 if !ok { 1024 klog.ErrorS(nil, "Failed to cast BaseEndpointInfo", "endpoint", e) 1025 continue 1026 } 1027 if !ep.IsLocal() { 1028 continue 1029 } 1030 epIP := ep.IP() 1031 epPort := ep.Port() 1032 // Error parsing this endpoint has been logged. Skip to next endpoint. 1033 if epIP == "" || epPort == 0 { 1034 continue 1035 } 1036 entry := &utilipset.Entry{ 1037 IP: epIP, 1038 Port: epPort, 1039 Protocol: protocol, 1040 IP2: epIP, 1041 SetType: utilipset.HashIPPortIP, 1042 } 1043 if valid := proxier.ipsetList[kubeLoopBackIPSet].validateEntry(entry); !valid { 1044 klog.ErrorS(nil, "Error adding entry to ipset", "entry", entry, "ipset", proxier.ipsetList[kubeLoopBackIPSet].Name) 1045 continue 1046 } 1047 proxier.ipsetList[kubeLoopBackIPSet].activeEntries.Insert(entry.String()) 1048 } 1049 1050 // Capture the clusterIP. 1051 // ipset call 1052 entry := &utilipset.Entry{ 1053 IP: svcInfo.ClusterIP().String(), 1054 Port: svcInfo.Port(), 1055 Protocol: protocol, 1056 SetType: utilipset.HashIPPort, 1057 } 1058 // add service Cluster IP:Port to kubeServiceAccess ip set for the purpose of solving hairpin. 1059 // proxier.kubeServiceAccessSet.activeEntries.Insert(entry.String()) 1060 if valid := proxier.ipsetList[kubeClusterIPSet].validateEntry(entry); !valid { 1061 klog.ErrorS(nil, "Error adding entry to ipset", "entry", entry, "ipset", proxier.ipsetList[kubeClusterIPSet].Name) 1062 continue 1063 } 1064 proxier.ipsetList[kubeClusterIPSet].activeEntries.Insert(entry.String()) 1065 // ipvs call 1066 serv := &utilipvs.VirtualServer{ 1067 Address: svcInfo.ClusterIP(), 1068 Port: uint16(svcInfo.Port()), 1069 Protocol: string(svcInfo.Protocol()), 1070 Scheduler: proxier.ipvsScheduler, 1071 } 1072 // Set session affinity flag and timeout for IPVS service 1073 if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP { 1074 serv.Flags |= utilipvs.FlagPersistent 1075 serv.Timeout = uint32(svcInfo.StickyMaxAgeSeconds()) 1076 } 1077 // Set the source hash flag needed for the distribution method "mh" 1078 if proxier.ipvsScheduler == "mh" { 1079 serv.Flags |= utilipvs.FlagSourceHash 1080 } 1081 // We need to bind ClusterIP to dummy interface, so set `bindAddr` parameter to `true` in syncService() 1082 if err := proxier.syncService(svcPortNameString, serv, true, alreadyBoundAddrs); err == nil { 1083 activeIPVSServices.Insert(serv.String()) 1084 activeBindAddrs.Insert(serv.Address.String()) 1085 // ExternalTrafficPolicy only works for NodePort and external LB traffic, does not affect ClusterIP 1086 // So we still need clusterIP rules in onlyNodeLocalEndpoints mode. 1087 internalNodeLocal := false 1088 if svcInfo.InternalPolicyLocal() { 1089 internalNodeLocal = true 1090 } 1091 if err := proxier.syncEndpoint(svcPortName, internalNodeLocal, serv); err != nil { 1092 klog.ErrorS(err, "Failed to sync endpoint for service", "servicePortName", svcPortName, "virtualServer", serv) 1093 } 1094 } else { 1095 klog.ErrorS(err, "Failed to sync service", "servicePortName", svcPortName, "virtualServer", serv) 1096 } 1097 1098 // Capture externalIPs. 1099 for _, externalIP := range svcInfo.ExternalIPStrings() { 1100 // ipset call 1101 entry := &utilipset.Entry{ 1102 IP: externalIP, 1103 Port: svcInfo.Port(), 1104 Protocol: protocol, 1105 SetType: utilipset.HashIPPort, 1106 } 1107 1108 if svcInfo.ExternalPolicyLocal() { 1109 if valid := proxier.ipsetList[kubeExternalIPLocalSet].validateEntry(entry); !valid { 1110 klog.ErrorS(nil, "Error adding entry to ipset", "entry", entry, "ipset", proxier.ipsetList[kubeExternalIPLocalSet].Name) 1111 continue 1112 } 1113 proxier.ipsetList[kubeExternalIPLocalSet].activeEntries.Insert(entry.String()) 1114 } else { 1115 // We have to SNAT packets to external IPs. 1116 if valid := proxier.ipsetList[kubeExternalIPSet].validateEntry(entry); !valid { 1117 klog.ErrorS(nil, "Error adding entry to ipset", "entry", entry, "ipset", proxier.ipsetList[kubeExternalIPSet].Name) 1118 continue 1119 } 1120 proxier.ipsetList[kubeExternalIPSet].activeEntries.Insert(entry.String()) 1121 } 1122 1123 // ipvs call 1124 serv := &utilipvs.VirtualServer{ 1125 Address: netutils.ParseIPSloppy(externalIP), 1126 Port: uint16(svcInfo.Port()), 1127 Protocol: string(svcInfo.Protocol()), 1128 Scheduler: proxier.ipvsScheduler, 1129 } 1130 if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP { 1131 serv.Flags |= utilipvs.FlagPersistent 1132 serv.Timeout = uint32(svcInfo.StickyMaxAgeSeconds()) 1133 } 1134 // Set the source hash flag needed for the distribution method "mh" 1135 if proxier.ipvsScheduler == "mh" { 1136 serv.Flags |= utilipvs.FlagSourceHash 1137 } 1138 // We must not add the address to the dummy device if it exist on another interface 1139 shouldBind := !nodeAddressSet.Has(serv.Address.String()) 1140 if err := proxier.syncService(svcPortNameString, serv, shouldBind, alreadyBoundAddrs); err == nil { 1141 activeIPVSServices.Insert(serv.String()) 1142 if shouldBind { 1143 activeBindAddrs.Insert(serv.Address.String()) 1144 } 1145 if err := proxier.syncEndpoint(svcPortName, svcInfo.ExternalPolicyLocal(), serv); err != nil { 1146 klog.ErrorS(err, "Failed to sync endpoint for service", "servicePortName", svcPortName, "virtualServer", serv) 1147 } 1148 } else { 1149 klog.ErrorS(err, "Failed to sync service", "servicePortName", svcPortName, "virtualServer", serv) 1150 } 1151 } 1152 1153 // Capture load-balancer ingress. 1154 for _, ingress := range svcInfo.LoadBalancerVIPStrings() { 1155 // ipset call 1156 entry = &utilipset.Entry{ 1157 IP: ingress, 1158 Port: svcInfo.Port(), 1159 Protocol: protocol, 1160 SetType: utilipset.HashIPPort, 1161 } 1162 // add service load balancer ingressIP:Port to kubeServiceAccess ip set for the purpose of solving hairpin. 1163 // proxier.kubeServiceAccessSet.activeEntries.Insert(entry.String()) 1164 // If we are proxying globally, we need to masquerade in case we cross nodes. 1165 // If we are proxying only locally, we can retain the source IP. 1166 if valid := proxier.ipsetList[kubeLoadBalancerSet].validateEntry(entry); !valid { 1167 klog.ErrorS(nil, "Error adding entry to ipset", "entry", entry, "ipset", proxier.ipsetList[kubeLoadBalancerSet].Name) 1168 continue 1169 } 1170 proxier.ipsetList[kubeLoadBalancerSet].activeEntries.Insert(entry.String()) 1171 // insert loadbalancer entry to lbIngressLocalSet if service externaltrafficpolicy=local 1172 if svcInfo.ExternalPolicyLocal() { 1173 if valid := proxier.ipsetList[kubeLoadBalancerLocalSet].validateEntry(entry); !valid { 1174 klog.ErrorS(nil, "Error adding entry to ipset", "entry", entry, "ipset", proxier.ipsetList[kubeLoadBalancerLocalSet].Name) 1175 continue 1176 } 1177 proxier.ipsetList[kubeLoadBalancerLocalSet].activeEntries.Insert(entry.String()) 1178 } 1179 if len(svcInfo.LoadBalancerSourceRanges()) != 0 { 1180 // The service firewall rules are created based on ServiceSpec.loadBalancerSourceRanges field. 1181 // This currently works for loadbalancers that preserves source ips. 1182 // For loadbalancers which direct traffic to service NodePort, the firewall rules will not apply. 1183 if valid := proxier.ipsetList[kubeLoadBalancerFWSet].validateEntry(entry); !valid { 1184 klog.ErrorS(nil, "Error adding entry to ipset", "entry", entry, "ipset", proxier.ipsetList[kubeLoadBalancerFWSet].Name) 1185 continue 1186 } 1187 proxier.ipsetList[kubeLoadBalancerFWSet].activeEntries.Insert(entry.String()) 1188 allowFromNode := false 1189 for _, src := range svcInfo.LoadBalancerSourceRanges() { 1190 // ipset call 1191 entry = &utilipset.Entry{ 1192 IP: ingress, 1193 Port: svcInfo.Port(), 1194 Protocol: protocol, 1195 Net: src, 1196 SetType: utilipset.HashIPPortNet, 1197 } 1198 // enumerate all white list source cidr 1199 if valid := proxier.ipsetList[kubeLoadBalancerSourceCIDRSet].validateEntry(entry); !valid { 1200 klog.ErrorS(nil, "Error adding entry to ipset", "entry", entry, "ipset", proxier.ipsetList[kubeLoadBalancerSourceCIDRSet].Name) 1201 continue 1202 } 1203 proxier.ipsetList[kubeLoadBalancerSourceCIDRSet].activeEntries.Insert(entry.String()) 1204 1205 // ignore error because it has been validated 1206 _, cidr, _ := netutils.ParseCIDRSloppy(src) 1207 if cidr.Contains(proxier.nodeIP) { 1208 allowFromNode = true 1209 } 1210 } 1211 // generally, ip route rule was added to intercept request to loadbalancer vip from the 1212 // loadbalancer's backend hosts. In this case, request will not hit the loadbalancer but loop back directly. 1213 // Need to add the following rule to allow request on host. 1214 if allowFromNode { 1215 entry = &utilipset.Entry{ 1216 IP: ingress, 1217 Port: svcInfo.Port(), 1218 Protocol: protocol, 1219 IP2: ingress, 1220 SetType: utilipset.HashIPPortIP, 1221 } 1222 // enumerate all white list source ip 1223 if valid := proxier.ipsetList[kubeLoadBalancerSourceIPSet].validateEntry(entry); !valid { 1224 klog.ErrorS(nil, "Error adding entry to ipset", "entry", entry, "ipset", proxier.ipsetList[kubeLoadBalancerSourceIPSet].Name) 1225 continue 1226 } 1227 proxier.ipsetList[kubeLoadBalancerSourceIPSet].activeEntries.Insert(entry.String()) 1228 } else { 1229 // since nodeIP is not covered in any of SourceRange we need to explicitly block the lbIP access from k8s nodes. 1230 proxier.lbNoNodeAccessIPPortProtocolEntries = append(proxier.lbNoNodeAccessIPPortProtocolEntries, entry) 1231 1232 } 1233 } 1234 // ipvs call 1235 serv := &utilipvs.VirtualServer{ 1236 Address: netutils.ParseIPSloppy(ingress), 1237 Port: uint16(svcInfo.Port()), 1238 Protocol: string(svcInfo.Protocol()), 1239 Scheduler: proxier.ipvsScheduler, 1240 } 1241 if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP { 1242 serv.Flags |= utilipvs.FlagPersistent 1243 serv.Timeout = uint32(svcInfo.StickyMaxAgeSeconds()) 1244 } 1245 // Set the source hash flag needed for the distribution method "mh" 1246 if proxier.ipvsScheduler == "mh" { 1247 serv.Flags |= utilipvs.FlagSourceHash 1248 } 1249 // We must not add the address to the dummy device if it exist on another interface 1250 shouldBind := !nodeAddressSet.Has(serv.Address.String()) 1251 if err := proxier.syncService(svcPortNameString, serv, shouldBind, alreadyBoundAddrs); err == nil { 1252 activeIPVSServices.Insert(serv.String()) 1253 if shouldBind { 1254 activeBindAddrs.Insert(serv.Address.String()) 1255 } 1256 if err := proxier.syncEndpoint(svcPortName, svcInfo.ExternalPolicyLocal(), serv); err != nil { 1257 klog.ErrorS(err, "Failed to sync endpoint for service", "servicePortName", svcPortName, "virtualServer", serv) 1258 } 1259 } else { 1260 klog.ErrorS(err, "Failed to sync service", "servicePortName", svcPortName, "virtualServer", serv) 1261 } 1262 } 1263 1264 if svcInfo.NodePort() != 0 { 1265 if len(nodeIPs) == 0 { 1266 // Skip nodePort configuration since an error occurred when 1267 // computing nodeAddresses or nodeIPs. 1268 continue 1269 } 1270 1271 // Nodeports need SNAT, unless they're local. 1272 // ipset call 1273 1274 var ( 1275 nodePortSet *IPSet 1276 entries []*utilipset.Entry 1277 ) 1278 1279 switch protocol { 1280 case utilipset.ProtocolTCP: 1281 nodePortSet = proxier.ipsetList[kubeNodePortSetTCP] 1282 entries = []*utilipset.Entry{{ 1283 // No need to provide ip info 1284 Port: svcInfo.NodePort(), 1285 Protocol: protocol, 1286 SetType: utilipset.BitmapPort, 1287 }} 1288 case utilipset.ProtocolUDP: 1289 nodePortSet = proxier.ipsetList[kubeNodePortSetUDP] 1290 entries = []*utilipset.Entry{{ 1291 // No need to provide ip info 1292 Port: svcInfo.NodePort(), 1293 Protocol: protocol, 1294 SetType: utilipset.BitmapPort, 1295 }} 1296 case utilipset.ProtocolSCTP: 1297 nodePortSet = proxier.ipsetList[kubeNodePortSetSCTP] 1298 // Since hash ip:port is used for SCTP, all the nodeIPs to be used in the SCTP ipset entries. 1299 entries = []*utilipset.Entry{} 1300 for _, nodeIP := range nodeIPs { 1301 entries = append(entries, &utilipset.Entry{ 1302 IP: nodeIP.String(), 1303 Port: svcInfo.NodePort(), 1304 Protocol: protocol, 1305 SetType: utilipset.HashIPPort, 1306 }) 1307 } 1308 default: 1309 // It should never hit 1310 klog.ErrorS(nil, "Unsupported protocol type", "protocol", protocol) 1311 } 1312 if nodePortSet != nil { 1313 entryInvalidErr := false 1314 for _, entry := range entries { 1315 if valid := nodePortSet.validateEntry(entry); !valid { 1316 klog.ErrorS(nil, "Error adding entry to ipset", "entry", entry, "ipset", nodePortSet.Name) 1317 entryInvalidErr = true 1318 break 1319 } 1320 nodePortSet.activeEntries.Insert(entry.String()) 1321 } 1322 if entryInvalidErr { 1323 continue 1324 } 1325 } 1326 1327 // Add externaltrafficpolicy=local type nodeport entry 1328 if svcInfo.ExternalPolicyLocal() { 1329 var nodePortLocalSet *IPSet 1330 switch protocol { 1331 case utilipset.ProtocolTCP: 1332 nodePortLocalSet = proxier.ipsetList[kubeNodePortLocalSetTCP] 1333 case utilipset.ProtocolUDP: 1334 nodePortLocalSet = proxier.ipsetList[kubeNodePortLocalSetUDP] 1335 case utilipset.ProtocolSCTP: 1336 nodePortLocalSet = proxier.ipsetList[kubeNodePortLocalSetSCTP] 1337 default: 1338 // It should never hit 1339 klog.ErrorS(nil, "Unsupported protocol type", "protocol", protocol) 1340 } 1341 if nodePortLocalSet != nil { 1342 entryInvalidErr := false 1343 for _, entry := range entries { 1344 if valid := nodePortLocalSet.validateEntry(entry); !valid { 1345 klog.ErrorS(nil, "Error adding entry to ipset", "entry", entry, "ipset", nodePortLocalSet.Name) 1346 entryInvalidErr = true 1347 break 1348 } 1349 nodePortLocalSet.activeEntries.Insert(entry.String()) 1350 } 1351 if entryInvalidErr { 1352 continue 1353 } 1354 } 1355 } 1356 1357 // Build ipvs kernel routes for each node ip address 1358 for _, nodeIP := range nodeIPs { 1359 // ipvs call 1360 serv := &utilipvs.VirtualServer{ 1361 Address: nodeIP, 1362 Port: uint16(svcInfo.NodePort()), 1363 Protocol: string(svcInfo.Protocol()), 1364 Scheduler: proxier.ipvsScheduler, 1365 } 1366 if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP { 1367 serv.Flags |= utilipvs.FlagPersistent 1368 serv.Timeout = uint32(svcInfo.StickyMaxAgeSeconds()) 1369 } 1370 // Set the source hash flag needed for the distribution method "mh" 1371 if proxier.ipvsScheduler == "mh" { 1372 serv.Flags |= utilipvs.FlagSourceHash 1373 } 1374 // There is no need to bind Node IP to dummy interface, so set parameter `bindAddr` to `false`. 1375 if err := proxier.syncService(svcPortNameString, serv, false, alreadyBoundAddrs); err == nil { 1376 activeIPVSServices.Insert(serv.String()) 1377 if err := proxier.syncEndpoint(svcPortName, svcInfo.ExternalPolicyLocal(), serv); err != nil { 1378 klog.ErrorS(err, "Failed to sync endpoint for service", "servicePortName", svcPortName, "virtualServer", serv) 1379 } 1380 } else { 1381 klog.ErrorS(err, "Failed to sync service", "servicePortName", svcPortName, "virtualServer", serv) 1382 } 1383 } 1384 } 1385 1386 if svcInfo.HealthCheckNodePort() != 0 { 1387 nodePortSet := proxier.ipsetList[kubeHealthCheckNodePortSet] 1388 entry := &utilipset.Entry{ 1389 // No need to provide ip info 1390 Port: svcInfo.HealthCheckNodePort(), 1391 Protocol: "tcp", 1392 SetType: utilipset.BitmapPort, 1393 } 1394 1395 if valid := nodePortSet.validateEntry(entry); !valid { 1396 klog.ErrorS(nil, "Error adding entry to ipset", "entry", entry, "ipset", nodePortSet.Name) 1397 continue 1398 } 1399 nodePortSet.activeEntries.Insert(entry.String()) 1400 } 1401 } 1402 1403 // Set the KUBE-IPVS-IPS set to the "activeBindAddrs" 1404 proxier.ipsetList[kubeIPVSSet].activeEntries = activeBindAddrs 1405 1406 // sync ipset entries 1407 for _, set := range proxier.ipsetList { 1408 set.syncIPSetEntries() 1409 } 1410 1411 // Tail call iptables rules for ipset, make sure only call iptables once 1412 // in a single loop per ip set. 1413 proxier.writeIptablesRules() 1414 1415 // Sync iptables rules. 1416 // NOTE: NoFlushTables is used so we don't flush non-kubernetes chains in the table. 1417 proxier.iptablesData.Reset() 1418 proxier.iptablesData.Write(proxier.natChains.Bytes()) 1419 proxier.iptablesData.Write(proxier.natRules.Bytes()) 1420 proxier.iptablesData.Write(proxier.filterChains.Bytes()) 1421 proxier.iptablesData.Write(proxier.filterRules.Bytes()) 1422 1423 klog.V(5).InfoS("Restoring iptables", "rules", proxier.iptablesData.Bytes()) 1424 err = proxier.iptables.RestoreAll(proxier.iptablesData.Bytes(), utiliptables.NoFlushTables, utiliptables.RestoreCounters) 1425 if err != nil { 1426 if pErr, ok := err.(utiliptables.ParseError); ok { 1427 lines := utiliptables.ExtractLines(proxier.iptablesData.Bytes(), pErr.Line(), 3) 1428 klog.ErrorS(pErr, "Failed to execute iptables-restore", "rules", lines) 1429 } else { 1430 klog.ErrorS(err, "Failed to execute iptables-restore", "rules", proxier.iptablesData.Bytes()) 1431 } 1432 metrics.IptablesRestoreFailuresTotal.Inc() 1433 return 1434 } 1435 for name, lastChangeTriggerTimes := range endpointUpdateResult.LastChangeTriggerTimes { 1436 for _, lastChangeTriggerTime := range lastChangeTriggerTimes { 1437 latency := metrics.SinceInSeconds(lastChangeTriggerTime) 1438 metrics.NetworkProgrammingLatency.Observe(latency) 1439 klog.V(4).InfoS("Network programming", "endpoint", klog.KRef(name.Namespace, name.Name), "elapsed", latency) 1440 } 1441 } 1442 1443 // Remove superfluous addresses from the dummy device 1444 superfluousAddresses := alreadyBoundAddrs.Difference(activeBindAddrs) 1445 if superfluousAddresses.Len() > 0 { 1446 klog.V(2).InfoS("Removing addresses", "interface", defaultDummyDevice, "addresses", superfluousAddresses) 1447 for adr := range superfluousAddresses { 1448 if err := proxier.netlinkHandle.UnbindAddress(adr, defaultDummyDevice); err != nil { 1449 klog.ErrorS(err, "UnbindAddress", "interface", defaultDummyDevice, "address", adr) 1450 } 1451 } 1452 } 1453 1454 // currentIPVSServices represent IPVS services listed from the system 1455 // (including any we have created in this sync) 1456 currentIPVSServices := make(map[string]*utilipvs.VirtualServer) 1457 appliedSvcs, err := proxier.ipvs.GetVirtualServers() 1458 if err == nil { 1459 for _, appliedSvc := range appliedSvcs { 1460 currentIPVSServices[appliedSvc.String()] = appliedSvc 1461 } 1462 } else { 1463 klog.ErrorS(err, "Failed to get ipvs service") 1464 } 1465 proxier.cleanLegacyService(activeIPVSServices, currentIPVSServices) 1466 1467 if proxier.healthzServer != nil { 1468 proxier.healthzServer.Updated(proxier.ipFamily) 1469 } 1470 metrics.SyncProxyRulesLastTimestamp.SetToCurrentTime() 1471 1472 // Update service healthchecks. The endpoints list might include services that are 1473 // not "OnlyLocal", but the services list will not, and the serviceHealthServer 1474 // will just drop those endpoints. 1475 if err := proxier.serviceHealthServer.SyncServices(proxier.svcPortMap.HealthCheckNodePorts()); err != nil { 1476 klog.ErrorS(err, "Error syncing healthcheck services") 1477 } 1478 if err := proxier.serviceHealthServer.SyncEndpoints(proxier.endpointsMap.LocalReadyEndpoints()); err != nil { 1479 klog.ErrorS(err, "Error syncing healthcheck endpoints") 1480 } 1481 1482 metrics.SyncProxyRulesNoLocalEndpointsTotal.WithLabelValues("internal").Set(float64(proxier.serviceNoLocalEndpointsInternal.Len())) 1483 metrics.SyncProxyRulesNoLocalEndpointsTotal.WithLabelValues("external").Set(float64(proxier.serviceNoLocalEndpointsExternal.Len())) 1484 1485 // Finish housekeeping, clear stale conntrack entries for UDP Services 1486 conntrack.CleanStaleEntries(proxier.ipFamily == v1.IPv6Protocol, proxier.exec, proxier.svcPortMap, serviceUpdateResult, endpointUpdateResult) 1487 } 1488 1489 // writeIptablesRules write all iptables rules to proxier.natRules or proxier.FilterRules that ipvs proxier needed 1490 // according to proxier.ipsetList information and the ipset match relationship that `ipsetWithIptablesChain` specified. 1491 // some ipset(kubeClusterIPSet for example) have particular match rules and iptables jump relation should be sync separately. 1492 func (proxier *Proxier) writeIptablesRules() { 1493 1494 // Dismiss connects to localhost early in the service chain 1495 loAddr := "127.0.0.0/8" 1496 if proxier.ipFamily == v1.IPv6Protocol { 1497 loAddr = "::1/128" 1498 } 1499 proxier.natRules.Write("-A", string(kubeServicesChain), "-s", loAddr, "-j", "RETURN") 1500 1501 // We are creating those slices ones here to avoid memory reallocations 1502 // in every loop. Note that reuse the memory, instead of doing: 1503 // slice = <some new slice> 1504 // you should always do one of the below: 1505 // slice = slice[:0] // and then append to it 1506 // slice = append(slice[:0], ...) 1507 // To avoid growing this slice, we arbitrarily set its size to 64, 1508 // there is never more than that many arguments for a single line. 1509 // Note that even if we go over 64, it will still be correct - it 1510 // is just for efficiency, not correctness. 1511 args := make([]string, 64) 1512 1513 for _, set := range ipsetWithIptablesChain { 1514 if _, find := proxier.ipsetList[set.name]; find && !proxier.ipsetList[set.name].isEmpty() { 1515 args = append(args[:0], "-A", set.from) 1516 if set.protocolMatch != "" { 1517 args = append(args, "-p", set.protocolMatch) 1518 } 1519 args = append(args, 1520 "-m", "comment", "--comment", proxier.ipsetList[set.name].getComment(), 1521 "-m", "set", "--match-set", proxier.ipsetList[set.name].Name, 1522 set.matchType, 1523 ) 1524 if set.table == utiliptables.TableFilter { 1525 proxier.filterRules.Write(args, "-j", set.to) 1526 } else { 1527 proxier.natRules.Write(args, "-j", set.to) 1528 } 1529 } 1530 } 1531 1532 if !proxier.ipsetList[kubeClusterIPSet].isEmpty() { 1533 args = append(args[:0], 1534 "-A", string(kubeServicesChain), 1535 "-m", "comment", "--comment", proxier.ipsetList[kubeClusterIPSet].getComment(), 1536 "-m", "set", "--match-set", proxier.ipsetList[kubeClusterIPSet].Name, 1537 ) 1538 if proxier.masqueradeAll { 1539 proxier.natRules.Write( 1540 args, "dst,dst", 1541 "-j", string(kubeMarkMasqChain)) 1542 } else if proxier.localDetector.IsImplemented() { 1543 // This masquerades off-cluster traffic to a service VIP. The idea 1544 // is that you can establish a static route for your Service range, 1545 // routing to any node, and that node will bridge into the Service 1546 // for you. Since that might bounce off-node, we masquerade here. 1547 // If/when we support "Local" policy for VIPs, we should update this. 1548 proxier.natRules.Write( 1549 args, "dst,dst", 1550 proxier.localDetector.IfNotLocal(), 1551 "-j", string(kubeMarkMasqChain)) 1552 } else { 1553 // Masquerade all OUTPUT traffic coming from a service ip. 1554 // The kube dummy interface has all service VIPs assigned which 1555 // results in the service VIP being picked as the source IP to reach 1556 // a VIP. This leads to a connection from VIP:<random port> to 1557 // VIP:<service port>. 1558 // Always masquerading OUTPUT (node-originating) traffic with a VIP 1559 // source ip and service port destination fixes the outgoing connections. 1560 proxier.natRules.Write( 1561 args, "src,dst", 1562 "-j", string(kubeMarkMasqChain)) 1563 } 1564 } 1565 1566 // externalIPRules adds iptables rules applies to Service ExternalIPs 1567 externalIPRules := func(args []string) { 1568 // Allow traffic for external IPs that does not come from a bridge (i.e. not from a container) 1569 // nor from a local process to be forwarded to the service. 1570 // This rule roughly translates to "all traffic from off-machine". 1571 // This is imperfect in the face of network plugins that might not use a bridge, but we can revisit that later. 1572 externalTrafficOnlyArgs := append(args, 1573 "-m", "physdev", "!", "--physdev-is-in", 1574 "-m", "addrtype", "!", "--src-type", "LOCAL") 1575 proxier.natRules.Write(externalTrafficOnlyArgs, "-j", "ACCEPT") 1576 dstLocalOnlyArgs := append(args, "-m", "addrtype", "--dst-type", "LOCAL") 1577 // Allow traffic bound for external IPs that happen to be recognized as local IPs to stay local. 1578 // This covers cases like GCE load-balancers which get added to the local routing table. 1579 proxier.natRules.Write(dstLocalOnlyArgs, "-j", "ACCEPT") 1580 } 1581 1582 if !proxier.ipsetList[kubeExternalIPSet].isEmpty() { 1583 // Build masquerade rules for packets to external IPs. 1584 args = append(args[:0], 1585 "-A", string(kubeServicesChain), 1586 "-m", "comment", "--comment", proxier.ipsetList[kubeExternalIPSet].getComment(), 1587 "-m", "set", "--match-set", proxier.ipsetList[kubeExternalIPSet].Name, 1588 "dst,dst", 1589 ) 1590 proxier.natRules.Write(args, "-j", string(kubeMarkMasqChain)) 1591 externalIPRules(args) 1592 } 1593 1594 if !proxier.ipsetList[kubeExternalIPLocalSet].isEmpty() { 1595 args = append(args[:0], 1596 "-A", string(kubeServicesChain), 1597 "-m", "comment", "--comment", proxier.ipsetList[kubeExternalIPLocalSet].getComment(), 1598 "-m", "set", "--match-set", proxier.ipsetList[kubeExternalIPLocalSet].Name, 1599 "dst,dst", 1600 ) 1601 externalIPRules(args) 1602 } 1603 1604 // -A KUBE-SERVICES -m addrtype --dst-type LOCAL -j KUBE-NODE-PORT 1605 args = append(args[:0], 1606 "-A", string(kubeServicesChain), 1607 "-m", "addrtype", "--dst-type", "LOCAL", 1608 ) 1609 proxier.natRules.Write(args, "-j", string(kubeNodePortChain)) 1610 1611 // mark for masquerading for KUBE-LOAD-BALANCER 1612 proxier.natRules.Write( 1613 "-A", string(kubeLoadBalancerChain), 1614 "-j", string(kubeMarkMasqChain), 1615 ) 1616 1617 // drop packets filtered by KUBE-SOURCE-RANGES-FIREWALL 1618 proxier.filterRules.Write( 1619 "-A", string(kubeSourceRangesFirewallChain), 1620 "-j", "DROP", 1621 ) 1622 1623 // disable LB access from node 1624 // for IPVS src and dst both would be lbIP 1625 for _, entry := range proxier.lbNoNodeAccessIPPortProtocolEntries { 1626 proxier.filterRules.Write( 1627 "-A", string(kubeIPVSOutFilterChain), 1628 "-s", entry.IP, 1629 "-m", "ipvs", "--vaddr", entry.IP, "--vproto", entry.Protocol, "--vport", strconv.Itoa(entry.Port), 1630 "-j", "DROP", 1631 ) 1632 } 1633 1634 // Accept all traffic with destination of ipvs virtual service, in case other iptables rules 1635 // block the traffic, that may result in ipvs rules invalid. 1636 // Those rules must be in the end of KUBE-SERVICE chain 1637 proxier.acceptIPVSTraffic() 1638 1639 // If the masqueradeMark has been added then we want to forward that same 1640 // traffic, this allows NodePort traffic to be forwarded even if the default 1641 // FORWARD policy is not accept. 1642 proxier.filterRules.Write( 1643 "-A", string(kubeForwardChain), 1644 "-m", "comment", "--comment", `"kubernetes forwarding rules"`, 1645 "-m", "mark", "--mark", fmt.Sprintf("%s/%s", proxier.masqueradeMark, proxier.masqueradeMark), 1646 "-j", "ACCEPT", 1647 ) 1648 1649 // The following rule ensures the traffic after the initial packet accepted 1650 // by the "kubernetes forwarding rules" rule above will be accepted. 1651 proxier.filterRules.Write( 1652 "-A", string(kubeForwardChain), 1653 "-m", "comment", "--comment", `"kubernetes forwarding conntrack rule"`, 1654 "-m", "conntrack", 1655 "--ctstate", "RELATED,ESTABLISHED", 1656 "-j", "ACCEPT", 1657 ) 1658 1659 // Add rule to accept traffic towards health check node port 1660 proxier.filterRules.Write( 1661 "-A", string(kubeNodePortChain), 1662 "-m", "comment", "--comment", proxier.ipsetList[kubeHealthCheckNodePortSet].getComment(), 1663 "-m", "set", "--match-set", proxier.ipsetList[kubeHealthCheckNodePortSet].Name, "dst", 1664 "-j", "ACCEPT", 1665 ) 1666 1667 // Add rules to the filter/KUBE-IPVS-FILTER chain to prevent access to ports on the host through VIP addresses. 1668 // https://github.com/kubernetes/kubernetes/issues/72236 1669 proxier.filterRules.Write( 1670 "-A", string(kubeIPVSFilterChain), 1671 "-m", "set", "--match-set", proxier.ipsetList[kubeLoadBalancerSet].Name, "dst,dst", "-j", "RETURN") 1672 proxier.filterRules.Write( 1673 "-A", string(kubeIPVSFilterChain), 1674 "-m", "set", "--match-set", proxier.ipsetList[kubeClusterIPSet].Name, "dst,dst", "-j", "RETURN") 1675 proxier.filterRules.Write( 1676 "-A", string(kubeIPVSFilterChain), 1677 "-m", "set", "--match-set", proxier.ipsetList[kubeExternalIPSet].Name, "dst,dst", "-j", "RETURN") 1678 proxier.filterRules.Write( 1679 "-A", string(kubeIPVSFilterChain), 1680 "-m", "set", "--match-set", proxier.ipsetList[kubeExternalIPLocalSet].Name, "dst,dst", "-j", "RETURN") 1681 proxier.filterRules.Write( 1682 "-A", string(kubeIPVSFilterChain), 1683 "-m", "set", "--match-set", proxier.ipsetList[kubeHealthCheckNodePortSet].Name, "dst", "-j", "RETURN") 1684 proxier.filterRules.Write( 1685 "-A", string(kubeIPVSFilterChain), 1686 "-m", "conntrack", "--ctstate", "NEW", 1687 "-m", "set", "--match-set", proxier.ipsetList[kubeIPVSSet].Name, "dst", "-j", "REJECT") 1688 1689 // Install the kubernetes-specific postrouting rules. We use a whole chain for 1690 // this so that it is easier to flush and change, for example if the mark 1691 // value should ever change. 1692 1693 proxier.natRules.Write( 1694 "-A", string(kubePostroutingChain), 1695 "-m", "mark", "!", "--mark", fmt.Sprintf("%s/%s", proxier.masqueradeMark, proxier.masqueradeMark), 1696 "-j", "RETURN", 1697 ) 1698 // Clear the mark to avoid re-masquerading if the packet re-traverses the network stack. 1699 proxier.natRules.Write( 1700 "-A", string(kubePostroutingChain), 1701 // XOR proxier.masqueradeMark to unset it 1702 "-j", "MARK", "--xor-mark", proxier.masqueradeMark, 1703 ) 1704 masqRule := []string{ 1705 "-A", string(kubePostroutingChain), 1706 "-m", "comment", "--comment", `"kubernetes service traffic requiring SNAT"`, 1707 "-j", "MASQUERADE", 1708 } 1709 if proxier.iptables.HasRandomFully() { 1710 masqRule = append(masqRule, "--random-fully") 1711 } 1712 proxier.natRules.Write(masqRule) 1713 1714 // Install the kubernetes-specific masquerade mark rule. We use a whole chain for 1715 // this so that it is easier to flush and change, for example if the mark 1716 // value should ever change. 1717 proxier.natRules.Write( 1718 "-A", string(kubeMarkMasqChain), 1719 "-j", "MARK", "--or-mark", proxier.masqueradeMark, 1720 ) 1721 1722 // Write the end-of-table markers. 1723 proxier.filterRules.Write("COMMIT") 1724 proxier.natRules.Write("COMMIT") 1725 } 1726 1727 func (proxier *Proxier) acceptIPVSTraffic() { 1728 sets := []string{kubeClusterIPSet, kubeLoadBalancerSet} 1729 for _, set := range sets { 1730 var matchType string 1731 if !proxier.ipsetList[set].isEmpty() { 1732 switch proxier.ipsetList[set].SetType { 1733 case utilipset.BitmapPort: 1734 matchType = "dst" 1735 default: 1736 matchType = "dst,dst" 1737 } 1738 proxier.natRules.Write( 1739 "-A", string(kubeServicesChain), 1740 "-m", "set", "--match-set", proxier.ipsetList[set].Name, matchType, 1741 "-j", "ACCEPT", 1742 ) 1743 } 1744 } 1745 } 1746 1747 // createAndLinkKubeChain create all kube chains that ipvs proxier need and write basic link. 1748 func (proxier *Proxier) createAndLinkKubeChain() { 1749 for _, ch := range iptablesChains { 1750 if _, err := proxier.iptables.EnsureChain(ch.table, ch.chain); err != nil { 1751 klog.ErrorS(err, "Failed to ensure chain exists", "table", ch.table, "chain", ch.chain) 1752 return 1753 } 1754 if ch.table == utiliptables.TableNAT { 1755 proxier.natChains.Write(utiliptables.MakeChainLine(ch.chain)) 1756 } else { 1757 proxier.filterChains.Write(utiliptables.MakeChainLine(ch.chain)) 1758 } 1759 } 1760 1761 for _, jc := range iptablesJumpChain { 1762 args := []string{"-m", "comment", "--comment", jc.comment, "-j", string(jc.to)} 1763 if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, jc.table, jc.from, args...); err != nil { 1764 klog.ErrorS(err, "Failed to ensure chain jumps", "table", jc.table, "srcChain", jc.from, "dstChain", jc.to) 1765 } 1766 } 1767 1768 } 1769 1770 func (proxier *Proxier) syncService(svcName string, vs *utilipvs.VirtualServer, bindAddr bool, alreadyBoundAddrs sets.Set[string]) error { 1771 appliedVirtualServer, _ := proxier.ipvs.GetVirtualServer(vs) 1772 if appliedVirtualServer == nil || !appliedVirtualServer.Equal(vs) { 1773 if appliedVirtualServer == nil { 1774 // IPVS service is not found, create a new service 1775 klog.V(3).InfoS("Adding new service", "serviceName", svcName, "virtualServer", vs) 1776 if err := proxier.ipvs.AddVirtualServer(vs); err != nil { 1777 klog.ErrorS(err, "Failed to add IPVS service", "serviceName", svcName) 1778 return err 1779 } 1780 } else { 1781 // IPVS service was changed, update the existing one 1782 // During updates, service VIP will not go down 1783 klog.V(3).InfoS("IPVS service was changed", "serviceName", svcName) 1784 if err := proxier.ipvs.UpdateVirtualServer(vs); err != nil { 1785 klog.ErrorS(err, "Failed to update IPVS service") 1786 return err 1787 } 1788 } 1789 } 1790 1791 // bind service address to dummy interface 1792 if bindAddr { 1793 // always attempt to bind if alreadyBoundAddrs is nil, 1794 // otherwise check if it's already binded and return early 1795 if alreadyBoundAddrs != nil && alreadyBoundAddrs.Has(vs.Address.String()) { 1796 return nil 1797 } 1798 1799 klog.V(4).InfoS("Bind address", "address", vs.Address) 1800 _, err := proxier.netlinkHandle.EnsureAddressBind(vs.Address.String(), defaultDummyDevice) 1801 if err != nil { 1802 klog.ErrorS(err, "Failed to bind service address to dummy device", "serviceName", svcName) 1803 return err 1804 } 1805 } 1806 1807 return nil 1808 } 1809 1810 func (proxier *Proxier) syncEndpoint(svcPortName proxy.ServicePortName, onlyNodeLocalEndpoints bool, vs *utilipvs.VirtualServer) error { 1811 appliedVirtualServer, err := proxier.ipvs.GetVirtualServer(vs) 1812 if err != nil { 1813 klog.ErrorS(err, "Failed to get IPVS service") 1814 return err 1815 } 1816 if appliedVirtualServer == nil { 1817 return errors.New("IPVS virtual service does not exist") 1818 } 1819 1820 // curEndpoints represents IPVS destinations listed from current system. 1821 curEndpoints := sets.New[string]() 1822 curDests, err := proxier.ipvs.GetRealServers(appliedVirtualServer) 1823 if err != nil { 1824 klog.ErrorS(err, "Failed to list IPVS destinations") 1825 return err 1826 } 1827 for _, des := range curDests { 1828 curEndpoints.Insert(des.String()) 1829 } 1830 1831 endpoints := proxier.endpointsMap[svcPortName] 1832 1833 // Filtering for topology aware endpoints. This function will only 1834 // filter endpoints if appropriate feature gates are enabled and the 1835 // Service does not have conflicting configuration such as 1836 // externalTrafficPolicy=Local. 1837 svcInfo, ok := proxier.svcPortMap[svcPortName] 1838 if !ok { 1839 klog.InfoS("Unable to filter endpoints due to missing service info", "servicePortName", svcPortName) 1840 } else { 1841 clusterEndpoints, localEndpoints, _, hasAnyEndpoints := proxy.CategorizeEndpoints(endpoints, svcInfo, proxier.nodeLabels) 1842 if onlyNodeLocalEndpoints { 1843 if len(localEndpoints) > 0 { 1844 endpoints = localEndpoints 1845 } else { 1846 // https://github.com/kubernetes/kubernetes/pull/97081 1847 // Allow access from local PODs even if no local endpoints exist. 1848 // Traffic from an external source will be routed but the reply 1849 // will have the POD address and will be discarded. 1850 endpoints = clusterEndpoints 1851 1852 if hasAnyEndpoints && svcInfo.InternalPolicyLocal() { 1853 proxier.serviceNoLocalEndpointsInternal.Insert(svcPortName.NamespacedName.String()) 1854 } 1855 1856 if hasAnyEndpoints && svcInfo.ExternalPolicyLocal() { 1857 proxier.serviceNoLocalEndpointsExternal.Insert(svcPortName.NamespacedName.String()) 1858 } 1859 } 1860 } else { 1861 endpoints = clusterEndpoints 1862 } 1863 } 1864 1865 newEndpoints := sets.New[string]() 1866 for _, epInfo := range endpoints { 1867 newEndpoints.Insert(epInfo.String()) 1868 } 1869 1870 // Create new endpoints 1871 for _, ep := range sets.List(newEndpoints) { 1872 ip, port, err := net.SplitHostPort(ep) 1873 if err != nil { 1874 klog.ErrorS(err, "Failed to parse endpoint", "endpoint", ep) 1875 continue 1876 } 1877 portNum, err := strconv.Atoi(port) 1878 if err != nil { 1879 klog.ErrorS(err, "Failed to parse endpoint port", "port", port) 1880 continue 1881 } 1882 1883 newDest := &utilipvs.RealServer{ 1884 Address: netutils.ParseIPSloppy(ip), 1885 Port: uint16(portNum), 1886 Weight: 1, 1887 } 1888 1889 if curEndpoints.Has(ep) { 1890 // if we are syncing for the first time, loop through all current destinations and 1891 // reset their weight. 1892 if proxier.initialSync { 1893 for _, dest := range curDests { 1894 if dest.Weight != newDest.Weight { 1895 err = proxier.ipvs.UpdateRealServer(appliedVirtualServer, newDest) 1896 if err != nil { 1897 klog.ErrorS(err, "Failed to update destination", "newDest", newDest) 1898 continue 1899 } 1900 } 1901 } 1902 } 1903 // check if newEndpoint is in gracefulDelete list, if true, delete this ep immediately 1904 uniqueRS := GetUniqueRSName(vs, newDest) 1905 if !proxier.gracefuldeleteManager.InTerminationList(uniqueRS) { 1906 continue 1907 } 1908 klog.V(5).InfoS("new ep is in graceful delete list", "uniqueRealServer", uniqueRS) 1909 err := proxier.gracefuldeleteManager.MoveRSOutofGracefulDeleteList(uniqueRS) 1910 if err != nil { 1911 klog.ErrorS(err, "Failed to delete endpoint in gracefulDeleteQueue", "endpoint", ep) 1912 continue 1913 } 1914 } 1915 err = proxier.ipvs.AddRealServer(appliedVirtualServer, newDest) 1916 if err != nil { 1917 klog.ErrorS(err, "Failed to add destination", "newDest", newDest) 1918 continue 1919 } 1920 } 1921 1922 // Delete old endpoints 1923 for _, ep := range curEndpoints.Difference(newEndpoints).UnsortedList() { 1924 // if curEndpoint is in gracefulDelete, skip 1925 uniqueRS := vs.String() + "/" + ep 1926 if proxier.gracefuldeleteManager.InTerminationList(uniqueRS) { 1927 continue 1928 } 1929 ip, port, err := net.SplitHostPort(ep) 1930 if err != nil { 1931 klog.ErrorS(err, "Failed to parse endpoint", "endpoint", ep) 1932 continue 1933 } 1934 portNum, err := strconv.Atoi(port) 1935 if err != nil { 1936 klog.ErrorS(err, "Failed to parse endpoint port", "port", port) 1937 continue 1938 } 1939 1940 delDest := &utilipvs.RealServer{ 1941 Address: netutils.ParseIPSloppy(ip), 1942 Port: uint16(portNum), 1943 } 1944 1945 klog.V(5).InfoS("Using graceful delete", "uniqueRealServer", uniqueRS) 1946 err = proxier.gracefuldeleteManager.GracefulDeleteRS(appliedVirtualServer, delDest) 1947 if err != nil { 1948 klog.ErrorS(err, "Failed to delete destination", "uniqueRealServer", uniqueRS) 1949 continue 1950 } 1951 } 1952 return nil 1953 } 1954 1955 func (proxier *Proxier) cleanLegacyService(activeServices sets.Set[string], currentServices map[string]*utilipvs.VirtualServer) { 1956 for cs, svc := range currentServices { 1957 if proxier.isIPInExcludeCIDRs(svc.Address) { 1958 continue 1959 } 1960 if getIPFamily(svc.Address) != proxier.ipFamily { 1961 // Not our family 1962 continue 1963 } 1964 if !activeServices.Has(cs) { 1965 klog.V(4).InfoS("Delete service", "virtualServer", svc) 1966 if err := proxier.ipvs.DeleteVirtualServer(svc); err != nil { 1967 klog.ErrorS(err, "Failed to delete service", "virtualServer", svc) 1968 } 1969 } 1970 } 1971 } 1972 1973 func (proxier *Proxier) isIPInExcludeCIDRs(ip net.IP) bool { 1974 // make sure it does not fall within an excluded CIDR range. 1975 for _, excludedCIDR := range proxier.excludeCIDRs { 1976 if excludedCIDR.Contains(ip) { 1977 return true 1978 } 1979 } 1980 return false 1981 } 1982 1983 func getIPFamily(ip net.IP) v1.IPFamily { 1984 if netutils.IsIPv4(ip) { 1985 return v1.IPv4Protocol 1986 } 1987 return v1.IPv6Protocol 1988 } 1989 1990 // ipvs Proxier fall back on iptables when it needs to do SNAT for engress packets 1991 // It will only operate iptables *nat table. 1992 // Create and link the kube postrouting chain for SNAT packets. 1993 // Chain POSTROUTING (policy ACCEPT) 1994 // target prot opt source destination 1995 // KUBE-POSTROUTING all -- 0.0.0.0/0 0.0.0.0/0 /* kubernetes postrouting rules * 1996 // Maintain by kubelet network sync loop 1997 1998 // *nat 1999 // :KUBE-POSTROUTING - [0:0] 2000 // Chain KUBE-POSTROUTING (1 references) 2001 // target prot opt source destination 2002 // MASQUERADE all -- 0.0.0.0/0 0.0.0.0/0 /* kubernetes service traffic requiring SNAT */ mark match 0x4000/0x4000 2003 2004 // :KUBE-MARK-MASQ - [0:0] 2005 // Chain KUBE-MARK-MASQ (0 references) 2006 // target prot opt source destination 2007 // MARK all -- 0.0.0.0/0 0.0.0.0/0 MARK or 0x4000