k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/proxy/ipvs/proxier.go (about) 1 //go:build linux 2 // +build linux 3 4 /* 5 Copyright 2017 The Kubernetes Authors. 6 7 Licensed under the Apache License, Version 2.0 (the "License"); 8 you may not use this file except in compliance with the License. 9 You may obtain a copy of the License at 10 11 http://www.apache.org/licenses/LICENSE-2.0 12 13 Unless required by applicable law or agreed to in writing, software 14 distributed under the License is distributed on an "AS IS" BASIS, 15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 See the License for the specific language governing permissions and 17 limitations under the License. 18 */ 19 20 package ipvs 21 22 import ( 23 "bytes" 24 "context" 25 "errors" 26 "fmt" 27 "io" 28 "net" 29 "reflect" 30 "strconv" 31 "strings" 32 "sync" 33 "sync/atomic" 34 "time" 35 36 "k8s.io/klog/v2" 37 utilexec "k8s.io/utils/exec" 38 netutils "k8s.io/utils/net" 39 40 v1 "k8s.io/api/core/v1" 41 discovery "k8s.io/api/discovery/v1" 42 "k8s.io/apimachinery/pkg/types" 43 "k8s.io/apimachinery/pkg/util/sets" 44 "k8s.io/apimachinery/pkg/util/version" 45 "k8s.io/apimachinery/pkg/util/wait" 46 "k8s.io/client-go/tools/events" 47 utilsysctl "k8s.io/component-helpers/node/util/sysctl" 48 "k8s.io/kubernetes/pkg/proxy" 49 "k8s.io/kubernetes/pkg/proxy/conntrack" 50 "k8s.io/kubernetes/pkg/proxy/healthcheck" 51 utilipset "k8s.io/kubernetes/pkg/proxy/ipvs/ipset" 52 utilipvs "k8s.io/kubernetes/pkg/proxy/ipvs/util" 53 "k8s.io/kubernetes/pkg/proxy/metaproxier" 54 "k8s.io/kubernetes/pkg/proxy/metrics" 55 proxyutil "k8s.io/kubernetes/pkg/proxy/util" 56 "k8s.io/kubernetes/pkg/util/async" 57 utiliptables "k8s.io/kubernetes/pkg/util/iptables" 58 utilkernel "k8s.io/kubernetes/pkg/util/kernel" 59 ) 60 61 const ( 62 // kubeServicesChain is the services portal chain 63 kubeServicesChain utiliptables.Chain = "KUBE-SERVICES" 64 65 // kubeProxyFirewallChain is the kube-proxy firewall chain. 66 kubeProxyFirewallChain utiliptables.Chain = "KUBE-PROXY-FIREWALL" 67 68 // kubeSourceRangesFirewallChain is the firewall subchain for LoadBalancerSourceRanges. 69 kubeSourceRangesFirewallChain utiliptables.Chain = "KUBE-SOURCE-RANGES-FIREWALL" 70 71 // kubePostroutingChain is the kubernetes postrouting chain 72 kubePostroutingChain utiliptables.Chain = "KUBE-POSTROUTING" 73 74 // kubeMarkMasqChain is the mark-for-masquerade chain 75 kubeMarkMasqChain utiliptables.Chain = "KUBE-MARK-MASQ" 76 77 // kubeNodePortChain is the kubernetes node port chain 78 kubeNodePortChain utiliptables.Chain = "KUBE-NODE-PORT" 79 80 // kubeForwardChain is the kubernetes forward chain 81 kubeForwardChain utiliptables.Chain = "KUBE-FORWARD" 82 83 // kubeLoadBalancerChain is the kubernetes chain for loadbalancer type service 84 kubeLoadBalancerChain utiliptables.Chain = "KUBE-LOAD-BALANCER" 85 86 // kubeIPVSFilterChain filters external access to main netns 87 // https://github.com/kubernetes/kubernetes/issues/72236 88 kubeIPVSFilterChain utiliptables.Chain = "KUBE-IPVS-FILTER" 89 90 // kubeIPVSOutFilterChain filters access to load balancer services from node. 91 // https://github.com/kubernetes/kubernetes/issues/119656 92 kubeIPVSOutFilterChain utiliptables.Chain = "KUBE-IPVS-OUT-FILTER" 93 94 // defaultScheduler is the default ipvs scheduler algorithm - round robin. 95 defaultScheduler = "rr" 96 97 // defaultDummyDevice is the default dummy interface which ipvs service address will bind to it. 98 defaultDummyDevice = "kube-ipvs0" 99 ) 100 101 // In IPVS proxy mode, the following flags need to be set 102 const ( 103 sysctlVSConnTrack = "net/ipv4/vs/conntrack" 104 sysctlConnReuse = "net/ipv4/vs/conn_reuse_mode" 105 sysctlExpireNoDestConn = "net/ipv4/vs/expire_nodest_conn" 106 sysctlExpireQuiescentTemplate = "net/ipv4/vs/expire_quiescent_template" 107 sysctlForward = "net/ipv4/ip_forward" 108 sysctlArpIgnore = "net/ipv4/conf/all/arp_ignore" 109 sysctlArpAnnounce = "net/ipv4/conf/all/arp_announce" 110 ) 111 112 // NewDualStackProxier returns a new Proxier for dual-stack operation 113 func NewDualStackProxier( 114 ctx context.Context, 115 ipt [2]utiliptables.Interface, 116 ipvs utilipvs.Interface, 117 ipset utilipset.Interface, 118 sysctl utilsysctl.Interface, 119 exec utilexec.Interface, 120 syncPeriod time.Duration, 121 minSyncPeriod time.Duration, 122 excludeCIDRs []string, 123 strictARP bool, 124 tcpTimeout time.Duration, 125 tcpFinTimeout time.Duration, 126 udpTimeout time.Duration, 127 masqueradeAll bool, 128 masqueradeBit int, 129 localDetectors map[v1.IPFamily]proxyutil.LocalTrafficDetector, 130 hostname string, 131 nodeIPs map[v1.IPFamily]net.IP, 132 recorder events.EventRecorder, 133 healthzServer *healthcheck.ProxierHealthServer, 134 scheduler string, 135 nodePortAddresses []string, 136 initOnly bool, 137 ) (proxy.Provider, error) { 138 // Create an ipv4 instance of the single-stack proxier 139 ipv4Proxier, err := NewProxier(ctx, v1.IPv4Protocol, ipt[0], ipvs, ipset, sysctl, 140 exec, syncPeriod, minSyncPeriod, filterCIDRs(false, excludeCIDRs), strictARP, 141 tcpTimeout, tcpFinTimeout, udpTimeout, masqueradeAll, masqueradeBit, 142 localDetectors[v1.IPv4Protocol], hostname, nodeIPs[v1.IPv4Protocol], recorder, 143 healthzServer, scheduler, nodePortAddresses, initOnly) 144 if err != nil { 145 return nil, fmt.Errorf("unable to create ipv4 proxier: %v", err) 146 } 147 148 ipv6Proxier, err := NewProxier(ctx, v1.IPv6Protocol, ipt[1], ipvs, ipset, sysctl, 149 exec, syncPeriod, minSyncPeriod, filterCIDRs(true, excludeCIDRs), strictARP, 150 tcpTimeout, tcpFinTimeout, udpTimeout, masqueradeAll, masqueradeBit, 151 localDetectors[v1.IPv6Protocol], hostname, nodeIPs[v1.IPv6Protocol], recorder, 152 healthzServer, scheduler, nodePortAddresses, initOnly) 153 if err != nil { 154 return nil, fmt.Errorf("unable to create ipv6 proxier: %v", err) 155 } 156 if initOnly { 157 return nil, nil 158 } 159 160 // Return a meta-proxier that dispatch calls between the two 161 // single-stack proxier instances 162 return metaproxier.NewMetaProxier(ipv4Proxier, ipv6Proxier), nil 163 } 164 165 // Proxier is an ipvs based proxy for connections between a localhost:lport 166 // and services that provide the actual backends. 167 type Proxier struct { 168 // the ipfamily on which this proxy is operating on. 169 ipFamily v1.IPFamily 170 // endpointsChanges and serviceChanges contains all changes to endpoints and 171 // services that happened since last syncProxyRules call. For a single object, 172 // changes are accumulated, i.e. previous is state from before all of them, 173 // current is state after applying all of those. 174 endpointsChanges *proxy.EndpointsChangeTracker 175 serviceChanges *proxy.ServiceChangeTracker 176 177 mu sync.Mutex // protects the following fields 178 svcPortMap proxy.ServicePortMap 179 endpointsMap proxy.EndpointsMap 180 nodeLabels map[string]string 181 // initialSync is a bool indicating if the proxier is syncing for the first time. 182 // It is set to true when a new proxier is initialized and then set to false on all 183 // future syncs. 184 // This lets us run specific logic that's required only during proxy startup. 185 // For eg: it enables us to update weights of existing destinations only on startup 186 // saving us the cost of querying and updating real servers during every sync. 187 initialSync bool 188 // endpointSlicesSynced, and servicesSynced are set to true when 189 // corresponding objects are synced after startup. This is used to avoid updating 190 // ipvs rules with some partial data after kube-proxy restart. 191 endpointSlicesSynced bool 192 servicesSynced bool 193 initialized int32 194 syncRunner *async.BoundedFrequencyRunner // governs calls to syncProxyRules 195 196 // These are effectively const and do not need the mutex to be held. 197 syncPeriod time.Duration 198 minSyncPeriod time.Duration 199 // Values are CIDR's to exclude when cleaning up IPVS rules. 200 excludeCIDRs []*net.IPNet 201 // Set to true to set sysctls arp_ignore and arp_announce 202 strictARP bool 203 iptables utiliptables.Interface 204 ipvs utilipvs.Interface 205 ipset utilipset.Interface 206 conntrack conntrack.Interface 207 masqueradeAll bool 208 masqueradeMark string 209 localDetector proxyutil.LocalTrafficDetector 210 hostname string 211 nodeIP net.IP 212 recorder events.EventRecorder 213 214 serviceHealthServer healthcheck.ServiceHealthServer 215 healthzServer *healthcheck.ProxierHealthServer 216 217 ipvsScheduler string 218 // The following buffers are used to reuse memory and avoid allocations 219 // that are significantly impacting performance. 220 iptablesData *bytes.Buffer 221 filterChainsData *bytes.Buffer 222 natChains proxyutil.LineBuffer 223 filterChains proxyutil.LineBuffer 224 natRules proxyutil.LineBuffer 225 filterRules proxyutil.LineBuffer 226 // Added as a member to the struct to allow injection for testing. 227 netlinkHandle NetLinkHandle 228 // ipsetList is the list of ipsets that ipvs proxier used. 229 ipsetList map[string]*IPSet 230 // nodePortAddresses selects the interfaces where nodePort works. 231 nodePortAddresses *proxyutil.NodePortAddresses 232 // networkInterfacer defines an interface for several net library functions. 233 // Inject for test purpose. 234 networkInterfacer proxyutil.NetworkInterfacer 235 gracefuldeleteManager *GracefulTerminationManager 236 // serviceNoLocalEndpointsInternal represents the set of services that couldn't be applied 237 // due to the absence of local endpoints when the internal traffic policy is "Local". 238 // It is used to publish the sync_proxy_rules_no_endpoints_total 239 // metric with the traffic_policy label set to "internal". 240 // A Set is used here since we end up calculating endpoint topology multiple times for the same Service 241 // if it has multiple ports but each Service should only be counted once. 242 serviceNoLocalEndpointsInternal sets.Set[string] 243 // serviceNoLocalEndpointsExternal represents the set of services that couldn't be applied 244 // due to the absence of any endpoints when the external traffic policy is "Local". 245 // It is used to publish the sync_proxy_rules_no_endpoints_total 246 // metric with the traffic_policy label set to "external". 247 // A Set is used here since we end up calculating endpoint topology multiple times for the same Service 248 // if it has multiple ports but each Service should only be counted once. 249 serviceNoLocalEndpointsExternal sets.Set[string] 250 // lbNoNodeAccessIPPortProtocolEntries represents the set of loadBalancers IP + Port + Protocol that should not be accessible from K8s nodes 251 // We cannot directly restrict LB access from node using LoadBalancerSourceRanges, we need to install 252 // additional iptables rules. 253 // (ref: https://github.com/kubernetes/kubernetes/issues/119656) 254 lbNoNodeAccessIPPortProtocolEntries []*utilipset.Entry 255 256 logger klog.Logger 257 } 258 259 // Proxier implements proxy.Provider 260 var _ proxy.Provider = &Proxier{} 261 262 // NewProxier returns a new Proxier given an iptables and ipvs Interface instance. 263 // Because of the iptables and ipvs logic, it is assumed that there is only a single Proxier active on a machine. 264 // An error will be returned if it fails to update or acquire the initial lock. 265 // Once a proxier is created, it will keep iptables and ipvs rules up to date in the background and 266 // will not terminate if a particular iptables or ipvs call fails. 267 func NewProxier( 268 ctx context.Context, 269 ipFamily v1.IPFamily, 270 ipt utiliptables.Interface, 271 ipvs utilipvs.Interface, 272 ipset utilipset.Interface, 273 sysctl utilsysctl.Interface, 274 exec utilexec.Interface, 275 syncPeriod time.Duration, 276 minSyncPeriod time.Duration, 277 excludeCIDRs []string, 278 strictARP bool, 279 tcpTimeout time.Duration, 280 tcpFinTimeout time.Duration, 281 udpTimeout time.Duration, 282 masqueradeAll bool, 283 masqueradeBit int, 284 localDetector proxyutil.LocalTrafficDetector, 285 hostname string, 286 nodeIP net.IP, 287 recorder events.EventRecorder, 288 healthzServer *healthcheck.ProxierHealthServer, 289 scheduler string, 290 nodePortAddressStrings []string, 291 initOnly bool, 292 ) (*Proxier, error) { 293 logger := klog.LoggerWithValues(klog.FromContext(ctx), "ipFamily", ipFamily) 294 // Set the conntrack sysctl we need for 295 if err := proxyutil.EnsureSysctl(sysctl, sysctlVSConnTrack, 1); err != nil { 296 return nil, err 297 } 298 299 kernelVersion, err := utilkernel.GetVersion() 300 if err != nil { 301 return nil, fmt.Errorf("failed to get kernel version: %w", err) 302 } 303 304 if kernelVersion.LessThan(version.MustParseGeneric(utilkernel.IPVSConnReuseModeMinSupportedKernelVersion)) { 305 logger.Error(nil, "Can't set sysctl, kernel version doesn't satisfy minimum version requirements", "sysctl", sysctlConnReuse, "minimumKernelVersion", utilkernel.IPVSConnReuseModeMinSupportedKernelVersion) 306 } else if kernelVersion.AtLeast(version.MustParseGeneric(utilkernel.IPVSConnReuseModeFixedKernelVersion)) { 307 // https://github.com/kubernetes/kubernetes/issues/93297 308 logger.V(2).Info("Left as-is", "sysctl", sysctlConnReuse) 309 } else { 310 // Set the connection reuse mode 311 if err := proxyutil.EnsureSysctl(sysctl, sysctlConnReuse, 0); err != nil { 312 return nil, err 313 } 314 } 315 316 // Set the expire_nodest_conn sysctl we need for 317 if err := proxyutil.EnsureSysctl(sysctl, sysctlExpireNoDestConn, 1); err != nil { 318 return nil, err 319 } 320 321 // Set the expire_quiescent_template sysctl we need for 322 if err := proxyutil.EnsureSysctl(sysctl, sysctlExpireQuiescentTemplate, 1); err != nil { 323 return nil, err 324 } 325 326 // Set the ip_forward sysctl we need for 327 if err := proxyutil.EnsureSysctl(sysctl, sysctlForward, 1); err != nil { 328 return nil, err 329 } 330 331 if strictARP { 332 // Set the arp_ignore sysctl we need for 333 if err := proxyutil.EnsureSysctl(sysctl, sysctlArpIgnore, 1); err != nil { 334 return nil, err 335 } 336 337 // Set the arp_announce sysctl we need for 338 if err := proxyutil.EnsureSysctl(sysctl, sysctlArpAnnounce, 2); err != nil { 339 return nil, err 340 } 341 } 342 343 // Configure IPVS timeouts if any one of the timeout parameters have been set. 344 // This is the equivalent to running ipvsadm --set, a value of 0 indicates the 345 // current system timeout should be preserved 346 if tcpTimeout > 0 || tcpFinTimeout > 0 || udpTimeout > 0 { 347 if err := ipvs.ConfigureTimeouts(tcpTimeout, tcpFinTimeout, udpTimeout); err != nil { 348 logger.Error(err, "Failed to configure IPVS timeouts") 349 } 350 } 351 352 if initOnly { 353 logger.Info("System initialized and --init-only specified") 354 return nil, nil 355 } 356 357 // Generate the masquerade mark to use for SNAT rules. 358 masqueradeValue := 1 << uint(masqueradeBit) 359 masqueradeMark := fmt.Sprintf("%#08x", masqueradeValue) 360 361 logger.V(2).Info("Record nodeIP and family", "nodeIP", nodeIP, "family", ipFamily) 362 363 if len(scheduler) == 0 { 364 logger.Info("IPVS scheduler not specified, use rr by default") 365 scheduler = defaultScheduler 366 } 367 368 nodePortAddresses := proxyutil.NewNodePortAddresses(ipFamily, nodePortAddressStrings) 369 370 serviceHealthServer := healthcheck.NewServiceHealthServer(hostname, recorder, nodePortAddresses, healthzServer) 371 372 // excludeCIDRs has been validated before, here we just parse it to IPNet list 373 parsedExcludeCIDRs, _ := netutils.ParseCIDRs(excludeCIDRs) 374 375 proxier := &Proxier{ 376 ipFamily: ipFamily, 377 svcPortMap: make(proxy.ServicePortMap), 378 serviceChanges: proxy.NewServiceChangeTracker(newServiceInfo, ipFamily, recorder, nil), 379 endpointsMap: make(proxy.EndpointsMap), 380 endpointsChanges: proxy.NewEndpointsChangeTracker(hostname, nil, ipFamily, recorder, nil), 381 initialSync: true, 382 syncPeriod: syncPeriod, 383 minSyncPeriod: minSyncPeriod, 384 excludeCIDRs: parsedExcludeCIDRs, 385 iptables: ipt, 386 masqueradeAll: masqueradeAll, 387 masqueradeMark: masqueradeMark, 388 conntrack: conntrack.NewExec(exec), 389 localDetector: localDetector, 390 hostname: hostname, 391 nodeIP: nodeIP, 392 recorder: recorder, 393 serviceHealthServer: serviceHealthServer, 394 healthzServer: healthzServer, 395 ipvs: ipvs, 396 ipvsScheduler: scheduler, 397 iptablesData: bytes.NewBuffer(nil), 398 filterChainsData: bytes.NewBuffer(nil), 399 natChains: proxyutil.NewLineBuffer(), 400 natRules: proxyutil.NewLineBuffer(), 401 filterChains: proxyutil.NewLineBuffer(), 402 filterRules: proxyutil.NewLineBuffer(), 403 netlinkHandle: NewNetLinkHandle(ipFamily == v1.IPv6Protocol), 404 ipset: ipset, 405 nodePortAddresses: nodePortAddresses, 406 networkInterfacer: proxyutil.RealNetwork{}, 407 gracefuldeleteManager: NewGracefulTerminationManager(ipvs), 408 logger: logger, 409 } 410 // initialize ipsetList with all sets we needed 411 proxier.ipsetList = make(map[string]*IPSet) 412 for _, is := range ipsetInfo { 413 proxier.ipsetList[is.name] = NewIPSet(ipset, is.name, is.setType, (ipFamily == v1.IPv6Protocol), is.comment) 414 } 415 burstSyncs := 2 416 logger.V(2).Info("ipvs sync params", "minSyncPeriod", minSyncPeriod, "syncPeriod", syncPeriod, "burstSyncs", burstSyncs) 417 proxier.syncRunner = async.NewBoundedFrequencyRunner("sync-runner", proxier.syncProxyRules, minSyncPeriod, syncPeriod, burstSyncs) 418 proxier.gracefuldeleteManager.Run() 419 return proxier, nil 420 } 421 422 func filterCIDRs(wantIPv6 bool, cidrs []string) []string { 423 var filteredCIDRs []string 424 for _, cidr := range cidrs { 425 if netutils.IsIPv6CIDRString(cidr) == wantIPv6 { 426 filteredCIDRs = append(filteredCIDRs, cidr) 427 } 428 } 429 return filteredCIDRs 430 } 431 432 // iptablesJumpChain is tables of iptables chains that ipvs proxier used to install iptables or cleanup iptables. 433 // `to` is the iptables chain we want to operate. 434 // `from` is the source iptables chain 435 var iptablesJumpChain = []struct { 436 table utiliptables.Table 437 from utiliptables.Chain 438 to utiliptables.Chain 439 comment string 440 }{ 441 {utiliptables.TableNAT, utiliptables.ChainOutput, kubeServicesChain, "kubernetes service portals"}, 442 {utiliptables.TableNAT, utiliptables.ChainPrerouting, kubeServicesChain, "kubernetes service portals"}, 443 {utiliptables.TableNAT, utiliptables.ChainPostrouting, kubePostroutingChain, "kubernetes postrouting rules"}, 444 {utiliptables.TableFilter, utiliptables.ChainForward, kubeForwardChain, "kubernetes forwarding rules"}, 445 {utiliptables.TableFilter, utiliptables.ChainInput, kubeNodePortChain, "kubernetes health check rules"}, 446 {utiliptables.TableFilter, utiliptables.ChainInput, kubeProxyFirewallChain, "kube-proxy firewall rules"}, 447 {utiliptables.TableFilter, utiliptables.ChainForward, kubeProxyFirewallChain, "kube-proxy firewall rules"}, 448 {utiliptables.TableFilter, utiliptables.ChainInput, kubeIPVSFilterChain, "kubernetes ipvs access filter"}, 449 {utiliptables.TableFilter, utiliptables.ChainOutput, kubeIPVSOutFilterChain, "kubernetes ipvs access filter"}, 450 } 451 452 var iptablesChains = []struct { 453 table utiliptables.Table 454 chain utiliptables.Chain 455 }{ 456 {utiliptables.TableNAT, kubeServicesChain}, 457 {utiliptables.TableNAT, kubePostroutingChain}, 458 {utiliptables.TableNAT, kubeNodePortChain}, 459 {utiliptables.TableNAT, kubeLoadBalancerChain}, 460 {utiliptables.TableNAT, kubeMarkMasqChain}, 461 {utiliptables.TableFilter, kubeForwardChain}, 462 {utiliptables.TableFilter, kubeNodePortChain}, 463 {utiliptables.TableFilter, kubeProxyFirewallChain}, 464 {utiliptables.TableFilter, kubeSourceRangesFirewallChain}, 465 {utiliptables.TableFilter, kubeIPVSFilterChain}, 466 {utiliptables.TableFilter, kubeIPVSOutFilterChain}, 467 } 468 469 var iptablesCleanupChains = []struct { 470 table utiliptables.Table 471 chain utiliptables.Chain 472 }{ 473 {utiliptables.TableNAT, kubeServicesChain}, 474 {utiliptables.TableNAT, kubePostroutingChain}, 475 {utiliptables.TableNAT, kubeNodePortChain}, 476 {utiliptables.TableNAT, kubeLoadBalancerChain}, 477 {utiliptables.TableFilter, kubeForwardChain}, 478 {utiliptables.TableFilter, kubeNodePortChain}, 479 {utiliptables.TableFilter, kubeProxyFirewallChain}, 480 {utiliptables.TableFilter, kubeSourceRangesFirewallChain}, 481 {utiliptables.TableFilter, kubeIPVSFilterChain}, 482 {utiliptables.TableFilter, kubeIPVSOutFilterChain}, 483 } 484 485 // ipsetInfo is all ipset we needed in ipvs proxier 486 var ipsetInfo = []struct { 487 name string 488 setType utilipset.Type 489 comment string 490 }{ 491 {kubeLoopBackIPSet, utilipset.HashIPPortIP, kubeLoopBackIPSetComment}, 492 {kubeClusterIPSet, utilipset.HashIPPort, kubeClusterIPSetComment}, 493 {kubeExternalIPSet, utilipset.HashIPPort, kubeExternalIPSetComment}, 494 {kubeExternalIPLocalSet, utilipset.HashIPPort, kubeExternalIPLocalSetComment}, 495 {kubeLoadBalancerSet, utilipset.HashIPPort, kubeLoadBalancerSetComment}, 496 {kubeLoadBalancerFWSet, utilipset.HashIPPort, kubeLoadBalancerFWSetComment}, 497 {kubeLoadBalancerLocalSet, utilipset.HashIPPort, kubeLoadBalancerLocalSetComment}, 498 {kubeLoadBalancerSourceIPSet, utilipset.HashIPPortIP, kubeLoadBalancerSourceIPSetComment}, 499 {kubeLoadBalancerSourceCIDRSet, utilipset.HashIPPortNet, kubeLoadBalancerSourceCIDRSetComment}, 500 {kubeNodePortSetTCP, utilipset.BitmapPort, kubeNodePortSetTCPComment}, 501 {kubeNodePortLocalSetTCP, utilipset.BitmapPort, kubeNodePortLocalSetTCPComment}, 502 {kubeNodePortSetUDP, utilipset.BitmapPort, kubeNodePortSetUDPComment}, 503 {kubeNodePortLocalSetUDP, utilipset.BitmapPort, kubeNodePortLocalSetUDPComment}, 504 {kubeNodePortSetSCTP, utilipset.HashIPPort, kubeNodePortSetSCTPComment}, 505 {kubeNodePortLocalSetSCTP, utilipset.HashIPPort, kubeNodePortLocalSetSCTPComment}, 506 {kubeHealthCheckNodePortSet, utilipset.BitmapPort, kubeHealthCheckNodePortSetComment}, 507 {kubeIPVSSet, utilipset.HashIP, kubeIPVSSetComment}, 508 } 509 510 // ipsetWithIptablesChain is the ipsets list with iptables source chain and the chain jump to 511 // `iptables -t nat -A <from> -m set --match-set <name> <matchType> -j <to>` 512 // example: iptables -t nat -A KUBE-SERVICES -m set --match-set KUBE-NODE-PORT-TCP dst -j KUBE-NODE-PORT 513 // ipsets with other match rules will be created Individually. 514 // Note: kubeNodePortLocalSetTCP must be prior to kubeNodePortSetTCP, the same for UDP. 515 var ipsetWithIptablesChain = []struct { 516 name string 517 table utiliptables.Table 518 from string 519 to string 520 matchType string 521 protocolMatch string 522 }{ 523 {kubeLoopBackIPSet, utiliptables.TableNAT, string(kubePostroutingChain), "MASQUERADE", "dst,dst,src", ""}, 524 {kubeLoadBalancerSet, utiliptables.TableNAT, string(kubeServicesChain), string(kubeLoadBalancerChain), "dst,dst", ""}, 525 {kubeLoadBalancerLocalSet, utiliptables.TableNAT, string(kubeLoadBalancerChain), "RETURN", "dst,dst", ""}, 526 {kubeNodePortLocalSetTCP, utiliptables.TableNAT, string(kubeNodePortChain), "RETURN", "dst", utilipset.ProtocolTCP}, 527 {kubeNodePortSetTCP, utiliptables.TableNAT, string(kubeNodePortChain), string(kubeMarkMasqChain), "dst", utilipset.ProtocolTCP}, 528 {kubeNodePortLocalSetUDP, utiliptables.TableNAT, string(kubeNodePortChain), "RETURN", "dst", utilipset.ProtocolUDP}, 529 {kubeNodePortSetUDP, utiliptables.TableNAT, string(kubeNodePortChain), string(kubeMarkMasqChain), "dst", utilipset.ProtocolUDP}, 530 {kubeNodePortLocalSetSCTP, utiliptables.TableNAT, string(kubeNodePortChain), "RETURN", "dst,dst", utilipset.ProtocolSCTP}, 531 {kubeNodePortSetSCTP, utiliptables.TableNAT, string(kubeNodePortChain), string(kubeMarkMasqChain), "dst,dst", utilipset.ProtocolSCTP}, 532 533 {kubeLoadBalancerFWSet, utiliptables.TableFilter, string(kubeProxyFirewallChain), string(kubeSourceRangesFirewallChain), "dst,dst", ""}, 534 {kubeLoadBalancerSourceCIDRSet, utiliptables.TableFilter, string(kubeSourceRangesFirewallChain), "RETURN", "dst,dst,src", ""}, 535 {kubeLoadBalancerSourceIPSet, utiliptables.TableFilter, string(kubeSourceRangesFirewallChain), "RETURN", "dst,dst,src", ""}, 536 } 537 538 // internal struct for string service information 539 type servicePortInfo struct { 540 *proxy.BaseServicePortInfo 541 // The following fields are computed and stored for performance reasons. 542 nameString string 543 } 544 545 // returns a new proxy.ServicePort which abstracts a serviceInfo 546 func newServiceInfo(port *v1.ServicePort, service *v1.Service, bsvcPortInfo *proxy.BaseServicePortInfo) proxy.ServicePort { 547 svcPort := &servicePortInfo{BaseServicePortInfo: bsvcPortInfo} 548 549 // Store the following for performance reasons. 550 svcName := types.NamespacedName{Namespace: service.Namespace, Name: service.Name} 551 svcPortName := proxy.ServicePortName{NamespacedName: svcName, Port: port.Name} 552 svcPort.nameString = svcPortName.String() 553 554 return svcPort 555 } 556 557 // getFirstColumn reads all the content from r into memory and return a 558 // slice which consists of the first word from each line. 559 func getFirstColumn(r io.Reader) ([]string, error) { 560 b, err := io.ReadAll(r) 561 if err != nil { 562 return nil, err 563 } 564 565 lines := strings.Split(string(b), "\n") 566 words := make([]string, 0, len(lines)) 567 for i := range lines { 568 fields := strings.Fields(lines[i]) 569 if len(fields) > 0 { 570 words = append(words, fields[0]) 571 } 572 } 573 return words, nil 574 } 575 576 // CanUseIPVSProxier checks if we can use the ipvs Proxier. 577 // The ipset version and the scheduler are checked. If any virtual servers (VS) 578 // already exist with the configured scheduler, we just return. Otherwise 579 // we check if a dummy VS can be configured with the configured scheduler. 580 // Kernel modules will be loaded automatically if necessary. 581 func CanUseIPVSProxier(ctx context.Context, ipvs utilipvs.Interface, ipsetver IPSetVersioner, scheduler string) error { 582 logger := klog.FromContext(ctx) 583 // BUG: https://github.com/moby/ipvs/issues/27 584 // If ipvs is not compiled into the kernel no error is returned and handle==nil. 585 // This in turn causes ipvs.GetVirtualServers and ipvs.AddVirtualServer 586 // to return ok (err==nil). If/when this bug is fixed parameter "ipvs" will be nil 587 // if ipvs is not supported by the kernel. Until then a re-read work-around is used. 588 if ipvs == nil { 589 return fmt.Errorf("Ipvs not supported by the kernel") 590 } 591 592 // Check ipset version 593 versionString, err := ipsetver.GetVersion() 594 if err != nil { 595 return fmt.Errorf("error getting ipset version, error: %v", err) 596 } 597 if !checkMinVersion(versionString) { 598 return fmt.Errorf("ipset version: %s is less than min required version: %s", versionString, MinIPSetCheckVersion) 599 } 600 601 if scheduler == "" { 602 scheduler = defaultScheduler 603 } 604 605 // If any virtual server (VS) using the scheduler exist we skip the checks. 606 vservers, err := ipvs.GetVirtualServers() 607 if err != nil { 608 logger.Error(err, "Can't read the ipvs") 609 return err 610 } 611 logger.V(5).Info("Virtual Servers", "count", len(vservers)) 612 if len(vservers) > 0 { 613 // This is most likely a kube-proxy re-start. We know that ipvs works 614 // and if any VS uses the configured scheduler, we are done. 615 for _, vs := range vservers { 616 if vs.Scheduler == scheduler { 617 logger.V(5).Info("VS exist, Skipping checks") 618 return nil 619 } 620 } 621 logger.V(5).Info("No existing VS uses the configured scheduler", "scheduler", scheduler) 622 } 623 624 // Try to insert a dummy VS with the passed scheduler. 625 // We should use a VIP address that is not used on the node. 626 // An address "198.51.100.0" from the TEST-NET-2 rage in https://datatracker.ietf.org/doc/html/rfc5737 627 // is used. These addresses are reserved for documentation. If the user is using 628 // this address for a VS anyway we *will* mess up, but that would be an invalid configuration. 629 // If the user have configured the address to an interface on the node (but not a VS) 630 // then traffic will temporary be routed to ipvs during the probe and dropped. 631 // The later case is also and invalid configuration, but the traffic impact will be minor. 632 // This should not be a problem if users honors reserved addresses, but cut/paste 633 // from documentation is not unheard of, so the restriction to not use the TEST-NET-2 range 634 // must be documented. 635 vs := utilipvs.VirtualServer{ 636 Address: netutils.ParseIPSloppy("198.51.100.0"), 637 Protocol: "TCP", 638 Port: 20000, 639 Scheduler: scheduler, 640 } 641 if err := ipvs.AddVirtualServer(&vs); err != nil { 642 logger.Error(err, "Could not create dummy VS", "scheduler", scheduler) 643 return err 644 } 645 646 // To overcome the BUG described above we check that the VS is *really* added. 647 vservers, err = ipvs.GetVirtualServers() 648 if err != nil { 649 logger.Error(err, "ipvs.GetVirtualServers") 650 return err 651 } 652 logger.V(5).Info("Virtual Servers after adding dummy", "count", len(vservers)) 653 if len(vservers) == 0 { 654 logger.Info("Dummy VS not created", "scheduler", scheduler) 655 return fmt.Errorf("Ipvs not supported") // This is a BUG work-around 656 } 657 logger.V(5).Info("Dummy VS created", "vs", vs) 658 659 if err := ipvs.DeleteVirtualServer(&vs); err != nil { 660 logger.Error(err, "Could not delete dummy VS") 661 return err 662 } 663 664 return nil 665 } 666 667 // CleanupIptablesLeftovers removes all iptables rules and chains created by the Proxier 668 // It returns true if an error was encountered. Errors are logged. 669 func cleanupIptablesLeftovers(ctx context.Context, ipt utiliptables.Interface) (encounteredError bool) { 670 logger := klog.FromContext(ctx) 671 // Unlink the iptables chains created by ipvs Proxier 672 for _, jc := range iptablesJumpChain { 673 args := []string{ 674 "-m", "comment", "--comment", jc.comment, 675 "-j", string(jc.to), 676 } 677 if err := ipt.DeleteRule(jc.table, jc.from, args...); err != nil { 678 if !utiliptables.IsNotFoundError(err) { 679 logger.Error(err, "Error removing iptables rules in ipvs proxier") 680 encounteredError = true 681 } 682 } 683 } 684 685 // Flush and remove all of our chains. Flushing all chains before removing them also removes all links between chains first. 686 for _, ch := range iptablesCleanupChains { 687 if err := ipt.FlushChain(ch.table, ch.chain); err != nil { 688 if !utiliptables.IsNotFoundError(err) { 689 logger.Error(err, "Error removing iptables rules in ipvs proxier") 690 encounteredError = true 691 } 692 } 693 } 694 695 // Remove all of our chains. 696 for _, ch := range iptablesCleanupChains { 697 if err := ipt.DeleteChain(ch.table, ch.chain); err != nil { 698 if !utiliptables.IsNotFoundError(err) { 699 logger.Error(err, "Error removing iptables rules in ipvs proxier") 700 encounteredError = true 701 } 702 } 703 } 704 705 return encounteredError 706 } 707 708 // CleanupLeftovers clean up all ipvs and iptables rules created by ipvs Proxier. 709 func CleanupLeftovers(ctx context.Context, ipvs utilipvs.Interface, ipt utiliptables.Interface, ipset utilipset.Interface) (encounteredError bool) { 710 logger := klog.FromContext(ctx) 711 // Clear all ipvs rules 712 if ipvs != nil { 713 err := ipvs.Flush() 714 if err != nil { 715 logger.Error(err, "Error flushing ipvs rules") 716 encounteredError = true 717 } 718 } 719 // Delete dummy interface created by ipvs Proxier. 720 nl := NewNetLinkHandle(false) 721 err := nl.DeleteDummyDevice(defaultDummyDevice) 722 if err != nil { 723 logger.Error(err, "Error deleting dummy device created by ipvs proxier", "device", defaultDummyDevice) 724 encounteredError = true 725 } 726 // Clear iptables created by ipvs Proxier. 727 encounteredError = cleanupIptablesLeftovers(ctx, ipt) || encounteredError 728 // Destroy ip sets created by ipvs Proxier. We should call it after cleaning up 729 // iptables since we can NOT delete ip set which is still referenced by iptables. 730 for _, set := range ipsetInfo { 731 err = ipset.DestroySet(set.name) 732 if err != nil { 733 if !utilipset.IsNotFoundError(err) { 734 logger.Error(err, "Error removing ipset", "ipset", set.name) 735 encounteredError = true 736 } 737 } 738 } 739 return encounteredError 740 } 741 742 // Sync is called to synchronize the proxier state to iptables and ipvs as soon as possible. 743 func (proxier *Proxier) Sync() { 744 if proxier.healthzServer != nil { 745 proxier.healthzServer.QueuedUpdate(proxier.ipFamily) 746 } 747 metrics.SyncProxyRulesLastQueuedTimestamp.SetToCurrentTime() 748 proxier.syncRunner.Run() 749 } 750 751 // SyncLoop runs periodic work. This is expected to run as a goroutine or as the main loop of the app. It does not return. 752 func (proxier *Proxier) SyncLoop() { 753 // Update healthz timestamp at beginning in case Sync() never succeeds. 754 if proxier.healthzServer != nil { 755 proxier.healthzServer.Updated(proxier.ipFamily) 756 } 757 // synthesize "last change queued" time as the informers are syncing. 758 metrics.SyncProxyRulesLastQueuedTimestamp.SetToCurrentTime() 759 proxier.syncRunner.Loop(wait.NeverStop) 760 } 761 762 func (proxier *Proxier) setInitialized(value bool) { 763 var initialized int32 764 if value { 765 initialized = 1 766 } 767 atomic.StoreInt32(&proxier.initialized, initialized) 768 } 769 770 func (proxier *Proxier) isInitialized() bool { 771 return atomic.LoadInt32(&proxier.initialized) > 0 772 } 773 774 // OnServiceAdd is called whenever creation of new service object is observed. 775 func (proxier *Proxier) OnServiceAdd(service *v1.Service) { 776 proxier.OnServiceUpdate(nil, service) 777 } 778 779 // OnServiceUpdate is called whenever modification of an existing service object is observed. 780 func (proxier *Proxier) OnServiceUpdate(oldService, service *v1.Service) { 781 if proxier.serviceChanges.Update(oldService, service) && proxier.isInitialized() { 782 proxier.Sync() 783 } 784 } 785 786 // OnServiceDelete is called whenever deletion of an existing service object is observed. 787 func (proxier *Proxier) OnServiceDelete(service *v1.Service) { 788 proxier.OnServiceUpdate(service, nil) 789 } 790 791 // OnServiceSynced is called once all the initial event handlers were called and the state is fully propagated to local cache. 792 func (proxier *Proxier) OnServiceSynced() { 793 proxier.mu.Lock() 794 proxier.servicesSynced = true 795 proxier.setInitialized(proxier.endpointSlicesSynced) 796 proxier.mu.Unlock() 797 798 // Sync unconditionally - this is called once per lifetime. 799 proxier.syncProxyRules() 800 } 801 802 // OnEndpointSliceAdd is called whenever creation of a new endpoint slice object 803 // is observed. 804 func (proxier *Proxier) OnEndpointSliceAdd(endpointSlice *discovery.EndpointSlice) { 805 if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, false) && proxier.isInitialized() { 806 proxier.Sync() 807 } 808 } 809 810 // OnEndpointSliceUpdate is called whenever modification of an existing endpoint 811 // slice object is observed. 812 func (proxier *Proxier) OnEndpointSliceUpdate(_, endpointSlice *discovery.EndpointSlice) { 813 if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, false) && proxier.isInitialized() { 814 proxier.Sync() 815 } 816 } 817 818 // OnEndpointSliceDelete is called whenever deletion of an existing endpoint slice 819 // object is observed. 820 func (proxier *Proxier) OnEndpointSliceDelete(endpointSlice *discovery.EndpointSlice) { 821 if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, true) && proxier.isInitialized() { 822 proxier.Sync() 823 } 824 } 825 826 // OnEndpointSlicesSynced is called once all the initial event handlers were 827 // called and the state is fully propagated to local cache. 828 func (proxier *Proxier) OnEndpointSlicesSynced() { 829 proxier.mu.Lock() 830 proxier.endpointSlicesSynced = true 831 proxier.setInitialized(proxier.servicesSynced) 832 proxier.mu.Unlock() 833 834 // Sync unconditionally - this is called once per lifetime. 835 proxier.syncProxyRules() 836 } 837 838 // OnNodeAdd is called whenever creation of new node object 839 // is observed. 840 func (proxier *Proxier) OnNodeAdd(node *v1.Node) { 841 if node.Name != proxier.hostname { 842 proxier.logger.Error(nil, "Received a watch event for a node that doesn't match the current node", "eventNode", node.Name, "currentNode", proxier.hostname) 843 return 844 } 845 846 if reflect.DeepEqual(proxier.nodeLabels, node.Labels) { 847 return 848 } 849 850 proxier.mu.Lock() 851 proxier.nodeLabels = map[string]string{} 852 for k, v := range node.Labels { 853 proxier.nodeLabels[k] = v 854 } 855 proxier.mu.Unlock() 856 proxier.logger.V(4).Info("Updated proxier node labels", "labels", node.Labels) 857 858 proxier.Sync() 859 } 860 861 // OnNodeUpdate is called whenever modification of an existing 862 // node object is observed. 863 func (proxier *Proxier) OnNodeUpdate(oldNode, node *v1.Node) { 864 if node.Name != proxier.hostname { 865 proxier.logger.Error(nil, "Received a watch event for a node that doesn't match the current node", "eventNode", node.Name, "currentNode", proxier.hostname) 866 return 867 } 868 869 if reflect.DeepEqual(proxier.nodeLabels, node.Labels) { 870 return 871 } 872 873 proxier.mu.Lock() 874 proxier.nodeLabels = map[string]string{} 875 for k, v := range node.Labels { 876 proxier.nodeLabels[k] = v 877 } 878 proxier.mu.Unlock() 879 proxier.logger.V(4).Info("Updated proxier node labels", "labels", node.Labels) 880 881 proxier.Sync() 882 } 883 884 // OnNodeDelete is called whenever deletion of an existing node 885 // object is observed. 886 func (proxier *Proxier) OnNodeDelete(node *v1.Node) { 887 if node.Name != proxier.hostname { 888 proxier.logger.Error(nil, "Received a watch event for a node that doesn't match the current node", "eventNode", node.Name, "currentNode", proxier.hostname) 889 return 890 } 891 892 proxier.mu.Lock() 893 proxier.nodeLabels = nil 894 proxier.mu.Unlock() 895 896 proxier.Sync() 897 } 898 899 // OnNodeSynced is called once all the initial event handlers were 900 // called and the state is fully propagated to local cache. 901 func (proxier *Proxier) OnNodeSynced() { 902 } 903 904 // OnServiceCIDRsChanged is called whenever a change is observed 905 // in any of the ServiceCIDRs, and provides complete list of service cidrs. 906 func (proxier *Proxier) OnServiceCIDRsChanged(_ []string) {} 907 908 // This is where all of the ipvs calls happen. 909 func (proxier *Proxier) syncProxyRules() { 910 proxier.mu.Lock() 911 defer proxier.mu.Unlock() 912 913 // don't sync rules till we've received services and endpoints 914 if !proxier.isInitialized() { 915 proxier.logger.V(2).Info("Not syncing ipvs rules until Services and Endpoints have been received from master") 916 return 917 } 918 919 // its safe to set initialSync to false as it acts as a flag for startup actions 920 // and the mutex is held. 921 defer func() { 922 proxier.initialSync = false 923 }() 924 925 // Keep track of how long syncs take. 926 start := time.Now() 927 defer func() { 928 metrics.SyncProxyRulesLatency.Observe(metrics.SinceInSeconds(start)) 929 proxier.logger.V(4).Info("syncProxyRules complete", "elapsed", time.Since(start)) 930 }() 931 932 // We assume that if this was called, we really want to sync them, 933 // even if nothing changed in the meantime. In other words, callers are 934 // responsible for detecting no-op changes and not calling this function. 935 serviceUpdateResult := proxier.svcPortMap.Update(proxier.serviceChanges) 936 endpointUpdateResult := proxier.endpointsMap.Update(proxier.endpointsChanges) 937 938 proxier.logger.V(3).Info("Syncing ipvs proxier rules") 939 940 proxier.serviceNoLocalEndpointsInternal = sets.New[string]() 941 proxier.serviceNoLocalEndpointsExternal = sets.New[string]() 942 943 proxier.lbNoNodeAccessIPPortProtocolEntries = make([]*utilipset.Entry, 0) 944 945 // Begin install iptables 946 947 // Reset all buffers used later. 948 // This is to avoid memory reallocations and thus improve performance. 949 proxier.natChains.Reset() 950 proxier.natRules.Reset() 951 proxier.filterChains.Reset() 952 proxier.filterRules.Reset() 953 954 // Write table headers. 955 proxier.filterChains.Write("*filter") 956 proxier.natChains.Write("*nat") 957 958 proxier.createAndLinkKubeChain() 959 960 // make sure dummy interface exists in the system where ipvs Proxier will bind service address on it 961 _, err := proxier.netlinkHandle.EnsureDummyDevice(defaultDummyDevice) 962 if err != nil { 963 proxier.logger.Error(err, "Failed to create dummy interface", "interface", defaultDummyDevice) 964 return 965 } 966 967 // make sure ip sets exists in the system. 968 for _, set := range proxier.ipsetList { 969 if err := ensureIPSet(set); err != nil { 970 return 971 } 972 set.resetEntries() 973 } 974 975 // activeIPVSServices represents IPVS service successfully created in this round of sync 976 activeIPVSServices := sets.New[string]() 977 // activeBindAddrs Represents addresses we want on the defaultDummyDevice after this round of sync 978 activeBindAddrs := sets.New[string]() 979 // alreadyBoundAddrs Represents addresses currently assigned to the dummy interface 980 alreadyBoundAddrs, err := proxier.netlinkHandle.GetLocalAddresses(defaultDummyDevice) 981 if err != nil { 982 proxier.logger.Error(err, "Error listing addresses binded to dummy interface") 983 } 984 // nodeAddressSet All addresses *except* those on the dummy interface 985 nodeAddressSet, err := proxier.netlinkHandle.GetAllLocalAddressesExcept(defaultDummyDevice) 986 if err != nil { 987 proxier.logger.Error(err, "Error listing node addresses") 988 } 989 990 hasNodePort := false 991 for _, svc := range proxier.svcPortMap { 992 svcInfo, ok := svc.(*servicePortInfo) 993 if ok && svcInfo.NodePort() != 0 { 994 hasNodePort = true 995 break 996 } 997 } 998 999 // List of node IP addresses to be used as IPVS services if nodePort is set. This 1000 // can be reused for all nodePort services. 1001 var nodeIPs []net.IP 1002 if hasNodePort { 1003 if proxier.nodePortAddresses.MatchAll() { 1004 for _, ipStr := range nodeAddressSet.UnsortedList() { 1005 nodeIPs = append(nodeIPs, netutils.ParseIPSloppy(ipStr)) 1006 } 1007 } else { 1008 allNodeIPs, err := proxier.nodePortAddresses.GetNodeIPs(proxier.networkInterfacer) 1009 if err != nil { 1010 proxier.logger.Error(err, "Failed to get node IP address matching nodeport cidr") 1011 } else { 1012 for _, ip := range allNodeIPs { 1013 if !ip.IsLoopback() { 1014 nodeIPs = append(nodeIPs, ip) 1015 } 1016 } 1017 } 1018 } 1019 } 1020 1021 // Build IPVS rules for each service. 1022 for svcPortName, svcPort := range proxier.svcPortMap { 1023 svcInfo, ok := svcPort.(*servicePortInfo) 1024 if !ok { 1025 proxier.logger.Error(nil, "Failed to cast serviceInfo", "servicePortName", svcPortName) 1026 continue 1027 } 1028 1029 protocol := strings.ToLower(string(svcInfo.Protocol())) 1030 // Precompute svcNameString; with many services the many calls 1031 // to ServicePortName.String() show up in CPU profiles. 1032 svcPortNameString := svcPortName.String() 1033 1034 // Handle traffic that loops back to the originator with SNAT. 1035 for _, e := range proxier.endpointsMap[svcPortName] { 1036 ep, ok := e.(*proxy.BaseEndpointInfo) 1037 if !ok { 1038 proxier.logger.Error(nil, "Failed to cast BaseEndpointInfo", "endpoint", e) 1039 continue 1040 } 1041 if !ep.IsLocal() { 1042 continue 1043 } 1044 epIP := ep.IP() 1045 epPort := ep.Port() 1046 // Error parsing this endpoint has been logged. Skip to next endpoint. 1047 if epIP == "" || epPort == 0 { 1048 continue 1049 } 1050 entry := &utilipset.Entry{ 1051 IP: epIP, 1052 Port: epPort, 1053 Protocol: protocol, 1054 IP2: epIP, 1055 SetType: utilipset.HashIPPortIP, 1056 } 1057 if valid := proxier.ipsetList[kubeLoopBackIPSet].validateEntry(entry); !valid { 1058 proxier.logger.Error(nil, "Error adding entry to ipset", "entry", entry, "ipset", proxier.ipsetList[kubeLoopBackIPSet].Name) 1059 continue 1060 } 1061 proxier.ipsetList[kubeLoopBackIPSet].activeEntries.Insert(entry.String()) 1062 } 1063 1064 // Capture the clusterIP. 1065 // ipset call 1066 entry := &utilipset.Entry{ 1067 IP: svcInfo.ClusterIP().String(), 1068 Port: svcInfo.Port(), 1069 Protocol: protocol, 1070 SetType: utilipset.HashIPPort, 1071 } 1072 // add service Cluster IP:Port to kubeServiceAccess ip set for the purpose of solving hairpin. 1073 // proxier.kubeServiceAccessSet.activeEntries.Insert(entry.String()) 1074 if valid := proxier.ipsetList[kubeClusterIPSet].validateEntry(entry); !valid { 1075 proxier.logger.Error(nil, "Error adding entry to ipset", "entry", entry, "ipset", proxier.ipsetList[kubeClusterIPSet].Name) 1076 continue 1077 } 1078 proxier.ipsetList[kubeClusterIPSet].activeEntries.Insert(entry.String()) 1079 // ipvs call 1080 serv := &utilipvs.VirtualServer{ 1081 Address: svcInfo.ClusterIP(), 1082 Port: uint16(svcInfo.Port()), 1083 Protocol: string(svcInfo.Protocol()), 1084 Scheduler: proxier.ipvsScheduler, 1085 } 1086 // Set session affinity flag and timeout for IPVS service 1087 if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP { 1088 serv.Flags |= utilipvs.FlagPersistent 1089 serv.Timeout = uint32(svcInfo.StickyMaxAgeSeconds()) 1090 } 1091 // Set the source hash flag needed for the distribution method "mh" 1092 if proxier.ipvsScheduler == "mh" { 1093 serv.Flags |= utilipvs.FlagSourceHash 1094 } 1095 // We need to bind ClusterIP to dummy interface, so set `bindAddr` parameter to `true` in syncService() 1096 if err := proxier.syncService(svcPortNameString, serv, true, alreadyBoundAddrs); err == nil { 1097 activeIPVSServices.Insert(serv.String()) 1098 activeBindAddrs.Insert(serv.Address.String()) 1099 // ExternalTrafficPolicy only works for NodePort and external LB traffic, does not affect ClusterIP 1100 // So we still need clusterIP rules in onlyNodeLocalEndpoints mode. 1101 internalNodeLocal := false 1102 if svcInfo.InternalPolicyLocal() { 1103 internalNodeLocal = true 1104 } 1105 if err := proxier.syncEndpoint(svcPortName, internalNodeLocal, serv); err != nil { 1106 proxier.logger.Error(err, "Failed to sync endpoint for service", "servicePortName", svcPortName, "virtualServer", serv) 1107 } 1108 } else { 1109 proxier.logger.Error(err, "Failed to sync service", "servicePortName", svcPortName, "virtualServer", serv) 1110 } 1111 1112 // Capture externalIPs. 1113 for _, externalIP := range svcInfo.ExternalIPs() { 1114 // ipset call 1115 entry := &utilipset.Entry{ 1116 IP: externalIP.String(), 1117 Port: svcInfo.Port(), 1118 Protocol: protocol, 1119 SetType: utilipset.HashIPPort, 1120 } 1121 1122 if svcInfo.ExternalPolicyLocal() { 1123 if valid := proxier.ipsetList[kubeExternalIPLocalSet].validateEntry(entry); !valid { 1124 proxier.logger.Error(nil, "Error adding entry to ipset", "entry", entry, "ipset", proxier.ipsetList[kubeExternalIPLocalSet].Name) 1125 continue 1126 } 1127 proxier.ipsetList[kubeExternalIPLocalSet].activeEntries.Insert(entry.String()) 1128 } else { 1129 // We have to SNAT packets to external IPs. 1130 if valid := proxier.ipsetList[kubeExternalIPSet].validateEntry(entry); !valid { 1131 proxier.logger.Error(nil, "Error adding entry to ipset", "entry", entry, "ipset", proxier.ipsetList[kubeExternalIPSet].Name) 1132 continue 1133 } 1134 proxier.ipsetList[kubeExternalIPSet].activeEntries.Insert(entry.String()) 1135 } 1136 1137 // ipvs call 1138 serv := &utilipvs.VirtualServer{ 1139 Address: externalIP, 1140 Port: uint16(svcInfo.Port()), 1141 Protocol: string(svcInfo.Protocol()), 1142 Scheduler: proxier.ipvsScheduler, 1143 } 1144 if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP { 1145 serv.Flags |= utilipvs.FlagPersistent 1146 serv.Timeout = uint32(svcInfo.StickyMaxAgeSeconds()) 1147 } 1148 // Set the source hash flag needed for the distribution method "mh" 1149 if proxier.ipvsScheduler == "mh" { 1150 serv.Flags |= utilipvs.FlagSourceHash 1151 } 1152 // We must not add the address to the dummy device if it exist on another interface 1153 shouldBind := !nodeAddressSet.Has(serv.Address.String()) 1154 if err := proxier.syncService(svcPortNameString, serv, shouldBind, alreadyBoundAddrs); err == nil { 1155 activeIPVSServices.Insert(serv.String()) 1156 if shouldBind { 1157 activeBindAddrs.Insert(serv.Address.String()) 1158 } 1159 if err := proxier.syncEndpoint(svcPortName, svcInfo.ExternalPolicyLocal(), serv); err != nil { 1160 proxier.logger.Error(err, "Failed to sync endpoint for service", "servicePortName", svcPortName, "virtualServer", serv) 1161 } 1162 } else { 1163 proxier.logger.Error(err, "Failed to sync service", "servicePortName", svcPortName, "virtualServer", serv) 1164 } 1165 } 1166 1167 // Capture load-balancer ingress. 1168 for _, ingress := range svcInfo.LoadBalancerVIPs() { 1169 // ipset call 1170 entry = &utilipset.Entry{ 1171 IP: ingress.String(), 1172 Port: svcInfo.Port(), 1173 Protocol: protocol, 1174 SetType: utilipset.HashIPPort, 1175 } 1176 // add service load balancer ingressIP:Port to kubeServiceAccess ip set for the purpose of solving hairpin. 1177 // proxier.kubeServiceAccessSet.activeEntries.Insert(entry.String()) 1178 // If we are proxying globally, we need to masquerade in case we cross nodes. 1179 // If we are proxying only locally, we can retain the source IP. 1180 if valid := proxier.ipsetList[kubeLoadBalancerSet].validateEntry(entry); !valid { 1181 proxier.logger.Error(nil, "Error adding entry to ipset", "entry", entry, "ipset", proxier.ipsetList[kubeLoadBalancerSet].Name) 1182 continue 1183 } 1184 proxier.ipsetList[kubeLoadBalancerSet].activeEntries.Insert(entry.String()) 1185 // insert loadbalancer entry to lbIngressLocalSet if service externaltrafficpolicy=local 1186 if svcInfo.ExternalPolicyLocal() { 1187 if valid := proxier.ipsetList[kubeLoadBalancerLocalSet].validateEntry(entry); !valid { 1188 proxier.logger.Error(nil, "Error adding entry to ipset", "entry", entry, "ipset", proxier.ipsetList[kubeLoadBalancerLocalSet].Name) 1189 continue 1190 } 1191 proxier.ipsetList[kubeLoadBalancerLocalSet].activeEntries.Insert(entry.String()) 1192 } 1193 if len(svcInfo.LoadBalancerSourceRanges()) != 0 { 1194 // The service firewall rules are created based on ServiceSpec.loadBalancerSourceRanges field. 1195 // This currently works for loadbalancers that preserves source ips. 1196 // For loadbalancers which direct traffic to service NodePort, the firewall rules will not apply. 1197 if valid := proxier.ipsetList[kubeLoadBalancerFWSet].validateEntry(entry); !valid { 1198 proxier.logger.Error(nil, "Error adding entry to ipset", "entry", entry, "ipset", proxier.ipsetList[kubeLoadBalancerFWSet].Name) 1199 continue 1200 } 1201 proxier.ipsetList[kubeLoadBalancerFWSet].activeEntries.Insert(entry.String()) 1202 allowFromNode := false 1203 for _, cidr := range svcInfo.LoadBalancerSourceRanges() { 1204 // ipset call 1205 entry = &utilipset.Entry{ 1206 IP: ingress.String(), 1207 Port: svcInfo.Port(), 1208 Protocol: protocol, 1209 Net: cidr.String(), 1210 SetType: utilipset.HashIPPortNet, 1211 } 1212 // enumerate all white list source cidr 1213 if valid := proxier.ipsetList[kubeLoadBalancerSourceCIDRSet].validateEntry(entry); !valid { 1214 proxier.logger.Error(nil, "Error adding entry to ipset", "entry", entry, "ipset", proxier.ipsetList[kubeLoadBalancerSourceCIDRSet].Name) 1215 continue 1216 } 1217 proxier.ipsetList[kubeLoadBalancerSourceCIDRSet].activeEntries.Insert(entry.String()) 1218 1219 if cidr.Contains(proxier.nodeIP) { 1220 allowFromNode = true 1221 } 1222 } 1223 // generally, ip route rule was added to intercept request to loadbalancer vip from the 1224 // loadbalancer's backend hosts. In this case, request will not hit the loadbalancer but loop back directly. 1225 // Need to add the following rule to allow request on host. 1226 if allowFromNode { 1227 entry = &utilipset.Entry{ 1228 IP: ingress.String(), 1229 Port: svcInfo.Port(), 1230 Protocol: protocol, 1231 IP2: ingress.String(), 1232 SetType: utilipset.HashIPPortIP, 1233 } 1234 // enumerate all white list source ip 1235 if valid := proxier.ipsetList[kubeLoadBalancerSourceIPSet].validateEntry(entry); !valid { 1236 proxier.logger.Error(nil, "Error adding entry to ipset", "entry", entry, "ipset", proxier.ipsetList[kubeLoadBalancerSourceIPSet].Name) 1237 continue 1238 } 1239 proxier.ipsetList[kubeLoadBalancerSourceIPSet].activeEntries.Insert(entry.String()) 1240 } else { 1241 // since nodeIP is not covered in any of SourceRange we need to explicitly block the lbIP access from k8s nodes. 1242 proxier.lbNoNodeAccessIPPortProtocolEntries = append(proxier.lbNoNodeAccessIPPortProtocolEntries, entry) 1243 1244 } 1245 } 1246 // ipvs call 1247 serv := &utilipvs.VirtualServer{ 1248 Address: ingress, 1249 Port: uint16(svcInfo.Port()), 1250 Protocol: string(svcInfo.Protocol()), 1251 Scheduler: proxier.ipvsScheduler, 1252 } 1253 if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP { 1254 serv.Flags |= utilipvs.FlagPersistent 1255 serv.Timeout = uint32(svcInfo.StickyMaxAgeSeconds()) 1256 } 1257 // Set the source hash flag needed for the distribution method "mh" 1258 if proxier.ipvsScheduler == "mh" { 1259 serv.Flags |= utilipvs.FlagSourceHash 1260 } 1261 // We must not add the address to the dummy device if it exist on another interface 1262 shouldBind := !nodeAddressSet.Has(serv.Address.String()) 1263 if err := proxier.syncService(svcPortNameString, serv, shouldBind, alreadyBoundAddrs); err == nil { 1264 activeIPVSServices.Insert(serv.String()) 1265 if shouldBind { 1266 activeBindAddrs.Insert(serv.Address.String()) 1267 } 1268 if err := proxier.syncEndpoint(svcPortName, svcInfo.ExternalPolicyLocal(), serv); err != nil { 1269 proxier.logger.Error(err, "Failed to sync endpoint for service", "servicePortName", svcPortName, "virtualServer", serv) 1270 } 1271 } else { 1272 proxier.logger.Error(err, "Failed to sync service", "servicePortName", svcPortName, "virtualServer", serv) 1273 } 1274 } 1275 1276 if svcInfo.NodePort() != 0 { 1277 if len(nodeIPs) == 0 { 1278 // Skip nodePort configuration since an error occurred when 1279 // computing nodeAddresses or nodeIPs. 1280 continue 1281 } 1282 1283 // Nodeports need SNAT, unless they're local. 1284 // ipset call 1285 1286 var ( 1287 nodePortSet *IPSet 1288 entries []*utilipset.Entry 1289 ) 1290 1291 switch protocol { 1292 case utilipset.ProtocolTCP: 1293 nodePortSet = proxier.ipsetList[kubeNodePortSetTCP] 1294 entries = []*utilipset.Entry{{ 1295 // No need to provide ip info 1296 Port: svcInfo.NodePort(), 1297 Protocol: protocol, 1298 SetType: utilipset.BitmapPort, 1299 }} 1300 case utilipset.ProtocolUDP: 1301 nodePortSet = proxier.ipsetList[kubeNodePortSetUDP] 1302 entries = []*utilipset.Entry{{ 1303 // No need to provide ip info 1304 Port: svcInfo.NodePort(), 1305 Protocol: protocol, 1306 SetType: utilipset.BitmapPort, 1307 }} 1308 case utilipset.ProtocolSCTP: 1309 nodePortSet = proxier.ipsetList[kubeNodePortSetSCTP] 1310 // Since hash ip:port is used for SCTP, all the nodeIPs to be used in the SCTP ipset entries. 1311 entries = []*utilipset.Entry{} 1312 for _, nodeIP := range nodeIPs { 1313 entries = append(entries, &utilipset.Entry{ 1314 IP: nodeIP.String(), 1315 Port: svcInfo.NodePort(), 1316 Protocol: protocol, 1317 SetType: utilipset.HashIPPort, 1318 }) 1319 } 1320 default: 1321 // It should never hit 1322 proxier.logger.Error(nil, "Unsupported protocol type", "protocol", protocol) 1323 } 1324 if nodePortSet != nil { 1325 entryInvalidErr := false 1326 for _, entry := range entries { 1327 if valid := nodePortSet.validateEntry(entry); !valid { 1328 proxier.logger.Error(nil, "Error adding entry to ipset", "entry", entry, "ipset", nodePortSet.Name) 1329 entryInvalidErr = true 1330 break 1331 } 1332 nodePortSet.activeEntries.Insert(entry.String()) 1333 } 1334 if entryInvalidErr { 1335 continue 1336 } 1337 } 1338 1339 // Add externaltrafficpolicy=local type nodeport entry 1340 if svcInfo.ExternalPolicyLocal() { 1341 var nodePortLocalSet *IPSet 1342 switch protocol { 1343 case utilipset.ProtocolTCP: 1344 nodePortLocalSet = proxier.ipsetList[kubeNodePortLocalSetTCP] 1345 case utilipset.ProtocolUDP: 1346 nodePortLocalSet = proxier.ipsetList[kubeNodePortLocalSetUDP] 1347 case utilipset.ProtocolSCTP: 1348 nodePortLocalSet = proxier.ipsetList[kubeNodePortLocalSetSCTP] 1349 default: 1350 // It should never hit 1351 proxier.logger.Error(nil, "Unsupported protocol type", "protocol", protocol) 1352 } 1353 if nodePortLocalSet != nil { 1354 entryInvalidErr := false 1355 for _, entry := range entries { 1356 if valid := nodePortLocalSet.validateEntry(entry); !valid { 1357 proxier.logger.Error(nil, "Error adding entry to ipset", "entry", entry, "ipset", nodePortLocalSet.Name) 1358 entryInvalidErr = true 1359 break 1360 } 1361 nodePortLocalSet.activeEntries.Insert(entry.String()) 1362 } 1363 if entryInvalidErr { 1364 continue 1365 } 1366 } 1367 } 1368 1369 // Build ipvs kernel routes for each node ip address 1370 for _, nodeIP := range nodeIPs { 1371 // ipvs call 1372 serv := &utilipvs.VirtualServer{ 1373 Address: nodeIP, 1374 Port: uint16(svcInfo.NodePort()), 1375 Protocol: string(svcInfo.Protocol()), 1376 Scheduler: proxier.ipvsScheduler, 1377 } 1378 if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP { 1379 serv.Flags |= utilipvs.FlagPersistent 1380 serv.Timeout = uint32(svcInfo.StickyMaxAgeSeconds()) 1381 } 1382 // Set the source hash flag needed for the distribution method "mh" 1383 if proxier.ipvsScheduler == "mh" { 1384 serv.Flags |= utilipvs.FlagSourceHash 1385 } 1386 // There is no need to bind Node IP to dummy interface, so set parameter `bindAddr` to `false`. 1387 if err := proxier.syncService(svcPortNameString, serv, false, alreadyBoundAddrs); err == nil { 1388 activeIPVSServices.Insert(serv.String()) 1389 if err := proxier.syncEndpoint(svcPortName, svcInfo.ExternalPolicyLocal(), serv); err != nil { 1390 proxier.logger.Error(err, "Failed to sync endpoint for service", "servicePortName", svcPortName, "virtualServer", serv) 1391 } 1392 } else { 1393 proxier.logger.Error(err, "Failed to sync service", "servicePortName", svcPortName, "virtualServer", serv) 1394 } 1395 } 1396 } 1397 1398 if svcInfo.HealthCheckNodePort() != 0 { 1399 nodePortSet := proxier.ipsetList[kubeHealthCheckNodePortSet] 1400 entry := &utilipset.Entry{ 1401 // No need to provide ip info 1402 Port: svcInfo.HealthCheckNodePort(), 1403 Protocol: "tcp", 1404 SetType: utilipset.BitmapPort, 1405 } 1406 1407 if valid := nodePortSet.validateEntry(entry); !valid { 1408 proxier.logger.Error(nil, "Error adding entry to ipset", "entry", entry, "ipset", nodePortSet.Name) 1409 continue 1410 } 1411 nodePortSet.activeEntries.Insert(entry.String()) 1412 } 1413 } 1414 1415 // Set the KUBE-IPVS-IPS set to the "activeBindAddrs" 1416 proxier.ipsetList[kubeIPVSSet].activeEntries = activeBindAddrs 1417 1418 // sync ipset entries 1419 for _, set := range proxier.ipsetList { 1420 set.syncIPSetEntries() 1421 } 1422 1423 // Tail call iptables rules for ipset, make sure only call iptables once 1424 // in a single loop per ip set. 1425 proxier.writeIptablesRules() 1426 1427 // Sync iptables rules. 1428 // NOTE: NoFlushTables is used so we don't flush non-kubernetes chains in the table. 1429 proxier.iptablesData.Reset() 1430 proxier.iptablesData.Write(proxier.natChains.Bytes()) 1431 proxier.iptablesData.Write(proxier.natRules.Bytes()) 1432 proxier.iptablesData.Write(proxier.filterChains.Bytes()) 1433 proxier.iptablesData.Write(proxier.filterRules.Bytes()) 1434 1435 proxier.logger.V(5).Info( 1436 "Restoring iptables", "natChains", proxier.natChains, 1437 "natRules", proxier.natRules, "filterChains", proxier.filterChains, 1438 "filterRules", proxier.filterRules) 1439 err = proxier.iptables.RestoreAll(proxier.iptablesData.Bytes(), utiliptables.NoFlushTables, utiliptables.RestoreCounters) 1440 if err != nil { 1441 if pErr, ok := err.(utiliptables.ParseError); ok { 1442 lines := utiliptables.ExtractLines(proxier.iptablesData.Bytes(), pErr.Line(), 3) 1443 proxier.logger.Error(pErr, "Failed to execute iptables-restore", "rules", lines) 1444 } else { 1445 proxier.logger.Error(err, "Failed to execute iptables-restore", "rules", proxier.iptablesData.Bytes()) 1446 } 1447 metrics.IPTablesRestoreFailuresTotal.Inc() 1448 return 1449 } 1450 for name, lastChangeTriggerTimes := range endpointUpdateResult.LastChangeTriggerTimes { 1451 for _, lastChangeTriggerTime := range lastChangeTriggerTimes { 1452 latency := metrics.SinceInSeconds(lastChangeTriggerTime) 1453 metrics.NetworkProgrammingLatency.Observe(latency) 1454 proxier.logger.V(4).Info("Network programming", "endpoint", klog.KRef(name.Namespace, name.Name), "elapsed", latency) 1455 } 1456 } 1457 1458 // Remove superfluous addresses from the dummy device 1459 superfluousAddresses := alreadyBoundAddrs.Difference(activeBindAddrs) 1460 if superfluousAddresses.Len() > 0 { 1461 proxier.logger.V(2).Info("Removing addresses", "interface", defaultDummyDevice, "addresses", superfluousAddresses) 1462 for adr := range superfluousAddresses { 1463 if err := proxier.netlinkHandle.UnbindAddress(adr, defaultDummyDevice); err != nil { 1464 proxier.logger.Error(err, "UnbindAddress", "interface", defaultDummyDevice, "address", adr) 1465 } 1466 } 1467 } 1468 1469 // currentIPVSServices represent IPVS services listed from the system 1470 // (including any we have created in this sync) 1471 currentIPVSServices := make(map[string]*utilipvs.VirtualServer) 1472 appliedSvcs, err := proxier.ipvs.GetVirtualServers() 1473 if err == nil { 1474 for _, appliedSvc := range appliedSvcs { 1475 currentIPVSServices[appliedSvc.String()] = appliedSvc 1476 } 1477 } else { 1478 proxier.logger.Error(err, "Failed to get ipvs service") 1479 } 1480 proxier.cleanLegacyService(activeIPVSServices, currentIPVSServices) 1481 1482 if proxier.healthzServer != nil { 1483 proxier.healthzServer.Updated(proxier.ipFamily) 1484 } 1485 metrics.SyncProxyRulesLastTimestamp.SetToCurrentTime() 1486 1487 // Update service healthchecks. The endpoints list might include services that are 1488 // not "OnlyLocal", but the services list will not, and the serviceHealthServer 1489 // will just drop those endpoints. 1490 if err := proxier.serviceHealthServer.SyncServices(proxier.svcPortMap.HealthCheckNodePorts()); err != nil { 1491 proxier.logger.Error(err, "Error syncing healthcheck services") 1492 } 1493 if err := proxier.serviceHealthServer.SyncEndpoints(proxier.endpointsMap.LocalReadyEndpoints()); err != nil { 1494 proxier.logger.Error(err, "Error syncing healthcheck endpoints") 1495 } 1496 1497 metrics.SyncProxyRulesNoLocalEndpointsTotal.WithLabelValues("internal").Set(float64(proxier.serviceNoLocalEndpointsInternal.Len())) 1498 metrics.SyncProxyRulesNoLocalEndpointsTotal.WithLabelValues("external").Set(float64(proxier.serviceNoLocalEndpointsExternal.Len())) 1499 1500 // Finish housekeeping, clear stale conntrack entries for UDP Services 1501 conntrack.CleanStaleEntries(proxier.conntrack, proxier.svcPortMap, serviceUpdateResult, endpointUpdateResult) 1502 } 1503 1504 // writeIptablesRules write all iptables rules to proxier.natRules or proxier.FilterRules that ipvs proxier needed 1505 // according to proxier.ipsetList information and the ipset match relationship that `ipsetWithIptablesChain` specified. 1506 // some ipset(kubeClusterIPSet for example) have particular match rules and iptables jump relation should be sync separately. 1507 func (proxier *Proxier) writeIptablesRules() { 1508 1509 // Dismiss connects to localhost early in the service chain 1510 loAddr := "127.0.0.0/8" 1511 if proxier.ipFamily == v1.IPv6Protocol { 1512 loAddr = "::1/128" 1513 } 1514 proxier.natRules.Write("-A", string(kubeServicesChain), "-s", loAddr, "-j", "RETURN") 1515 1516 // We are creating those slices ones here to avoid memory reallocations 1517 // in every loop. Note that reuse the memory, instead of doing: 1518 // slice = <some new slice> 1519 // you should always do one of the below: 1520 // slice = slice[:0] // and then append to it 1521 // slice = append(slice[:0], ...) 1522 // To avoid growing this slice, we arbitrarily set its size to 64, 1523 // there is never more than that many arguments for a single line. 1524 // Note that even if we go over 64, it will still be correct - it 1525 // is just for efficiency, not correctness. 1526 args := make([]string, 64) 1527 1528 for _, set := range ipsetWithIptablesChain { 1529 if _, find := proxier.ipsetList[set.name]; find && !proxier.ipsetList[set.name].isEmpty() { 1530 args = append(args[:0], "-A", set.from) 1531 if set.protocolMatch != "" { 1532 args = append(args, "-p", set.protocolMatch) 1533 } 1534 args = append(args, 1535 "-m", "comment", "--comment", proxier.ipsetList[set.name].getComment(), 1536 "-m", "set", "--match-set", proxier.ipsetList[set.name].Name, 1537 set.matchType, 1538 ) 1539 if set.table == utiliptables.TableFilter { 1540 proxier.filterRules.Write(args, "-j", set.to) 1541 } else { 1542 proxier.natRules.Write(args, "-j", set.to) 1543 } 1544 } 1545 } 1546 1547 if !proxier.ipsetList[kubeClusterIPSet].isEmpty() { 1548 args = append(args[:0], 1549 "-A", string(kubeServicesChain), 1550 "-m", "comment", "--comment", proxier.ipsetList[kubeClusterIPSet].getComment(), 1551 "-m", "set", "--match-set", proxier.ipsetList[kubeClusterIPSet].Name, 1552 ) 1553 if proxier.masqueradeAll { 1554 proxier.natRules.Write( 1555 args, "dst,dst", 1556 "-j", string(kubeMarkMasqChain)) 1557 } else if proxier.localDetector.IsImplemented() { 1558 // This masquerades off-cluster traffic to a service VIP. The idea 1559 // is that you can establish a static route for your Service range, 1560 // routing to any node, and that node will bridge into the Service 1561 // for you. Since that might bounce off-node, we masquerade here. 1562 // If/when we support "Local" policy for VIPs, we should update this. 1563 proxier.natRules.Write( 1564 args, "dst,dst", 1565 proxier.localDetector.IfNotLocal(), 1566 "-j", string(kubeMarkMasqChain)) 1567 } else { 1568 // Masquerade all OUTPUT traffic coming from a service ip. 1569 // The kube dummy interface has all service VIPs assigned which 1570 // results in the service VIP being picked as the source IP to reach 1571 // a VIP. This leads to a connection from VIP:<random port> to 1572 // VIP:<service port>. 1573 // Always masquerading OUTPUT (node-originating) traffic with a VIP 1574 // source ip and service port destination fixes the outgoing connections. 1575 proxier.natRules.Write( 1576 args, "src,dst", 1577 "-j", string(kubeMarkMasqChain)) 1578 } 1579 } 1580 1581 // externalIPRules adds iptables rules applies to Service ExternalIPs 1582 externalIPRules := func(args []string) { 1583 // Allow traffic for external IPs that does not come from a bridge (i.e. not from a container) 1584 // nor from a local process to be forwarded to the service. 1585 // This rule roughly translates to "all traffic from off-machine". 1586 // This is imperfect in the face of network plugins that might not use a bridge, but we can revisit that later. 1587 externalTrafficOnlyArgs := append(args, 1588 "-m", "physdev", "!", "--physdev-is-in", 1589 "-m", "addrtype", "!", "--src-type", "LOCAL") 1590 proxier.natRules.Write(externalTrafficOnlyArgs, "-j", "ACCEPT") 1591 dstLocalOnlyArgs := append(args, "-m", "addrtype", "--dst-type", "LOCAL") 1592 // Allow traffic bound for external IPs that happen to be recognized as local IPs to stay local. 1593 // This covers cases like GCE load-balancers which get added to the local routing table. 1594 proxier.natRules.Write(dstLocalOnlyArgs, "-j", "ACCEPT") 1595 } 1596 1597 if !proxier.ipsetList[kubeExternalIPSet].isEmpty() { 1598 // Build masquerade rules for packets to external IPs. 1599 args = append(args[:0], 1600 "-A", string(kubeServicesChain), 1601 "-m", "comment", "--comment", proxier.ipsetList[kubeExternalIPSet].getComment(), 1602 "-m", "set", "--match-set", proxier.ipsetList[kubeExternalIPSet].Name, 1603 "dst,dst", 1604 ) 1605 proxier.natRules.Write(args, "-j", string(kubeMarkMasqChain)) 1606 externalIPRules(args) 1607 } 1608 1609 if !proxier.ipsetList[kubeExternalIPLocalSet].isEmpty() { 1610 args = append(args[:0], 1611 "-A", string(kubeServicesChain), 1612 "-m", "comment", "--comment", proxier.ipsetList[kubeExternalIPLocalSet].getComment(), 1613 "-m", "set", "--match-set", proxier.ipsetList[kubeExternalIPLocalSet].Name, 1614 "dst,dst", 1615 ) 1616 externalIPRules(args) 1617 } 1618 1619 // -A KUBE-SERVICES -m addrtype --dst-type LOCAL -j KUBE-NODE-PORT 1620 args = append(args[:0], 1621 "-A", string(kubeServicesChain), 1622 "-m", "addrtype", "--dst-type", "LOCAL", 1623 ) 1624 proxier.natRules.Write(args, "-j", string(kubeNodePortChain)) 1625 1626 // mark for masquerading for KUBE-LOAD-BALANCER 1627 proxier.natRules.Write( 1628 "-A", string(kubeLoadBalancerChain), 1629 "-j", string(kubeMarkMasqChain), 1630 ) 1631 1632 // drop packets filtered by KUBE-SOURCE-RANGES-FIREWALL 1633 proxier.filterRules.Write( 1634 "-A", string(kubeSourceRangesFirewallChain), 1635 "-j", "DROP", 1636 ) 1637 1638 // disable LB access from node 1639 // for IPVS src and dst both would be lbIP 1640 for _, entry := range proxier.lbNoNodeAccessIPPortProtocolEntries { 1641 proxier.filterRules.Write( 1642 "-A", string(kubeIPVSOutFilterChain), 1643 "-s", entry.IP, 1644 "-m", "ipvs", "--vaddr", entry.IP, "--vproto", entry.Protocol, "--vport", strconv.Itoa(entry.Port), 1645 "-j", "DROP", 1646 ) 1647 } 1648 1649 // Accept all traffic with destination of ipvs virtual service, in case other iptables rules 1650 // block the traffic, that may result in ipvs rules invalid. 1651 // Those rules must be in the end of KUBE-SERVICE chain 1652 proxier.acceptIPVSTraffic() 1653 1654 // If the masqueradeMark has been added then we want to forward that same 1655 // traffic, this allows NodePort traffic to be forwarded even if the default 1656 // FORWARD policy is not accept. 1657 proxier.filterRules.Write( 1658 "-A", string(kubeForwardChain), 1659 "-m", "comment", "--comment", `"kubernetes forwarding rules"`, 1660 "-m", "mark", "--mark", fmt.Sprintf("%s/%s", proxier.masqueradeMark, proxier.masqueradeMark), 1661 "-j", "ACCEPT", 1662 ) 1663 1664 // The following rule ensures the traffic after the initial packet accepted 1665 // by the "kubernetes forwarding rules" rule above will be accepted. 1666 proxier.filterRules.Write( 1667 "-A", string(kubeForwardChain), 1668 "-m", "comment", "--comment", `"kubernetes forwarding conntrack rule"`, 1669 "-m", "conntrack", 1670 "--ctstate", "RELATED,ESTABLISHED", 1671 "-j", "ACCEPT", 1672 ) 1673 1674 // Add rule to accept traffic towards health check node port 1675 proxier.filterRules.Write( 1676 "-A", string(kubeNodePortChain), 1677 "-m", "comment", "--comment", proxier.ipsetList[kubeHealthCheckNodePortSet].getComment(), 1678 "-m", "set", "--match-set", proxier.ipsetList[kubeHealthCheckNodePortSet].Name, "dst", 1679 "-j", "ACCEPT", 1680 ) 1681 1682 // Add rules to the filter/KUBE-IPVS-FILTER chain to prevent access to ports on the host through VIP addresses. 1683 // https://github.com/kubernetes/kubernetes/issues/72236 1684 proxier.filterRules.Write( 1685 "-A", string(kubeIPVSFilterChain), 1686 "-m", "set", "--match-set", proxier.ipsetList[kubeLoadBalancerSet].Name, "dst,dst", "-j", "RETURN") 1687 proxier.filterRules.Write( 1688 "-A", string(kubeIPVSFilterChain), 1689 "-m", "set", "--match-set", proxier.ipsetList[kubeClusterIPSet].Name, "dst,dst", "-j", "RETURN") 1690 proxier.filterRules.Write( 1691 "-A", string(kubeIPVSFilterChain), 1692 "-m", "set", "--match-set", proxier.ipsetList[kubeExternalIPSet].Name, "dst,dst", "-j", "RETURN") 1693 proxier.filterRules.Write( 1694 "-A", string(kubeIPVSFilterChain), 1695 "-m", "set", "--match-set", proxier.ipsetList[kubeExternalIPLocalSet].Name, "dst,dst", "-j", "RETURN") 1696 proxier.filterRules.Write( 1697 "-A", string(kubeIPVSFilterChain), 1698 "-m", "set", "--match-set", proxier.ipsetList[kubeHealthCheckNodePortSet].Name, "dst", "-j", "RETURN") 1699 proxier.filterRules.Write( 1700 "-A", string(kubeIPVSFilterChain), 1701 "-m", "conntrack", "--ctstate", "NEW", 1702 "-m", "set", "--match-set", proxier.ipsetList[kubeIPVSSet].Name, "dst", "-j", "REJECT") 1703 1704 // Install the kubernetes-specific postrouting rules. We use a whole chain for 1705 // this so that it is easier to flush and change, for example if the mark 1706 // value should ever change. 1707 1708 proxier.natRules.Write( 1709 "-A", string(kubePostroutingChain), 1710 "-m", "mark", "!", "--mark", fmt.Sprintf("%s/%s", proxier.masqueradeMark, proxier.masqueradeMark), 1711 "-j", "RETURN", 1712 ) 1713 // Clear the mark to avoid re-masquerading if the packet re-traverses the network stack. 1714 proxier.natRules.Write( 1715 "-A", string(kubePostroutingChain), 1716 // XOR proxier.masqueradeMark to unset it 1717 "-j", "MARK", "--xor-mark", proxier.masqueradeMark, 1718 ) 1719 masqRule := []string{ 1720 "-A", string(kubePostroutingChain), 1721 "-m", "comment", "--comment", `"kubernetes service traffic requiring SNAT"`, 1722 "-j", "MASQUERADE", 1723 } 1724 if proxier.iptables.HasRandomFully() { 1725 masqRule = append(masqRule, "--random-fully") 1726 } 1727 proxier.natRules.Write(masqRule) 1728 1729 // Install the kubernetes-specific masquerade mark rule. We use a whole chain for 1730 // this so that it is easier to flush and change, for example if the mark 1731 // value should ever change. 1732 proxier.natRules.Write( 1733 "-A", string(kubeMarkMasqChain), 1734 "-j", "MARK", "--or-mark", proxier.masqueradeMark, 1735 ) 1736 1737 // Write the end-of-table markers. 1738 proxier.filterRules.Write("COMMIT") 1739 proxier.natRules.Write("COMMIT") 1740 } 1741 1742 func (proxier *Proxier) acceptIPVSTraffic() { 1743 sets := []string{kubeClusterIPSet, kubeLoadBalancerSet} 1744 for _, set := range sets { 1745 var matchType string 1746 if !proxier.ipsetList[set].isEmpty() { 1747 switch proxier.ipsetList[set].SetType { 1748 case utilipset.BitmapPort: 1749 matchType = "dst" 1750 default: 1751 matchType = "dst,dst" 1752 } 1753 proxier.natRules.Write( 1754 "-A", string(kubeServicesChain), 1755 "-m", "set", "--match-set", proxier.ipsetList[set].Name, matchType, 1756 "-j", "ACCEPT", 1757 ) 1758 } 1759 } 1760 } 1761 1762 // createAndLinkKubeChain create all kube chains that ipvs proxier need and write basic link. 1763 func (proxier *Proxier) createAndLinkKubeChain() { 1764 for _, ch := range iptablesChains { 1765 if _, err := proxier.iptables.EnsureChain(ch.table, ch.chain); err != nil { 1766 proxier.logger.Error(err, "Failed to ensure chain exists", "table", ch.table, "chain", ch.chain) 1767 return 1768 } 1769 if ch.table == utiliptables.TableNAT { 1770 proxier.natChains.Write(utiliptables.MakeChainLine(ch.chain)) 1771 } else { 1772 proxier.filterChains.Write(utiliptables.MakeChainLine(ch.chain)) 1773 } 1774 } 1775 1776 for _, jc := range iptablesJumpChain { 1777 args := []string{"-m", "comment", "--comment", jc.comment, "-j", string(jc.to)} 1778 if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, jc.table, jc.from, args...); err != nil { 1779 proxier.logger.Error(err, "Failed to ensure chain jumps", "table", jc.table, "srcChain", jc.from, "dstChain", jc.to) 1780 } 1781 } 1782 1783 } 1784 1785 func (proxier *Proxier) syncService(svcName string, vs *utilipvs.VirtualServer, bindAddr bool, alreadyBoundAddrs sets.Set[string]) error { 1786 appliedVirtualServer, _ := proxier.ipvs.GetVirtualServer(vs) 1787 if appliedVirtualServer == nil || !appliedVirtualServer.Equal(vs) { 1788 if appliedVirtualServer == nil { 1789 // IPVS service is not found, create a new service 1790 proxier.logger.V(3).Info("Adding new service", "serviceName", svcName, "virtualServer", vs) 1791 if err := proxier.ipvs.AddVirtualServer(vs); err != nil { 1792 proxier.logger.Error(err, "Failed to add IPVS service", "serviceName", svcName) 1793 return err 1794 } 1795 } else { 1796 // IPVS service was changed, update the existing one 1797 // During updates, service VIP will not go down 1798 proxier.logger.V(3).Info("IPVS service was changed", "serviceName", svcName) 1799 if err := proxier.ipvs.UpdateVirtualServer(vs); err != nil { 1800 proxier.logger.Error(err, "Failed to update IPVS service") 1801 return err 1802 } 1803 } 1804 } 1805 1806 // bind service address to dummy interface 1807 if bindAddr { 1808 // always attempt to bind if alreadyBoundAddrs is nil, 1809 // otherwise check if it's already binded and return early 1810 if alreadyBoundAddrs != nil && alreadyBoundAddrs.Has(vs.Address.String()) { 1811 return nil 1812 } 1813 1814 proxier.logger.V(4).Info("Bind address", "address", vs.Address) 1815 _, err := proxier.netlinkHandle.EnsureAddressBind(vs.Address.String(), defaultDummyDevice) 1816 if err != nil { 1817 proxier.logger.Error(err, "Failed to bind service address to dummy device", "serviceName", svcName) 1818 return err 1819 } 1820 } 1821 1822 return nil 1823 } 1824 1825 func (proxier *Proxier) syncEndpoint(svcPortName proxy.ServicePortName, onlyNodeLocalEndpoints bool, vs *utilipvs.VirtualServer) error { 1826 appliedVirtualServer, err := proxier.ipvs.GetVirtualServer(vs) 1827 if err != nil { 1828 proxier.logger.Error(err, "Failed to get IPVS service") 1829 return err 1830 } 1831 if appliedVirtualServer == nil { 1832 return errors.New("IPVS virtual service does not exist") 1833 } 1834 1835 // curEndpoints represents IPVS destinations listed from current system. 1836 curEndpoints := sets.New[string]() 1837 curDests, err := proxier.ipvs.GetRealServers(appliedVirtualServer) 1838 if err != nil { 1839 proxier.logger.Error(err, "Failed to list IPVS destinations") 1840 return err 1841 } 1842 for _, des := range curDests { 1843 curEndpoints.Insert(des.String()) 1844 } 1845 1846 endpoints := proxier.endpointsMap[svcPortName] 1847 1848 // Filtering for topology aware endpoints. This function will only 1849 // filter endpoints if appropriate feature gates are enabled and the 1850 // Service does not have conflicting configuration such as 1851 // externalTrafficPolicy=Local. 1852 svcInfo, ok := proxier.svcPortMap[svcPortName] 1853 if !ok { 1854 proxier.logger.Info("Unable to filter endpoints due to missing service info", "servicePortName", svcPortName) 1855 } else { 1856 clusterEndpoints, localEndpoints, _, hasAnyEndpoints := proxy.CategorizeEndpoints(endpoints, svcInfo, proxier.nodeLabels) 1857 if onlyNodeLocalEndpoints { 1858 if len(localEndpoints) > 0 { 1859 endpoints = localEndpoints 1860 } else { 1861 // https://github.com/kubernetes/kubernetes/pull/97081 1862 // Allow access from local PODs even if no local endpoints exist. 1863 // Traffic from an external source will be routed but the reply 1864 // will have the POD address and will be discarded. 1865 endpoints = clusterEndpoints 1866 1867 if hasAnyEndpoints && svcInfo.InternalPolicyLocal() { 1868 proxier.serviceNoLocalEndpointsInternal.Insert(svcPortName.NamespacedName.String()) 1869 } 1870 1871 if hasAnyEndpoints && svcInfo.ExternalPolicyLocal() { 1872 proxier.serviceNoLocalEndpointsExternal.Insert(svcPortName.NamespacedName.String()) 1873 } 1874 } 1875 } else { 1876 endpoints = clusterEndpoints 1877 } 1878 } 1879 1880 newEndpoints := sets.New[string]() 1881 for _, epInfo := range endpoints { 1882 newEndpoints.Insert(epInfo.String()) 1883 } 1884 1885 // Create new endpoints 1886 for _, ep := range newEndpoints.UnsortedList() { 1887 ip, port, err := net.SplitHostPort(ep) 1888 if err != nil { 1889 proxier.logger.Error(err, "Failed to parse endpoint", "endpoint", ep) 1890 continue 1891 } 1892 portNum, err := strconv.Atoi(port) 1893 if err != nil { 1894 proxier.logger.Error(err, "Failed to parse endpoint port", "port", port) 1895 continue 1896 } 1897 1898 newDest := &utilipvs.RealServer{ 1899 Address: netutils.ParseIPSloppy(ip), 1900 Port: uint16(portNum), 1901 Weight: 1, 1902 } 1903 1904 if curEndpoints.Has(ep) { 1905 // if we are syncing for the first time, loop through all current destinations and 1906 // reset their weight. 1907 if proxier.initialSync { 1908 for _, dest := range curDests { 1909 if dest.Weight != newDest.Weight { 1910 err = proxier.ipvs.UpdateRealServer(appliedVirtualServer, newDest) 1911 if err != nil { 1912 proxier.logger.Error(err, "Failed to update destination", "newDest", newDest) 1913 continue 1914 } 1915 } 1916 } 1917 } 1918 // check if newEndpoint is in gracefulDelete list, if true, delete this ep immediately 1919 uniqueRS := GetUniqueRSName(vs, newDest) 1920 if !proxier.gracefuldeleteManager.InTerminationList(uniqueRS) { 1921 continue 1922 } 1923 proxier.logger.V(5).Info("new ep is in graceful delete list", "uniqueRealServer", uniqueRS) 1924 err := proxier.gracefuldeleteManager.MoveRSOutofGracefulDeleteList(uniqueRS) 1925 if err != nil { 1926 proxier.logger.Error(err, "Failed to delete endpoint in gracefulDeleteQueue", "endpoint", ep) 1927 continue 1928 } 1929 } 1930 err = proxier.ipvs.AddRealServer(appliedVirtualServer, newDest) 1931 if err != nil { 1932 proxier.logger.Error(err, "Failed to add destination", "newDest", newDest) 1933 continue 1934 } 1935 } 1936 1937 // Delete old endpoints 1938 for _, ep := range curEndpoints.Difference(newEndpoints).UnsortedList() { 1939 // if curEndpoint is in gracefulDelete, skip 1940 uniqueRS := vs.String() + "/" + ep 1941 if proxier.gracefuldeleteManager.InTerminationList(uniqueRS) { 1942 continue 1943 } 1944 ip, port, err := net.SplitHostPort(ep) 1945 if err != nil { 1946 proxier.logger.Error(err, "Failed to parse endpoint", "endpoint", ep) 1947 continue 1948 } 1949 portNum, err := strconv.Atoi(port) 1950 if err != nil { 1951 proxier.logger.Error(err, "Failed to parse endpoint port", "port", port) 1952 continue 1953 } 1954 1955 delDest := &utilipvs.RealServer{ 1956 Address: netutils.ParseIPSloppy(ip), 1957 Port: uint16(portNum), 1958 } 1959 1960 proxier.logger.V(5).Info("Using graceful delete", "uniqueRealServer", uniqueRS) 1961 err = proxier.gracefuldeleteManager.GracefulDeleteRS(appliedVirtualServer, delDest) 1962 if err != nil { 1963 proxier.logger.Error(err, "Failed to delete destination", "uniqueRealServer", uniqueRS) 1964 continue 1965 } 1966 } 1967 return nil 1968 } 1969 1970 func (proxier *Proxier) cleanLegacyService(activeServices sets.Set[string], currentServices map[string]*utilipvs.VirtualServer) { 1971 for cs, svc := range currentServices { 1972 if proxier.isIPInExcludeCIDRs(svc.Address) { 1973 continue 1974 } 1975 if getIPFamily(svc.Address) != proxier.ipFamily { 1976 // Not our family 1977 continue 1978 } 1979 if !activeServices.Has(cs) { 1980 proxier.logger.V(4).Info("Delete service", "virtualServer", svc) 1981 if err := proxier.ipvs.DeleteVirtualServer(svc); err != nil { 1982 proxier.logger.Error(err, "Failed to delete service", "virtualServer", svc) 1983 } 1984 } 1985 } 1986 } 1987 1988 func (proxier *Proxier) isIPInExcludeCIDRs(ip net.IP) bool { 1989 // make sure it does not fall within an excluded CIDR range. 1990 for _, excludedCIDR := range proxier.excludeCIDRs { 1991 if excludedCIDR.Contains(ip) { 1992 return true 1993 } 1994 } 1995 return false 1996 } 1997 1998 func getIPFamily(ip net.IP) v1.IPFamily { 1999 if netutils.IsIPv4(ip) { 2000 return v1.IPv4Protocol 2001 } 2002 return v1.IPv6Protocol 2003 } 2004 2005 // ipvs Proxier fall back on iptables when it needs to do SNAT for engress packets 2006 // It will only operate iptables *nat table. 2007 // Create and link the kube postrouting chain for SNAT packets. 2008 // Chain POSTROUTING (policy ACCEPT) 2009 // target prot opt source destination 2010 // KUBE-POSTROUTING all -- 0.0.0.0/0 0.0.0.0/0 /* kubernetes postrouting rules * 2011 // Maintain by kubelet network sync loop 2012 2013 // *nat 2014 // :KUBE-POSTROUTING - [0:0] 2015 // Chain KUBE-POSTROUTING (1 references) 2016 // target prot opt source destination 2017 // MASQUERADE all -- 0.0.0.0/0 0.0.0.0/0 /* kubernetes service traffic requiring SNAT */ mark match 0x4000/0x4000 2018 2019 // :KUBE-MARK-MASQ - [0:0] 2020 // Chain KUBE-MARK-MASQ (0 references) 2021 // target prot opt source destination 2022 // MARK all -- 0.0.0.0/0 0.0.0.0/0 MARK or 0x4000