k8s.io/kubernetes@v1.29.3/pkg/proxy/nftables/proxier.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package nftables 18 19 // 20 // NOTE: this needs to be tested in e2e since it uses nftables for everything. 21 // 22 23 import ( 24 "context" 25 "crypto/sha256" 26 "encoding/base32" 27 "fmt" 28 "net" 29 "reflect" 30 "strconv" 31 "strings" 32 "sync" 33 "sync/atomic" 34 "time" 35 36 "github.com/danwinship/knftables" 37 38 v1 "k8s.io/api/core/v1" 39 discovery "k8s.io/api/discovery/v1" 40 "k8s.io/apimachinery/pkg/types" 41 "k8s.io/apimachinery/pkg/util/sets" 42 "k8s.io/apimachinery/pkg/util/wait" 43 "k8s.io/client-go/tools/events" 44 utilsysctl "k8s.io/component-helpers/node/util/sysctl" 45 "k8s.io/klog/v2" 46 "k8s.io/kubernetes/pkg/proxy" 47 "k8s.io/kubernetes/pkg/proxy/conntrack" 48 "k8s.io/kubernetes/pkg/proxy/healthcheck" 49 "k8s.io/kubernetes/pkg/proxy/metaproxier" 50 "k8s.io/kubernetes/pkg/proxy/metrics" 51 proxyutil "k8s.io/kubernetes/pkg/proxy/util" 52 proxyutiliptables "k8s.io/kubernetes/pkg/proxy/util/iptables" 53 "k8s.io/kubernetes/pkg/util/async" 54 utilexec "k8s.io/utils/exec" 55 netutils "k8s.io/utils/net" 56 "k8s.io/utils/ptr" 57 ) 58 59 const ( 60 // Our nftables table. All of our chains/sets/maps are created inside this table, 61 // so they don't need any "kube-" or "kube-proxy-" prefix of their own. 62 kubeProxyTable = "kube-proxy" 63 64 // service dispatch 65 kubeServicesChain = "services" 66 kubeServiceIPsMap = "service-ips" 67 kubeServiceNodePortsMap = "service-nodeports" 68 69 // set of IPs that accept NodePort traffic 70 kubeNodePortIPsSet = "nodeport-ips" 71 72 // handling for services with no endpoints 73 kubeEndpointsCheckChain = "endpoints-check" 74 kubeNoEndpointServicesMap = "no-endpoint-services" 75 kubeNoEndpointNodePortsMap = "no-endpoint-nodeports" 76 kubeRejectChain = "reject-chain" 77 78 // LoadBalancerSourceRanges handling 79 kubeFirewallSet = "firewall" 80 kubeFirewallCheckChain = "firewall-check" 81 kubeFirewallAllowSet = "firewall-allow" 82 kubeFirewallAllowCheckChain = "firewall-allow-check" 83 84 // masquerading 85 kubeMarkMasqChain = "mark-for-masquerade" 86 kubeMasqueradingChain = "masquerading" 87 88 // chain for special filtering rules 89 kubeForwardChain = "forward" 90 ) 91 92 const sysctlNFConntrackTCPBeLiberal = "net/netfilter/nf_conntrack_tcp_be_liberal" 93 94 // internal struct for string service information 95 type servicePortInfo struct { 96 *proxy.BaseServicePortInfo 97 // The following fields are computed and stored for performance reasons. 98 nameString string 99 clusterPolicyChainName string 100 localPolicyChainName string 101 externalChainName string 102 } 103 104 // returns a new proxy.ServicePort which abstracts a serviceInfo 105 func newServiceInfo(port *v1.ServicePort, service *v1.Service, bsvcPortInfo *proxy.BaseServicePortInfo) proxy.ServicePort { 106 svcPort := &servicePortInfo{BaseServicePortInfo: bsvcPortInfo} 107 108 // Store the following for performance reasons. 109 svcName := types.NamespacedName{Namespace: service.Namespace, Name: service.Name} 110 svcPortName := proxy.ServicePortName{NamespacedName: svcName, Port: port.Name} 111 svcPort.nameString = svcPortName.String() 112 113 chainNameBase := servicePortChainNameBase(&svcPortName, strings.ToLower(string(svcPort.Protocol()))) 114 svcPort.clusterPolicyChainName = servicePortPolicyClusterChainNamePrefix + chainNameBase 115 svcPort.localPolicyChainName = servicePortPolicyLocalChainNamePrefix + chainNameBase 116 svcPort.externalChainName = serviceExternalChainNamePrefix + chainNameBase 117 118 return svcPort 119 } 120 121 // internal struct for endpoints information 122 type endpointInfo struct { 123 *proxy.BaseEndpointInfo 124 125 chainName string 126 affinitySetName string 127 } 128 129 // returns a new proxy.Endpoint which abstracts a endpointInfo 130 func newEndpointInfo(baseInfo *proxy.BaseEndpointInfo, svcPortName *proxy.ServicePortName) proxy.Endpoint { 131 chainNameBase := servicePortEndpointChainNameBase(svcPortName, strings.ToLower(string(svcPortName.Protocol)), baseInfo.String()) 132 return &endpointInfo{ 133 BaseEndpointInfo: baseInfo, 134 chainName: servicePortEndpointChainNamePrefix + chainNameBase, 135 affinitySetName: servicePortEndpointAffinityNamePrefix + chainNameBase, 136 } 137 } 138 139 // Proxier is an nftables based proxy 140 type Proxier struct { 141 // ipFamily defines the IP family which this proxier is tracking. 142 ipFamily v1.IPFamily 143 144 // endpointsChanges and serviceChanges contains all changes to endpoints and 145 // services that happened since nftables was synced. For a single object, 146 // changes are accumulated, i.e. previous is state from before all of them, 147 // current is state after applying all of those. 148 endpointsChanges *proxy.EndpointsChangeTracker 149 serviceChanges *proxy.ServiceChangeTracker 150 151 mu sync.Mutex // protects the following fields 152 svcPortMap proxy.ServicePortMap 153 endpointsMap proxy.EndpointsMap 154 nodeLabels map[string]string 155 // endpointSlicesSynced, and servicesSynced are set to true 156 // when corresponding objects are synced after startup. This is used to avoid 157 // updating nftables with some partial data after kube-proxy restart. 158 endpointSlicesSynced bool 159 servicesSynced bool 160 initialized int32 161 syncRunner *async.BoundedFrequencyRunner // governs calls to syncProxyRules 162 syncPeriod time.Duration 163 164 // These are effectively const and do not need the mutex to be held. 165 nftables knftables.Interface 166 masqueradeAll bool 167 masqueradeMark string 168 exec utilexec.Interface 169 localDetector proxyutiliptables.LocalTrafficDetector 170 hostname string 171 nodeIP net.IP 172 recorder events.EventRecorder 173 174 serviceHealthServer healthcheck.ServiceHealthServer 175 healthzServer *healthcheck.ProxierHealthServer 176 177 // conntrackTCPLiberal indicates whether the system sets the kernel nf_conntrack_tcp_be_liberal 178 conntrackTCPLiberal bool 179 180 // nodePortAddresses selects the interfaces where nodePort works. 181 nodePortAddresses *proxyutil.NodePortAddresses 182 // networkInterfacer defines an interface for several net library functions. 183 // Inject for test purpose. 184 networkInterfacer proxyutil.NetworkInterfacer 185 186 // staleChains contains information about chains to be deleted later 187 staleChains map[string]time.Time 188 } 189 190 // Proxier implements proxy.Provider 191 var _ proxy.Provider = &Proxier{} 192 193 // NewProxier returns a new nftables Proxier. Once a proxier is created, it will keep 194 // nftables up to date in the background and will not terminate if a particular nftables 195 // call fails. 196 func NewProxier(ipFamily v1.IPFamily, 197 sysctl utilsysctl.Interface, 198 syncPeriod time.Duration, 199 minSyncPeriod time.Duration, 200 masqueradeAll bool, 201 masqueradeBit int, 202 localDetector proxyutiliptables.LocalTrafficDetector, 203 hostname string, 204 nodeIP net.IP, 205 recorder events.EventRecorder, 206 healthzServer *healthcheck.ProxierHealthServer, 207 nodePortAddressStrings []string, 208 initOnly bool, 209 ) (*Proxier, error) { 210 nodePortAddresses := proxyutil.NewNodePortAddresses(ipFamily, nodePortAddressStrings) 211 212 // Be conservative in what you do, be liberal in what you accept from others. 213 // If it's non-zero, we mark only out of window RST segments as INVALID. 214 // Ref: https://docs.kernel.org/networking/nf_conntrack-sysctl.html 215 conntrackTCPLiberal := false 216 if val, err := sysctl.GetSysctl(sysctlNFConntrackTCPBeLiberal); err == nil && val != 0 { 217 conntrackTCPLiberal = true 218 klog.InfoS("nf_conntrack_tcp_be_liberal set, not installing DROP rules for INVALID packets") 219 } 220 221 if initOnly { 222 klog.InfoS("System initialized and --init-only specified") 223 return nil, nil 224 } 225 226 // Generate the masquerade mark to use for SNAT rules. 227 masqueradeValue := 1 << uint(masqueradeBit) 228 masqueradeMark := fmt.Sprintf("%#08x", masqueradeValue) 229 klog.V(2).InfoS("Using nftables mark for masquerade", "ipFamily", ipFamily, "mark", masqueradeMark) 230 231 serviceHealthServer := healthcheck.NewServiceHealthServer(hostname, recorder, nodePortAddresses, healthzServer) 232 233 var nftablesFamily knftables.Family 234 if ipFamily == v1.IPv4Protocol { 235 nftablesFamily = knftables.IPv4Family 236 } else { 237 nftablesFamily = knftables.IPv6Family 238 } 239 nft, err := knftables.New(nftablesFamily, kubeProxyTable) 240 if err != nil { 241 return nil, err 242 } 243 244 proxier := &Proxier{ 245 ipFamily: ipFamily, 246 svcPortMap: make(proxy.ServicePortMap), 247 serviceChanges: proxy.NewServiceChangeTracker(newServiceInfo, ipFamily, recorder, nil), 248 endpointsMap: make(proxy.EndpointsMap), 249 endpointsChanges: proxy.NewEndpointsChangeTracker(hostname, newEndpointInfo, ipFamily, recorder, nil), 250 syncPeriod: syncPeriod, 251 nftables: nft, 252 masqueradeAll: masqueradeAll, 253 masqueradeMark: masqueradeMark, 254 exec: utilexec.New(), 255 localDetector: localDetector, 256 hostname: hostname, 257 nodeIP: nodeIP, 258 recorder: recorder, 259 serviceHealthServer: serviceHealthServer, 260 healthzServer: healthzServer, 261 nodePortAddresses: nodePortAddresses, 262 networkInterfacer: proxyutil.RealNetwork{}, 263 conntrackTCPLiberal: conntrackTCPLiberal, 264 staleChains: make(map[string]time.Time), 265 } 266 267 burstSyncs := 2 268 klog.V(2).InfoS("NFTables sync params", "ipFamily", ipFamily, "minSyncPeriod", minSyncPeriod, "syncPeriod", syncPeriod, "burstSyncs", burstSyncs) 269 proxier.syncRunner = async.NewBoundedFrequencyRunner("sync-runner", proxier.syncProxyRules, minSyncPeriod, syncPeriod, burstSyncs) 270 271 return proxier, nil 272 } 273 274 // NewDualStackProxier creates a MetaProxier instance, with IPv4 and IPv6 proxies. 275 func NewDualStackProxier( 276 sysctl utilsysctl.Interface, 277 syncPeriod time.Duration, 278 minSyncPeriod time.Duration, 279 masqueradeAll bool, 280 masqueradeBit int, 281 localDetectors [2]proxyutiliptables.LocalTrafficDetector, 282 hostname string, 283 nodeIPs map[v1.IPFamily]net.IP, 284 recorder events.EventRecorder, 285 healthzServer *healthcheck.ProxierHealthServer, 286 nodePortAddresses []string, 287 initOnly bool, 288 ) (proxy.Provider, error) { 289 // Create an ipv4 instance of the single-stack proxier 290 ipv4Proxier, err := NewProxier(v1.IPv4Protocol, sysctl, 291 syncPeriod, minSyncPeriod, masqueradeAll, masqueradeBit, localDetectors[0], hostname, 292 nodeIPs[v1.IPv4Protocol], recorder, healthzServer, nodePortAddresses, initOnly) 293 if err != nil { 294 return nil, fmt.Errorf("unable to create ipv4 proxier: %v", err) 295 } 296 297 ipv6Proxier, err := NewProxier(v1.IPv6Protocol, sysctl, 298 syncPeriod, minSyncPeriod, masqueradeAll, masqueradeBit, localDetectors[1], hostname, 299 nodeIPs[v1.IPv6Protocol], recorder, healthzServer, nodePortAddresses, initOnly) 300 if err != nil { 301 return nil, fmt.Errorf("unable to create ipv6 proxier: %v", err) 302 } 303 if initOnly { 304 return nil, nil 305 } 306 return metaproxier.NewMetaProxier(ipv4Proxier, ipv6Proxier), nil 307 } 308 309 // nftablesBaseChains lists our "base chains"; those that are directly connected to the 310 // netfilter hooks (e.g., "postrouting", "input", etc.), as opposed to "regular" chains, 311 // which are only run when a rule jumps to them. See 312 // https://wiki.nftables.org/wiki-nftables/index.php/Configuring_chains. 313 // 314 // These are set up from setupNFTables() and then not directly referenced by 315 // syncProxyRules(). 316 // 317 // All of our base chains have names that are just "${type}-${hook}". e.g., "nat-prerouting". 318 type nftablesBaseChain struct { 319 name string 320 chainType knftables.BaseChainType 321 hook knftables.BaseChainHook 322 priority knftables.BaseChainPriority 323 } 324 325 var nftablesBaseChains = []nftablesBaseChain{ 326 // We want our filtering rules to operate on pre-DNAT dest IPs, so our filter 327 // chains have to run before DNAT. 328 {"filter-input", knftables.FilterType, knftables.InputHook, knftables.DNATPriority + "-1"}, 329 {"filter-forward", knftables.FilterType, knftables.ForwardHook, knftables.DNATPriority + "-1"}, 330 {"filter-output", knftables.FilterType, knftables.OutputHook, knftables.DNATPriority + "-1"}, 331 {"nat-prerouting", knftables.NATType, knftables.PreroutingHook, knftables.DNATPriority}, 332 {"nat-output", knftables.NATType, knftables.OutputHook, knftables.DNATPriority}, 333 {"nat-postrouting", knftables.NATType, knftables.PostroutingHook, knftables.SNATPriority}, 334 } 335 336 // nftablesJumpChains lists our top-level "regular chains" that are jumped to directly 337 // from one of the base chains. These are set up from setupNFTables(), and some of them 338 // are also referenced in syncProxyRules(). 339 type nftablesJumpChain struct { 340 dstChain string 341 srcChain string 342 extraArgs string 343 } 344 345 var nftablesJumpChains = []nftablesJumpChain{ 346 {kubeEndpointsCheckChain, "filter-input", "ct state new"}, 347 {kubeEndpointsCheckChain, "filter-forward", "ct state new"}, 348 {kubeEndpointsCheckChain, "filter-output", "ct state new"}, 349 350 {kubeForwardChain, "filter-forward", ""}, 351 352 {kubeFirewallCheckChain, "filter-input", "ct state new"}, 353 {kubeFirewallCheckChain, "filter-output", "ct state new"}, 354 {kubeFirewallCheckChain, "filter-forward", "ct state new"}, 355 356 {kubeServicesChain, "nat-output", ""}, 357 {kubeServicesChain, "nat-prerouting", ""}, 358 {kubeMasqueradingChain, "nat-postrouting", ""}, 359 } 360 361 // ensureChain adds commands to tx to ensure that chain exists and doesn't contain 362 // anything from before this transaction (using createdChains to ensure that we don't 363 // Flush a chain more than once and lose *new* rules as well.) 364 func ensureChain(chain string, tx *knftables.Transaction, createdChains sets.Set[string]) { 365 if createdChains.Has(chain) { 366 return 367 } 368 tx.Add(&knftables.Chain{ 369 Name: chain, 370 }) 371 tx.Flush(&knftables.Chain{ 372 Name: chain, 373 }) 374 createdChains.Insert(chain) 375 } 376 377 func (proxier *Proxier) setupNFTables(tx *knftables.Transaction) { 378 ipX := "ip" 379 ipvX_addr := "ipv4_addr" //nolint:stylecheck // var name intentionally resembles value 380 noLocalhost := "ip daddr != 127.0.0.0/8" 381 if proxier.ipFamily == v1.IPv6Protocol { 382 ipX = "ip6" 383 ipvX_addr = "ipv6_addr" 384 noLocalhost = "ip6 daddr != ::1" 385 } 386 387 tx.Add(&knftables.Table{ 388 Comment: ptr.To("rules for kube-proxy"), 389 }) 390 391 // Create and flush base chains 392 for _, bc := range nftablesBaseChains { 393 chain := &knftables.Chain{ 394 Name: bc.name, 395 Type: ptr.To(bc.chainType), 396 Hook: ptr.To(bc.hook), 397 Priority: ptr.To(bc.priority), 398 } 399 tx.Add(chain) 400 tx.Flush(chain) 401 } 402 403 // Create and flush ordinary chains and add rules jumping to them 404 createdChains := sets.New[string]() 405 for _, c := range nftablesJumpChains { 406 ensureChain(c.dstChain, tx, createdChains) 407 tx.Add(&knftables.Rule{ 408 Chain: c.srcChain, 409 Rule: knftables.Concat( 410 c.extraArgs, 411 "jump", c.dstChain, 412 ), 413 }) 414 } 415 416 // Ensure all of our other "top-level" chains exist 417 for _, chain := range []string{kubeServicesChain, kubeForwardChain, kubeMasqueradingChain, kubeMarkMasqChain} { 418 ensureChain(chain, tx, createdChains) 419 } 420 421 // Add the rules in the mark-for-masquerade and masquerading chains 422 tx.Add(&knftables.Rule{ 423 Chain: kubeMarkMasqChain, 424 Rule: knftables.Concat( 425 "mark", "set", "mark", "or", proxier.masqueradeMark, 426 ), 427 }) 428 429 tx.Add(&knftables.Rule{ 430 Chain: kubeMasqueradingChain, 431 Rule: knftables.Concat( 432 "mark", "and", proxier.masqueradeMark, "==", "0", 433 "return", 434 ), 435 }) 436 tx.Add(&knftables.Rule{ 437 Chain: kubeMasqueradingChain, 438 Rule: knftables.Concat( 439 "mark", "set", "mark", "xor", proxier.masqueradeMark, 440 ), 441 }) 442 tx.Add(&knftables.Rule{ 443 Chain: kubeMasqueradingChain, 444 Rule: "masquerade fully-random", 445 }) 446 447 // Drop the packets in INVALID state, which would potentially cause 448 // unexpected connection reset if nf_conntrack_tcp_be_liberal is not set. 449 // Ref: https://github.com/kubernetes/kubernetes/issues/74839 450 // Ref: https://github.com/kubernetes/kubernetes/issues/117924 451 if !proxier.conntrackTCPLiberal { 452 tx.Add(&knftables.Rule{ 453 Chain: kubeForwardChain, 454 Rule: "ct state invalid drop", 455 }) 456 } 457 458 // Fill in nodeport-ips set if needed (or delete it if not). (We do "add+delete" 459 // rather than just "delete" when we want to ensure the set doesn't exist, because 460 // doing just "delete" would return an error if the set didn't exist.) 461 tx.Add(&knftables.Set{ 462 Name: kubeNodePortIPsSet, 463 Type: ipvX_addr, 464 Comment: ptr.To("IPs that accept NodePort traffic"), 465 }) 466 if proxier.nodePortAddresses.MatchAll() { 467 tx.Delete(&knftables.Set{ 468 Name: kubeNodePortIPsSet, 469 }) 470 } else { 471 tx.Flush(&knftables.Set{ 472 Name: kubeNodePortIPsSet, 473 }) 474 nodeIPs, err := proxier.nodePortAddresses.GetNodeIPs(proxier.networkInterfacer) 475 if err != nil { 476 klog.ErrorS(err, "Failed to get node ip address matching nodeport cidrs, services with nodeport may not work as intended", "CIDRs", proxier.nodePortAddresses) 477 } 478 for _, ip := range nodeIPs { 479 if ip.IsLoopback() { 480 klog.ErrorS(nil, "--nodeport-addresses includes localhost but localhost NodePorts are not supported", "address", ip.String()) 481 continue 482 } 483 tx.Add(&knftables.Element{ 484 Set: kubeNodePortIPsSet, 485 Key: []string{ 486 ip.String(), 487 }, 488 }) 489 } 490 } 491 492 // Set up "no endpoints" drop/reject handling 493 tx.Add(&knftables.Map{ 494 Name: kubeNoEndpointServicesMap, 495 Type: ipvX_addr + " . inet_proto . inet_service : verdict", 496 Comment: ptr.To("vmap to drop or reject packets to services with no endpoints"), 497 }) 498 tx.Add(&knftables.Map{ 499 Name: kubeNoEndpointNodePortsMap, 500 Type: "inet_proto . inet_service : verdict", 501 Comment: ptr.To("vmap to drop or reject packets to service nodeports with no endpoints"), 502 }) 503 504 tx.Add(&knftables.Chain{ 505 Name: kubeRejectChain, 506 Comment: ptr.To("helper for @no-endpoint-services / @no-endpoint-nodeports"), 507 }) 508 tx.Flush(&knftables.Chain{ 509 Name: kubeRejectChain, 510 }) 511 tx.Add(&knftables.Rule{ 512 Chain: kubeRejectChain, 513 Rule: "reject", 514 }) 515 516 tx.Add(&knftables.Rule{ 517 Chain: kubeEndpointsCheckChain, 518 Rule: knftables.Concat( 519 ipX, "daddr", ".", "meta l4proto", ".", "th dport", 520 "vmap", "@", kubeNoEndpointServicesMap, 521 ), 522 }) 523 524 if proxier.nodePortAddresses.MatchAll() { 525 tx.Add(&knftables.Rule{ 526 Chain: kubeEndpointsCheckChain, 527 Rule: knftables.Concat( 528 "fib daddr type local", 529 noLocalhost, 530 "meta l4proto . th dport", 531 "vmap", "@", kubeNoEndpointNodePortsMap, 532 ), 533 }) 534 } else { 535 tx.Add(&knftables.Rule{ 536 Chain: kubeEndpointsCheckChain, 537 Rule: knftables.Concat( 538 ipX, "daddr", "@", kubeNodePortIPsSet, 539 "meta l4proto . th dport", 540 "vmap", "@", kubeNoEndpointNodePortsMap, 541 ), 542 }) 543 } 544 545 // Set up LoadBalancerSourceRanges firewalling 546 tx.Add(&knftables.Set{ 547 Name: kubeFirewallSet, 548 Type: ipvX_addr + " . inet_proto . inet_service", 549 Comment: ptr.To("destinations that are subject to LoadBalancerSourceRanges"), 550 }) 551 tx.Add(&knftables.Set{ 552 Name: kubeFirewallAllowSet, 553 Type: ipvX_addr + " . inet_proto . inet_service . " + ipvX_addr, 554 Flags: []knftables.SetFlag{knftables.IntervalFlag}, 555 Comment: ptr.To("destinations+sources that are allowed by LoadBalancerSourceRanges"), 556 }) 557 558 ensureChain(kubeFirewallCheckChain, tx, createdChains) 559 ensureChain(kubeFirewallAllowCheckChain, tx, createdChains) 560 tx.Add(&knftables.Rule{ 561 Chain: kubeFirewallCheckChain, 562 Rule: knftables.Concat( 563 ipX, "daddr", ".", "meta l4proto", ".", "th dport", "@", kubeFirewallSet, 564 "jump", kubeFirewallAllowCheckChain, 565 ), 566 }) 567 tx.Add(&knftables.Rule{ 568 Chain: kubeFirewallAllowCheckChain, 569 Rule: knftables.Concat( 570 ipX, "daddr", ".", "meta l4proto", ".", "th dport", ".", ipX, "saddr", "@", kubeFirewallAllowSet, 571 "return", 572 ), 573 }) 574 tx.Add(&knftables.Rule{ 575 Chain: kubeFirewallAllowCheckChain, 576 Rule: "drop", 577 }) 578 579 // Set up service dispatch 580 tx.Add(&knftables.Map{ 581 Name: kubeServiceIPsMap, 582 Type: ipvX_addr + " . inet_proto . inet_service : verdict", 583 Comment: ptr.To("ClusterIP, ExternalIP and LoadBalancer IP traffic"), 584 }) 585 tx.Add(&knftables.Map{ 586 Name: kubeServiceNodePortsMap, 587 Type: "inet_proto . inet_service : verdict", 588 Comment: ptr.To("NodePort traffic"), 589 }) 590 tx.Add(&knftables.Rule{ 591 Chain: kubeServicesChain, 592 Rule: knftables.Concat( 593 ipX, "daddr", ".", "meta l4proto", ".", "th dport", 594 "vmap", "@", kubeServiceIPsMap, 595 ), 596 }) 597 if proxier.nodePortAddresses.MatchAll() { 598 tx.Add(&knftables.Rule{ 599 Chain: kubeServicesChain, 600 Rule: knftables.Concat( 601 "fib daddr type local", 602 noLocalhost, 603 "meta l4proto . th dport", 604 "vmap", "@", kubeServiceNodePortsMap, 605 ), 606 }) 607 } else { 608 tx.Add(&knftables.Rule{ 609 Chain: kubeServicesChain, 610 Rule: knftables.Concat( 611 ipX, "daddr @nodeport-ips", 612 "meta l4proto . th dport", 613 "vmap", "@", kubeServiceNodePortsMap, 614 ), 615 }) 616 } 617 } 618 619 // CleanupLeftovers removes all nftables rules and chains created by the Proxier 620 // It returns true if an error was encountered. Errors are logged. 621 func CleanupLeftovers() bool { 622 var encounteredError bool 623 624 for _, family := range []knftables.Family{knftables.IPv4Family, knftables.IPv6Family} { 625 nft, err := knftables.New(family, kubeProxyTable) 626 if err == nil { 627 tx := nft.NewTransaction() 628 tx.Delete(&knftables.Table{}) 629 err = nft.Run(context.TODO(), tx) 630 } 631 if err != nil && !knftables.IsNotFound(err) { 632 klog.ErrorS(err, "Error cleaning up nftables rules") 633 encounteredError = true 634 } 635 } 636 637 return encounteredError 638 } 639 640 // Sync is called to synchronize the proxier state to nftables as soon as possible. 641 func (proxier *Proxier) Sync() { 642 if proxier.healthzServer != nil { 643 proxier.healthzServer.QueuedUpdate(proxier.ipFamily) 644 } 645 metrics.SyncProxyRulesLastQueuedTimestamp.SetToCurrentTime() 646 proxier.syncRunner.Run() 647 } 648 649 // SyncLoop runs periodic work. This is expected to run as a goroutine or as the main loop of the app. It does not return. 650 func (proxier *Proxier) SyncLoop() { 651 // Update healthz timestamp at beginning in case Sync() never succeeds. 652 if proxier.healthzServer != nil { 653 proxier.healthzServer.Updated(proxier.ipFamily) 654 } 655 656 // synthesize "last change queued" time as the informers are syncing. 657 metrics.SyncProxyRulesLastQueuedTimestamp.SetToCurrentTime() 658 proxier.syncRunner.Loop(wait.NeverStop) 659 } 660 661 func (proxier *Proxier) setInitialized(value bool) { 662 var initialized int32 663 if value { 664 initialized = 1 665 } 666 atomic.StoreInt32(&proxier.initialized, initialized) 667 } 668 669 func (proxier *Proxier) isInitialized() bool { 670 return atomic.LoadInt32(&proxier.initialized) > 0 671 } 672 673 // OnServiceAdd is called whenever creation of new service object 674 // is observed. 675 func (proxier *Proxier) OnServiceAdd(service *v1.Service) { 676 proxier.OnServiceUpdate(nil, service) 677 } 678 679 // OnServiceUpdate is called whenever modification of an existing 680 // service object is observed. 681 func (proxier *Proxier) OnServiceUpdate(oldService, service *v1.Service) { 682 if proxier.serviceChanges.Update(oldService, service) && proxier.isInitialized() { 683 proxier.Sync() 684 } 685 } 686 687 // OnServiceDelete is called whenever deletion of an existing service 688 // object is observed. 689 func (proxier *Proxier) OnServiceDelete(service *v1.Service) { 690 proxier.OnServiceUpdate(service, nil) 691 692 } 693 694 // OnServiceSynced is called once all the initial event handlers were 695 // called and the state is fully propagated to local cache. 696 func (proxier *Proxier) OnServiceSynced() { 697 proxier.mu.Lock() 698 proxier.servicesSynced = true 699 proxier.setInitialized(proxier.endpointSlicesSynced) 700 proxier.mu.Unlock() 701 702 // Sync unconditionally - this is called once per lifetime. 703 proxier.syncProxyRules() 704 } 705 706 // OnEndpointSliceAdd is called whenever creation of a new endpoint slice object 707 // is observed. 708 func (proxier *Proxier) OnEndpointSliceAdd(endpointSlice *discovery.EndpointSlice) { 709 if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, false) && proxier.isInitialized() { 710 proxier.Sync() 711 } 712 } 713 714 // OnEndpointSliceUpdate is called whenever modification of an existing endpoint 715 // slice object is observed. 716 func (proxier *Proxier) OnEndpointSliceUpdate(_, endpointSlice *discovery.EndpointSlice) { 717 if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, false) && proxier.isInitialized() { 718 proxier.Sync() 719 } 720 } 721 722 // OnEndpointSliceDelete is called whenever deletion of an existing endpoint slice 723 // object is observed. 724 func (proxier *Proxier) OnEndpointSliceDelete(endpointSlice *discovery.EndpointSlice) { 725 if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, true) && proxier.isInitialized() { 726 proxier.Sync() 727 } 728 } 729 730 // OnEndpointSlicesSynced is called once all the initial event handlers were 731 // called and the state is fully propagated to local cache. 732 func (proxier *Proxier) OnEndpointSlicesSynced() { 733 proxier.mu.Lock() 734 proxier.endpointSlicesSynced = true 735 proxier.setInitialized(proxier.servicesSynced) 736 proxier.mu.Unlock() 737 738 // Sync unconditionally - this is called once per lifetime. 739 proxier.syncProxyRules() 740 } 741 742 // OnNodeAdd is called whenever creation of new node object 743 // is observed. 744 func (proxier *Proxier) OnNodeAdd(node *v1.Node) { 745 if node.Name != proxier.hostname { 746 klog.ErrorS(nil, "Received a watch event for a node that doesn't match the current node", 747 "eventNode", node.Name, "currentNode", proxier.hostname) 748 return 749 } 750 751 if reflect.DeepEqual(proxier.nodeLabels, node.Labels) { 752 return 753 } 754 755 proxier.mu.Lock() 756 proxier.nodeLabels = map[string]string{} 757 for k, v := range node.Labels { 758 proxier.nodeLabels[k] = v 759 } 760 proxier.mu.Unlock() 761 klog.V(4).InfoS("Updated proxier node labels", "labels", node.Labels) 762 763 proxier.Sync() 764 } 765 766 // OnNodeUpdate is called whenever modification of an existing 767 // node object is observed. 768 func (proxier *Proxier) OnNodeUpdate(oldNode, node *v1.Node) { 769 if node.Name != proxier.hostname { 770 klog.ErrorS(nil, "Received a watch event for a node that doesn't match the current node", 771 "eventNode", node.Name, "currentNode", proxier.hostname) 772 return 773 } 774 775 if reflect.DeepEqual(proxier.nodeLabels, node.Labels) { 776 return 777 } 778 779 proxier.mu.Lock() 780 proxier.nodeLabels = map[string]string{} 781 for k, v := range node.Labels { 782 proxier.nodeLabels[k] = v 783 } 784 proxier.mu.Unlock() 785 klog.V(4).InfoS("Updated proxier node labels", "labels", node.Labels) 786 787 proxier.Sync() 788 } 789 790 // OnNodeDelete is called whenever deletion of an existing node 791 // object is observed. 792 func (proxier *Proxier) OnNodeDelete(node *v1.Node) { 793 if node.Name != proxier.hostname { 794 klog.ErrorS(nil, "Received a watch event for a node that doesn't match the current node", 795 "eventNode", node.Name, "currentNode", proxier.hostname) 796 return 797 } 798 799 proxier.mu.Lock() 800 proxier.nodeLabels = nil 801 proxier.mu.Unlock() 802 803 proxier.Sync() 804 } 805 806 // OnNodeSynced is called once all the initial event handlers were 807 // called and the state is fully propagated to local cache. 808 func (proxier *Proxier) OnNodeSynced() { 809 } 810 811 const ( 812 // Maximum length for one of our chain name prefixes, including the trailing 813 // hyphen. 814 chainNamePrefixLengthMax = 16 815 816 // Maximum length of the string returned from servicePortChainNameBase or 817 // servicePortEndpointChainNameBase. 818 chainNameBaseLengthMax = knftables.NameLengthMax - chainNamePrefixLengthMax 819 ) 820 821 const ( 822 servicePortPolicyClusterChainNamePrefix = "service-" 823 servicePortPolicyLocalChainNamePrefix = "local-" 824 serviceExternalChainNamePrefix = "external-" 825 servicePortEndpointChainNamePrefix = "endpoint-" 826 servicePortEndpointAffinityNamePrefix = "affinity-" 827 ) 828 829 // hashAndTruncate prefixes name with a hash of itself and then truncates to 830 // chainNameBaseLengthMax. The hash ensures that (a) the name is still unique if we have 831 // to truncate the end, and (b) it's visually distinguishable from other chains that would 832 // otherwise have nearly identical names (e.g., different endpoint chains for a given 833 // service that differ in only a single digit). 834 func hashAndTruncate(name string) string { 835 hash := sha256.Sum256([]byte(name)) 836 encoded := base32.StdEncoding.EncodeToString(hash[:]) 837 name = encoded[:8] + "-" + name 838 if len(name) > chainNameBaseLengthMax { 839 name = name[:chainNameBaseLengthMax-3] + "..." 840 } 841 return name 842 } 843 844 // servicePortChainNameBase returns the base name for a chain for the given ServicePort. 845 // This is something like "HASH-namespace/serviceName/protocol/portName", e.g, 846 // "ULMVA6XW-ns1/svc1/tcp/p80". 847 func servicePortChainNameBase(servicePortName *proxy.ServicePortName, protocol string) string { 848 // nftables chains can contain the characters [A-Za-z0-9_./-] (but must start with 849 // a letter, underscore, or dot). 850 // 851 // Namespace, Service, and Port names can contain [a-z0-9-] (with some additional 852 // restrictions that aren't relevant here). 853 // 854 // Protocol is /(tcp|udp|sctp)/. 855 // 856 // Thus, we can safely use all Namespace names, Service names, protocol values, 857 // and Port names directly in nftables chain names (though note that this assumes 858 // that the chain name won't *start* with any of those strings, since that might 859 // be illegal). We use "/" to separate the parts of the name, which is one of the 860 // two characters allowed in a chain name that isn't allowed in our input strings. 861 862 name := fmt.Sprintf("%s/%s/%s/%s", 863 servicePortName.NamespacedName.Namespace, 864 servicePortName.NamespacedName.Name, 865 protocol, 866 servicePortName.Port, 867 ) 868 869 // The namespace, service, and port name can each be up to 63 characters, protocol 870 // can be up to 4, plus 8 for the hash and 4 additional punctuation characters. 871 // That's a total of 205, which is less than chainNameBaseLengthMax (240). So this 872 // will never actually return a truncated name. 873 return hashAndTruncate(name) 874 } 875 876 // servicePortEndpointChainNameBase returns the suffix for chain names for the given 877 // endpoint. This is something like 878 // "HASH-namespace/serviceName/protocol/portName__endpointIP/endpointport", e.g., 879 // "5OJB2KTY-ns1/svc1/tcp/p80__10.180.0.1/80". 880 func servicePortEndpointChainNameBase(servicePortName *proxy.ServicePortName, protocol, endpoint string) string { 881 // As above in servicePortChainNameBase: Namespace, Service, Port, Protocol, and 882 // EndpointPort are all safe to copy into the chain name directly. But if 883 // EndpointIP is IPv6 then it will contain colons, which aren't allowed in a chain 884 // name. IPv6 IPs are also quite long, but we can't safely truncate them (e.g. to 885 // only the final segment) because (especially for manually-created external 886 // endpoints), we can't know for sure that any part of them is redundant. 887 888 endpointIP, endpointPort, _ := net.SplitHostPort(endpoint) 889 if strings.Contains(endpointIP, ":") { 890 endpointIP = strings.ReplaceAll(endpointIP, ":", ".") 891 } 892 893 // As above, we use "/" to separate parts of the name, and "__" to separate the 894 // "service" part from the "endpoint" part. 895 name := fmt.Sprintf("%s/%s/%s/%s__%s/%s", 896 servicePortName.NamespacedName.Namespace, 897 servicePortName.NamespacedName.Name, 898 protocol, 899 servicePortName.Port, 900 endpointIP, 901 endpointPort, 902 ) 903 904 // The part of name before the "__" can be up to 205 characters (as with 905 // servicePortChainNameBase above). An IPv6 address can be up to 39 characters, and 906 // a port can be up to 5 digits, plus 3 punctuation characters gives a max total 907 // length of 252, well over chainNameBaseLengthMax (240), so truncation is 908 // theoretically possible (though incredibly unlikely). 909 return hashAndTruncate(name) 910 } 911 912 func isServiceChainName(chainString string) bool { 913 // The chains returned from servicePortChainNameBase and 914 // servicePortEndpointChainNameBase will always have at least one "/" in them. 915 // Since none of our "stock" chain names use slashes, we can distinguish them this 916 // way. 917 return strings.Contains(chainString, "/") 918 } 919 920 func isAffinitySetName(set string) bool { 921 return strings.HasPrefix(set, servicePortEndpointAffinityNamePrefix) 922 } 923 924 // This is where all of the nftables calls happen. 925 // This assumes proxier.mu is NOT held 926 func (proxier *Proxier) syncProxyRules() { 927 proxier.mu.Lock() 928 defer proxier.mu.Unlock() 929 930 // don't sync rules till we've received services and endpoints 931 if !proxier.isInitialized() { 932 klog.V(2).InfoS("Not syncing nftables until Services and Endpoints have been received from master") 933 return 934 } 935 936 // 937 // Below this point we will not return until we try to write the nftables rules. 938 // 939 940 // Keep track of how long syncs take. 941 start := time.Now() 942 defer func() { 943 metrics.SyncProxyRulesLatency.Observe(metrics.SinceInSeconds(start)) 944 klog.V(2).InfoS("SyncProxyRules complete", "elapsed", time.Since(start)) 945 }() 946 947 serviceUpdateResult := proxier.svcPortMap.Update(proxier.serviceChanges) 948 endpointUpdateResult := proxier.endpointsMap.Update(proxier.endpointsChanges) 949 950 klog.V(2).InfoS("Syncing nftables rules") 951 952 success := false 953 defer func() { 954 if !success { 955 klog.InfoS("Sync failed", "retryingTime", proxier.syncPeriod) 956 proxier.syncRunner.RetryAfter(proxier.syncPeriod) 957 } 958 }() 959 960 // If there are sufficiently-stale chains left over from previous transactions, 961 // try to delete them now. 962 if len(proxier.staleChains) > 0 { 963 oneSecondAgo := start.Add(-time.Second) 964 tx := proxier.nftables.NewTransaction() 965 deleted := 0 966 for chain, modtime := range proxier.staleChains { 967 if modtime.Before(oneSecondAgo) { 968 tx.Delete(&knftables.Chain{ 969 Name: chain, 970 }) 971 delete(proxier.staleChains, chain) 972 deleted++ 973 } 974 } 975 if deleted > 0 { 976 klog.InfoS("Deleting stale nftables chains", "numChains", deleted) 977 err := proxier.nftables.Run(context.TODO(), tx) 978 if err != nil { 979 // We already deleted the entries from staleChains, but if 980 // the chains still exist, they'll just get added back 981 // (with a later timestamp) at the end of the sync. 982 klog.ErrorS(err, "Unable to delete stale chains; will retry later") 983 // FIXME: metric 984 } 985 } 986 } 987 988 // Now start the actual syncing transaction 989 tx := proxier.nftables.NewTransaction() 990 proxier.setupNFTables(tx) 991 992 // We need to use, eg, "ip daddr" for IPv4 but "ip6 daddr" for IPv6 993 ipX := "ip" 994 ipvX_addr := "ipv4_addr" //nolint:stylecheck // var name intentionally resembles value 995 if proxier.ipFamily == v1.IPv6Protocol { 996 ipX = "ip6" 997 ipvX_addr = "ipv6_addr" 998 } 999 1000 // We currently fully-rebuild our sets and maps on each resync 1001 tx.Flush(&knftables.Set{ 1002 Name: kubeFirewallSet, 1003 }) 1004 tx.Flush(&knftables.Set{ 1005 Name: kubeFirewallAllowSet, 1006 }) 1007 tx.Flush(&knftables.Map{ 1008 Name: kubeNoEndpointServicesMap, 1009 }) 1010 tx.Flush(&knftables.Map{ 1011 Name: kubeNoEndpointNodePortsMap, 1012 }) 1013 tx.Flush(&knftables.Map{ 1014 Name: kubeServiceIPsMap, 1015 }) 1016 tx.Flush(&knftables.Map{ 1017 Name: kubeServiceNodePortsMap, 1018 }) 1019 1020 // Accumulate service/endpoint chains and affinity sets to keep. 1021 activeChains := sets.New[string]() 1022 activeAffinitySets := sets.New[string]() 1023 1024 // Compute total number of endpoint chains across all services 1025 // to get a sense of how big the cluster is. 1026 totalEndpoints := 0 1027 for svcName := range proxier.svcPortMap { 1028 totalEndpoints += len(proxier.endpointsMap[svcName]) 1029 } 1030 1031 // These two variables are used to publish the sync_proxy_rules_no_endpoints_total 1032 // metric. 1033 serviceNoLocalEndpointsTotalInternal := 0 1034 serviceNoLocalEndpointsTotalExternal := 0 1035 1036 // Build rules for each service-port. 1037 for svcName, svc := range proxier.svcPortMap { 1038 svcInfo, ok := svc.(*servicePortInfo) 1039 if !ok { 1040 klog.ErrorS(nil, "Failed to cast serviceInfo", "serviceName", svcName) 1041 continue 1042 } 1043 protocol := strings.ToLower(string(svcInfo.Protocol())) 1044 svcPortNameString := svcInfo.nameString 1045 1046 // Figure out the endpoints for Cluster and Local traffic policy. 1047 // allLocallyReachableEndpoints is the set of all endpoints that can be routed to 1048 // from this node, given the service's traffic policies. hasEndpoints is true 1049 // if the service has any usable endpoints on any node, not just this one. 1050 allEndpoints := proxier.endpointsMap[svcName] 1051 clusterEndpoints, localEndpoints, allLocallyReachableEndpoints, hasEndpoints := proxy.CategorizeEndpoints(allEndpoints, svcInfo, proxier.nodeLabels) 1052 1053 // Note the endpoint chains that will be used 1054 for _, ep := range allLocallyReachableEndpoints { 1055 if epInfo, ok := ep.(*endpointInfo); ok { 1056 ensureChain(epInfo.chainName, tx, activeChains) 1057 } 1058 } 1059 1060 // clusterPolicyChain contains the endpoints used with "Cluster" traffic policy 1061 clusterPolicyChain := svcInfo.clusterPolicyChainName 1062 usesClusterPolicyChain := len(clusterEndpoints) > 0 && svcInfo.UsesClusterEndpoints() 1063 if usesClusterPolicyChain { 1064 ensureChain(clusterPolicyChain, tx, activeChains) 1065 } 1066 1067 // localPolicyChain contains the endpoints used with "Local" traffic policy 1068 localPolicyChain := svcInfo.localPolicyChainName 1069 usesLocalPolicyChain := len(localEndpoints) > 0 && svcInfo.UsesLocalEndpoints() 1070 if usesLocalPolicyChain { 1071 ensureChain(localPolicyChain, tx, activeChains) 1072 } 1073 1074 // internalPolicyChain is the chain containing the endpoints for 1075 // "internal" (ClusterIP) traffic. internalTrafficChain is the chain that 1076 // internal traffic is routed to (which is always the same as 1077 // internalPolicyChain). hasInternalEndpoints is true if we should 1078 // generate rules pointing to internalTrafficChain, or false if there are 1079 // no available internal endpoints. 1080 internalPolicyChain := clusterPolicyChain 1081 hasInternalEndpoints := hasEndpoints 1082 if svcInfo.InternalPolicyLocal() { 1083 internalPolicyChain = localPolicyChain 1084 if len(localEndpoints) == 0 { 1085 hasInternalEndpoints = false 1086 } 1087 } 1088 internalTrafficChain := internalPolicyChain 1089 1090 // Similarly, externalPolicyChain is the chain containing the endpoints 1091 // for "external" (NodePort, LoadBalancer, and ExternalIP) traffic. 1092 // externalTrafficChain is the chain that external traffic is routed to 1093 // (which is always the service's "EXT" chain). hasExternalEndpoints is 1094 // true if there are endpoints that will be reached by external traffic. 1095 // (But we may still have to generate externalTrafficChain even if there 1096 // are no external endpoints, to ensure that the short-circuit rules for 1097 // local traffic are set up.) 1098 externalPolicyChain := clusterPolicyChain 1099 hasExternalEndpoints := hasEndpoints 1100 if svcInfo.ExternalPolicyLocal() { 1101 externalPolicyChain = localPolicyChain 1102 if len(localEndpoints) == 0 { 1103 hasExternalEndpoints = false 1104 } 1105 } 1106 externalTrafficChain := svcInfo.externalChainName // eventually jumps to externalPolicyChain 1107 1108 // usesExternalTrafficChain is based on hasEndpoints, not hasExternalEndpoints, 1109 // because we need the local-traffic-short-circuiting rules even when there 1110 // are no externally-usable endpoints. 1111 usesExternalTrafficChain := hasEndpoints && svcInfo.ExternallyAccessible() 1112 if usesExternalTrafficChain { 1113 ensureChain(externalTrafficChain, tx, activeChains) 1114 } 1115 1116 var internalTrafficFilterVerdict, externalTrafficFilterVerdict string 1117 if !hasEndpoints { 1118 // The service has no endpoints at all; hasInternalEndpoints and 1119 // hasExternalEndpoints will also be false, and we will not 1120 // generate any chains in the "nat" table for the service; only 1121 // rules in the "filter" table rejecting incoming packets for 1122 // the service's IPs. 1123 internalTrafficFilterVerdict = fmt.Sprintf("goto %s", kubeRejectChain) 1124 externalTrafficFilterVerdict = fmt.Sprintf("goto %s", kubeRejectChain) 1125 } else { 1126 if !hasInternalEndpoints { 1127 // The internalTrafficPolicy is "Local" but there are no local 1128 // endpoints. Traffic to the clusterIP will be dropped, but 1129 // external traffic may still be accepted. 1130 internalTrafficFilterVerdict = "drop" 1131 serviceNoLocalEndpointsTotalInternal++ 1132 } 1133 if !hasExternalEndpoints { 1134 // The externalTrafficPolicy is "Local" but there are no 1135 // local endpoints. Traffic to "external" IPs from outside 1136 // the cluster will be dropped, but traffic from inside 1137 // the cluster may still be accepted. 1138 externalTrafficFilterVerdict = "drop" 1139 serviceNoLocalEndpointsTotalExternal++ 1140 } 1141 } 1142 1143 // Capture the clusterIP. 1144 if hasInternalEndpoints { 1145 tx.Add(&knftables.Element{ 1146 Map: kubeServiceIPsMap, 1147 Key: []string{ 1148 svcInfo.ClusterIP().String(), 1149 protocol, 1150 strconv.Itoa(svcInfo.Port()), 1151 }, 1152 Value: []string{ 1153 fmt.Sprintf("goto %s", internalTrafficChain), 1154 }, 1155 }) 1156 } else { 1157 // No endpoints. 1158 tx.Add(&knftables.Element{ 1159 Map: kubeNoEndpointServicesMap, 1160 Key: []string{ 1161 svcInfo.ClusterIP().String(), 1162 protocol, 1163 strconv.Itoa(svcInfo.Port()), 1164 }, 1165 Value: []string{ 1166 internalTrafficFilterVerdict, 1167 }, 1168 Comment: &svcPortNameString, 1169 }) 1170 } 1171 1172 // Capture externalIPs. 1173 for _, externalIP := range svcInfo.ExternalIPStrings() { 1174 if hasEndpoints { 1175 // Send traffic bound for external IPs to the "external 1176 // destinations" chain. 1177 tx.Add(&knftables.Element{ 1178 Map: kubeServiceIPsMap, 1179 Key: []string{ 1180 externalIP, 1181 protocol, 1182 strconv.Itoa(svcInfo.Port()), 1183 }, 1184 Value: []string{ 1185 fmt.Sprintf("goto %s", externalTrafficChain), 1186 }, 1187 }) 1188 } 1189 if !hasExternalEndpoints { 1190 // Either no endpoints at all (REJECT) or no endpoints for 1191 // external traffic (DROP anything that didn't get 1192 // short-circuited by the EXT chain.) 1193 tx.Add(&knftables.Element{ 1194 Map: kubeNoEndpointServicesMap, 1195 Key: []string{ 1196 externalIP, 1197 protocol, 1198 strconv.Itoa(svcInfo.Port()), 1199 }, 1200 Value: []string{ 1201 externalTrafficFilterVerdict, 1202 }, 1203 Comment: &svcPortNameString, 1204 }) 1205 } 1206 } 1207 1208 // Capture load-balancer ingress. 1209 for _, lbip := range svcInfo.LoadBalancerVIPStrings() { 1210 if hasEndpoints { 1211 tx.Add(&knftables.Element{ 1212 Map: kubeServiceIPsMap, 1213 Key: []string{ 1214 lbip, 1215 protocol, 1216 strconv.Itoa(svcInfo.Port()), 1217 }, 1218 Value: []string{ 1219 fmt.Sprintf("goto %s", externalTrafficChain), 1220 }, 1221 }) 1222 } 1223 1224 if len(svcInfo.LoadBalancerSourceRanges()) > 0 { 1225 tx.Add(&knftables.Element{ 1226 Set: kubeFirewallSet, 1227 Key: []string{ 1228 lbip, 1229 protocol, 1230 strconv.Itoa(svcInfo.Port()), 1231 }, 1232 Comment: &svcPortNameString, 1233 }) 1234 1235 allowFromNode := false 1236 for _, src := range svcInfo.LoadBalancerSourceRanges() { 1237 _, cidr, _ := netutils.ParseCIDRSloppy(src) 1238 if cidr == nil { 1239 continue 1240 } 1241 tx.Add(&knftables.Element{ 1242 Set: kubeFirewallAllowSet, 1243 Key: []string{ 1244 lbip, 1245 protocol, 1246 strconv.Itoa(svcInfo.Port()), 1247 src, 1248 }, 1249 Comment: &svcPortNameString, 1250 }) 1251 if cidr.Contains(proxier.nodeIP) { 1252 allowFromNode = true 1253 } 1254 } 1255 // For VIP-like LBs, the VIP is often added as a local 1256 // address (via an IP route rule). In that case, a request 1257 // from a node to the VIP will not hit the loadbalancer but 1258 // will loop back with the source IP set to the VIP. We 1259 // need the following rules to allow requests from this node. 1260 if allowFromNode { 1261 tx.Add(&knftables.Element{ 1262 Set: kubeFirewallAllowSet, 1263 Key: []string{ 1264 lbip, 1265 protocol, 1266 strconv.Itoa(svcInfo.Port()), 1267 lbip, 1268 }, 1269 }) 1270 } 1271 } 1272 } 1273 if !hasExternalEndpoints { 1274 // Either no endpoints at all (REJECT) or no endpoints for 1275 // external traffic (DROP anything that didn't get short-circuited 1276 // by the EXT chain.) 1277 for _, lbip := range svcInfo.LoadBalancerVIPStrings() { 1278 tx.Add(&knftables.Element{ 1279 Map: kubeNoEndpointServicesMap, 1280 Key: []string{ 1281 lbip, 1282 protocol, 1283 strconv.Itoa(svcInfo.Port()), 1284 }, 1285 Value: []string{ 1286 externalTrafficFilterVerdict, 1287 }, 1288 Comment: &svcPortNameString, 1289 }) 1290 } 1291 } 1292 1293 // Capture nodeports. 1294 if svcInfo.NodePort() != 0 { 1295 if hasEndpoints { 1296 // Jump to the external destination chain. For better or for 1297 // worse, nodeports are not subect to loadBalancerSourceRanges, 1298 // and we can't change that. 1299 tx.Add(&knftables.Element{ 1300 Map: kubeServiceNodePortsMap, 1301 Key: []string{ 1302 protocol, 1303 strconv.Itoa(svcInfo.NodePort()), 1304 }, 1305 Value: []string{ 1306 fmt.Sprintf("goto %s", externalTrafficChain), 1307 }, 1308 }) 1309 } 1310 if !hasExternalEndpoints { 1311 // Either no endpoints at all (REJECT) or no endpoints for 1312 // external traffic (DROP anything that didn't get 1313 // short-circuited by the EXT chain.) 1314 tx.Add(&knftables.Element{ 1315 Map: kubeNoEndpointNodePortsMap, 1316 Key: []string{ 1317 protocol, 1318 strconv.Itoa(svcInfo.NodePort()), 1319 }, 1320 Value: []string{ 1321 externalTrafficFilterVerdict, 1322 }, 1323 Comment: &svcPortNameString, 1324 }) 1325 } 1326 } 1327 1328 // Set up internal traffic handling. 1329 if hasInternalEndpoints { 1330 if proxier.masqueradeAll { 1331 tx.Add(&knftables.Rule{ 1332 Chain: internalTrafficChain, 1333 Rule: knftables.Concat( 1334 ipX, "daddr", svcInfo.ClusterIP(), 1335 protocol, "dport", svcInfo.Port(), 1336 "jump", kubeMarkMasqChain, 1337 ), 1338 }) 1339 } else if proxier.localDetector.IsImplemented() { 1340 // This masquerades off-cluster traffic to a service VIP. The 1341 // idea is that you can establish a static route for your 1342 // Service range, routing to any node, and that node will 1343 // bridge into the Service for you. Since that might bounce 1344 // off-node, we masquerade here. 1345 tx.Add(&knftables.Rule{ 1346 Chain: internalTrafficChain, 1347 Rule: knftables.Concat( 1348 ipX, "daddr", svcInfo.ClusterIP(), 1349 protocol, "dport", svcInfo.Port(), 1350 proxier.localDetector.IfNotLocalNFT(), 1351 "jump", kubeMarkMasqChain, 1352 ), 1353 }) 1354 } 1355 } 1356 1357 // Set up external traffic handling (if any "external" destinations are 1358 // enabled). All captured traffic for all external destinations should 1359 // jump to externalTrafficChain, which will handle some special cases and 1360 // then jump to externalPolicyChain. 1361 if usesExternalTrafficChain { 1362 if !svcInfo.ExternalPolicyLocal() { 1363 // If we are using non-local endpoints we need to masquerade, 1364 // in case we cross nodes. 1365 tx.Add(&knftables.Rule{ 1366 Chain: externalTrafficChain, 1367 Rule: knftables.Concat( 1368 "jump", kubeMarkMasqChain, 1369 ), 1370 }) 1371 } else { 1372 // If we are only using same-node endpoints, we can retain the 1373 // source IP in most cases. 1374 1375 if proxier.localDetector.IsImplemented() { 1376 // Treat all locally-originated pod -> external destination 1377 // traffic as a special-case. It is subject to neither 1378 // form of traffic policy, which simulates going up-and-out 1379 // to an external load-balancer and coming back in. 1380 tx.Add(&knftables.Rule{ 1381 Chain: externalTrafficChain, 1382 Rule: knftables.Concat( 1383 proxier.localDetector.IfLocalNFT(), 1384 "goto", clusterPolicyChain, 1385 ), 1386 Comment: ptr.To("short-circuit pod traffic"), 1387 }) 1388 } 1389 1390 // Locally originated traffic (not a pod, but the host node) 1391 // still needs masquerade because the LBIP itself is a local 1392 // address, so that will be the chosen source IP. 1393 tx.Add(&knftables.Rule{ 1394 Chain: externalTrafficChain, 1395 Rule: knftables.Concat( 1396 "fib", "saddr", "type", "local", 1397 "jump", kubeMarkMasqChain, 1398 ), 1399 Comment: ptr.To("masquerade local traffic"), 1400 }) 1401 1402 // Redirect all src-type=LOCAL -> external destination to the 1403 // policy=cluster chain. This allows traffic originating 1404 // from the host to be redirected to the service correctly. 1405 tx.Add(&knftables.Rule{ 1406 Chain: externalTrafficChain, 1407 Rule: knftables.Concat( 1408 "fib", "saddr", "type", "local", 1409 "goto", clusterPolicyChain, 1410 ), 1411 Comment: ptr.To("short-circuit local traffic"), 1412 }) 1413 } 1414 1415 // Anything else falls thru to the appropriate policy chain. 1416 if hasExternalEndpoints { 1417 tx.Add(&knftables.Rule{ 1418 Chain: externalTrafficChain, 1419 Rule: knftables.Concat( 1420 "goto", externalPolicyChain, 1421 ), 1422 }) 1423 } 1424 } 1425 1426 if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP { 1427 // Generate the per-endpoint affinity sets 1428 for _, ep := range allLocallyReachableEndpoints { 1429 epInfo, ok := ep.(*endpointInfo) 1430 if !ok { 1431 klog.ErrorS(nil, "Failed to cast endpointsInfo", "endpointsInfo", ep) 1432 continue 1433 } 1434 1435 // Create a set to store current affinity mappings. As 1436 // with the iptables backend, endpoint affinity is 1437 // recorded for connections from a particular source IP 1438 // (without regard to source port) to a particular 1439 // ServicePort (without regard to which service IP was 1440 // used to reach the service). This may be changed in the 1441 // future. 1442 tx.Add(&knftables.Set{ 1443 Name: epInfo.affinitySetName, 1444 Type: ipvX_addr, 1445 Flags: []knftables.SetFlag{ 1446 // The nft docs say "dynamic" is only 1447 // needed for sets containing stateful 1448 // objects (eg counters), but (at least on 1449 // RHEL8) if we create the set without 1450 // "dynamic", it later gets mutated to 1451 // have it, and then the next attempt to 1452 // tx.Add() it here fails because it looks 1453 // like we're trying to change the flags. 1454 knftables.DynamicFlag, 1455 knftables.TimeoutFlag, 1456 }, 1457 Timeout: ptr.To(time.Duration(svcInfo.StickyMaxAgeSeconds()) * time.Second), 1458 }) 1459 activeAffinitySets.Insert(epInfo.affinitySetName) 1460 } 1461 } 1462 1463 // If Cluster policy is in use, create the chain and create rules jumping 1464 // from clusterPolicyChain to the clusterEndpoints 1465 if usesClusterPolicyChain { 1466 proxier.writeServiceToEndpointRules(tx, svcPortNameString, svcInfo, clusterPolicyChain, clusterEndpoints) 1467 } 1468 1469 // If Local policy is in use, create rules jumping from localPolicyChain 1470 // to the localEndpoints 1471 if usesLocalPolicyChain { 1472 proxier.writeServiceToEndpointRules(tx, svcPortNameString, svcInfo, localPolicyChain, localEndpoints) 1473 } 1474 1475 // Generate the per-endpoint chains 1476 for _, ep := range allLocallyReachableEndpoints { 1477 epInfo, ok := ep.(*endpointInfo) 1478 if !ok { 1479 klog.ErrorS(nil, "Failed to cast endpointInfo", "endpointInfo", ep) 1480 continue 1481 } 1482 1483 endpointChain := epInfo.chainName 1484 1485 // Handle traffic that loops back to the originator with SNAT. 1486 tx.Add(&knftables.Rule{ 1487 Chain: endpointChain, 1488 Rule: knftables.Concat( 1489 ipX, "saddr", epInfo.IP(), 1490 "jump", kubeMarkMasqChain, 1491 ), 1492 }) 1493 1494 // Handle session affinity 1495 if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP { 1496 tx.Add(&knftables.Rule{ 1497 Chain: endpointChain, 1498 Rule: knftables.Concat( 1499 "update", "@", epInfo.affinitySetName, 1500 "{", ipX, "saddr", "}", 1501 ), 1502 }) 1503 } 1504 1505 // DNAT to final destination. 1506 tx.Add(&knftables.Rule{ 1507 Chain: endpointChain, 1508 Rule: knftables.Concat( 1509 "meta l4proto", protocol, 1510 "dnat to", epInfo.String(), 1511 ), 1512 }) 1513 } 1514 } 1515 1516 // Figure out which chains are now stale. Unfortunately, we can't delete them 1517 // right away, because with kernels before 6.2, if there is a map element pointing 1518 // to a chain, and you delete that map element, the kernel doesn't notice until a 1519 // short amount of time later that the chain is now unreferenced. So we flush them 1520 // now, and record the time that they become stale in staleChains so they can be 1521 // deleted later. 1522 existingChains, err := proxier.nftables.List(context.TODO(), "chains") 1523 if err == nil { 1524 for _, chain := range existingChains { 1525 if isServiceChainName(chain) && !activeChains.Has(chain) { 1526 tx.Flush(&knftables.Chain{ 1527 Name: chain, 1528 }) 1529 proxier.staleChains[chain] = start 1530 } 1531 } 1532 } else if !knftables.IsNotFound(err) { 1533 klog.ErrorS(err, "Failed to list nftables chains: stale chains will not be deleted") 1534 } 1535 1536 // OTOH, we can immediately delete any stale affinity sets 1537 existingSets, err := proxier.nftables.List(context.TODO(), "sets") 1538 if err == nil { 1539 for _, set := range existingSets { 1540 if isAffinitySetName(set) && !activeAffinitySets.Has(set) { 1541 tx.Delete(&knftables.Set{ 1542 Name: set, 1543 }) 1544 } 1545 } 1546 } else if !knftables.IsNotFound(err) { 1547 klog.ErrorS(err, "Failed to list nftables sets: stale affinity sets will not be deleted") 1548 } 1549 1550 // Sync rules. 1551 klog.V(2).InfoS("Reloading service nftables data", 1552 "numServices", len(proxier.svcPortMap), 1553 "numEndpoints", totalEndpoints, 1554 ) 1555 1556 // FIXME 1557 // klog.V(9).InfoS("Running nftables transaction", "transaction", tx.Bytes()) 1558 1559 err = proxier.nftables.Run(context.TODO(), tx) 1560 if err != nil { 1561 klog.ErrorS(err, "nftables sync failed") 1562 metrics.IptablesRestoreFailuresTotal.Inc() 1563 return 1564 } 1565 success = true 1566 1567 for name, lastChangeTriggerTimes := range endpointUpdateResult.LastChangeTriggerTimes { 1568 for _, lastChangeTriggerTime := range lastChangeTriggerTimes { 1569 latency := metrics.SinceInSeconds(lastChangeTriggerTime) 1570 metrics.NetworkProgrammingLatency.Observe(latency) 1571 klog.V(4).InfoS("Network programming", "endpoint", klog.KRef(name.Namespace, name.Name), "elapsed", latency) 1572 } 1573 } 1574 1575 metrics.SyncProxyRulesNoLocalEndpointsTotal.WithLabelValues("internal").Set(float64(serviceNoLocalEndpointsTotalInternal)) 1576 metrics.SyncProxyRulesNoLocalEndpointsTotal.WithLabelValues("external").Set(float64(serviceNoLocalEndpointsTotalExternal)) 1577 if proxier.healthzServer != nil { 1578 proxier.healthzServer.Updated(proxier.ipFamily) 1579 } 1580 metrics.SyncProxyRulesLastTimestamp.SetToCurrentTime() 1581 1582 // Update service healthchecks. The endpoints list might include services that are 1583 // not "OnlyLocal", but the services list will not, and the serviceHealthServer 1584 // will just drop those endpoints. 1585 if err := proxier.serviceHealthServer.SyncServices(proxier.svcPortMap.HealthCheckNodePorts()); err != nil { 1586 klog.ErrorS(err, "Error syncing healthcheck services") 1587 } 1588 if err := proxier.serviceHealthServer.SyncEndpoints(proxier.endpointsMap.LocalReadyEndpoints()); err != nil { 1589 klog.ErrorS(err, "Error syncing healthcheck endpoints") 1590 } 1591 1592 // Finish housekeeping, clear stale conntrack entries for UDP Services 1593 conntrack.CleanStaleEntries(proxier.ipFamily == v1.IPv6Protocol, proxier.exec, proxier.svcPortMap, serviceUpdateResult, endpointUpdateResult) 1594 } 1595 1596 func (proxier *Proxier) writeServiceToEndpointRules(tx *knftables.Transaction, svcPortNameString string, svcInfo *servicePortInfo, svcChain string, endpoints []proxy.Endpoint) { 1597 // First write session affinity rules, if applicable. 1598 if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP { 1599 ipX := "ip" 1600 if proxier.ipFamily == v1.IPv6Protocol { 1601 ipX = "ip6" 1602 } 1603 1604 for _, ep := range endpoints { 1605 epInfo, ok := ep.(*endpointInfo) 1606 if !ok { 1607 continue 1608 } 1609 1610 tx.Add(&knftables.Rule{ 1611 Chain: svcChain, 1612 Rule: knftables.Concat( 1613 ipX, "saddr", "@", epInfo.affinitySetName, 1614 "goto", epInfo.chainName, 1615 ), 1616 }) 1617 } 1618 } 1619 1620 // Now write loadbalancing rule 1621 var elements []string 1622 for i, ep := range endpoints { 1623 epInfo, ok := ep.(*endpointInfo) 1624 if !ok { 1625 continue 1626 } 1627 1628 elements = append(elements, 1629 strconv.Itoa(i), ":", "goto", epInfo.chainName, 1630 ) 1631 if i != len(endpoints)-1 { 1632 elements = append(elements, ",") 1633 } 1634 } 1635 tx.Add(&knftables.Rule{ 1636 Chain: svcChain, 1637 Rule: knftables.Concat( 1638 "numgen random mod", len(endpoints), "vmap", 1639 "{", elements, "}", 1640 ), 1641 }) 1642 }