k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/proxy/nftables/proxier.go (about) 1 //go:build linux 2 // +build linux 3 4 /* 5 Copyright 2015 The Kubernetes Authors. 6 7 Licensed under the Apache License, Version 2.0 (the "License"); 8 you may not use this file except in compliance with the License. 9 You may obtain a copy of the License at 10 11 http://www.apache.org/licenses/LICENSE-2.0 12 13 Unless required by applicable law or agreed to in writing, software 14 distributed under the License is distributed on an "AS IS" BASIS, 15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 See the License for the specific language governing permissions and 17 limitations under the License. 18 */ 19 20 package nftables 21 22 // 23 // NOTE: this needs to be tested in e2e since it uses nftables for everything. 24 // 25 26 import ( 27 "context" 28 "crypto/sha256" 29 "encoding/base32" 30 "fmt" 31 "net" 32 "reflect" 33 "strconv" 34 "strings" 35 "sync" 36 "sync/atomic" 37 "time" 38 39 v1 "k8s.io/api/core/v1" 40 discovery "k8s.io/api/discovery/v1" 41 "k8s.io/apimachinery/pkg/types" 42 "k8s.io/apimachinery/pkg/util/sets" 43 "k8s.io/apimachinery/pkg/util/wait" 44 "k8s.io/client-go/tools/events" 45 utilsysctl "k8s.io/component-helpers/node/util/sysctl" 46 "k8s.io/klog/v2" 47 "k8s.io/kubernetes/pkg/proxy" 48 "k8s.io/kubernetes/pkg/proxy/conntrack" 49 "k8s.io/kubernetes/pkg/proxy/healthcheck" 50 "k8s.io/kubernetes/pkg/proxy/metaproxier" 51 "k8s.io/kubernetes/pkg/proxy/metrics" 52 proxyutil "k8s.io/kubernetes/pkg/proxy/util" 53 "k8s.io/kubernetes/pkg/util/async" 54 utilexec "k8s.io/utils/exec" 55 netutils "k8s.io/utils/net" 56 "k8s.io/utils/ptr" 57 "sigs.k8s.io/knftables" 58 ) 59 60 const ( 61 // Our nftables table. All of our chains/sets/maps are created inside this table, 62 // so they don't need any "kube-" or "kube-proxy-" prefix of their own. 63 kubeProxyTable = "kube-proxy" 64 65 // base chains 66 filterPreroutingChain = "filter-prerouting" 67 filterInputChain = "filter-input" 68 filterForwardChain = "filter-forward" 69 filterOutputChain = "filter-output" 70 filterOutputPostDNATChain = "filter-output-post-dnat" 71 natPreroutingChain = "nat-prerouting" 72 natOutputChain = "nat-output" 73 natPostroutingChain = "nat-postrouting" 74 75 // service dispatch 76 servicesChain = "services" 77 serviceIPsMap = "service-ips" 78 serviceNodePortsMap = "service-nodeports" 79 80 // set of IPs that accept NodePort traffic 81 nodePortIPsSet = "nodeport-ips" 82 83 // set of active ClusterIPs. 84 clusterIPsSet = "cluster-ips" 85 86 // handling for services with no endpoints 87 serviceEndpointsCheckChain = "service-endpoints-check" 88 nodePortEndpointsCheckChain = "nodeport-endpoints-check" 89 noEndpointServicesMap = "no-endpoint-services" 90 noEndpointNodePortsMap = "no-endpoint-nodeports" 91 rejectChain = "reject-chain" 92 93 // handling traffic to unallocated ClusterIPs and undefined ports of ClusterIPs 94 clusterIPsCheckChain = "cluster-ips-check" 95 96 // LoadBalancerSourceRanges handling 97 firewallIPsMap = "firewall-ips" 98 firewallCheckChain = "firewall-check" 99 100 // masquerading 101 markMasqChain = "mark-for-masquerade" 102 masqueradingChain = "masquerading" 103 ) 104 105 // NewDualStackProxier creates a MetaProxier instance, with IPv4 and IPv6 proxies. 106 func NewDualStackProxier( 107 ctx context.Context, 108 sysctl utilsysctl.Interface, 109 syncPeriod time.Duration, 110 minSyncPeriod time.Duration, 111 masqueradeAll bool, 112 masqueradeBit int, 113 localDetectors map[v1.IPFamily]proxyutil.LocalTrafficDetector, 114 hostname string, 115 nodeIPs map[v1.IPFamily]net.IP, 116 recorder events.EventRecorder, 117 healthzServer *healthcheck.ProxierHealthServer, 118 nodePortAddresses []string, 119 initOnly bool, 120 ) (proxy.Provider, error) { 121 // Create an ipv4 instance of the single-stack proxier 122 ipv4Proxier, err := NewProxier(ctx, v1.IPv4Protocol, sysctl, 123 syncPeriod, minSyncPeriod, masqueradeAll, masqueradeBit, 124 localDetectors[v1.IPv4Protocol], hostname, nodeIPs[v1.IPv4Protocol], 125 recorder, healthzServer, nodePortAddresses, initOnly) 126 if err != nil { 127 return nil, fmt.Errorf("unable to create ipv4 proxier: %v", err) 128 } 129 130 ipv6Proxier, err := NewProxier(ctx, v1.IPv6Protocol, sysctl, 131 syncPeriod, minSyncPeriod, masqueradeAll, masqueradeBit, 132 localDetectors[v1.IPv6Protocol], hostname, nodeIPs[v1.IPv6Protocol], 133 recorder, healthzServer, nodePortAddresses, initOnly) 134 if err != nil { 135 return nil, fmt.Errorf("unable to create ipv6 proxier: %v", err) 136 } 137 if initOnly { 138 return nil, nil 139 } 140 return metaproxier.NewMetaProxier(ipv4Proxier, ipv6Proxier), nil 141 } 142 143 // Proxier is an nftables based proxy 144 type Proxier struct { 145 // ipFamily defines the IP family which this proxier is tracking. 146 ipFamily v1.IPFamily 147 148 // endpointsChanges and serviceChanges contains all changes to endpoints and 149 // services that happened since nftables was synced. For a single object, 150 // changes are accumulated, i.e. previous is state from before all of them, 151 // current is state after applying all of those. 152 endpointsChanges *proxy.EndpointsChangeTracker 153 serviceChanges *proxy.ServiceChangeTracker 154 155 mu sync.Mutex // protects the following fields 156 svcPortMap proxy.ServicePortMap 157 endpointsMap proxy.EndpointsMap 158 nodeLabels map[string]string 159 // endpointSlicesSynced, and servicesSynced are set to true 160 // when corresponding objects are synced after startup. This is used to avoid 161 // updating nftables with some partial data after kube-proxy restart. 162 endpointSlicesSynced bool 163 servicesSynced bool 164 initialized int32 165 syncRunner *async.BoundedFrequencyRunner // governs calls to syncProxyRules 166 syncPeriod time.Duration 167 flushed bool 168 169 // These are effectively const and do not need the mutex to be held. 170 nftables knftables.Interface 171 masqueradeAll bool 172 masqueradeMark string 173 conntrack conntrack.Interface 174 localDetector proxyutil.LocalTrafficDetector 175 hostname string 176 nodeIP net.IP 177 recorder events.EventRecorder 178 179 serviceHealthServer healthcheck.ServiceHealthServer 180 healthzServer *healthcheck.ProxierHealthServer 181 182 // nodePortAddresses selects the interfaces where nodePort works. 183 nodePortAddresses *proxyutil.NodePortAddresses 184 // networkInterfacer defines an interface for several net library functions. 185 // Inject for test purpose. 186 networkInterfacer proxyutil.NetworkInterfacer 187 188 // staleChains contains information about chains to be deleted later 189 staleChains map[string]time.Time 190 191 // serviceCIDRs is a comma separated list of ServiceCIDRs belonging to the IPFamily 192 // which proxier is operating on, can be directly consumed by knftables. 193 serviceCIDRs string 194 195 logger klog.Logger 196 } 197 198 // Proxier implements proxy.Provider 199 var _ proxy.Provider = &Proxier{} 200 201 // NewProxier returns a new nftables Proxier. Once a proxier is created, it will keep 202 // nftables up to date in the background and will not terminate if a particular nftables 203 // call fails. 204 func NewProxier(ctx context.Context, 205 ipFamily v1.IPFamily, 206 sysctl utilsysctl.Interface, 207 syncPeriod time.Duration, 208 minSyncPeriod time.Duration, 209 masqueradeAll bool, 210 masqueradeBit int, 211 localDetector proxyutil.LocalTrafficDetector, 212 hostname string, 213 nodeIP net.IP, 214 recorder events.EventRecorder, 215 healthzServer *healthcheck.ProxierHealthServer, 216 nodePortAddressStrings []string, 217 initOnly bool, 218 ) (*Proxier, error) { 219 logger := klog.LoggerWithValues(klog.FromContext(ctx), "ipFamily", ipFamily) 220 221 if initOnly { 222 logger.Info("System initialized and --init-only specified") 223 return nil, nil 224 } 225 226 // Generate the masquerade mark to use for SNAT rules. 227 masqueradeValue := 1 << uint(masqueradeBit) 228 masqueradeMark := fmt.Sprintf("%#08x", masqueradeValue) 229 logger.V(2).Info("Using nftables mark for masquerade", "mark", masqueradeMark) 230 231 nodePortAddresses := proxyutil.NewNodePortAddresses(ipFamily, nodePortAddressStrings) 232 233 serviceHealthServer := healthcheck.NewServiceHealthServer(hostname, recorder, nodePortAddresses, healthzServer) 234 235 var nftablesFamily knftables.Family 236 if ipFamily == v1.IPv4Protocol { 237 nftablesFamily = knftables.IPv4Family 238 } else { 239 nftablesFamily = knftables.IPv6Family 240 } 241 nft, err := knftables.New(nftablesFamily, kubeProxyTable) 242 if err != nil { 243 return nil, err 244 } 245 246 proxier := &Proxier{ 247 ipFamily: ipFamily, 248 svcPortMap: make(proxy.ServicePortMap), 249 serviceChanges: proxy.NewServiceChangeTracker(newServiceInfo, ipFamily, recorder, nil), 250 endpointsMap: make(proxy.EndpointsMap), 251 endpointsChanges: proxy.NewEndpointsChangeTracker(hostname, newEndpointInfo, ipFamily, recorder, nil), 252 syncPeriod: syncPeriod, 253 nftables: nft, 254 masqueradeAll: masqueradeAll, 255 masqueradeMark: masqueradeMark, 256 conntrack: conntrack.NewExec(utilexec.New()), 257 localDetector: localDetector, 258 hostname: hostname, 259 nodeIP: nodeIP, 260 recorder: recorder, 261 serviceHealthServer: serviceHealthServer, 262 healthzServer: healthzServer, 263 nodePortAddresses: nodePortAddresses, 264 networkInterfacer: proxyutil.RealNetwork{}, 265 staleChains: make(map[string]time.Time), 266 logger: logger, 267 } 268 269 burstSyncs := 2 270 logger.V(2).Info("NFTables sync params", "minSyncPeriod", minSyncPeriod, "syncPeriod", syncPeriod, "burstSyncs", burstSyncs) 271 proxier.syncRunner = async.NewBoundedFrequencyRunner("sync-runner", proxier.syncProxyRules, minSyncPeriod, syncPeriod, burstSyncs) 272 273 return proxier, nil 274 } 275 276 // internal struct for string service information 277 type servicePortInfo struct { 278 *proxy.BaseServicePortInfo 279 // The following fields are computed and stored for performance reasons. 280 nameString string 281 clusterPolicyChainName string 282 localPolicyChainName string 283 externalChainName string 284 firewallChainName string 285 } 286 287 // returns a new proxy.ServicePort which abstracts a serviceInfo 288 func newServiceInfo(port *v1.ServicePort, service *v1.Service, bsvcPortInfo *proxy.BaseServicePortInfo) proxy.ServicePort { 289 svcPort := &servicePortInfo{BaseServicePortInfo: bsvcPortInfo} 290 291 // Store the following for performance reasons. 292 svcName := types.NamespacedName{Namespace: service.Namespace, Name: service.Name} 293 svcPortName := proxy.ServicePortName{NamespacedName: svcName, Port: port.Name} 294 svcPort.nameString = svcPortName.String() 295 296 chainNameBase := servicePortChainNameBase(&svcPortName, strings.ToLower(string(svcPort.Protocol()))) 297 svcPort.clusterPolicyChainName = servicePortPolicyClusterChainNamePrefix + chainNameBase 298 svcPort.localPolicyChainName = servicePortPolicyLocalChainNamePrefix + chainNameBase 299 svcPort.externalChainName = serviceExternalChainNamePrefix + chainNameBase 300 svcPort.firewallChainName = servicePortFirewallChainNamePrefix + chainNameBase 301 302 return svcPort 303 } 304 305 // internal struct for endpoints information 306 type endpointInfo struct { 307 *proxy.BaseEndpointInfo 308 309 chainName string 310 affinitySetName string 311 } 312 313 // returns a new proxy.Endpoint which abstracts a endpointInfo 314 func newEndpointInfo(baseInfo *proxy.BaseEndpointInfo, svcPortName *proxy.ServicePortName) proxy.Endpoint { 315 chainNameBase := servicePortEndpointChainNameBase(svcPortName, strings.ToLower(string(svcPortName.Protocol)), baseInfo.String()) 316 return &endpointInfo{ 317 BaseEndpointInfo: baseInfo, 318 chainName: servicePortEndpointChainNamePrefix + chainNameBase, 319 affinitySetName: servicePortEndpointAffinityNamePrefix + chainNameBase, 320 } 321 } 322 323 // nftablesBaseChains lists our "base chains"; those that are directly connected to the 324 // netfilter hooks (e.g., "postrouting", "input", etc.), as opposed to "regular" chains, 325 // which are only run when a rule jumps to them. See 326 // https://wiki.nftables.org/wiki-nftables/index.php/Configuring_chains. 327 // 328 // These are set up from setupNFTables() and then not directly referenced by 329 // syncProxyRules(). 330 // 331 // All of our base chains have names that are just "${type}-${hook}". e.g., "nat-prerouting". 332 type nftablesBaseChain struct { 333 name string 334 chainType knftables.BaseChainType 335 hook knftables.BaseChainHook 336 priority knftables.BaseChainPriority 337 } 338 339 var nftablesBaseChains = []nftablesBaseChain{ 340 // We want our filtering rules to operate on pre-DNAT dest IPs, so our filter 341 // chains have to run before DNAT. 342 {filterPreroutingChain, knftables.FilterType, knftables.PreroutingHook, knftables.DNATPriority + "-10"}, 343 {filterInputChain, knftables.FilterType, knftables.InputHook, knftables.DNATPriority + "-10"}, 344 {filterForwardChain, knftables.FilterType, knftables.ForwardHook, knftables.DNATPriority + "-10"}, 345 {filterOutputChain, knftables.FilterType, knftables.OutputHook, knftables.DNATPriority + "-10"}, 346 {filterOutputPostDNATChain, knftables.FilterType, knftables.OutputHook, knftables.DNATPriority + "+10"}, 347 {natPreroutingChain, knftables.NATType, knftables.PreroutingHook, knftables.DNATPriority}, 348 {natOutputChain, knftables.NATType, knftables.OutputHook, knftables.DNATPriority}, 349 {natPostroutingChain, knftables.NATType, knftables.PostroutingHook, knftables.SNATPriority}, 350 } 351 352 // nftablesJumpChains lists our top-level "regular chains" that are jumped to directly 353 // from one of the base chains. These are set up from setupNFTables(), and some of them 354 // are also referenced in syncProxyRules(). 355 type nftablesJumpChain struct { 356 dstChain string 357 srcChain string 358 extraArgs string 359 } 360 361 var nftablesJumpChains = []nftablesJumpChain{ 362 // We can't jump to endpointsCheckChain from filter-prerouting like 363 // firewallCheckChain because reject action is only valid in chains using the 364 // input, forward or output hooks with kernels before 5.9. 365 {nodePortEndpointsCheckChain, filterInputChain, "ct state new"}, 366 {serviceEndpointsCheckChain, filterInputChain, "ct state new"}, 367 {serviceEndpointsCheckChain, filterForwardChain, "ct state new"}, 368 {serviceEndpointsCheckChain, filterOutputChain, "ct state new"}, 369 370 {firewallCheckChain, filterPreroutingChain, "ct state new"}, 371 {firewallCheckChain, filterOutputChain, "ct state new"}, 372 373 {servicesChain, natOutputChain, ""}, 374 {servicesChain, natPreroutingChain, ""}, 375 {masqueradingChain, natPostroutingChain, ""}, 376 377 {clusterIPsCheckChain, filterForwardChain, "ct state new"}, 378 {clusterIPsCheckChain, filterOutputPostDNATChain, "ct state new"}, 379 } 380 381 // ensureChain adds commands to tx to ensure that chain exists and doesn't contain 382 // anything from before this transaction (using createdChains to ensure that we don't 383 // Flush a chain more than once and lose *new* rules as well.) 384 func ensureChain(chain string, tx *knftables.Transaction, createdChains sets.Set[string]) { 385 if createdChains.Has(chain) { 386 return 387 } 388 tx.Add(&knftables.Chain{ 389 Name: chain, 390 }) 391 tx.Flush(&knftables.Chain{ 392 Name: chain, 393 }) 394 createdChains.Insert(chain) 395 } 396 397 func (proxier *Proxier) setupNFTables(tx *knftables.Transaction) { 398 ipX := "ip" 399 ipvX_addr := "ipv4_addr" //nolint:stylecheck // var name intentionally resembles value 400 noLocalhost := "ip daddr != 127.0.0.0/8" 401 if proxier.ipFamily == v1.IPv6Protocol { 402 ipX = "ip6" 403 ipvX_addr = "ipv6_addr" 404 noLocalhost = "ip6 daddr != ::1" 405 } 406 407 tx.Add(&knftables.Table{ 408 Comment: ptr.To("rules for kube-proxy"), 409 }) 410 411 // Do an extra "add+delete" once to ensure all previous base chains in the table 412 // will be recreated. Otherwise, altering properties (e.g. priority) of these 413 // chains would fail the transaction. 414 if !proxier.flushed { 415 for _, bc := range nftablesBaseChains { 416 chain := &knftables.Chain{ 417 Name: bc.name, 418 } 419 tx.Add(chain) 420 tx.Delete(chain) 421 } 422 proxier.flushed = true 423 } 424 425 // Create and flush base chains 426 for _, bc := range nftablesBaseChains { 427 chain := &knftables.Chain{ 428 Name: bc.name, 429 Type: ptr.To(bc.chainType), 430 Hook: ptr.To(bc.hook), 431 Priority: ptr.To(bc.priority), 432 } 433 tx.Add(chain) 434 tx.Flush(chain) 435 } 436 437 // Create and flush ordinary chains and add rules jumping to them 438 createdChains := sets.New[string]() 439 for _, c := range nftablesJumpChains { 440 ensureChain(c.dstChain, tx, createdChains) 441 tx.Add(&knftables.Rule{ 442 Chain: c.srcChain, 443 Rule: knftables.Concat( 444 c.extraArgs, 445 "jump", c.dstChain, 446 ), 447 }) 448 } 449 450 // Ensure all of our other "top-level" chains exist 451 for _, chain := range []string{servicesChain, clusterIPsCheckChain, masqueradingChain, markMasqChain} { 452 ensureChain(chain, tx, createdChains) 453 } 454 455 // Add the rules in the mark-for-masquerade and masquerading chains 456 tx.Add(&knftables.Rule{ 457 Chain: markMasqChain, 458 Rule: knftables.Concat( 459 "mark", "set", "mark", "or", proxier.masqueradeMark, 460 ), 461 }) 462 463 tx.Add(&knftables.Rule{ 464 Chain: masqueradingChain, 465 Rule: knftables.Concat( 466 "mark", "and", proxier.masqueradeMark, "==", "0", 467 "return", 468 ), 469 }) 470 tx.Add(&knftables.Rule{ 471 Chain: masqueradingChain, 472 Rule: knftables.Concat( 473 "mark", "set", "mark", "xor", proxier.masqueradeMark, 474 ), 475 }) 476 tx.Add(&knftables.Rule{ 477 Chain: masqueradingChain, 478 Rule: "masquerade fully-random", 479 }) 480 481 // add cluster-ips set. 482 tx.Add(&knftables.Set{ 483 Name: clusterIPsSet, 484 Type: ipvX_addr, 485 Comment: ptr.To("Active ClusterIPs"), 486 }) 487 488 // reject traffic to invalid ports of ClusterIPs. 489 tx.Add(&knftables.Rule{ 490 Chain: clusterIPsCheckChain, 491 Rule: knftables.Concat( 492 ipX, "daddr", "@", clusterIPsSet, "reject", 493 ), 494 Comment: ptr.To("Reject traffic to invalid ports of ClusterIPs"), 495 }) 496 497 // drop traffic to unallocated ClusterIPs. 498 if len(proxier.serviceCIDRs) > 0 { 499 tx.Add(&knftables.Rule{ 500 Chain: clusterIPsCheckChain, 501 Rule: knftables.Concat( 502 ipX, "daddr", "{", proxier.serviceCIDRs, "}", 503 "drop", 504 ), 505 Comment: ptr.To("Drop traffic to unallocated ClusterIPs"), 506 }) 507 } 508 509 // Fill in nodeport-ips set if needed (or delete it if not). (We do "add+delete" 510 // rather than just "delete" when we want to ensure the set doesn't exist, because 511 // doing just "delete" would return an error if the set didn't exist.) 512 tx.Add(&knftables.Set{ 513 Name: nodePortIPsSet, 514 Type: ipvX_addr, 515 Comment: ptr.To("IPs that accept NodePort traffic"), 516 }) 517 if proxier.nodePortAddresses.MatchAll() { 518 tx.Delete(&knftables.Set{ 519 Name: nodePortIPsSet, 520 }) 521 } else { 522 tx.Flush(&knftables.Set{ 523 Name: nodePortIPsSet, 524 }) 525 nodeIPs, err := proxier.nodePortAddresses.GetNodeIPs(proxier.networkInterfacer) 526 if err != nil { 527 proxier.logger.Error(err, "Failed to get node ip address matching nodeport cidrs, services with nodeport may not work as intended", "CIDRs", proxier.nodePortAddresses) 528 } 529 for _, ip := range nodeIPs { 530 if ip.IsLoopback() { 531 proxier.logger.Error(nil, "--nodeport-addresses includes localhost but localhost NodePorts are not supported", "address", ip.String()) 532 continue 533 } 534 tx.Add(&knftables.Element{ 535 Set: nodePortIPsSet, 536 Key: []string{ 537 ip.String(), 538 }, 539 }) 540 } 541 } 542 543 // Set up "no endpoints" drop/reject handling 544 tx.Add(&knftables.Map{ 545 Name: noEndpointServicesMap, 546 Type: ipvX_addr + " . inet_proto . inet_service : verdict", 547 Comment: ptr.To("vmap to drop or reject packets to services with no endpoints"), 548 }) 549 tx.Add(&knftables.Map{ 550 Name: noEndpointNodePortsMap, 551 Type: "inet_proto . inet_service : verdict", 552 Comment: ptr.To("vmap to drop or reject packets to service nodeports with no endpoints"), 553 }) 554 555 tx.Add(&knftables.Chain{ 556 Name: rejectChain, 557 Comment: ptr.To("helper for @no-endpoint-services / @no-endpoint-nodeports"), 558 }) 559 tx.Flush(&knftables.Chain{ 560 Name: rejectChain, 561 }) 562 tx.Add(&knftables.Rule{ 563 Chain: rejectChain, 564 Rule: "reject", 565 }) 566 567 tx.Add(&knftables.Rule{ 568 Chain: serviceEndpointsCheckChain, 569 Rule: knftables.Concat( 570 ipX, "daddr", ".", "meta l4proto", ".", "th dport", 571 "vmap", "@", noEndpointServicesMap, 572 ), 573 }) 574 575 if proxier.nodePortAddresses.MatchAll() { 576 tx.Add(&knftables.Rule{ 577 Chain: nodePortEndpointsCheckChain, 578 Rule: knftables.Concat( 579 noLocalhost, 580 "meta l4proto . th dport", 581 "vmap", "@", noEndpointNodePortsMap, 582 ), 583 }) 584 } else { 585 tx.Add(&knftables.Rule{ 586 Chain: nodePortEndpointsCheckChain, 587 Rule: knftables.Concat( 588 ipX, "daddr", "@", nodePortIPsSet, 589 "meta l4proto . th dport", 590 "vmap", "@", noEndpointNodePortsMap, 591 ), 592 }) 593 } 594 595 // Set up LoadBalancerSourceRanges firewalling 596 tx.Add(&knftables.Map{ 597 Name: firewallIPsMap, 598 Type: ipvX_addr + " . inet_proto . inet_service : verdict", 599 Comment: ptr.To("destinations that are subject to LoadBalancerSourceRanges"), 600 }) 601 602 ensureChain(firewallCheckChain, tx, createdChains) 603 tx.Add(&knftables.Rule{ 604 Chain: firewallCheckChain, 605 Rule: knftables.Concat( 606 ipX, "daddr", ".", "meta l4proto", ".", "th dport", 607 "vmap", "@", firewallIPsMap, 608 ), 609 }) 610 611 // Set up service dispatch 612 tx.Add(&knftables.Map{ 613 Name: serviceIPsMap, 614 Type: ipvX_addr + " . inet_proto . inet_service : verdict", 615 Comment: ptr.To("ClusterIP, ExternalIP and LoadBalancer IP traffic"), 616 }) 617 tx.Add(&knftables.Map{ 618 Name: serviceNodePortsMap, 619 Type: "inet_proto . inet_service : verdict", 620 Comment: ptr.To("NodePort traffic"), 621 }) 622 tx.Add(&knftables.Rule{ 623 Chain: servicesChain, 624 Rule: knftables.Concat( 625 ipX, "daddr", ".", "meta l4proto", ".", "th dport", 626 "vmap", "@", serviceIPsMap, 627 ), 628 }) 629 if proxier.nodePortAddresses.MatchAll() { 630 tx.Add(&knftables.Rule{ 631 Chain: servicesChain, 632 Rule: knftables.Concat( 633 "fib daddr type local", 634 noLocalhost, 635 "meta l4proto . th dport", 636 "vmap", "@", serviceNodePortsMap, 637 ), 638 }) 639 } else { 640 tx.Add(&knftables.Rule{ 641 Chain: servicesChain, 642 Rule: knftables.Concat( 643 ipX, "daddr @nodeport-ips", 644 "meta l4proto . th dport", 645 "vmap", "@", serviceNodePortsMap, 646 ), 647 }) 648 } 649 } 650 651 // CleanupLeftovers removes all nftables rules and chains created by the Proxier 652 // It returns true if an error was encountered. Errors are logged. 653 func CleanupLeftovers(ctx context.Context) bool { 654 logger := klog.FromContext(ctx) 655 var encounteredError bool 656 657 for _, family := range []knftables.Family{knftables.IPv4Family, knftables.IPv6Family} { 658 nft, err := knftables.New(family, kubeProxyTable) 659 if err == nil { 660 tx := nft.NewTransaction() 661 tx.Delete(&knftables.Table{}) 662 err = nft.Run(ctx, tx) 663 } 664 if err != nil && !knftables.IsNotFound(err) { 665 logger.Error(err, "Error cleaning up nftables rules") 666 encounteredError = true 667 } 668 } 669 670 return encounteredError 671 } 672 673 // Sync is called to synchronize the proxier state to nftables as soon as possible. 674 func (proxier *Proxier) Sync() { 675 if proxier.healthzServer != nil { 676 proxier.healthzServer.QueuedUpdate(proxier.ipFamily) 677 } 678 metrics.SyncProxyRulesLastQueuedTimestamp.SetToCurrentTime() 679 proxier.syncRunner.Run() 680 } 681 682 // SyncLoop runs periodic work. This is expected to run as a goroutine or as the main loop of the app. It does not return. 683 func (proxier *Proxier) SyncLoop() { 684 // Update healthz timestamp at beginning in case Sync() never succeeds. 685 if proxier.healthzServer != nil { 686 proxier.healthzServer.Updated(proxier.ipFamily) 687 } 688 689 // synthesize "last change queued" time as the informers are syncing. 690 metrics.SyncProxyRulesLastQueuedTimestamp.SetToCurrentTime() 691 proxier.syncRunner.Loop(wait.NeverStop) 692 } 693 694 func (proxier *Proxier) setInitialized(value bool) { 695 var initialized int32 696 if value { 697 initialized = 1 698 } 699 atomic.StoreInt32(&proxier.initialized, initialized) 700 } 701 702 func (proxier *Proxier) isInitialized() bool { 703 return atomic.LoadInt32(&proxier.initialized) > 0 704 } 705 706 // OnServiceAdd is called whenever creation of new service object 707 // is observed. 708 func (proxier *Proxier) OnServiceAdd(service *v1.Service) { 709 proxier.OnServiceUpdate(nil, service) 710 } 711 712 // OnServiceUpdate is called whenever modification of an existing 713 // service object is observed. 714 func (proxier *Proxier) OnServiceUpdate(oldService, service *v1.Service) { 715 if proxier.serviceChanges.Update(oldService, service) && proxier.isInitialized() { 716 proxier.Sync() 717 } 718 } 719 720 // OnServiceDelete is called whenever deletion of an existing service 721 // object is observed. 722 func (proxier *Proxier) OnServiceDelete(service *v1.Service) { 723 proxier.OnServiceUpdate(service, nil) 724 725 } 726 727 // OnServiceSynced is called once all the initial event handlers were 728 // called and the state is fully propagated to local cache. 729 func (proxier *Proxier) OnServiceSynced() { 730 proxier.mu.Lock() 731 proxier.servicesSynced = true 732 proxier.setInitialized(proxier.endpointSlicesSynced) 733 proxier.mu.Unlock() 734 735 // Sync unconditionally - this is called once per lifetime. 736 proxier.syncProxyRules() 737 } 738 739 // OnEndpointSliceAdd is called whenever creation of a new endpoint slice object 740 // is observed. 741 func (proxier *Proxier) OnEndpointSliceAdd(endpointSlice *discovery.EndpointSlice) { 742 if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, false) && proxier.isInitialized() { 743 proxier.Sync() 744 } 745 } 746 747 // OnEndpointSliceUpdate is called whenever modification of an existing endpoint 748 // slice object is observed. 749 func (proxier *Proxier) OnEndpointSliceUpdate(_, endpointSlice *discovery.EndpointSlice) { 750 if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, false) && proxier.isInitialized() { 751 proxier.Sync() 752 } 753 } 754 755 // OnEndpointSliceDelete is called whenever deletion of an existing endpoint slice 756 // object is observed. 757 func (proxier *Proxier) OnEndpointSliceDelete(endpointSlice *discovery.EndpointSlice) { 758 if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, true) && proxier.isInitialized() { 759 proxier.Sync() 760 } 761 } 762 763 // OnEndpointSlicesSynced is called once all the initial event handlers were 764 // called and the state is fully propagated to local cache. 765 func (proxier *Proxier) OnEndpointSlicesSynced() { 766 proxier.mu.Lock() 767 proxier.endpointSlicesSynced = true 768 proxier.setInitialized(proxier.servicesSynced) 769 proxier.mu.Unlock() 770 771 // Sync unconditionally - this is called once per lifetime. 772 proxier.syncProxyRules() 773 } 774 775 // OnNodeAdd is called whenever creation of new node object 776 // is observed. 777 func (proxier *Proxier) OnNodeAdd(node *v1.Node) { 778 if node.Name != proxier.hostname { 779 proxier.logger.Error(nil, "Received a watch event for a node that doesn't match the current node", 780 "eventNode", node.Name, "currentNode", proxier.hostname) 781 return 782 } 783 784 if reflect.DeepEqual(proxier.nodeLabels, node.Labels) { 785 return 786 } 787 788 proxier.mu.Lock() 789 proxier.nodeLabels = map[string]string{} 790 for k, v := range node.Labels { 791 proxier.nodeLabels[k] = v 792 } 793 proxier.mu.Unlock() 794 proxier.logger.V(4).Info("Updated proxier node labels", "labels", node.Labels) 795 796 proxier.Sync() 797 } 798 799 // OnNodeUpdate is called whenever modification of an existing 800 // node object is observed. 801 func (proxier *Proxier) OnNodeUpdate(oldNode, node *v1.Node) { 802 if node.Name != proxier.hostname { 803 proxier.logger.Error(nil, "Received a watch event for a node that doesn't match the current node", 804 "eventNode", node.Name, "currentNode", proxier.hostname) 805 return 806 } 807 808 if reflect.DeepEqual(proxier.nodeLabels, node.Labels) { 809 return 810 } 811 812 proxier.mu.Lock() 813 proxier.nodeLabels = map[string]string{} 814 for k, v := range node.Labels { 815 proxier.nodeLabels[k] = v 816 } 817 proxier.mu.Unlock() 818 proxier.logger.V(4).Info("Updated proxier node labels", "labels", node.Labels) 819 820 proxier.Sync() 821 } 822 823 // OnNodeDelete is called whenever deletion of an existing node 824 // object is observed. 825 func (proxier *Proxier) OnNodeDelete(node *v1.Node) { 826 if node.Name != proxier.hostname { 827 proxier.logger.Error(nil, "Received a watch event for a node that doesn't match the current node", 828 "eventNode", node.Name, "currentNode", proxier.hostname) 829 return 830 } 831 832 proxier.mu.Lock() 833 proxier.nodeLabels = nil 834 proxier.mu.Unlock() 835 836 proxier.Sync() 837 } 838 839 // OnNodeSynced is called once all the initial event handlers were 840 // called and the state is fully propagated to local cache. 841 func (proxier *Proxier) OnNodeSynced() { 842 } 843 844 // OnServiceCIDRsChanged is called whenever a change is observed 845 // in any of the ServiceCIDRs, and provides complete list of service cidrs. 846 func (proxier *Proxier) OnServiceCIDRsChanged(cidrs []string) { 847 proxier.mu.Lock() 848 defer proxier.mu.Unlock() 849 850 cidrsForProxier := make([]string, 0) 851 for _, cidr := range cidrs { 852 isIPv4CIDR := netutils.IsIPv4CIDRString(cidr) 853 if proxier.ipFamily == v1.IPv4Protocol && isIPv4CIDR { 854 cidrsForProxier = append(cidrsForProxier, cidr) 855 } 856 857 if proxier.ipFamily == v1.IPv6Protocol && !isIPv4CIDR { 858 cidrsForProxier = append(cidrsForProxier, cidr) 859 } 860 } 861 proxier.serviceCIDRs = strings.Join(cidrsForProxier, ",") 862 } 863 864 const ( 865 // Maximum length for one of our chain name prefixes, including the trailing 866 // hyphen. 867 chainNamePrefixLengthMax = 16 868 869 // Maximum length of the string returned from servicePortChainNameBase or 870 // servicePortEndpointChainNameBase. 871 chainNameBaseLengthMax = knftables.NameLengthMax - chainNamePrefixLengthMax 872 ) 873 874 const ( 875 servicePortPolicyClusterChainNamePrefix = "service-" 876 servicePortPolicyLocalChainNamePrefix = "local-" 877 serviceExternalChainNamePrefix = "external-" 878 servicePortEndpointChainNamePrefix = "endpoint-" 879 servicePortEndpointAffinityNamePrefix = "affinity-" 880 servicePortFirewallChainNamePrefix = "firewall-" 881 ) 882 883 // hashAndTruncate prefixes name with a hash of itself and then truncates to 884 // chainNameBaseLengthMax. The hash ensures that (a) the name is still unique if we have 885 // to truncate the end, and (b) it's visually distinguishable from other chains that would 886 // otherwise have nearly identical names (e.g., different endpoint chains for a given 887 // service that differ in only a single digit). 888 func hashAndTruncate(name string) string { 889 hash := sha256.Sum256([]byte(name)) 890 encoded := base32.StdEncoding.EncodeToString(hash[:]) 891 name = encoded[:8] + "-" + name 892 if len(name) > chainNameBaseLengthMax { 893 name = name[:chainNameBaseLengthMax-3] + "..." 894 } 895 return name 896 } 897 898 // servicePortChainNameBase returns the base name for a chain for the given ServicePort. 899 // This is something like "HASH-namespace/serviceName/protocol/portName", e.g, 900 // "ULMVA6XW-ns1/svc1/tcp/p80". 901 func servicePortChainNameBase(servicePortName *proxy.ServicePortName, protocol string) string { 902 // nftables chains can contain the characters [A-Za-z0-9_./-] (but must start with 903 // a letter, underscore, or dot). 904 // 905 // Namespace, Service, and Port names can contain [a-z0-9-] (with some additional 906 // restrictions that aren't relevant here). 907 // 908 // Protocol is /(tcp|udp|sctp)/. 909 // 910 // Thus, we can safely use all Namespace names, Service names, protocol values, 911 // and Port names directly in nftables chain names (though note that this assumes 912 // that the chain name won't *start* with any of those strings, since that might 913 // be illegal). We use "/" to separate the parts of the name, which is one of the 914 // two characters allowed in a chain name that isn't allowed in our input strings. 915 916 name := fmt.Sprintf("%s/%s/%s/%s", 917 servicePortName.NamespacedName.Namespace, 918 servicePortName.NamespacedName.Name, 919 protocol, 920 servicePortName.Port, 921 ) 922 923 // The namespace, service, and port name can each be up to 63 characters, protocol 924 // can be up to 4, plus 8 for the hash and 4 additional punctuation characters. 925 // That's a total of 205, which is less than chainNameBaseLengthMax (240). So this 926 // will never actually return a truncated name. 927 return hashAndTruncate(name) 928 } 929 930 // servicePortEndpointChainNameBase returns the suffix for chain names for the given 931 // endpoint. This is something like 932 // "HASH-namespace/serviceName/protocol/portName__endpointIP/endpointport", e.g., 933 // "5OJB2KTY-ns1/svc1/tcp/p80__10.180.0.1/80". 934 func servicePortEndpointChainNameBase(servicePortName *proxy.ServicePortName, protocol, endpoint string) string { 935 // As above in servicePortChainNameBase: Namespace, Service, Port, Protocol, and 936 // EndpointPort are all safe to copy into the chain name directly. But if 937 // EndpointIP is IPv6 then it will contain colons, which aren't allowed in a chain 938 // name. IPv6 IPs are also quite long, but we can't safely truncate them (e.g. to 939 // only the final segment) because (especially for manually-created external 940 // endpoints), we can't know for sure that any part of them is redundant. 941 942 endpointIP, endpointPort, _ := net.SplitHostPort(endpoint) 943 if strings.Contains(endpointIP, ":") { 944 endpointIP = strings.ReplaceAll(endpointIP, ":", ".") 945 } 946 947 // As above, we use "/" to separate parts of the name, and "__" to separate the 948 // "service" part from the "endpoint" part. 949 name := fmt.Sprintf("%s/%s/%s/%s__%s/%s", 950 servicePortName.NamespacedName.Namespace, 951 servicePortName.NamespacedName.Name, 952 protocol, 953 servicePortName.Port, 954 endpointIP, 955 endpointPort, 956 ) 957 958 // The part of name before the "__" can be up to 205 characters (as with 959 // servicePortChainNameBase above). An IPv6 address can be up to 39 characters, and 960 // a port can be up to 5 digits, plus 3 punctuation characters gives a max total 961 // length of 252, well over chainNameBaseLengthMax (240), so truncation is 962 // theoretically possible (though incredibly unlikely). 963 return hashAndTruncate(name) 964 } 965 966 func isServiceChainName(chainString string) bool { 967 // The chains returned from servicePortChainNameBase and 968 // servicePortEndpointChainNameBase will always have at least one "/" in them. 969 // Since none of our "stock" chain names use slashes, we can distinguish them this 970 // way. 971 return strings.Contains(chainString, "/") 972 } 973 974 func isAffinitySetName(set string) bool { 975 return strings.HasPrefix(set, servicePortEndpointAffinityNamePrefix) 976 } 977 978 // This is where all of the nftables calls happen. 979 // This assumes proxier.mu is NOT held 980 func (proxier *Proxier) syncProxyRules() { 981 proxier.mu.Lock() 982 defer proxier.mu.Unlock() 983 984 // don't sync rules till we've received services and endpoints 985 if !proxier.isInitialized() { 986 proxier.logger.V(2).Info("Not syncing nftables until Services and Endpoints have been received from master") 987 return 988 } 989 990 // 991 // Below this point we will not return until we try to write the nftables rules. 992 // 993 994 // Keep track of how long syncs take. 995 start := time.Now() 996 defer func() { 997 metrics.SyncProxyRulesLatency.Observe(metrics.SinceInSeconds(start)) 998 proxier.logger.V(2).Info("SyncProxyRules complete", "elapsed", time.Since(start)) 999 }() 1000 1001 serviceUpdateResult := proxier.svcPortMap.Update(proxier.serviceChanges) 1002 endpointUpdateResult := proxier.endpointsMap.Update(proxier.endpointsChanges) 1003 1004 proxier.logger.V(2).Info("Syncing nftables rules") 1005 1006 success := false 1007 defer func() { 1008 if !success { 1009 proxier.logger.Info("Sync failed", "retryingTime", proxier.syncPeriod) 1010 proxier.syncRunner.RetryAfter(proxier.syncPeriod) 1011 } 1012 }() 1013 1014 // If there are sufficiently-stale chains left over from previous transactions, 1015 // try to delete them now. 1016 if len(proxier.staleChains) > 0 { 1017 oneSecondAgo := start.Add(-time.Second) 1018 tx := proxier.nftables.NewTransaction() 1019 deleted := 0 1020 for chain, modtime := range proxier.staleChains { 1021 if modtime.Before(oneSecondAgo) { 1022 tx.Delete(&knftables.Chain{ 1023 Name: chain, 1024 }) 1025 delete(proxier.staleChains, chain) 1026 deleted++ 1027 } 1028 } 1029 if deleted > 0 { 1030 proxier.logger.Info("Deleting stale nftables chains", "numChains", deleted) 1031 err := proxier.nftables.Run(context.TODO(), tx) 1032 if err != nil { 1033 // We already deleted the entries from staleChains, but if 1034 // the chains still exist, they'll just get added back 1035 // (with a later timestamp) at the end of the sync. 1036 proxier.logger.Error(err, "Unable to delete stale chains; will retry later") 1037 metrics.NFTablesCleanupFailuresTotal.Inc() 1038 } 1039 } 1040 } 1041 1042 // Now start the actual syncing transaction 1043 tx := proxier.nftables.NewTransaction() 1044 proxier.setupNFTables(tx) 1045 1046 // We need to use, eg, "ip daddr" for IPv4 but "ip6 daddr" for IPv6 1047 ipX := "ip" 1048 ipvX_addr := "ipv4_addr" //nolint:stylecheck // var name intentionally resembles value 1049 if proxier.ipFamily == v1.IPv6Protocol { 1050 ipX = "ip6" 1051 ipvX_addr = "ipv6_addr" 1052 } 1053 1054 // We currently fully-rebuild our sets and maps on each resync 1055 tx.Flush(&knftables.Set{ 1056 Name: clusterIPsSet, 1057 }) 1058 tx.Flush(&knftables.Map{ 1059 Name: firewallIPsMap, 1060 }) 1061 tx.Flush(&knftables.Map{ 1062 Name: noEndpointServicesMap, 1063 }) 1064 tx.Flush(&knftables.Map{ 1065 Name: noEndpointNodePortsMap, 1066 }) 1067 tx.Flush(&knftables.Map{ 1068 Name: serviceIPsMap, 1069 }) 1070 tx.Flush(&knftables.Map{ 1071 Name: serviceNodePortsMap, 1072 }) 1073 1074 // Accumulate service/endpoint chains and affinity sets to keep. 1075 activeChains := sets.New[string]() 1076 activeAffinitySets := sets.New[string]() 1077 1078 // Compute total number of endpoint chains across all services 1079 // to get a sense of how big the cluster is. 1080 totalEndpoints := 0 1081 for svcName := range proxier.svcPortMap { 1082 totalEndpoints += len(proxier.endpointsMap[svcName]) 1083 } 1084 1085 // These two variables are used to publish the sync_proxy_rules_no_endpoints_total 1086 // metric. 1087 serviceNoLocalEndpointsTotalInternal := 0 1088 serviceNoLocalEndpointsTotalExternal := 0 1089 1090 // Build rules for each service-port. 1091 for svcName, svc := range proxier.svcPortMap { 1092 svcInfo, ok := svc.(*servicePortInfo) 1093 if !ok { 1094 proxier.logger.Error(nil, "Failed to cast serviceInfo", "serviceName", svcName) 1095 continue 1096 } 1097 protocol := strings.ToLower(string(svcInfo.Protocol())) 1098 svcPortNameString := svcInfo.nameString 1099 1100 // Figure out the endpoints for Cluster and Local traffic policy. 1101 // allLocallyReachableEndpoints is the set of all endpoints that can be routed to 1102 // from this node, given the service's traffic policies. hasEndpoints is true 1103 // if the service has any usable endpoints on any node, not just this one. 1104 allEndpoints := proxier.endpointsMap[svcName] 1105 clusterEndpoints, localEndpoints, allLocallyReachableEndpoints, hasEndpoints := proxy.CategorizeEndpoints(allEndpoints, svcInfo, proxier.nodeLabels) 1106 1107 // Note the endpoint chains that will be used 1108 for _, ep := range allLocallyReachableEndpoints { 1109 if epInfo, ok := ep.(*endpointInfo); ok { 1110 ensureChain(epInfo.chainName, tx, activeChains) 1111 } 1112 } 1113 1114 // clusterPolicyChain contains the endpoints used with "Cluster" traffic policy 1115 clusterPolicyChain := svcInfo.clusterPolicyChainName 1116 usesClusterPolicyChain := len(clusterEndpoints) > 0 && svcInfo.UsesClusterEndpoints() 1117 if usesClusterPolicyChain { 1118 ensureChain(clusterPolicyChain, tx, activeChains) 1119 } 1120 1121 // localPolicyChain contains the endpoints used with "Local" traffic policy 1122 localPolicyChain := svcInfo.localPolicyChainName 1123 usesLocalPolicyChain := len(localEndpoints) > 0 && svcInfo.UsesLocalEndpoints() 1124 if usesLocalPolicyChain { 1125 ensureChain(localPolicyChain, tx, activeChains) 1126 } 1127 1128 // internalPolicyChain is the chain containing the endpoints for 1129 // "internal" (ClusterIP) traffic. internalTrafficChain is the chain that 1130 // internal traffic is routed to (which is always the same as 1131 // internalPolicyChain). hasInternalEndpoints is true if we should 1132 // generate rules pointing to internalTrafficChain, or false if there are 1133 // no available internal endpoints. 1134 internalPolicyChain := clusterPolicyChain 1135 hasInternalEndpoints := hasEndpoints 1136 if svcInfo.InternalPolicyLocal() { 1137 internalPolicyChain = localPolicyChain 1138 if len(localEndpoints) == 0 { 1139 hasInternalEndpoints = false 1140 } 1141 } 1142 internalTrafficChain := internalPolicyChain 1143 1144 // Similarly, externalPolicyChain is the chain containing the endpoints 1145 // for "external" (NodePort, LoadBalancer, and ExternalIP) traffic. 1146 // externalTrafficChain is the chain that external traffic is routed to 1147 // (which is always the service's "EXT" chain). hasExternalEndpoints is 1148 // true if there are endpoints that will be reached by external traffic. 1149 // (But we may still have to generate externalTrafficChain even if there 1150 // are no external endpoints, to ensure that the short-circuit rules for 1151 // local traffic are set up.) 1152 externalPolicyChain := clusterPolicyChain 1153 hasExternalEndpoints := hasEndpoints 1154 if svcInfo.ExternalPolicyLocal() { 1155 externalPolicyChain = localPolicyChain 1156 if len(localEndpoints) == 0 { 1157 hasExternalEndpoints = false 1158 } 1159 } 1160 externalTrafficChain := svcInfo.externalChainName // eventually jumps to externalPolicyChain 1161 1162 // usesExternalTrafficChain is based on hasEndpoints, not hasExternalEndpoints, 1163 // because we need the local-traffic-short-circuiting rules even when there 1164 // are no externally-usable endpoints. 1165 usesExternalTrafficChain := hasEndpoints && svcInfo.ExternallyAccessible() 1166 if usesExternalTrafficChain { 1167 ensureChain(externalTrafficChain, tx, activeChains) 1168 } 1169 1170 var internalTrafficFilterVerdict, externalTrafficFilterVerdict string 1171 if !hasEndpoints { 1172 // The service has no endpoints at all; hasInternalEndpoints and 1173 // hasExternalEndpoints will also be false, and we will not 1174 // generate any chains in the "nat" table for the service; only 1175 // rules in the "filter" table rejecting incoming packets for 1176 // the service's IPs. 1177 internalTrafficFilterVerdict = fmt.Sprintf("goto %s", rejectChain) 1178 externalTrafficFilterVerdict = fmt.Sprintf("goto %s", rejectChain) 1179 } else { 1180 if !hasInternalEndpoints { 1181 // The internalTrafficPolicy is "Local" but there are no local 1182 // endpoints. Traffic to the clusterIP will be dropped, but 1183 // external traffic may still be accepted. 1184 internalTrafficFilterVerdict = "drop" 1185 serviceNoLocalEndpointsTotalInternal++ 1186 } 1187 if !hasExternalEndpoints { 1188 // The externalTrafficPolicy is "Local" but there are no 1189 // local endpoints. Traffic to "external" IPs from outside 1190 // the cluster will be dropped, but traffic from inside 1191 // the cluster may still be accepted. 1192 externalTrafficFilterVerdict = "drop" 1193 serviceNoLocalEndpointsTotalExternal++ 1194 } 1195 } 1196 1197 // Capture the clusterIP. 1198 tx.Add(&knftables.Element{ 1199 Set: clusterIPsSet, 1200 Key: []string{svcInfo.ClusterIP().String()}, 1201 }) 1202 if hasInternalEndpoints { 1203 tx.Add(&knftables.Element{ 1204 Map: serviceIPsMap, 1205 Key: []string{ 1206 svcInfo.ClusterIP().String(), 1207 protocol, 1208 strconv.Itoa(svcInfo.Port()), 1209 }, 1210 Value: []string{ 1211 fmt.Sprintf("goto %s", internalTrafficChain), 1212 }, 1213 }) 1214 } else { 1215 // No endpoints. 1216 tx.Add(&knftables.Element{ 1217 Map: noEndpointServicesMap, 1218 Key: []string{ 1219 svcInfo.ClusterIP().String(), 1220 protocol, 1221 strconv.Itoa(svcInfo.Port()), 1222 }, 1223 Value: []string{ 1224 internalTrafficFilterVerdict, 1225 }, 1226 Comment: &svcPortNameString, 1227 }) 1228 } 1229 1230 // Capture externalIPs. 1231 for _, externalIP := range svcInfo.ExternalIPs() { 1232 if hasEndpoints { 1233 // Send traffic bound for external IPs to the "external 1234 // destinations" chain. 1235 tx.Add(&knftables.Element{ 1236 Map: serviceIPsMap, 1237 Key: []string{ 1238 externalIP.String(), 1239 protocol, 1240 strconv.Itoa(svcInfo.Port()), 1241 }, 1242 Value: []string{ 1243 fmt.Sprintf("goto %s", externalTrafficChain), 1244 }, 1245 }) 1246 } 1247 if !hasExternalEndpoints { 1248 // Either no endpoints at all (REJECT) or no endpoints for 1249 // external traffic (DROP anything that didn't get 1250 // short-circuited by the EXT chain.) 1251 tx.Add(&knftables.Element{ 1252 Map: noEndpointServicesMap, 1253 Key: []string{ 1254 externalIP.String(), 1255 protocol, 1256 strconv.Itoa(svcInfo.Port()), 1257 }, 1258 Value: []string{ 1259 externalTrafficFilterVerdict, 1260 }, 1261 Comment: &svcPortNameString, 1262 }) 1263 } 1264 } 1265 1266 usesFWChain := len(svcInfo.LoadBalancerVIPs()) > 0 && len(svcInfo.LoadBalancerSourceRanges()) > 0 1267 fwChain := svcInfo.firewallChainName 1268 if usesFWChain { 1269 ensureChain(fwChain, tx, activeChains) 1270 var sources []string 1271 allowFromNode := false 1272 for _, cidr := range svcInfo.LoadBalancerSourceRanges() { 1273 if len(sources) > 0 { 1274 sources = append(sources, ",") 1275 } 1276 sources = append(sources, cidr.String()) 1277 if cidr.Contains(proxier.nodeIP) { 1278 allowFromNode = true 1279 } 1280 } 1281 // For VIP-like LBs, the VIP is often added as a local 1282 // address (via an IP route rule). In that case, a request 1283 // from a node to the VIP will not hit the loadbalancer but 1284 // will loop back with the source IP set to the VIP. We 1285 // need the following rules to allow requests from this node. 1286 if allowFromNode { 1287 for _, lbip := range svcInfo.LoadBalancerVIPs() { 1288 sources = append(sources, ",", lbip.String()) 1289 } 1290 } 1291 tx.Add(&knftables.Rule{ 1292 Chain: fwChain, 1293 Rule: knftables.Concat( 1294 ipX, "saddr", "!=", "{", sources, "}", 1295 "drop", 1296 ), 1297 }) 1298 } 1299 1300 // Capture load-balancer ingress. 1301 for _, lbip := range svcInfo.LoadBalancerVIPs() { 1302 if hasEndpoints { 1303 tx.Add(&knftables.Element{ 1304 Map: serviceIPsMap, 1305 Key: []string{ 1306 lbip.String(), 1307 protocol, 1308 strconv.Itoa(svcInfo.Port()), 1309 }, 1310 Value: []string{ 1311 fmt.Sprintf("goto %s", externalTrafficChain), 1312 }, 1313 }) 1314 } 1315 1316 if usesFWChain { 1317 tx.Add(&knftables.Element{ 1318 Map: firewallIPsMap, 1319 Key: []string{ 1320 lbip.String(), 1321 protocol, 1322 strconv.Itoa(svcInfo.Port()), 1323 }, 1324 Value: []string{ 1325 fmt.Sprintf("goto %s", fwChain), 1326 }, 1327 Comment: &svcPortNameString, 1328 }) 1329 } 1330 } 1331 if !hasExternalEndpoints { 1332 // Either no endpoints at all (REJECT) or no endpoints for 1333 // external traffic (DROP anything that didn't get short-circuited 1334 // by the EXT chain.) 1335 for _, lbip := range svcInfo.LoadBalancerVIPs() { 1336 tx.Add(&knftables.Element{ 1337 Map: noEndpointServicesMap, 1338 Key: []string{ 1339 lbip.String(), 1340 protocol, 1341 strconv.Itoa(svcInfo.Port()), 1342 }, 1343 Value: []string{ 1344 externalTrafficFilterVerdict, 1345 }, 1346 Comment: &svcPortNameString, 1347 }) 1348 } 1349 } 1350 1351 // Capture nodeports. 1352 if svcInfo.NodePort() != 0 { 1353 if hasEndpoints { 1354 // Jump to the external destination chain. For better or for 1355 // worse, nodeports are not subect to loadBalancerSourceRanges, 1356 // and we can't change that. 1357 tx.Add(&knftables.Element{ 1358 Map: serviceNodePortsMap, 1359 Key: []string{ 1360 protocol, 1361 strconv.Itoa(svcInfo.NodePort()), 1362 }, 1363 Value: []string{ 1364 fmt.Sprintf("goto %s", externalTrafficChain), 1365 }, 1366 }) 1367 } 1368 if !hasExternalEndpoints { 1369 // Either no endpoints at all (REJECT) or no endpoints for 1370 // external traffic (DROP anything that didn't get 1371 // short-circuited by the EXT chain.) 1372 tx.Add(&knftables.Element{ 1373 Map: noEndpointNodePortsMap, 1374 Key: []string{ 1375 protocol, 1376 strconv.Itoa(svcInfo.NodePort()), 1377 }, 1378 Value: []string{ 1379 externalTrafficFilterVerdict, 1380 }, 1381 Comment: &svcPortNameString, 1382 }) 1383 } 1384 } 1385 1386 // Set up internal traffic handling. 1387 if hasInternalEndpoints { 1388 if proxier.masqueradeAll { 1389 tx.Add(&knftables.Rule{ 1390 Chain: internalTrafficChain, 1391 Rule: knftables.Concat( 1392 ipX, "daddr", svcInfo.ClusterIP(), 1393 protocol, "dport", svcInfo.Port(), 1394 "jump", markMasqChain, 1395 ), 1396 }) 1397 } else if proxier.localDetector.IsImplemented() { 1398 // This masquerades off-cluster traffic to a service VIP. The 1399 // idea is that you can establish a static route for your 1400 // Service range, routing to any node, and that node will 1401 // bridge into the Service for you. Since that might bounce 1402 // off-node, we masquerade here. 1403 tx.Add(&knftables.Rule{ 1404 Chain: internalTrafficChain, 1405 Rule: knftables.Concat( 1406 ipX, "daddr", svcInfo.ClusterIP(), 1407 protocol, "dport", svcInfo.Port(), 1408 proxier.localDetector.IfNotLocalNFT(), 1409 "jump", markMasqChain, 1410 ), 1411 }) 1412 } 1413 } 1414 1415 // Set up external traffic handling (if any "external" destinations are 1416 // enabled). All captured traffic for all external destinations should 1417 // jump to externalTrafficChain, which will handle some special cases and 1418 // then jump to externalPolicyChain. 1419 if usesExternalTrafficChain { 1420 if !svcInfo.ExternalPolicyLocal() { 1421 // If we are using non-local endpoints we need to masquerade, 1422 // in case we cross nodes. 1423 tx.Add(&knftables.Rule{ 1424 Chain: externalTrafficChain, 1425 Rule: knftables.Concat( 1426 "jump", markMasqChain, 1427 ), 1428 }) 1429 } else { 1430 // If we are only using same-node endpoints, we can retain the 1431 // source IP in most cases. 1432 1433 if proxier.localDetector.IsImplemented() { 1434 // Treat all locally-originated pod -> external destination 1435 // traffic as a special-case. It is subject to neither 1436 // form of traffic policy, which simulates going up-and-out 1437 // to an external load-balancer and coming back in. 1438 tx.Add(&knftables.Rule{ 1439 Chain: externalTrafficChain, 1440 Rule: knftables.Concat( 1441 proxier.localDetector.IfLocalNFT(), 1442 "goto", clusterPolicyChain, 1443 ), 1444 Comment: ptr.To("short-circuit pod traffic"), 1445 }) 1446 } 1447 1448 // Locally originated traffic (not a pod, but the host node) 1449 // still needs masquerade because the LBIP itself is a local 1450 // address, so that will be the chosen source IP. 1451 tx.Add(&knftables.Rule{ 1452 Chain: externalTrafficChain, 1453 Rule: knftables.Concat( 1454 "fib", "saddr", "type", "local", 1455 "jump", markMasqChain, 1456 ), 1457 Comment: ptr.To("masquerade local traffic"), 1458 }) 1459 1460 // Redirect all src-type=LOCAL -> external destination to the 1461 // policy=cluster chain. This allows traffic originating 1462 // from the host to be redirected to the service correctly. 1463 tx.Add(&knftables.Rule{ 1464 Chain: externalTrafficChain, 1465 Rule: knftables.Concat( 1466 "fib", "saddr", "type", "local", 1467 "goto", clusterPolicyChain, 1468 ), 1469 Comment: ptr.To("short-circuit local traffic"), 1470 }) 1471 } 1472 1473 // Anything else falls thru to the appropriate policy chain. 1474 if hasExternalEndpoints { 1475 tx.Add(&knftables.Rule{ 1476 Chain: externalTrafficChain, 1477 Rule: knftables.Concat( 1478 "goto", externalPolicyChain, 1479 ), 1480 }) 1481 } 1482 } 1483 1484 if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP { 1485 // Generate the per-endpoint affinity sets 1486 for _, ep := range allLocallyReachableEndpoints { 1487 epInfo, ok := ep.(*endpointInfo) 1488 if !ok { 1489 proxier.logger.Error(nil, "Failed to cast endpointsInfo", "endpointsInfo", ep) 1490 continue 1491 } 1492 1493 // Create a set to store current affinity mappings. As 1494 // with the iptables backend, endpoint affinity is 1495 // recorded for connections from a particular source IP 1496 // (without regard to source port) to a particular 1497 // ServicePort (without regard to which service IP was 1498 // used to reach the service). This may be changed in the 1499 // future. 1500 tx.Add(&knftables.Set{ 1501 Name: epInfo.affinitySetName, 1502 Type: ipvX_addr, 1503 Flags: []knftables.SetFlag{ 1504 // The nft docs say "dynamic" is only 1505 // needed for sets containing stateful 1506 // objects (eg counters), but (at least on 1507 // RHEL8) if we create the set without 1508 // "dynamic", it later gets mutated to 1509 // have it, and then the next attempt to 1510 // tx.Add() it here fails because it looks 1511 // like we're trying to change the flags. 1512 knftables.DynamicFlag, 1513 knftables.TimeoutFlag, 1514 }, 1515 Timeout: ptr.To(time.Duration(svcInfo.StickyMaxAgeSeconds()) * time.Second), 1516 }) 1517 activeAffinitySets.Insert(epInfo.affinitySetName) 1518 } 1519 } 1520 1521 // If Cluster policy is in use, create the chain and create rules jumping 1522 // from clusterPolicyChain to the clusterEndpoints 1523 if usesClusterPolicyChain { 1524 proxier.writeServiceToEndpointRules(tx, svcPortNameString, svcInfo, clusterPolicyChain, clusterEndpoints) 1525 } 1526 1527 // If Local policy is in use, create rules jumping from localPolicyChain 1528 // to the localEndpoints 1529 if usesLocalPolicyChain { 1530 proxier.writeServiceToEndpointRules(tx, svcPortNameString, svcInfo, localPolicyChain, localEndpoints) 1531 } 1532 1533 // Generate the per-endpoint chains 1534 for _, ep := range allLocallyReachableEndpoints { 1535 epInfo, ok := ep.(*endpointInfo) 1536 if !ok { 1537 proxier.logger.Error(nil, "Failed to cast endpointInfo", "endpointInfo", ep) 1538 continue 1539 } 1540 1541 endpointChain := epInfo.chainName 1542 1543 // Handle traffic that loops back to the originator with SNAT. 1544 tx.Add(&knftables.Rule{ 1545 Chain: endpointChain, 1546 Rule: knftables.Concat( 1547 ipX, "saddr", epInfo.IP(), 1548 "jump", markMasqChain, 1549 ), 1550 }) 1551 1552 // Handle session affinity 1553 if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP { 1554 tx.Add(&knftables.Rule{ 1555 Chain: endpointChain, 1556 Rule: knftables.Concat( 1557 "update", "@", epInfo.affinitySetName, 1558 "{", ipX, "saddr", "}", 1559 ), 1560 }) 1561 } 1562 1563 // DNAT to final destination. 1564 tx.Add(&knftables.Rule{ 1565 Chain: endpointChain, 1566 Rule: knftables.Concat( 1567 "meta l4proto", protocol, 1568 "dnat to", epInfo.String(), 1569 ), 1570 }) 1571 } 1572 } 1573 1574 // Figure out which chains are now stale. Unfortunately, we can't delete them 1575 // right away, because with kernels before 6.2, if there is a map element pointing 1576 // to a chain, and you delete that map element, the kernel doesn't notice until a 1577 // short amount of time later that the chain is now unreferenced. So we flush them 1578 // now, and record the time that they become stale in staleChains so they can be 1579 // deleted later. 1580 existingChains, err := proxier.nftables.List(context.TODO(), "chains") 1581 if err == nil { 1582 for _, chain := range existingChains { 1583 if isServiceChainName(chain) { 1584 if !activeChains.Has(chain) { 1585 tx.Flush(&knftables.Chain{ 1586 Name: chain, 1587 }) 1588 proxier.staleChains[chain] = start 1589 } else { 1590 delete(proxier.staleChains, chain) 1591 } 1592 } 1593 } 1594 } else if !knftables.IsNotFound(err) { 1595 proxier.logger.Error(err, "Failed to list nftables chains: stale chains will not be deleted") 1596 } 1597 1598 // OTOH, we can immediately delete any stale affinity sets 1599 existingSets, err := proxier.nftables.List(context.TODO(), "sets") 1600 if err == nil { 1601 for _, set := range existingSets { 1602 if isAffinitySetName(set) && !activeAffinitySets.Has(set) { 1603 tx.Delete(&knftables.Set{ 1604 Name: set, 1605 }) 1606 } 1607 } 1608 } else if !knftables.IsNotFound(err) { 1609 proxier.logger.Error(err, "Failed to list nftables sets: stale affinity sets will not be deleted") 1610 } 1611 1612 // Sync rules. 1613 proxier.logger.V(2).Info("Reloading service nftables data", 1614 "numServices", len(proxier.svcPortMap), 1615 "numEndpoints", totalEndpoints, 1616 ) 1617 1618 if klogV9 := klog.V(9); klogV9.Enabled() { 1619 klogV9.InfoS("Running nftables transaction", "transaction", tx.String()) 1620 } 1621 1622 err = proxier.nftables.Run(context.TODO(), tx) 1623 if err != nil { 1624 proxier.logger.Error(err, "nftables sync failed") 1625 metrics.NFTablesSyncFailuresTotal.Inc() 1626 1627 // staleChains is now incorrect since we didn't actually flush the 1628 // chains in it. We can recompute it next time. 1629 clear(proxier.staleChains) 1630 return 1631 } 1632 success = true 1633 1634 for name, lastChangeTriggerTimes := range endpointUpdateResult.LastChangeTriggerTimes { 1635 for _, lastChangeTriggerTime := range lastChangeTriggerTimes { 1636 latency := metrics.SinceInSeconds(lastChangeTriggerTime) 1637 metrics.NetworkProgrammingLatency.Observe(latency) 1638 proxier.logger.V(4).Info("Network programming", "endpoint", klog.KRef(name.Namespace, name.Name), "elapsed", latency) 1639 } 1640 } 1641 1642 metrics.SyncProxyRulesNoLocalEndpointsTotal.WithLabelValues("internal").Set(float64(serviceNoLocalEndpointsTotalInternal)) 1643 metrics.SyncProxyRulesNoLocalEndpointsTotal.WithLabelValues("external").Set(float64(serviceNoLocalEndpointsTotalExternal)) 1644 if proxier.healthzServer != nil { 1645 proxier.healthzServer.Updated(proxier.ipFamily) 1646 } 1647 metrics.SyncProxyRulesLastTimestamp.SetToCurrentTime() 1648 1649 // Update service healthchecks. The endpoints list might include services that are 1650 // not "OnlyLocal", but the services list will not, and the serviceHealthServer 1651 // will just drop those endpoints. 1652 if err := proxier.serviceHealthServer.SyncServices(proxier.svcPortMap.HealthCheckNodePorts()); err != nil { 1653 proxier.logger.Error(err, "Error syncing healthcheck services") 1654 } 1655 if err := proxier.serviceHealthServer.SyncEndpoints(proxier.endpointsMap.LocalReadyEndpoints()); err != nil { 1656 proxier.logger.Error(err, "Error syncing healthcheck endpoints") 1657 } 1658 1659 // Finish housekeeping, clear stale conntrack entries for UDP Services 1660 conntrack.CleanStaleEntries(proxier.conntrack, proxier.svcPortMap, serviceUpdateResult, endpointUpdateResult) 1661 } 1662 1663 func (proxier *Proxier) writeServiceToEndpointRules(tx *knftables.Transaction, svcPortNameString string, svcInfo *servicePortInfo, svcChain string, endpoints []proxy.Endpoint) { 1664 // First write session affinity rules, if applicable. 1665 if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP { 1666 ipX := "ip" 1667 if proxier.ipFamily == v1.IPv6Protocol { 1668 ipX = "ip6" 1669 } 1670 1671 for _, ep := range endpoints { 1672 epInfo, ok := ep.(*endpointInfo) 1673 if !ok { 1674 continue 1675 } 1676 1677 tx.Add(&knftables.Rule{ 1678 Chain: svcChain, 1679 Rule: knftables.Concat( 1680 ipX, "saddr", "@", epInfo.affinitySetName, 1681 "goto", epInfo.chainName, 1682 ), 1683 }) 1684 } 1685 } 1686 1687 // Now write loadbalancing rule 1688 var elements []string 1689 for i, ep := range endpoints { 1690 epInfo, ok := ep.(*endpointInfo) 1691 if !ok { 1692 continue 1693 } 1694 1695 elements = append(elements, 1696 strconv.Itoa(i), ":", "goto", epInfo.chainName, 1697 ) 1698 if i != len(endpoints)-1 { 1699 elements = append(elements, ",") 1700 } 1701 } 1702 tx.Add(&knftables.Rule{ 1703 Chain: svcChain, 1704 Rule: knftables.Concat( 1705 "numgen random mod", len(endpoints), "vmap", 1706 "{", elements, "}", 1707 ), 1708 }) 1709 }