k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/cmd/kube-proxy/app/server_linux.go (about) 1 //go:build linux 2 // +build linux 3 4 /* 5 Copyright 2014 The Kubernetes Authors. 6 7 Licensed under the Apache License, Version 2.0 (the "License"); 8 you may not use this file except in compliance with the License. 9 You may obtain a copy of the License at 10 11 http://www.apache.org/licenses/LICENSE-2.0 12 13 Unless required by applicable law or agreed to in writing, software 14 distributed under the License is distributed on an "AS IS" BASIS, 15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 See the License for the specific language governing permissions and 17 limitations under the License. 18 */ 19 20 // Package app does all of the work necessary to configure and run a 21 // Kubernetes app process. 22 package app 23 24 import ( 25 "context" 26 "errors" 27 "fmt" 28 goruntime "runtime" 29 "strings" 30 "time" 31 32 "github.com/google/cadvisor/machine" 33 "github.com/google/cadvisor/utils/sysfs" 34 35 v1 "k8s.io/api/core/v1" 36 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 37 "k8s.io/apimachinery/pkg/fields" 38 "k8s.io/apimachinery/pkg/runtime" 39 "k8s.io/apimachinery/pkg/watch" 40 utilfeature "k8s.io/apiserver/pkg/util/feature" 41 clientset "k8s.io/client-go/kubernetes" 42 "k8s.io/client-go/tools/cache" 43 toolswatch "k8s.io/client-go/tools/watch" 44 utilsysctl "k8s.io/component-helpers/node/util/sysctl" 45 "k8s.io/klog/v2" 46 "k8s.io/kubernetes/pkg/features" 47 "k8s.io/kubernetes/pkg/proxy" 48 proxyconfigapi "k8s.io/kubernetes/pkg/proxy/apis/config" 49 "k8s.io/kubernetes/pkg/proxy/iptables" 50 "k8s.io/kubernetes/pkg/proxy/ipvs" 51 utilipset "k8s.io/kubernetes/pkg/proxy/ipvs/ipset" 52 utilipvs "k8s.io/kubernetes/pkg/proxy/ipvs/util" 53 "k8s.io/kubernetes/pkg/proxy/nftables" 54 proxyutil "k8s.io/kubernetes/pkg/proxy/util" 55 utiliptables "k8s.io/kubernetes/pkg/util/iptables" 56 "k8s.io/utils/exec" 57 ) 58 59 // timeoutForNodePodCIDR is the time to wait for allocators to assign a PodCIDR to the 60 // node after it is registered. 61 var timeoutForNodePodCIDR = 5 * time.Minute 62 63 // platformApplyDefaults is called after parsing command-line flags and/or reading the 64 // config file, to apply platform-specific default values to config. 65 func (o *Options) platformApplyDefaults(config *proxyconfigapi.KubeProxyConfiguration) { 66 if config.Mode == "" { 67 o.logger.Info("Using iptables proxy") 68 config.Mode = proxyconfigapi.ProxyModeIPTables 69 } 70 71 if config.Mode == proxyconfigapi.ProxyModeNFTables && len(config.NodePortAddresses) == 0 { 72 config.NodePortAddresses = []string{proxyconfigapi.NodePortAddressesPrimary} 73 } 74 75 if config.DetectLocalMode == "" { 76 o.logger.V(4).Info("Defaulting detect-local-mode", "localModeClusterCIDR", string(proxyconfigapi.LocalModeClusterCIDR)) 77 config.DetectLocalMode = proxyconfigapi.LocalModeClusterCIDR 78 } 79 o.logger.V(2).Info("DetectLocalMode", "localMode", string(config.DetectLocalMode)) 80 } 81 82 // platformSetup is called after setting up the ProxyServer, but before creating the 83 // Proxier. It should fill in any platform-specific fields and perform other 84 // platform-specific setup. 85 func (s *ProxyServer) platformSetup(ctx context.Context) error { 86 logger := klog.FromContext(ctx) 87 if s.Config.DetectLocalMode == proxyconfigapi.LocalModeNodeCIDR { 88 logger.Info("Watching for node, awaiting podCIDR allocation", "hostname", s.Hostname) 89 node, err := waitForPodCIDR(ctx, s.Client, s.Hostname) 90 if err != nil { 91 return err 92 } 93 s.podCIDRs = node.Spec.PodCIDRs 94 logger.Info("NodeInfo", "podCIDRs", node.Spec.PodCIDRs) 95 } 96 97 err := s.setupConntrack(ctx) 98 if err != nil { 99 return err 100 } 101 102 return nil 103 } 104 105 // isIPTablesBased checks whether mode is based on iptables rather than nftables 106 func isIPTablesBased(mode proxyconfigapi.ProxyMode) bool { 107 return mode == proxyconfigapi.ProxyModeIPTables || mode == proxyconfigapi.ProxyModeIPVS 108 } 109 110 // getIPTables returns an array of [IPv4, IPv6] utiliptables.Interfaces. If primaryFamily 111 // is not v1.IPFamilyUnknown then it will also separately return the interface for just 112 // that family. 113 func getIPTables(primaryFamily v1.IPFamily) ([2]utiliptables.Interface, utiliptables.Interface) { 114 execer := exec.New() 115 116 // Create iptables handlers for both families. Always ordered as IPv4, IPv6 117 ipt := [2]utiliptables.Interface{ 118 utiliptables.New(execer, utiliptables.ProtocolIPv4), 119 utiliptables.New(execer, utiliptables.ProtocolIPv6), 120 } 121 122 var iptInterface utiliptables.Interface 123 if primaryFamily == v1.IPv4Protocol { 124 iptInterface = ipt[0] 125 } else if primaryFamily == v1.IPv6Protocol { 126 iptInterface = ipt[1] 127 } 128 129 return ipt, iptInterface 130 } 131 132 // platformCheckSupported is called immediately before creating the Proxier, to check 133 // what IP families are supported (and whether the configuration is usable at all). 134 func (s *ProxyServer) platformCheckSupported(ctx context.Context) (ipv4Supported, ipv6Supported, dualStackSupported bool, err error) { 135 logger := klog.FromContext(ctx) 136 137 if isIPTablesBased(s.Config.Mode) { 138 ipt, _ := getIPTables(v1.IPFamilyUnknown) 139 ipv4Supported = ipt[0].Present() 140 ipv6Supported = ipt[1].Present() 141 142 if !ipv4Supported && !ipv6Supported { 143 err = fmt.Errorf("iptables is not available on this host") 144 } else if !ipv4Supported { 145 logger.Info("No iptables support for family", "ipFamily", v1.IPv4Protocol) 146 } else if !ipv6Supported { 147 logger.Info("No iptables support for family", "ipFamily", v1.IPv6Protocol) 148 } 149 } else { 150 // Assume support for both families. 151 // FIXME: figure out how to check for kernel IPv6 support using nft 152 ipv4Supported, ipv6Supported = true, true 153 } 154 155 // The Linux proxies can always support dual-stack if they can support both IPv4 156 // and IPv6. 157 dualStackSupported = ipv4Supported && ipv6Supported 158 return 159 } 160 161 // createProxier creates the proxy.Provider 162 func (s *ProxyServer) createProxier(ctx context.Context, config *proxyconfigapi.KubeProxyConfiguration, dualStack, initOnly bool) (proxy.Provider, error) { 163 logger := klog.FromContext(ctx) 164 var proxier proxy.Provider 165 var err error 166 167 localDetectors := getLocalDetectors(logger, s.PrimaryIPFamily, config, s.podCIDRs) 168 169 if config.Mode == proxyconfigapi.ProxyModeIPTables { 170 logger.Info("Using iptables Proxier") 171 172 if dualStack { 173 ipt, _ := getIPTables(s.PrimaryIPFamily) 174 175 // TODO this has side effects that should only happen when Run() is invoked. 176 proxier, err = iptables.NewDualStackProxier( 177 ctx, 178 ipt, 179 utilsysctl.New(), 180 exec.New(), 181 config.IPTables.SyncPeriod.Duration, 182 config.IPTables.MinSyncPeriod.Duration, 183 config.IPTables.MasqueradeAll, 184 *config.IPTables.LocalhostNodePorts, 185 int(*config.IPTables.MasqueradeBit), 186 localDetectors, 187 s.Hostname, 188 s.NodeIPs, 189 s.Recorder, 190 s.HealthzServer, 191 config.NodePortAddresses, 192 initOnly, 193 ) 194 } else { 195 // Create a single-stack proxier if and only if the node does not support dual-stack (i.e, no iptables support). 196 _, iptInterface := getIPTables(s.PrimaryIPFamily) 197 198 // TODO this has side effects that should only happen when Run() is invoked. 199 proxier, err = iptables.NewProxier( 200 ctx, 201 s.PrimaryIPFamily, 202 iptInterface, 203 utilsysctl.New(), 204 exec.New(), 205 config.IPTables.SyncPeriod.Duration, 206 config.IPTables.MinSyncPeriod.Duration, 207 config.IPTables.MasqueradeAll, 208 *config.IPTables.LocalhostNodePorts, 209 int(*config.IPTables.MasqueradeBit), 210 localDetectors[s.PrimaryIPFamily], 211 s.Hostname, 212 s.NodeIPs[s.PrimaryIPFamily], 213 s.Recorder, 214 s.HealthzServer, 215 config.NodePortAddresses, 216 initOnly, 217 ) 218 } 219 220 if err != nil { 221 return nil, fmt.Errorf("unable to create proxier: %v", err) 222 } 223 } else if config.Mode == proxyconfigapi.ProxyModeIPVS { 224 execer := exec.New() 225 ipsetInterface := utilipset.New(execer) 226 ipvsInterface := utilipvs.New() 227 if err := ipvs.CanUseIPVSProxier(ctx, ipvsInterface, ipsetInterface, config.IPVS.Scheduler); err != nil { 228 return nil, fmt.Errorf("can't use the IPVS proxier: %v", err) 229 } 230 231 logger.Info("Using ipvs Proxier") 232 if dualStack { 233 ipt, _ := getIPTables(s.PrimaryIPFamily) 234 proxier, err = ipvs.NewDualStackProxier( 235 ctx, 236 ipt, 237 ipvsInterface, 238 ipsetInterface, 239 utilsysctl.New(), 240 execer, 241 config.IPVS.SyncPeriod.Duration, 242 config.IPVS.MinSyncPeriod.Duration, 243 config.IPVS.ExcludeCIDRs, 244 config.IPVS.StrictARP, 245 config.IPVS.TCPTimeout.Duration, 246 config.IPVS.TCPFinTimeout.Duration, 247 config.IPVS.UDPTimeout.Duration, 248 config.IPTables.MasqueradeAll, 249 int(*config.IPTables.MasqueradeBit), 250 localDetectors, 251 s.Hostname, 252 s.NodeIPs, 253 s.Recorder, 254 s.HealthzServer, 255 config.IPVS.Scheduler, 256 config.NodePortAddresses, 257 initOnly, 258 ) 259 } else { 260 _, iptInterface := getIPTables(s.PrimaryIPFamily) 261 proxier, err = ipvs.NewProxier( 262 ctx, 263 s.PrimaryIPFamily, 264 iptInterface, 265 ipvsInterface, 266 ipsetInterface, 267 utilsysctl.New(), 268 execer, 269 config.IPVS.SyncPeriod.Duration, 270 config.IPVS.MinSyncPeriod.Duration, 271 config.IPVS.ExcludeCIDRs, 272 config.IPVS.StrictARP, 273 config.IPVS.TCPTimeout.Duration, 274 config.IPVS.TCPFinTimeout.Duration, 275 config.IPVS.UDPTimeout.Duration, 276 config.IPTables.MasqueradeAll, 277 int(*config.IPTables.MasqueradeBit), 278 localDetectors[s.PrimaryIPFamily], 279 s.Hostname, 280 s.NodeIPs[s.PrimaryIPFamily], 281 s.Recorder, 282 s.HealthzServer, 283 config.IPVS.Scheduler, 284 config.NodePortAddresses, 285 initOnly, 286 ) 287 } 288 if err != nil { 289 return nil, fmt.Errorf("unable to create proxier: %v", err) 290 } 291 } else if config.Mode == proxyconfigapi.ProxyModeNFTables { 292 logger.Info("Using nftables Proxier") 293 294 if dualStack { 295 // TODO this has side effects that should only happen when Run() is invoked. 296 proxier, err = nftables.NewDualStackProxier( 297 ctx, 298 utilsysctl.New(), 299 config.NFTables.SyncPeriod.Duration, 300 config.NFTables.MinSyncPeriod.Duration, 301 config.NFTables.MasqueradeAll, 302 int(*config.NFTables.MasqueradeBit), 303 localDetectors, 304 s.Hostname, 305 s.NodeIPs, 306 s.Recorder, 307 s.HealthzServer, 308 config.NodePortAddresses, 309 initOnly, 310 ) 311 } else { 312 // Create a single-stack proxier if and only if the node does not support dual-stack 313 // TODO this has side effects that should only happen when Run() is invoked. 314 proxier, err = nftables.NewProxier( 315 ctx, 316 s.PrimaryIPFamily, 317 utilsysctl.New(), 318 config.NFTables.SyncPeriod.Duration, 319 config.NFTables.MinSyncPeriod.Duration, 320 config.NFTables.MasqueradeAll, 321 int(*config.NFTables.MasqueradeBit), 322 localDetectors[s.PrimaryIPFamily], 323 s.Hostname, 324 s.NodeIPs[s.PrimaryIPFamily], 325 s.Recorder, 326 s.HealthzServer, 327 config.NodePortAddresses, 328 initOnly, 329 ) 330 } 331 332 if err != nil { 333 return nil, fmt.Errorf("unable to create proxier: %v", err) 334 } 335 } 336 337 return proxier, nil 338 } 339 340 func (s *ProxyServer) setupConntrack(ctx context.Context) error { 341 ct := &realConntracker{} 342 343 max, err := getConntrackMax(ctx, s.Config.Conntrack) 344 if err != nil { 345 return err 346 } 347 if max > 0 { 348 err := ct.SetMax(ctx, max) 349 if err != nil { 350 if err != errReadOnlySysFS { 351 return err 352 } 353 // errReadOnlySysFS is caused by a known docker issue (https://github.com/docker/docker/issues/24000), 354 // the only remediation we know is to restart the docker daemon. 355 // Here we'll send an node event with specific reason and message, the 356 // administrator should decide whether and how to handle this issue, 357 // whether to drain the node and restart docker. Occurs in other container runtimes 358 // as well. 359 // TODO(random-liu): Remove this when the docker bug is fixed. 360 const message = "CRI error: /sys is read-only: " + 361 "cannot modify conntrack limits, problems may arise later (If running Docker, see docker issue #24000)" 362 s.Recorder.Eventf(s.NodeRef, nil, v1.EventTypeWarning, err.Error(), "StartKubeProxy", message) 363 } 364 } 365 366 if s.Config.Conntrack.TCPEstablishedTimeout != nil && s.Config.Conntrack.TCPEstablishedTimeout.Duration > 0 { 367 timeout := int(s.Config.Conntrack.TCPEstablishedTimeout.Duration / time.Second) 368 if err := ct.SetTCPEstablishedTimeout(ctx, timeout); err != nil { 369 return err 370 } 371 } 372 373 if s.Config.Conntrack.TCPCloseWaitTimeout != nil && s.Config.Conntrack.TCPCloseWaitTimeout.Duration > 0 { 374 timeout := int(s.Config.Conntrack.TCPCloseWaitTimeout.Duration / time.Second) 375 if err := ct.SetTCPCloseWaitTimeout(ctx, timeout); err != nil { 376 return err 377 } 378 } 379 380 if s.Config.Conntrack.TCPBeLiberal { 381 if err := ct.SetTCPBeLiberal(ctx, 1); err != nil { 382 return err 383 } 384 } 385 386 if s.Config.Conntrack.UDPTimeout.Duration > 0 { 387 timeout := int(s.Config.Conntrack.UDPTimeout.Duration / time.Second) 388 if err := ct.SetUDPTimeout(ctx, timeout); err != nil { 389 return err 390 } 391 } 392 393 if s.Config.Conntrack.UDPStreamTimeout.Duration > 0 { 394 timeout := int(s.Config.Conntrack.UDPStreamTimeout.Duration / time.Second) 395 if err := ct.SetUDPStreamTimeout(ctx, timeout); err != nil { 396 return err 397 } 398 } 399 400 return nil 401 } 402 403 func getConntrackMax(ctx context.Context, config proxyconfigapi.KubeProxyConntrackConfiguration) (int, error) { 404 logger := klog.FromContext(ctx) 405 if config.MaxPerCore != nil && *config.MaxPerCore > 0 { 406 floor := 0 407 if config.Min != nil { 408 floor = int(*config.Min) 409 } 410 scaled := int(*config.MaxPerCore) * detectNumCPU() 411 if scaled > floor { 412 logger.V(3).Info("GetConntrackMax: using scaled conntrack-max-per-core") 413 return scaled, nil 414 } 415 logger.V(3).Info("GetConntrackMax: using conntrack-min") 416 return floor, nil 417 } 418 return 0, nil 419 } 420 421 func waitForPodCIDR(ctx context.Context, client clientset.Interface, nodeName string) (*v1.Node, error) { 422 // since allocators can assign the podCIDR after the node registers, we do a watch here to wait 423 // for podCIDR to be assigned, instead of assuming that the Get() on startup will have it. 424 ctx, cancelFunc := context.WithTimeout(ctx, timeoutForNodePodCIDR) 425 defer cancelFunc() 426 427 fieldSelector := fields.OneTermEqualSelector("metadata.name", nodeName).String() 428 lw := &cache.ListWatch{ 429 ListFunc: func(options metav1.ListOptions) (object runtime.Object, e error) { 430 options.FieldSelector = fieldSelector 431 return client.CoreV1().Nodes().List(ctx, options) 432 }, 433 WatchFunc: func(options metav1.ListOptions) (i watch.Interface, e error) { 434 options.FieldSelector = fieldSelector 435 return client.CoreV1().Nodes().Watch(ctx, options) 436 }, 437 } 438 condition := func(event watch.Event) (bool, error) { 439 // don't process delete events 440 if event.Type != watch.Modified && event.Type != watch.Added { 441 return false, nil 442 } 443 444 n, ok := event.Object.(*v1.Node) 445 if !ok { 446 return false, fmt.Errorf("event object not of type Node") 447 } 448 // don't consider the node if is going to be deleted and keep waiting 449 if !n.DeletionTimestamp.IsZero() { 450 return false, nil 451 } 452 return n.Spec.PodCIDR != "" && len(n.Spec.PodCIDRs) > 0, nil 453 } 454 455 evt, err := toolswatch.UntilWithSync(ctx, lw, &v1.Node{}, nil, condition) 456 if err != nil { 457 return nil, fmt.Errorf("timeout waiting for PodCIDR allocation to configure detect-local-mode %v: %v", proxyconfigapi.LocalModeNodeCIDR, err) 458 } 459 if n, ok := evt.Object.(*v1.Node); ok { 460 return n, nil 461 } 462 return nil, fmt.Errorf("event object not of type node") 463 } 464 465 func detectNumCPU() int { 466 // try get numCPU from /sys firstly due to a known issue (https://github.com/kubernetes/kubernetes/issues/99225) 467 _, numCPU, err := machine.GetTopology(sysfs.NewRealSysFs()) 468 if err != nil || numCPU < 1 { 469 return goruntime.NumCPU() 470 } 471 return numCPU 472 } 473 474 func getLocalDetectors(logger klog.Logger, primaryIPFamily v1.IPFamily, config *proxyconfigapi.KubeProxyConfiguration, nodePodCIDRs []string) map[v1.IPFamily]proxyutil.LocalTrafficDetector { 475 localDetectors := map[v1.IPFamily]proxyutil.LocalTrafficDetector{ 476 v1.IPv4Protocol: proxyutil.NewNoOpLocalDetector(), 477 v1.IPv6Protocol: proxyutil.NewNoOpLocalDetector(), 478 } 479 480 switch config.DetectLocalMode { 481 case proxyconfigapi.LocalModeClusterCIDR: 482 clusterCIDRs := strings.Split(strings.TrimSpace(config.ClusterCIDR), ",") 483 for family, cidrs := range proxyutil.MapCIDRsByIPFamily(clusterCIDRs) { 484 localDetectors[family] = proxyutil.NewDetectLocalByCIDR(cidrs[0].String()) 485 } 486 if !localDetectors[primaryIPFamily].IsImplemented() { 487 logger.Info("Detect-local-mode set to ClusterCIDR, but no cluster CIDR specified for primary IP family", "ipFamily", primaryIPFamily, "clusterCIDR", config.ClusterCIDR) 488 } 489 490 case proxyconfigapi.LocalModeNodeCIDR: 491 for family, cidrs := range proxyutil.MapCIDRsByIPFamily(nodePodCIDRs) { 492 localDetectors[family] = proxyutil.NewDetectLocalByCIDR(cidrs[0].String()) 493 } 494 if !localDetectors[primaryIPFamily].IsImplemented() { 495 logger.Info("Detect-local-mode set to NodeCIDR, but no PodCIDR defined at node for primary IP family", "ipFamily", primaryIPFamily, "podCIDRs", nodePodCIDRs) 496 } 497 498 case proxyconfigapi.LocalModeBridgeInterface: 499 localDetector := proxyutil.NewDetectLocalByBridgeInterface(config.DetectLocal.BridgeInterface) 500 localDetectors[v1.IPv4Protocol] = localDetector 501 localDetectors[v1.IPv6Protocol] = localDetector 502 503 case proxyconfigapi.LocalModeInterfaceNamePrefix: 504 localDetector := proxyutil.NewDetectLocalByInterfaceNamePrefix(config.DetectLocal.InterfaceNamePrefix) 505 localDetectors[v1.IPv4Protocol] = localDetector 506 localDetectors[v1.IPv6Protocol] = localDetector 507 508 default: 509 logger.Info("Defaulting to no-op detect-local") 510 } 511 512 return localDetectors 513 } 514 515 // platformCleanup removes stale kube-proxy rules that can be safely removed. If 516 // cleanupAndExit is true, it will attempt to remove rules from all known kube-proxy 517 // modes. If it is false, it will only remove rules that are definitely not in use by the 518 // currently-configured mode. 519 func platformCleanup(ctx context.Context, mode proxyconfigapi.ProxyMode, cleanupAndExit bool) error { 520 var encounteredError bool 521 522 // Clean up iptables and ipvs rules if switching to nftables, or if cleanupAndExit 523 if !isIPTablesBased(mode) || cleanupAndExit { 524 ipts, _ := getIPTables(v1.IPFamilyUnknown) 525 execer := exec.New() 526 ipsetInterface := utilipset.New(execer) 527 ipvsInterface := utilipvs.New() 528 529 for _, ipt := range ipts { 530 encounteredError = iptables.CleanupLeftovers(ctx, ipt) || encounteredError 531 encounteredError = ipvs.CleanupLeftovers(ctx, ipvsInterface, ipt, ipsetInterface) || encounteredError 532 } 533 } 534 535 if utilfeature.DefaultFeatureGate.Enabled(features.NFTablesProxyMode) { 536 // Clean up nftables rules when switching to iptables or ipvs, or if cleanupAndExit 537 if isIPTablesBased(mode) || cleanupAndExit { 538 encounteredError = nftables.CleanupLeftovers(ctx) || encounteredError 539 } 540 } 541 542 if encounteredError { 543 return errors.New("encountered an error while tearing down rules") 544 } 545 return nil 546 }