github.com/looshlee/beatles@v0.0.0-20220727174639-742810ab631c/pkg/datapath/iptables/iptables.go (about) 1 // Copyright 2016-2019 Authors of Cilium 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package iptables 16 17 import ( 18 "bufio" 19 "bytes" 20 "fmt" 21 "regexp" 22 "strings" 23 "time" 24 25 "github.com/cilium/cilium/pkg/byteorder" 26 "github.com/cilium/cilium/pkg/command/exec" 27 "github.com/cilium/cilium/pkg/datapath/linux/linux_defaults" 28 "github.com/cilium/cilium/pkg/defaults" 29 "github.com/cilium/cilium/pkg/logging/logfields" 30 "github.com/cilium/cilium/pkg/modules" 31 "github.com/cilium/cilium/pkg/node" 32 "github.com/cilium/cilium/pkg/option" 33 "github.com/cilium/cilium/pkg/sysctl" 34 "github.com/cilium/cilium/pkg/versioncheck" 35 36 go_version "github.com/hashicorp/go-version" 37 "github.com/mattn/go-shellwords" 38 ) 39 40 const ( 41 ciliumPrefix = "CILIUM_" 42 ciliumInputChain = "CILIUM_INPUT" 43 ciliumOutputChain = "CILIUM_OUTPUT" 44 ciliumOutputRawChain = "CILIUM_OUTPUT_raw" 45 ciliumPostNatChain = "CILIUM_POST_nat" 46 ciliumOutputNatChain = "CILIUM_OUTPUT_nat" 47 ciliumPreNatChain = "CILIUM_PRE_nat" 48 ciliumPostMangleChain = "CILIUM_POST_mangle" 49 ciliumPreMangleChain = "CILIUM_PRE_mangle" 50 ciliumPreRawChain = "CILIUM_PRE_raw" 51 ciliumForwardChain = "CILIUM_FORWARD" 52 ciliumTransientForwardChain = "CILIUM_TRANSIENT_FORWARD" 53 feederDescription = "cilium-feeder:" 54 xfrmDescription = "cilium-xfrm-notrack:" 55 ) 56 57 // Minimum iptables versions supporting the -w and -w<seconds> flags 58 var ( 59 waitMinVersion = versioncheck.MustCompile(">=v1.4.20") 60 waitSecondsMinVersion = versioncheck.MustCompile(">=v1.4.22") 61 ) 62 63 const ( 64 waitString = "-w" 65 ) 66 67 type customChain struct { 68 name string 69 table string 70 hook string 71 feederArgs []string 72 ipv6 bool // ip6tables chain in addition to iptables chain 73 } 74 75 func getVersion(prog string) (*go_version.Version, error) { 76 b, err := exec.WithTimeout(defaults.ExecTimeout, prog, "--version").CombinedOutput(log, false) 77 if err != nil { 78 return nil, err 79 } 80 v := regexp.MustCompile("v([0-9]+(\\.[0-9]+)+)") 81 vString := v.FindStringSubmatch(string(b)) 82 if vString == nil { 83 return nil, fmt.Errorf("no iptables version found in string: %s", string(b)) 84 } 85 return go_version.NewVersion(vString[1]) 86 } 87 88 func runProg(prog string, args []string, quiet bool) error { 89 _, err := exec.WithTimeout(defaults.ExecTimeout, prog, args...).CombinedOutput(log, !quiet) 90 return err 91 } 92 93 func getFeedRule(name, args string) []string { 94 ruleTail := []string{"-m", "comment", "--comment", feederDescription + " " + name, "-j", name} 95 if args == "" { 96 return ruleTail 97 } 98 argsList, err := shellwords.Parse(args) 99 if err != nil { 100 log.WithError(err).WithField(logfields.Object, args).Fatal("Unable to parse rule into argument slice") 101 } 102 return append(argsList, ruleTail...) 103 } 104 105 // KernelHasNetfilter probes whether iptables related modules are present in 106 // the kernel and returns true if indeed the case, else false. 107 func KernelHasNetfilter() bool { 108 modulesManager := &modules.ModulesManager{} 109 if err := modulesManager.Init(); err != nil { 110 return true 111 } 112 if found, _ := modulesManager.FindModules( 113 "ip_tables", "iptable_mangle", "iptable_raw", "iptable_filter"); found { 114 return true 115 } 116 if found, _ := modulesManager.FindModules( 117 "ip6_tables", "ip6table_mangle", "ip6table_raw", "ip6table_filter"); found { 118 return true 119 } 120 return false 121 } 122 123 func (c *customChain) add(waitArgs []string) error { 124 var err error 125 if option.Config.EnableIPv4 { 126 err = runProg("iptables", append(waitArgs, "-t", c.table, "-N", c.name), false) 127 } 128 if err == nil && option.Config.EnableIPv6 && c.ipv6 == true { 129 err = runProg("ip6tables", append(waitArgs, "-t", c.table, "-N", c.name), false) 130 } 131 return err 132 } 133 134 func reverseRule(rule string) ([]string, error) { 135 if strings.HasPrefix(rule, "-A") { 136 // From: -A POSTROUTING -m comment [...] 137 // To: -D POSTROUTING -m comment [...] 138 return shellwords.Parse(strings.Replace(rule, "-A", "-D", 1)) 139 } 140 141 if strings.HasPrefix(rule, "-I") { 142 // From: -I POSTROUTING -m comment [...] 143 // To: -D POSTROUTING -m comment [...] 144 return shellwords.Parse(strings.Replace(rule, "-I", "-D", 1)) 145 } 146 147 return []string{}, nil 148 } 149 150 func (m *IptablesManager) removeCiliumRules(table, prog, match string) { 151 args := append(m.waitArgs, "-t", table, "-S") 152 153 out, err := exec.WithTimeout(defaults.ExecTimeout, prog, args...).CombinedOutput(log, true) 154 if err != nil { 155 return 156 } 157 158 scanner := bufio.NewScanner(bytes.NewReader(out)) 159 for scanner.Scan() { 160 rule := scanner.Text() 161 log.WithField(logfields.Object, logfields.Repr(rule)).Debugf("Considering removing %s rule", prog) 162 if match != ciliumTransientForwardChain && strings.Contains(rule, ciliumTransientForwardChain) { 163 continue 164 } 165 166 // All rules installed by cilium either belong to a chain with 167 // the name CILIUM_ or call a chain with the name CILIUM_: 168 // -A CILIUM_FORWARD -o cilium_host -m comment --comment "cilium: any->cluster on cilium_host forward accept" -j ACCEPT 169 // -A POSTROUTING -m comment --comment "cilium-feeder: CILIUM_POST" -j CILIUM_POST 170 if strings.Contains(rule, match) { 171 reversedRule, err := reverseRule(rule) 172 if err != nil { 173 log.WithError(err).WithField(logfields.Object, rule).Warnf("Unable to parse %s rule into slice. Leaving rule behind.", prog) 174 continue 175 } 176 177 if len(reversedRule) > 0 { 178 deleteRule := append(append(m.waitArgs, "-t", table), reversedRule...) 179 log.WithField(logfields.Object, logfields.Repr(deleteRule)).Debugf("Removing %s rule", prog) 180 err = runProg(prog, deleteRule, true) 181 if err != nil { 182 log.WithError(err).WithField(logfields.Object, rule).Warnf("Unable to delete Cilium %s rule", prog) 183 } 184 } 185 } 186 } 187 } 188 189 func (c *customChain) remove(waitArgs []string, quiet bool) { 190 if option.Config.EnableIPv4 { 191 prog := "iptables" 192 args := append(waitArgs, "-t", c.table, "-F", c.name) 193 err := runProg(prog, args, true) 194 if err != nil && !quiet { 195 log.WithError(err).WithField(logfields.Object, args).Warnf("Unable to flush Cilium %s chain", prog) 196 } 197 198 args = append(waitArgs, "-t", c.table, "-X", c.name) 199 err = runProg(prog, args, true) 200 if err != nil && !quiet { 201 log.WithError(err).WithField(logfields.Object, args).Warnf("Unable to delete Cilium %s chain", prog) 202 } 203 } 204 if option.Config.EnableIPv6 && c.ipv6 == true { 205 prog := "ip6tables" 206 args := append(waitArgs, "-t", c.table, "-F", c.name) 207 err := runProg(prog, args, true) 208 if err != nil && !quiet { 209 log.WithError(err).WithField(logfields.Object, args).Warnf("Unable to flush Cilium %s chain", prog) 210 } 211 212 args = append(waitArgs, "-t", c.table, "-X", c.name) 213 err = runProg(prog, args, true) 214 if err != nil && !quiet { 215 log.WithError(err).WithField(logfields.Object, args).Warnf("Unable to delete Cilium %s chain", prog) 216 } 217 } 218 } 219 220 func (c *customChain) installFeeder(waitArgs []string) error { 221 installMode := "-A" 222 if option.Config.PrependIptablesChains { 223 installMode = "-I" 224 } 225 226 for _, feedArgs := range c.feederArgs { 227 if option.Config.EnableIPv4 { 228 err := runProg("iptables", append(append(waitArgs, "-t", c.table, installMode, c.hook), getFeedRule(c.name, feedArgs)...), true) 229 if err != nil { 230 return err 231 } 232 } 233 if option.Config.EnableIPv6 && c.ipv6 == true { 234 err := runProg("ip6tables", append(append(waitArgs, "-t", c.table, installMode, c.hook), getFeedRule(c.name, feedArgs)...), true) 235 if err != nil { 236 return err 237 } 238 } 239 } 240 return nil 241 } 242 243 // ciliumChains is the list of custom iptables chain used by Cilium. Custom 244 // chains are used to allow for simple replacements of all rules. 245 // 246 // WARNING: If you change or remove any of the feeder rules you have to ensure 247 // that the old feeder rules is also removed on agent start, otherwise, 248 // flushing and removing the custom chains will fail. 249 var ciliumChains = []customChain{ 250 { 251 name: ciliumInputChain, 252 table: "filter", 253 hook: "INPUT", 254 feederArgs: []string{""}, 255 ipv6: true, 256 }, 257 { 258 name: ciliumOutputChain, 259 table: "filter", 260 hook: "OUTPUT", 261 feederArgs: []string{""}, 262 ipv6: true, 263 }, 264 { 265 name: ciliumOutputRawChain, 266 table: "raw", 267 hook: "OUTPUT", 268 feederArgs: []string{""}, 269 ipv6: true, 270 }, 271 { 272 name: ciliumPostNatChain, 273 table: "nat", 274 hook: "POSTROUTING", 275 feederArgs: []string{""}, 276 }, 277 { 278 name: ciliumOutputNatChain, 279 table: "nat", 280 hook: "OUTPUT", 281 feederArgs: []string{""}, 282 }, 283 { 284 name: ciliumPreNatChain, 285 table: "nat", 286 hook: "PREROUTING", 287 feederArgs: []string{""}, 288 }, 289 { 290 name: ciliumPostMangleChain, 291 table: "mangle", 292 hook: "POSTROUTING", 293 feederArgs: []string{""}, 294 }, 295 { 296 name: ciliumPreMangleChain, 297 table: "mangle", 298 hook: "PREROUTING", 299 feederArgs: []string{""}, 300 ipv6: true, 301 }, 302 { 303 name: ciliumPreRawChain, 304 table: "raw", 305 hook: "PREROUTING", 306 feederArgs: []string{""}, 307 ipv6: true, 308 }, 309 { 310 name: ciliumForwardChain, 311 table: "filter", 312 hook: "FORWARD", 313 feederArgs: []string{""}, 314 }, 315 } 316 317 var transientChain = customChain{ 318 name: ciliumTransientForwardChain, 319 table: "filter", 320 hook: "FORWARD", 321 feederArgs: []string{""}, 322 } 323 324 // IptablesManager manages the iptables-related configuration for Cilium. 325 type IptablesManager struct { 326 haveIp6tables bool 327 haveSocketMatch bool 328 ipEarlyDemuxDisabled bool 329 waitArgs []string 330 } 331 332 // Init initializes the iptables manager and checks for iptables kernel modules 333 // availability. 334 func (m *IptablesManager) Init() { 335 modulesManager := &modules.ModulesManager{} 336 ip6tables := true 337 if err := modulesManager.Init(); err != nil { 338 log.WithError(err).Fatal( 339 "Unable to get information about kernel modules") 340 } 341 if err := modulesManager.FindOrLoadModules( 342 "ip_tables", "iptable_nat", "iptable_mangle", "iptable_raw", 343 "iptable_filter"); err != nil { 344 log.WithError(err).Warning( 345 "iptables modules could not be initialized. It probably means that iptables is not available on this system") 346 } 347 if err := modulesManager.FindOrLoadModules( 348 "ip6_tables", "ip6table_mangle", "ip6table_raw", "ip6table_filter"); err != nil { 349 if option.Config.EnableIPv6 { 350 log.WithError(err).Warning( 351 "IPv6 is enabled and ip6tables modules could not be initialized") 352 } 353 log.WithError(err).Debug( 354 "ip6tables kernel modules could not be loaded, so IPv6 cannot be used") 355 ip6tables = false 356 } 357 m.haveIp6tables = ip6tables 358 359 if err := modulesManager.FindOrLoadModules("xt_socket"); err != nil { 360 if option.Config.Tunnel == option.TunnelDisabled { 361 // xt_socket module is needed to circumvent an explicit drop in ip_forward() 362 // logic for packets for which a local socket is found by ip early 363 // demux. xt_socket performs a local socket match and sets an skb mark on 364 // match, which will divert the packet to the local stack using our policy 365 // routing rule, thus avoiding being processed by ip_forward() at all. 366 // 367 // If xt_socket module does not exist we can disable ip early demux to to 368 // avoid the explicit drop in ip_forward(). This is not needed in tunneling 369 // modes, as then we'll set the skb mark in the bpf logic before the policy 370 // routing stage so that the packet is routed locally instead of being 371 // forwarded by ip_forward(). 372 // 373 // We would not need the xt_socket at all if the datapath universally would 374 // set the "to proxy" skb mark bits on before the packet hits policy routing 375 // stage. Currently this is not true for endpoint routing modes. 376 log.WithError(err).Warning("xt_socket kernel module could not be loaded") 377 378 if option.Config.EnableXTSocketFallback { 379 v4disabled := true 380 v6disabled := true 381 if option.Config.EnableIPv4 { 382 v4disabled = sysctl.Disable("net.ipv4.ip_early_demux") == nil 383 } 384 if option.Config.EnableIPv6 { 385 v6disabled = sysctl.Disable("net.ipv6.ip_early_demux") == nil 386 } 387 if v4disabled && v6disabled { 388 m.ipEarlyDemuxDisabled = true 389 log.Warning("Disabled ip_early_demux to allow proxy redirection with original source/destination address without xt_socket support also in non-tunneled datapath modes.") 390 } else { 391 log.WithError(err).Warning("Could not disable ip_early_demux, traffic redirected due to an HTTP policy or visibility may be dropped unexpectedly") 392 } 393 } 394 } 395 } else { 396 m.haveSocketMatch = true 397 } 398 399 v, err := getVersion("iptables") 400 if err == nil { 401 switch { 402 case waitSecondsMinVersion.Check(v): 403 m.waitArgs = []string{waitString, fmt.Sprintf("%d", option.Config.IPTablesLockTimeout/time.Second)} 404 if option.Config.IPTablesLockWaitInterval != "" { 405 m.waitArgs = append(m.waitArgs, "-W", option.Config.IPTablesLockWaitInterval) 406 } 407 case waitMinVersion.Check(v): 408 m.waitArgs = []string{waitString} 409 } 410 } 411 } 412 413 // SupportsOriginalSourceAddr tells if an L7 proxy can use POD's original source address and port in 414 // the upstream connection to allow the destination to properly derive the source security ID from 415 // the source IP address. 416 func (m *IptablesManager) SupportsOriginalSourceAddr() bool { 417 // Original source address use works if xt_socket match is supported, or if ip early demux 418 // is disabled, or if the datapath is in a tunneling mode. 419 return m.haveSocketMatch || m.ipEarlyDemuxDisabled || option.Config.Tunnel != option.TunnelDisabled 420 } 421 422 // RemoveRules removes iptables rules installed by Cilium. 423 func (m *IptablesManager) RemoveRules() { 424 // Set of tables that have had iptables rules in any Cilium version 425 tables := []string{"nat", "mangle", "raw", "filter"} 426 for _, t := range tables { 427 m.removeCiliumRules(t, "iptables", ciliumPrefix) 428 } 429 430 // Set of tables that have had ip6tables rules in any Cilium version 431 if m.haveIp6tables { 432 tables6 := []string{"mangle", "raw", "filter"} 433 for _, t := range tables6 { 434 m.removeCiliumRules(t, "ip6tables", ciliumPrefix) 435 } 436 } 437 438 for _, c := range ciliumChains { 439 c.remove(m.waitArgs, false) 440 } 441 } 442 443 func (m *IptablesManager) ingressProxyRule(cmd, l4Match, markMatch, mark, port, name string) []string { 444 return append(m.waitArgs, 445 "-t", "mangle", 446 cmd, ciliumPreMangleChain, 447 "-p", l4Match, 448 "-m", "mark", "--mark", markMatch, 449 "-m", "comment", "--comment", "cilium: TPROXY to host "+name+" proxy", 450 "-j", "TPROXY", 451 "--tproxy-mark", mark, 452 "--on-port", port) 453 } 454 455 func (m *IptablesManager) inboundProxyRedirectRule(cmd string) []string { 456 // Mark host proxy transparent connections to be routed to the local stack. 457 // This comes before the TPROXY rules in the chain, and setting the mark 458 // without the proxy port number will make the TPROXY rule to not match, 459 // as we do not want to try to tproxy packets that are going to the stack 460 // already. 461 // This rule is needed for couple of reasons: 462 // 1. route return traffic to the proxy 463 // 2. route original direction traffic that would otherwise be intercepted 464 // by ip_early_demux 465 toProxyMark := fmt.Sprintf("%#08x", linux_defaults.MagicMarkIsToProxy) 466 return append(m.waitArgs, 467 "-t", "mangle", 468 cmd, ciliumPreMangleChain, 469 "-m", "socket", "--transparent", "--nowildcard", 470 "-m", "comment", "--comment", "cilium: any->pod redirect proxied traffic to host proxy", 471 "-j", "MARK", 472 "--set-mark", toProxyMark) 473 } 474 475 func (m *IptablesManager) iptIngressProxyRule(cmd string, l4proto string, proxyPort uint16, name string) error { 476 // Match 477 port := uint32(byteorder.HostToNetwork(proxyPort).(uint16)) << 16 478 ingressMarkMatch := fmt.Sprintf("%#08x", linux_defaults.MagicMarkIsToProxy|port) 479 // TPROXY params 480 ingressProxyMark := fmt.Sprintf("%#08x", linux_defaults.MagicMarkIsToProxy) 481 ingressProxyPort := fmt.Sprintf("%d", proxyPort) 482 483 var err error 484 if option.Config.EnableIPv4 { 485 err = runProg("iptables", 486 m.ingressProxyRule(cmd, l4proto, ingressMarkMatch, 487 ingressProxyMark, ingressProxyPort, name), 488 false) 489 } 490 if err == nil && option.Config.EnableIPv6 { 491 err = runProg("ip6tables", 492 m.ingressProxyRule(cmd, l4proto, ingressMarkMatch, 493 ingressProxyMark, ingressProxyPort, name), 494 false) 495 } 496 return err 497 } 498 499 func (m *IptablesManager) egressProxyRule(cmd, l4Match, markMatch, mark, port, name string) []string { 500 return append(m.waitArgs, 501 "-t", "mangle", 502 cmd, ciliumPreMangleChain, 503 "-p", l4Match, 504 "-m", "mark", "--mark", markMatch, 505 "-m", "comment", "--comment", "cilium: TPROXY to host "+name+" proxy", 506 "-j", "TPROXY", 507 "--tproxy-mark", mark, 508 "--on-port", port) 509 } 510 511 func (m *IptablesManager) iptEgressProxyRule(cmd string, l4proto string, proxyPort uint16, name string) error { 512 // Match 513 port := uint32(byteorder.HostToNetwork(proxyPort).(uint16)) << 16 514 egressMarkMatch := fmt.Sprintf("%#08x", linux_defaults.MagicMarkIsToProxy|port) 515 // TPROXY params 516 egressProxyMark := fmt.Sprintf("%#08x", linux_defaults.MagicMarkIsToProxy) 517 egressProxyPort := fmt.Sprintf("%d", proxyPort) 518 519 var err error 520 if option.Config.EnableIPv4 { 521 err = runProg("iptables", 522 m.egressProxyRule(cmd, l4proto, egressMarkMatch, 523 egressProxyMark, egressProxyPort, name), 524 false) 525 } 526 if err == nil && option.Config.EnableIPv6 { 527 err = runProg("ip6tables", 528 m.egressProxyRule(cmd, l4proto, egressMarkMatch, 529 egressProxyMark, egressProxyPort, name), 530 false) 531 } 532 return err 533 } 534 535 func (m *IptablesManager) installStaticProxyRules() error { 536 // match traffic to a proxy (upper 16 bits has the proxy port, which is masked out) 537 matchToProxy := fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkIsToProxy, linux_defaults.MagicMarkHostMask) 538 // proxy return traffic has 0 ID in the mask 539 matchProxyReply := fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkIsProxy, linux_defaults.MagicMarkProxyNoIDMask) 540 541 var err error 542 if option.Config.EnableIPv4 { 543 // No conntrack for traffic to proxy 544 err = runProg("iptables", append( 545 m.waitArgs, 546 "-t", "raw", 547 "-A", ciliumPreRawChain, 548 // Destination is a local node POD address 549 "!", "-d", node.GetInternalIPv4().String(), 550 "-m", "mark", "--mark", matchToProxy, 551 "-m", "comment", "--comment", "cilium: NOTRACK for proxy traffic", 552 "-j", "NOTRACK"), false) 553 if err == nil { 554 // Explicit ACCEPT for the proxy traffic. Needed when the INPUT defaults to DROP. 555 // Matching needs to be the same as for the NOTRACK rule above. 556 err = runProg("iptables", append( 557 m.waitArgs, 558 "-t", "filter", 559 "-A", ciliumInputChain, 560 // Destination is a local node POD address 561 "!", "-d", node.GetInternalIPv4().String(), 562 "-m", "mark", "--mark", matchToProxy, 563 "-m", "comment", "--comment", "cilium: ACCEPT for proxy traffic", 564 "-j", "ACCEPT"), false) 565 } 566 if err == nil { 567 // No conntrack for proxy return traffic 568 err = runProg("iptables", append( 569 m.waitArgs, 570 "-t", "raw", 571 "-A", ciliumOutputRawChain, 572 // Return traffic is from a local node POD address 573 "!", "-s", node.GetInternalIPv4().String(), 574 "-m", "mark", "--mark", matchProxyReply, 575 "-m", "comment", "--comment", "cilium: NOTRACK for proxy return traffic", 576 "-j", "NOTRACK"), false) 577 } 578 if err == nil { 579 // Explicit ACCEPT for the proxy return traffic. Needed when the OUTPUT defaults to DROP. 580 // Matching needs to be the same as for the NOTRACK rule above. 581 err = runProg("iptables", append( 582 m.waitArgs, 583 "-t", "filter", 584 "-A", ciliumOutputChain, 585 // Return traffic is from a local node POD address 586 "!", "-s", node.GetInternalIPv4().String(), 587 "-m", "mark", "--mark", matchProxyReply, 588 "-m", "comment", "--comment", "cilium: ACCEPT for proxy return traffic", 589 "-j", "ACCEPT"), false) 590 } 591 if err == nil && m.haveSocketMatch { 592 // Direct inbound TPROXYed traffic towards the socket 593 err = runProg("iptables", m.inboundProxyRedirectRule("-A"), false) 594 } 595 } 596 if err == nil && option.Config.EnableIPv6 { 597 // No conntrack for traffic to ingress proxy 598 err = runProg("ip6tables", append( 599 m.waitArgs, 600 "-t", "raw", 601 "-A", ciliumPreRawChain, 602 // Destination is a local node POD address 603 "!", "-d", node.GetIPv6().String(), 604 "-m", "mark", "--mark", matchToProxy, 605 "-m", "comment", "--comment", "cilium: NOTRACK for proxy traffic", 606 "-j", "NOTRACK"), false) 607 if err == nil { 608 // Explicit ACCEPT for the proxy traffic. Needed when the INPUT defaults to DROP. 609 // Matching needs to be the same as for the NOTRACK rule above. 610 err = runProg("ip6tables", append( 611 m.waitArgs, 612 "-t", "filter", 613 "-A", ciliumInputChain, 614 // Destination is a local node POD address 615 "!", "-d", node.GetIPv6().String(), 616 "-m", "mark", "--mark", matchToProxy, 617 "-m", "comment", "--comment", "cilium: ACCEPT for proxy traffic", 618 "-j", "ACCEPT"), false) 619 } 620 if err == nil { 621 // No conntrack for proxy return traffic 622 err = runProg("ip6tables", append( 623 m.waitArgs, 624 "-t", "raw", 625 "-A", ciliumOutputRawChain, 626 // Return traffic is from a local node POD address 627 "!", "-s", node.GetIPv6().String(), 628 "-m", "mark", "--mark", matchProxyReply, 629 "-m", "comment", "--comment", "cilium: NOTRACK for proxy return traffic", 630 "-j", "NOTRACK"), false) 631 } 632 if err == nil { 633 // Explicit ACCEPT for the proxy return traffic. Needed when the OUTPUT defaults to DROP. 634 // Matching needs to be the same as for the NOTRACK rule above. 635 err = runProg("ip6tables", append( 636 m.waitArgs, 637 "-t", "filter", 638 "-A", ciliumOutputChain, 639 // Return traffic is from a local node POD address 640 "!", "-s", node.GetIPv6().String(), 641 "-m", "mark", "--mark", matchProxyReply, 642 "-m", "comment", "--comment", "cilium: ACCEPT for proxy return traffic", 643 "-j", "ACCEPT"), false) 644 } 645 if err == nil && m.haveSocketMatch { 646 // Direct inbound TPROXYed traffic towards the socket 647 err = runProg("ip6tables", m.inboundProxyRedirectRule("-A"), false) 648 } 649 } 650 return err 651 } 652 653 // install or remove rules for a single proxy port 654 func (m *IptablesManager) iptProxyRules(cmd string, proxyPort uint16, ingress bool, name string) error { 655 // Redirect packets to the host proxy via TPROXY, as directed by the Cilium 656 // datapath bpf programs via skb marks (egress) or DSCP (ingress). 657 if ingress { 658 if err := m.iptIngressProxyRule(cmd, "tcp", proxyPort, name); err != nil { 659 return err 660 } 661 if err := m.iptIngressProxyRule(cmd, "udp", proxyPort, name); err != nil { 662 return err 663 } 664 } else { 665 if err := m.iptEgressProxyRule(cmd, "tcp", proxyPort, name); err != nil { 666 return err 667 } 668 if err := m.iptEgressProxyRule(cmd, "udp", proxyPort, name); err != nil { 669 return err 670 } 671 } 672 return nil 673 } 674 675 func (m *IptablesManager) InstallProxyRules(proxyPort uint16, ingress bool, name string) error { 676 return m.iptProxyRules("-A", proxyPort, ingress, name) 677 } 678 679 func (m *IptablesManager) RemoveProxyRules(proxyPort uint16, ingress bool, name string) error { 680 return m.iptProxyRules("-D", proxyPort, ingress, name) 681 } 682 683 func (m *IptablesManager) remoteSnatDstAddrExclusion() string { 684 switch { 685 case option.Config.IPv4NativeRoutingCIDR() != nil: 686 return option.Config.IPv4NativeRoutingCIDR().String() 687 688 case option.Config.Tunnel == option.TunnelDisabled: 689 return node.GetIPv4ClusterRange().String() 690 691 default: 692 return node.GetIPv4AllocRange().String() 693 } 694 } 695 696 func getDeliveryInterface(ifName string) string { 697 deliveryInterface := ifName 698 if option.Config.IPAM == option.IPAMENI || option.Config.EnableEndpointRoutes { 699 deliveryInterface = "lxc+" 700 } 701 return deliveryInterface 702 } 703 704 // TransientRulesStart installs iptables rules for Cilium that need to be 705 // kept in-tact during agent restart which removes/installs its main rules. 706 // Transient rules are then removed once iptables rule update cycle has 707 // completed. This is mainly due to interactions with kube-proxy. 708 func (m *IptablesManager) TransientRulesStart(ifName string) error { 709 if option.Config.EnableIPv4 { 710 localDeliveryInterface := getDeliveryInterface(ifName) 711 712 m.TransientRulesEnd(true) 713 714 if err := transientChain.add(m.waitArgs); err != nil { 715 return fmt.Errorf("cannot add custom chain %s: %s", transientChain.name, err) 716 } 717 // While kube-proxy does change the policy of the iptables FORWARD chain 718 // it doesn't seem to handle all cases, e.g. host network pods that use 719 // the node IP which would still end up in default DENY. Similarly, for 720 // plain Docker setup, we would otherwise hit default DENY in FORWARD chain. 721 // Also, k8s 1.15 introduced "-m conntrack --ctstate INVALID -j DROP" which 722 // in the direct routing case can drop EP replies. 723 // 724 // Therefore, add three rules below to avoid having a user to manually opt-in. 725 // See also: https://github.com/kubernetes/kubernetes/issues/39823 726 // In here can only be basic ACCEPT rules, nothing more complicated. 727 // 728 // The second rule is for the case of nodeport traffic where the backend is 729 // remote. The traffic flow in FORWARD is as follows: 730 // 731 // - Node serving nodeport request: 732 // IN=eno1 OUT=cilium_host 733 // IN=cilium_host OUT=eno1 734 // 735 // - Node running backend: 736 // IN=eno1 OUT=cilium_host 737 // IN=lxc... OUT=eno1 738 if err := runProg("iptables", append( 739 m.waitArgs, 740 "-A", ciliumTransientForwardChain, 741 "-o", localDeliveryInterface, 742 "-m", "comment", "--comment", "cilium (transient): any->cluster on "+localDeliveryInterface+" forward accept", 743 "-j", "ACCEPT"), false); err != nil { 744 return err 745 } 746 if err := runProg("iptables", append( 747 m.waitArgs, 748 "-A", ciliumTransientForwardChain, 749 "-i", localDeliveryInterface, 750 "-m", "comment", "--comment", "cilium (transient): cluster->any on "+localDeliveryInterface+" forward accept (nodeport)", 751 "-j", "ACCEPT"), false); err != nil { 752 return err 753 } 754 if err := runProg("iptables", append( 755 m.waitArgs, 756 "-A", ciliumTransientForwardChain, 757 "-i", "lxc+", 758 "-m", "comment", "--comment", "cilium (transient): cluster->any on lxc+ forward accept", 759 "-j", "ACCEPT"), false); err != nil { 760 return err 761 } 762 if err := transientChain.installFeeder(m.waitArgs); err != nil { 763 return fmt.Errorf("cannot install feeder rule %s: %s", transientChain.feederArgs, err) 764 } 765 } 766 return nil 767 } 768 769 // TransientRulesEnd removes Cilium related rules installed from TransientRulesStart. 770 func (m *IptablesManager) TransientRulesEnd(quiet bool) { 771 if option.Config.EnableIPv4 { 772 m.removeCiliumRules("filter", "iptables", ciliumTransientForwardChain) 773 transientChain.remove(m.waitArgs, quiet) 774 } 775 } 776 777 // InstallRules installs iptables rules for Cilium in specific use-cases 778 // (most specifically, interaction with kube-proxy). 779 func (m *IptablesManager) InstallRules(ifName string) error { 780 matchFromIPSecEncrypt := fmt.Sprintf("%#08x/%#08x", linux_defaults.RouteMarkDecrypt, linux_defaults.RouteMarkMask) 781 matchFromIPSecDecrypt := fmt.Sprintf("%#08x/%#08x", linux_defaults.RouteMarkEncrypt, linux_defaults.RouteMarkMask) 782 localDeliveryInterface := getDeliveryInterface(ifName) 783 784 for _, c := range ciliumChains { 785 if err := c.add(m.waitArgs); err != nil { 786 return fmt.Errorf("cannot add custom chain %s: %s", c.name, err) 787 } 788 } 789 790 if err := m.installStaticProxyRules(); err != nil { 791 return fmt.Errorf("cannot add static proxy rules: %s", err) 792 } 793 794 if err := m.addCiliumAcceptXfrmRules(); err != nil { 795 return err 796 } 797 798 if option.Config.EnableIPv4 { 799 // See kube-proxy comment in TransientRules(). 800 if err := runProg("iptables", append( 801 m.waitArgs, 802 "-A", ciliumForwardChain, 803 "-o", localDeliveryInterface, 804 "-m", "comment", "--comment", "cilium: any->cluster on "+localDeliveryInterface+" forward accept", 805 "-j", "ACCEPT"), false); err != nil { 806 return err 807 } 808 if err := runProg("iptables", append( 809 m.waitArgs, 810 "-A", ciliumForwardChain, 811 "-i", localDeliveryInterface, 812 "-m", "comment", "--comment", "cilium: cluster->any on "+localDeliveryInterface+" forward accept (nodeport)", 813 "-j", "ACCEPT"), false); err != nil { 814 return err 815 } 816 if err := runProg("iptables", append( 817 m.waitArgs, 818 "-A", ciliumForwardChain, 819 "-i", "lxc+", 820 "-m", "comment", "--comment", "cilium: cluster->any on lxc+ forward accept", 821 "-j", "ACCEPT"), false); err != nil { 822 return err 823 } 824 825 // Mark all packets sourced from processes running on the host with a 826 // special marker so that we can differentiate traffic sourced locally 827 // vs. traffic from the outside world that was masqueraded to appear 828 // like it's from the host. 829 // 830 // Originally we set this mark only for traffic destined to the 831 // ifName device, to ensure that any traffic directly reaching 832 // to a Cilium-managed IP could be classified as from the host. 833 // 834 // However, there's another case where a local process attempts to 835 // reach a service IP which is backed by a Cilium-managed pod. The 836 // service implementation is outside of Cilium's control, for example, 837 // handled by kube-proxy. We can tag even this traffic with a magic 838 // mark, then when the service implementation proxies it back into 839 // Cilium the BPF will see this mark and understand that the packet 840 // originated from the host. 841 matchFromProxy := fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkIsProxy, linux_defaults.MagicMarkProxyMask) 842 markAsFromHost := fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkHost, linux_defaults.MagicMarkHostMask) 843 if err := runProg("iptables", append( 844 m.waitArgs, 845 "-t", "filter", 846 "-A", ciliumOutputChain, 847 "-m", "mark", "!", "--mark", matchFromIPSecDecrypt, // Don't match ipsec traffic 848 "-m", "mark", "!", "--mark", matchFromIPSecEncrypt, // Don't match ipsec traffic 849 "-m", "mark", "!", "--mark", matchFromProxy, // Don't match proxy traffic 850 "-m", "comment", "--comment", "cilium: host->any mark as from host", 851 "-j", "MARK", "--set-xmark", markAsFromHost), false); err != nil { 852 return err 853 } 854 855 if option.Config.Masquerade { 856 // Masquerade all egress traffic leaving the node 857 // 858 // This rule must be first as it has different exclusion criteria 859 // than the other rules in this table. 860 // 861 // The following conditions must be met: 862 // * May not leave on a cilium_ interface, this excludes all 863 // tunnel traffic 864 // * Must originate from an IP in the local allocation range 865 // * Must not be reply if BPF NodePort is enabled 866 // * Tunnel mode: 867 // * May not be targeted to an IP in the local allocation 868 // range 869 // * Non-tunnel mode: 870 // * May not be targeted to an IP in the cluster range 871 if option.Config.EgressMasqueradeInterfaces != "" { 872 if err := runProg("iptables", append( 873 m.waitArgs, 874 "-t", "nat", 875 "-A", ciliumPostNatChain, 876 "!", "-d", m.remoteSnatDstAddrExclusion(), 877 "-o", option.Config.EgressMasqueradeInterfaces, 878 "-m", "comment", "--comment", "cilium masquerade non-cluster", 879 "-j", "MASQUERADE"), false); err != nil { 880 return err 881 } 882 } else { 883 if err := runProg("iptables", append( 884 m.waitArgs, 885 "-t", "nat", 886 "-A", ciliumPostNatChain, 887 "-s", node.GetIPv4AllocRange().String(), 888 "!", "-d", m.remoteSnatDstAddrExclusion(), 889 "!", "-o", "cilium_+", 890 "-m", "comment", "--comment", "cilium masquerade non-cluster", 891 "-j", "MASQUERADE"), false); err != nil { 892 return err 893 } 894 } 895 896 // The following rules exclude traffic from the remaining rules in this chain. 897 // If any of these rules match, none of the remaining rules in this chain 898 // are considered. 899 // Exclude traffic for other than interface from the masquarade rules. 900 // RETURN fro the chain as it is possible that other rules need to be matched. 901 if err := runProg("iptables", append( 902 m.waitArgs, 903 "-t", "nat", 904 "-A", ciliumPostNatChain, 905 "!", "-o", localDeliveryInterface, 906 "-m", "comment", "--comment", "exclude non-"+ifName+" traffic from masquerade", 907 "-j", "RETURN"), false); err != nil { 908 return err 909 } 910 911 // Exclude proxy return traffic from the masquarade rules 912 if err := runProg("iptables", append( 913 m.waitArgs, 914 "-t", "nat", 915 "-A", ciliumPostNatChain, 916 "-m", "mark", "--mark", matchFromProxy, // Don't match proxy (return) traffic 917 "-m", "comment", "--comment", "exclude proxy return traffic from masquarade", 918 "-j", "ACCEPT"), false); err != nil { 919 return err 920 } 921 922 if option.Config.Tunnel != option.TunnelDisabled { 923 // Masquerade all traffic from the host into the ifName 924 // interface if the source is not the internal IP 925 // 926 // The following conditions must be met: 927 // * Must be targeted for the ifName interface 928 // * Must be targeted to an IP that is not local 929 // * Tunnel mode: 930 // * May not already be originating from the masquerade IP 931 // * Non-tunnel mode: 932 // * May not orignate from any IP inside of the cluster range 933 if err := runProg("iptables", append( 934 m.waitArgs, 935 "-t", "nat", 936 "-A", ciliumPostNatChain, 937 "!", "-s", node.GetHostMasqueradeIPv4().String(), 938 "!", "-d", node.GetIPv4AllocRange().String(), 939 "-o", "cilium_host", 940 "-m", "comment", "--comment", "cilium host->cluster masquerade", 941 "-j", "SNAT", "--to-source", node.GetHostMasqueradeIPv4().String()), false); err != nil { 942 return err 943 } 944 } 945 946 // Masquerade all traffic from the host into local 947 // endpoints if the source is 127.0.0.1. This is 948 // required to force replies out of the endpoint's 949 // network namespace. 950 // 951 // The following conditions must be met: 952 // * Must be targeted for local endpoint 953 // * Must be from 127.0.0.1 954 if err := runProg("iptables", append( 955 m.waitArgs, 956 "-t", "nat", 957 "-A", ciliumPostNatChain, 958 "-s", "127.0.0.1", 959 "-o", localDeliveryInterface, 960 "-m", "comment", "--comment", "cilium host->cluster from 127.0.0.1 masquerade", 961 "-j", "SNAT", "--to-source", node.GetHostMasqueradeIPv4().String()), false); err != nil { 962 return err 963 } 964 965 // Masquerade all traffic that originated from a local 966 // pod and thus carries a security identity and that 967 // was also DNAT'ed. It must be masqueraded to ensure 968 // that reverse NAT can be performed. Otherwise the 969 // reply traffic would be sent directly to the pod 970 // without traversing the Linux stack again. 971 // 972 // This is only done if EnableEndpointRoutes is 973 // disabled, if EnableEndpointRoutes is enabled, then 974 // all traffic always passes through the stack anyway. 975 // 976 // This is required for: 977 // - portmap/host if both source and destination are 978 // on the same node 979 // - kiam if source and server are on the same node 980 if !option.Config.EnableEndpointRoutes { 981 if err := runProg("iptables", append( 982 m.waitArgs, 983 "-t", "nat", 984 "-A", ciliumPostNatChain, 985 "-m", "mark", "--mark", fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkIdentity, linux_defaults.MagicMarkHostMask), 986 "-o", localDeliveryInterface, 987 "-m", "conntrack", "--ctstate", "DNAT", 988 "-m", "comment", "--comment", "hairpin traffic that originated from a local pod", 989 "-j", "SNAT", "--to-source", node.GetHostMasqueradeIPv4().String()), false); err != nil { 990 return err 991 } 992 } 993 } 994 } 995 996 if option.Config.EnableIPSec { 997 if err := m.addCiliumNoTrackXfrmRules(); err != nil { 998 return fmt.Errorf("cannot install xfrm rules: %s", err) 999 } 1000 } 1001 1002 for _, c := range ciliumChains { 1003 if err := c.installFeeder(m.waitArgs); err != nil { 1004 return fmt.Errorf("cannot install feeder rule %s: %s", c.feederArgs, err) 1005 } 1006 } 1007 1008 return nil 1009 } 1010 1011 func (m *IptablesManager) ciliumNoTrackXfrmRules(prog, input string) error { 1012 matchFromIPSecEncrypt := fmt.Sprintf("%#08x/%#08x", linux_defaults.RouteMarkDecrypt, linux_defaults.RouteMarkMask) 1013 matchFromIPSecDecrypt := fmt.Sprintf("%#08x/%#08x", linux_defaults.RouteMarkEncrypt, linux_defaults.RouteMarkMask) 1014 1015 if err := runProg(prog, append( 1016 m.waitArgs, 1017 "-t", "raw", input, ciliumPreRawChain, 1018 "-m", "mark", "--mark", matchFromIPSecDecrypt, 1019 "-m", "comment", "--comment", xfrmDescription, 1020 "-j", "NOTRACK"), false); err != nil { 1021 return err 1022 } 1023 if err := runProg(prog, append( 1024 m.waitArgs, 1025 "-t", "raw", input, ciliumPreRawChain, 1026 "-m", "mark", "--mark", matchFromIPSecEncrypt, 1027 "-m", "comment", "--comment", xfrmDescription, 1028 "-j", "NOTRACK"), false); err != nil { 1029 return err 1030 } 1031 return nil 1032 } 1033 1034 // Exclude crypto traffic from the filter and nat table rules. 1035 // This avoids encryption bits and keyID, 0x*d00 for decryption 1036 // and 0x*e00 for encryption, colliding with existing rules. Needed 1037 // for kube-proxy for example. 1038 func (m *IptablesManager) addCiliumAcceptXfrmRules() error { 1039 if option.Config.EnableIPSec == false { 1040 return nil 1041 } 1042 insertAcceptXfrm := func(table, chain string) error { 1043 matchFromIPSecEncrypt := fmt.Sprintf("%#08x/%#08x", linux_defaults.RouteMarkDecrypt, linux_defaults.RouteMarkMask) 1044 matchFromIPSecDecrypt := fmt.Sprintf("%#08x/%#08x", linux_defaults.RouteMarkEncrypt, linux_defaults.RouteMarkMask) 1045 1046 comment := "exclude xfrm marks from " + table + " " + chain + " chain" 1047 1048 if err := runProg("iptables", append( 1049 m.waitArgs, 1050 "-t", table, 1051 "-A", chain, 1052 "-m", "mark", "--mark", matchFromIPSecEncrypt, 1053 "-m", "comment", "--comment", comment, 1054 "-j", "ACCEPT"), false); err != nil { 1055 return err 1056 } 1057 1058 return runProg("iptables", append( 1059 m.waitArgs, 1060 "-t", table, 1061 "-A", chain, 1062 "-m", "mark", "--mark", matchFromIPSecDecrypt, 1063 "-m", "comment", "--comment", comment, 1064 "-j", "ACCEPT"), false) 1065 } 1066 if err := insertAcceptXfrm("filter", ciliumInputChain); err != nil { 1067 return err 1068 } 1069 if err := insertAcceptXfrm("filter", ciliumOutputChain); err != nil { 1070 return err 1071 } 1072 if err := insertAcceptXfrm("filter", ciliumForwardChain); err != nil { 1073 return err 1074 } 1075 if err := insertAcceptXfrm("nat", ciliumPostNatChain); err != nil { 1076 return err 1077 } 1078 if err := insertAcceptXfrm("nat", ciliumPreNatChain); err != nil { 1079 return err 1080 } 1081 if err := insertAcceptXfrm("nat", ciliumOutputNatChain); err != nil { 1082 return err 1083 } 1084 return nil 1085 } 1086 1087 func (m *IptablesManager) addCiliumNoTrackXfrmRules() error { 1088 if option.Config.EnableIPv4 { 1089 return m.ciliumNoTrackXfrmRules("iptables", "-I") 1090 } 1091 return nil 1092 }