github.com/elfadel/cilium@v1.6.12/pkg/datapath/iptables/iptables.go (about) 1 // Copyright 2016-2019 Authors of Cilium 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package iptables 16 17 import ( 18 "bufio" 19 "bytes" 20 "fmt" 21 "regexp" 22 "strings" 23 "time" 24 25 "github.com/cilium/cilium/pkg/byteorder" 26 "github.com/cilium/cilium/pkg/command/exec" 27 "github.com/cilium/cilium/pkg/datapath/linux/linux_defaults" 28 "github.com/cilium/cilium/pkg/defaults" 29 "github.com/cilium/cilium/pkg/logging/logfields" 30 "github.com/cilium/cilium/pkg/modules" 31 "github.com/cilium/cilium/pkg/node" 32 "github.com/cilium/cilium/pkg/option" 33 "github.com/cilium/cilium/pkg/sysctl" 34 "github.com/cilium/cilium/pkg/versioncheck" 35 36 go_version "github.com/hashicorp/go-version" 37 "github.com/mattn/go-shellwords" 38 ) 39 40 const ( 41 ciliumPrefix = "CILIUM_" 42 ciliumInputChain = "CILIUM_INPUT" 43 ciliumOutputChain = "CILIUM_OUTPUT" 44 ciliumOutputRawChain = "CILIUM_OUTPUT_raw" 45 ciliumPostNatChain = "CILIUM_POST_nat" 46 ciliumOutputNatChain = "CILIUM_OUTPUT_nat" 47 ciliumPreNatChain = "CILIUM_PRE_nat" 48 ciliumPostMangleChain = "CILIUM_POST_mangle" 49 ciliumPreMangleChain = "CILIUM_PRE_mangle" 50 ciliumPreRawChain = "CILIUM_PRE_raw" 51 ciliumForwardChain = "CILIUM_FORWARD" 52 ciliumTransientForwardChain = "CILIUM_TRANSIENT_FORWARD" 53 feederDescription = "cilium-feeder:" 54 xfrmDescription = "cilium-xfrm-notrack:" 55 ) 56 57 // Minimum iptables versions supporting the -w and -w<seconds> flags 58 var ( 59 waitMinVersion = versioncheck.MustCompile(">=v1.4.20") 60 waitSecondsMinVersion = versioncheck.MustCompile(">=v1.4.22") 61 ) 62 63 const ( 64 waitString = "-w" 65 ) 66 67 type customChain struct { 68 name string 69 table string 70 hook string 71 feederArgs []string 72 ipv6 bool // ip6tables chain in addition to iptables chain 73 } 74 75 func getVersion(prog string) (*go_version.Version, error) { 76 b, err := exec.WithTimeout(defaults.ExecTimeout, prog, "--version").CombinedOutput(log, false) 77 if err != nil { 78 return nil, err 79 } 80 v := regexp.MustCompile("v([0-9]+(\\.[0-9]+)+)") 81 vString := v.FindStringSubmatch(string(b)) 82 if vString == nil { 83 return nil, fmt.Errorf("no iptables version found in string: %s", string(b)) 84 } 85 return go_version.NewVersion(vString[1]) 86 } 87 88 func runProg(prog string, args []string, quiet bool) error { 89 _, err := exec.WithTimeout(defaults.ExecTimeout, prog, args...).CombinedOutput(log, !quiet) 90 return err 91 } 92 93 func getFeedRule(name, args string) []string { 94 ruleTail := []string{"-m", "comment", "--comment", feederDescription + " " + name, "-j", name} 95 if args == "" { 96 return ruleTail 97 } 98 argsList, err := shellwords.Parse(args) 99 if err != nil { 100 log.WithError(err).WithField(logfields.Object, args).Fatal("Unable to parse rule into argument slice") 101 } 102 return append(argsList, ruleTail...) 103 } 104 105 // KernelHasNetfilter probes whether iptables related modules are present in 106 // the kernel and returns true if indeed the case, else false. 107 func KernelHasNetfilter() bool { 108 modulesManager := &modules.ModulesManager{} 109 if err := modulesManager.Init(); err != nil { 110 return true 111 } 112 if found, _ := modulesManager.FindModules( 113 "ip_tables", "iptable_mangle", "iptable_raw", "iptable_filter"); found { 114 return true 115 } 116 if found, _ := modulesManager.FindModules( 117 "ip6_tables", "ip6table_mangle", "ip6table_raw", "ip6table_filter"); found { 118 return true 119 } 120 return false 121 } 122 123 func (c *customChain) add(waitArgs []string) error { 124 var err error 125 if option.Config.EnableIPv4 { 126 err = runProg("iptables", append(waitArgs, "-t", c.table, "-N", c.name), false) 127 } 128 if err == nil && option.Config.EnableIPv6 && c.ipv6 == true { 129 err = runProg("ip6tables", append(waitArgs, "-t", c.table, "-N", c.name), false) 130 } 131 return err 132 } 133 134 func reverseRule(rule string) ([]string, error) { 135 if strings.HasPrefix(rule, "-A") { 136 // From: -A POSTROUTING -m comment [...] 137 // To: -D POSTROUTING -m comment [...] 138 return shellwords.Parse(strings.Replace(rule, "-A", "-D", 1)) 139 } 140 141 if strings.HasPrefix(rule, "-I") { 142 // From: -I POSTROUTING -m comment [...] 143 // To: -D POSTROUTING -m comment [...] 144 return shellwords.Parse(strings.Replace(rule, "-I", "-D", 1)) 145 } 146 147 return []string{}, nil 148 } 149 150 func (m *IptablesManager) removeCiliumRules(table, prog, match string) { 151 args := append(m.waitArgs, "-t", table, "-S") 152 153 out, err := exec.WithTimeout(defaults.ExecTimeout, prog, args...).CombinedOutput(log, true) 154 if err != nil { 155 return 156 } 157 158 scanner := bufio.NewScanner(bytes.NewReader(out)) 159 for scanner.Scan() { 160 rule := scanner.Text() 161 log.WithField(logfields.Object, logfields.Repr(rule)).Debugf("Considering removing %s rule", prog) 162 if match != ciliumTransientForwardChain && strings.Contains(rule, ciliumTransientForwardChain) { 163 continue 164 } 165 166 // All rules installed by cilium either belong to a chain with 167 // the name CILIUM_ or call a chain with the name CILIUM_: 168 // -A CILIUM_FORWARD -o cilium_host -m comment --comment "cilium: any->cluster on cilium_host forward accept" -j ACCEPT 169 // -A POSTROUTING -m comment --comment "cilium-feeder: CILIUM_POST" -j CILIUM_POST 170 if strings.Contains(rule, match) { 171 reversedRule, err := reverseRule(rule) 172 if err != nil { 173 log.WithError(err).WithField(logfields.Object, rule).Warnf("Unable to parse %s rule into slice. Leaving rule behind.", prog) 174 continue 175 } 176 177 if len(reversedRule) > 0 { 178 deleteRule := append(append(m.waitArgs, "-t", table), reversedRule...) 179 log.WithField(logfields.Object, logfields.Repr(deleteRule)).Debugf("Removing %s rule", prog) 180 err = runProg(prog, deleteRule, true) 181 if err != nil { 182 log.WithError(err).WithField(logfields.Object, rule).Warnf("Unable to delete Cilium %s rule", prog) 183 } 184 } 185 } 186 } 187 } 188 189 func (c *customChain) remove(waitArgs []string, quiet bool) { 190 if option.Config.EnableIPv4 { 191 prog := "iptables" 192 args := append(waitArgs, "-t", c.table, "-F", c.name) 193 err := runProg(prog, args, true) 194 if err != nil && !quiet { 195 log.WithError(err).WithField(logfields.Object, args).Warnf("Unable to flush Cilium %s chain", prog) 196 } 197 198 args = append(waitArgs, "-t", c.table, "-X", c.name) 199 err = runProg(prog, args, true) 200 if err != nil && !quiet { 201 log.WithError(err).WithField(logfields.Object, args).Warnf("Unable to delete Cilium %s chain", prog) 202 } 203 } 204 if option.Config.EnableIPv6 && c.ipv6 == true { 205 prog := "ip6tables" 206 args := append(waitArgs, "-t", c.table, "-F", c.name) 207 err := runProg(prog, args, true) 208 if err != nil && !quiet { 209 log.WithError(err).WithField(logfields.Object, args).Warnf("Unable to flush Cilium %s chain", prog) 210 } 211 212 args = append(waitArgs, "-t", c.table, "-X", c.name) 213 err = runProg(prog, args, true) 214 if err != nil && !quiet { 215 log.WithError(err).WithField(logfields.Object, args).Warnf("Unable to delete Cilium %s chain", prog) 216 } 217 } 218 } 219 220 func (c *customChain) installFeeder(waitArgs []string) error { 221 installMode := "-A" 222 if option.Config.PrependIptablesChains { 223 installMode = "-I" 224 } 225 226 for _, feedArgs := range c.feederArgs { 227 if option.Config.EnableIPv4 { 228 err := runProg("iptables", append(append(waitArgs, "-t", c.table, installMode, c.hook), getFeedRule(c.name, feedArgs)...), true) 229 if err != nil { 230 return err 231 } 232 } 233 if option.Config.EnableIPv6 && c.ipv6 == true { 234 err := runProg("ip6tables", append(append(waitArgs, "-t", c.table, installMode, c.hook), getFeedRule(c.name, feedArgs)...), true) 235 if err != nil { 236 return err 237 } 238 } 239 } 240 return nil 241 } 242 243 // ciliumChains is the list of custom iptables chain used by Cilium. Custom 244 // chains are used to allow for simple replacements of all rules. 245 // 246 // WARNING: If you change or remove any of the feeder rules you have to ensure 247 // that the old feeder rules is also removed on agent start, otherwise, 248 // flushing and removing the custom chains will fail. 249 var ciliumChains = []customChain{ 250 { 251 name: ciliumInputChain, 252 table: "filter", 253 hook: "INPUT", 254 feederArgs: []string{""}, 255 ipv6: true, 256 }, 257 { 258 name: ciliumOutputChain, 259 table: "filter", 260 hook: "OUTPUT", 261 feederArgs: []string{""}, 262 ipv6: true, 263 }, 264 { 265 name: ciliumOutputRawChain, 266 table: "raw", 267 hook: "OUTPUT", 268 feederArgs: []string{""}, 269 ipv6: true, 270 }, 271 { 272 name: ciliumPostNatChain, 273 table: "nat", 274 hook: "POSTROUTING", 275 feederArgs: []string{""}, 276 }, 277 { 278 name: ciliumOutputNatChain, 279 table: "nat", 280 hook: "OUTPUT", 281 feederArgs: []string{""}, 282 }, 283 { 284 name: ciliumPreNatChain, 285 table: "nat", 286 hook: "PREROUTING", 287 feederArgs: []string{""}, 288 }, 289 { 290 name: ciliumPostMangleChain, 291 table: "mangle", 292 hook: "POSTROUTING", 293 feederArgs: []string{""}, 294 }, 295 { 296 name: ciliumPreMangleChain, 297 table: "mangle", 298 hook: "PREROUTING", 299 feederArgs: []string{""}, 300 ipv6: true, 301 }, 302 { 303 name: ciliumPreRawChain, 304 table: "raw", 305 hook: "PREROUTING", 306 feederArgs: []string{""}, 307 ipv6: true, 308 }, 309 { 310 name: ciliumForwardChain, 311 table: "filter", 312 hook: "FORWARD", 313 feederArgs: []string{""}, 314 }, 315 } 316 317 var transientChain = customChain{ 318 name: ciliumTransientForwardChain, 319 table: "filter", 320 hook: "FORWARD", 321 feederArgs: []string{""}, 322 } 323 324 // IptablesManager manages the iptables-related configuration for Cilium. 325 type IptablesManager struct { 326 haveIp6tables bool 327 haveSocketMatch bool 328 ipEarlyDemuxDisabled bool 329 waitArgs []string 330 } 331 332 // Init initializes the iptables manager and checks for iptables kernel modules 333 // availability. 334 func (m *IptablesManager) Init() { 335 modulesManager := &modules.ModulesManager{} 336 ip6tables := true 337 if err := modulesManager.Init(); err != nil { 338 log.WithError(err).Fatal( 339 "Unable to get information about kernel modules") 340 } 341 if err := modulesManager.FindOrLoadModules( 342 "ip_tables", "iptable_nat", "iptable_mangle", "iptable_raw", 343 "iptable_filter"); err != nil { 344 log.WithError(err).Warning( 345 "iptables modules could not be initialized. It probably means that iptables is not available on this system") 346 } 347 if err := modulesManager.FindOrLoadModules( 348 "ip6_tables", "ip6table_mangle", "ip6table_raw", "ip6table_filter"); err != nil { 349 if option.Config.EnableIPv6 { 350 log.WithError(err).Warning( 351 "IPv6 is enabled and ip6tables modules could not be initialized") 352 } 353 log.WithError(err).Debug( 354 "ip6tables kernel modules could not be loaded, so IPv6 cannot be used") 355 ip6tables = false 356 } 357 m.haveIp6tables = ip6tables 358 359 if err := modulesManager.FindOrLoadModules("xt_socket"); err != nil { 360 if option.Config.Tunnel == option.TunnelDisabled { 361 // xt_socket module is needed to circumvent an explicit drop in ip_forward() 362 // logic for packets for which a local socket is found by ip early 363 // demux. xt_socket performs a local socket match and sets an skb mark on 364 // match, which will divert the packet to the local stack using our policy 365 // routing rule, thus avoiding being processed by ip_forward() at all. 366 // 367 // If xt_socket module does not exist we can disable ip early demux to to 368 // avoid the explicit drop in ip_forward(). This is not needed in tunneling 369 // modes, as then we'll set the skb mark in the bpf logic before the policy 370 // routing stage so that the packet is routed locally instead of being 371 // forwarded by ip_forward(). 372 // 373 // We would not need the xt_socket at all if the datapath universally would 374 // set the "to proxy" skb mark bits on before the packet hits policy routing 375 // stage. Currently this is not true for endpoint routing modes. 376 log.WithError(err).Warning("xt_socket kernel module could not be loaded") 377 378 if option.Config.EnableXTSocketFallback { 379 v4disabled := true 380 v6disabled := true 381 if option.Config.EnableIPv4 { 382 v4disabled = sysctl.Disable("net.ipv4.ip_early_demux") == nil 383 } 384 if option.Config.EnableIPv6 { 385 v6disabled = sysctl.Disable("net.ipv6.ip_early_demux") == nil 386 } 387 if v4disabled && v6disabled { 388 m.ipEarlyDemuxDisabled = true 389 log.Warning("Disabled ip_early_demux to allow proxy redirection with original source/destination address without xt_socket support also in non-tunneled datapath modes.") 390 } else { 391 log.WithError(err).Warning("Could not disable ip_early_demux, traffic redirected due to an HTTP policy or visibility may be dropped unexpectedly") 392 } 393 } 394 } 395 } else { 396 m.haveSocketMatch = true 397 } 398 399 v, err := getVersion("iptables") 400 if err == nil { 401 switch { 402 case waitSecondsMinVersion.Check(v): 403 m.waitArgs = []string{waitString, fmt.Sprintf("%d", option.Config.IPTablesLockTimeout/time.Second)} 404 case waitMinVersion.Check(v): 405 m.waitArgs = []string{waitString} 406 } 407 } 408 } 409 410 // SupportsOriginalSourceAddr tells if an L7 proxy can use POD's original source address and port in 411 // the upstream connection to allow the destination to properly derive the source security ID from 412 // the source IP address. 413 func (m *IptablesManager) SupportsOriginalSourceAddr() bool { 414 // Original source address use works if xt_socket match is supported, or if ip early demux 415 // is disabled, or if the datapath is in a tunneling mode. 416 return m.haveSocketMatch || m.ipEarlyDemuxDisabled || option.Config.Tunnel != option.TunnelDisabled 417 } 418 419 // RemoveRules removes iptables rules installed by Cilium. 420 func (m *IptablesManager) RemoveRules() { 421 // Set of tables that have had iptables rules in any Cilium version 422 tables := []string{"nat", "mangle", "raw", "filter"} 423 for _, t := range tables { 424 m.removeCiliumRules(t, "iptables", ciliumPrefix) 425 } 426 427 // Set of tables that have had ip6tables rules in any Cilium version 428 if m.haveIp6tables { 429 tables6 := []string{"mangle", "raw", "filter"} 430 for _, t := range tables6 { 431 m.removeCiliumRules(t, "ip6tables", ciliumPrefix) 432 } 433 } 434 435 for _, c := range ciliumChains { 436 c.remove(m.waitArgs, false) 437 } 438 } 439 440 func (m *IptablesManager) ingressProxyRule(cmd, l4Match, markMatch, mark, port, name string) []string { 441 return append(m.waitArgs, 442 "-t", "mangle", 443 cmd, ciliumPreMangleChain, 444 "-p", l4Match, 445 "-m", "mark", "--mark", markMatch, 446 "-m", "comment", "--comment", "cilium: TPROXY to host "+name+" proxy", 447 "-j", "TPROXY", 448 "--tproxy-mark", mark, 449 "--on-port", port) 450 } 451 452 func (m *IptablesManager) inboundProxyRedirectRule(cmd string) []string { 453 // Mark host proxy transparent connections to be routed to the local stack. 454 // This comes before the TPROXY rules in the chain, and setting the mark 455 // without the proxy port number will make the TPROXY rule to not match, 456 // as we do not want to try to tproxy packets that are going to the stack 457 // already. 458 // This rule is needed for couple of reasons: 459 // 1. route return traffic to the proxy 460 // 2. route original direction traffic that would otherwise be intercepted 461 // by ip_early_demux 462 toProxyMark := fmt.Sprintf("%#08x", linux_defaults.MagicMarkIsToProxy) 463 return append(m.waitArgs, 464 "-t", "mangle", 465 cmd, ciliumPreMangleChain, 466 "-m", "socket", "--transparent", 467 "-m", "comment", "--comment", "cilium: any->pod redirect proxied traffic to host proxy", 468 "-j", "MARK", 469 "--set-mark", toProxyMark) 470 } 471 472 func (m *IptablesManager) iptIngressProxyRule(cmd string, l4proto string, proxyPort uint16, name string) error { 473 // Match 474 port := uint32(byteorder.HostToNetwork(proxyPort).(uint16)) << 16 475 ingressMarkMatch := fmt.Sprintf("%#08x", linux_defaults.MagicMarkIsToProxy|port) 476 // TPROXY params 477 ingressProxyMark := fmt.Sprintf("%#08x", linux_defaults.MagicMarkIsToProxy) 478 ingressProxyPort := fmt.Sprintf("%d", proxyPort) 479 480 var err error 481 if option.Config.EnableIPv4 { 482 err = runProg("iptables", 483 m.ingressProxyRule(cmd, l4proto, ingressMarkMatch, 484 ingressProxyMark, ingressProxyPort, name), 485 false) 486 } 487 if err == nil && option.Config.EnableIPv6 { 488 err = runProg("ip6tables", 489 m.ingressProxyRule(cmd, l4proto, ingressMarkMatch, 490 ingressProxyMark, ingressProxyPort, name), 491 false) 492 } 493 return err 494 } 495 496 func (m *IptablesManager) egressProxyRule(cmd, l4Match, markMatch, mark, port, name string) []string { 497 return append(m.waitArgs, 498 "-t", "mangle", 499 cmd, ciliumPreMangleChain, 500 "-p", l4Match, 501 "-m", "mark", "--mark", markMatch, 502 "-m", "comment", "--comment", "cilium: TPROXY to host "+name+" proxy", 503 "-j", "TPROXY", 504 "--tproxy-mark", mark, 505 "--on-port", port) 506 } 507 508 func (m *IptablesManager) iptEgressProxyRule(cmd string, l4proto string, proxyPort uint16, name string) error { 509 // Match 510 port := uint32(byteorder.HostToNetwork(proxyPort).(uint16)) << 16 511 egressMarkMatch := fmt.Sprintf("%#08x", linux_defaults.MagicMarkIsToProxy|port) 512 // TPROXY params 513 egressProxyMark := fmt.Sprintf("%#08x", linux_defaults.MagicMarkIsToProxy) 514 egressProxyPort := fmt.Sprintf("%d", proxyPort) 515 516 var err error 517 if option.Config.EnableIPv4 { 518 err = runProg("iptables", 519 m.egressProxyRule(cmd, l4proto, egressMarkMatch, 520 egressProxyMark, egressProxyPort, name), 521 false) 522 } 523 if err == nil && option.Config.EnableIPv6 { 524 err = runProg("ip6tables", 525 m.egressProxyRule(cmd, l4proto, egressMarkMatch, 526 egressProxyMark, egressProxyPort, name), 527 false) 528 } 529 return err 530 } 531 532 func (m *IptablesManager) installStaticProxyRules() error { 533 // match traffic to a proxy (upper 16 bits has the proxy port, which is masked out) 534 matchToProxy := fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkIsToProxy, linux_defaults.MagicMarkHostMask) 535 // proxy return traffic has 0 ID in the mask 536 matchProxyReply := fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkIsProxy, linux_defaults.MagicMarkProxyNoIDMask) 537 538 var err error 539 if option.Config.EnableIPv4 { 540 // No conntrack for traffic to proxy 541 err = runProg("iptables", append( 542 m.waitArgs, 543 "-t", "raw", 544 "-A", ciliumPreRawChain, 545 // Destination is a local node POD address 546 "!", "-d", node.GetInternalIPv4().String(), 547 "-m", "mark", "--mark", matchToProxy, 548 "-m", "comment", "--comment", "cilium: NOTRACK for proxy traffic", 549 "-j", "NOTRACK"), false) 550 if err == nil { 551 // Explicit ACCEPT for the proxy traffic. Needed when the INPUT defaults to DROP. 552 // Matching needs to be the same as for the NOTRACK rule above. 553 err = runProg("iptables", append( 554 m.waitArgs, 555 "-t", "filter", 556 "-A", ciliumInputChain, 557 // Destination is a local node POD address 558 "!", "-d", node.GetInternalIPv4().String(), 559 "-m", "mark", "--mark", matchToProxy, 560 "-m", "comment", "--comment", "cilium: ACCEPT for proxy traffic", 561 "-j", "ACCEPT"), false) 562 } 563 if err == nil { 564 // No conntrack for proxy return traffic 565 err = runProg("iptables", append( 566 m.waitArgs, 567 "-t", "raw", 568 "-A", ciliumOutputRawChain, 569 // Return traffic is from a local node POD address 570 "!", "-s", node.GetInternalIPv4().String(), 571 "-m", "mark", "--mark", matchProxyReply, 572 "-m", "comment", "--comment", "cilium: NOTRACK for proxy return traffic", 573 "-j", "NOTRACK"), false) 574 } 575 if err == nil { 576 // Explicit ACCEPT for the proxy return traffic. Needed when the OUTPUT defaults to DROP. 577 // Matching needs to be the same as for the NOTRACK rule above. 578 err = runProg("iptables", append( 579 m.waitArgs, 580 "-t", "filter", 581 "-A", ciliumOutputChain, 582 // Return traffic is from a local node POD address 583 "!", "-s", node.GetInternalIPv4().String(), 584 "-m", "mark", "--mark", matchProxyReply, 585 "-m", "comment", "--comment", "cilium: ACCEPT for proxy return traffic", 586 "-j", "ACCEPT"), false) 587 } 588 if err == nil && m.haveSocketMatch { 589 // Direct inbound TPROXYed traffic towards the socket 590 err = runProg("iptables", m.inboundProxyRedirectRule("-A"), false) 591 } 592 } 593 if err == nil && option.Config.EnableIPv6 { 594 // No conntrack for traffic to ingress proxy 595 err = runProg("ip6tables", append( 596 m.waitArgs, 597 "-t", "raw", 598 "-A", ciliumPreRawChain, 599 // Destination is a local node POD address 600 "!", "-d", node.GetIPv6().String(), 601 "-m", "mark", "--mark", matchToProxy, 602 "-m", "comment", "--comment", "cilium: NOTRACK for proxy traffic", 603 "-j", "NOTRACK"), false) 604 if err == nil { 605 // Explicit ACCEPT for the proxy traffic. Needed when the INPUT defaults to DROP. 606 // Matching needs to be the same as for the NOTRACK rule above. 607 err = runProg("ip6tables", append( 608 m.waitArgs, 609 "-t", "filter", 610 "-A", ciliumInputChain, 611 // Destination is a local node POD address 612 "!", "-d", node.GetIPv6().String(), 613 "-m", "mark", "--mark", matchToProxy, 614 "-m", "comment", "--comment", "cilium: ACCEPT for proxy traffic", 615 "-j", "ACCEPT"), false) 616 } 617 if err == nil { 618 // No conntrack for proxy return traffic 619 err = runProg("ip6tables", append( 620 m.waitArgs, 621 "-t", "raw", 622 "-A", ciliumOutputRawChain, 623 // Return traffic is from a local node POD address 624 "!", "-s", node.GetIPv6().String(), 625 "-m", "mark", "--mark", matchProxyReply, 626 "-m", "comment", "--comment", "cilium: NOTRACK for proxy return traffic", 627 "-j", "NOTRACK"), false) 628 } 629 if err == nil { 630 // Explicit ACCEPT for the proxy return traffic. Needed when the OUTPUT defaults to DROP. 631 // Matching needs to be the same as for the NOTRACK rule above. 632 err = runProg("ip6tables", append( 633 m.waitArgs, 634 "-t", "filter", 635 "-A", ciliumOutputChain, 636 // Return traffic is from a local node POD address 637 "!", "-s", node.GetIPv6().String(), 638 "-m", "mark", "--mark", matchProxyReply, 639 "-m", "comment", "--comment", "cilium: ACCEPT for proxy return traffic", 640 "-j", "ACCEPT"), false) 641 } 642 if err == nil && m.haveSocketMatch { 643 // Direct inbound TPROXYed traffic towards the socket 644 err = runProg("ip6tables", m.inboundProxyRedirectRule("-A"), false) 645 } 646 } 647 return err 648 } 649 650 // install or remove rules for a single proxy port 651 func (m *IptablesManager) iptProxyRules(cmd string, proxyPort uint16, ingress bool, name string) error { 652 // Redirect packets to the host proxy via TPROXY, as directed by the Cilium 653 // datapath bpf programs via skb marks (egress) or DSCP (ingress). 654 if ingress { 655 if err := m.iptIngressProxyRule(cmd, "tcp", proxyPort, name); err != nil { 656 return err 657 } 658 if err := m.iptIngressProxyRule(cmd, "udp", proxyPort, name); err != nil { 659 return err 660 } 661 } else { 662 if err := m.iptEgressProxyRule(cmd, "tcp", proxyPort, name); err != nil { 663 return err 664 } 665 if err := m.iptEgressProxyRule(cmd, "udp", proxyPort, name); err != nil { 666 return err 667 } 668 } 669 return nil 670 } 671 672 func (m *IptablesManager) InstallProxyRules(proxyPort uint16, ingress bool, name string) error { 673 return m.iptProxyRules("-A", proxyPort, ingress, name) 674 } 675 676 func (m *IptablesManager) RemoveProxyRules(proxyPort uint16, ingress bool, name string) error { 677 return m.iptProxyRules("-D", proxyPort, ingress, name) 678 } 679 680 func (m *IptablesManager) remoteSnatDstAddrExclusion() string { 681 switch { 682 case option.Config.IPv4NativeRoutingCIDR() != nil: 683 return option.Config.IPv4NativeRoutingCIDR().String() 684 685 case option.Config.Tunnel == option.TunnelDisabled: 686 return node.GetIPv4ClusterRange().String() 687 688 default: 689 return node.GetIPv4AllocRange().String() 690 } 691 } 692 693 func getDeliveryInterface(ifName string) string { 694 deliveryInterface := ifName 695 if option.Config.IPAM == option.IPAMENI || option.Config.EnableEndpointRoutes { 696 deliveryInterface = "lxc+" 697 } 698 return deliveryInterface 699 } 700 701 // TransientRulesStart installs iptables rules for Cilium that need to be 702 // kept in-tact during agent restart which removes/installs its main rules. 703 // Transient rules are then removed once iptables rule update cycle has 704 // completed. This is mainly due to interactions with kube-proxy. 705 func (m *IptablesManager) TransientRulesStart(ifName string) error { 706 if option.Config.EnableIPv4 { 707 localDeliveryInterface := getDeliveryInterface(ifName) 708 709 m.TransientRulesEnd(true) 710 711 if err := transientChain.add(m.waitArgs); err != nil { 712 return fmt.Errorf("cannot add custom chain %s: %s", transientChain.name, err) 713 } 714 // While kube-proxy does change the policy of the iptables FORWARD chain 715 // it doesn't seem to handle all cases, e.g. host network pods that use 716 // the node IP which would still end up in default DENY. Similarly, for 717 // plain Docker setup, we would otherwise hit default DENY in FORWARD chain. 718 // Also, k8s 1.15 introduced "-m conntrack --ctstate INVALID -j DROP" which 719 // in the direct routing case can drop EP replies. 720 // 721 // Therefore, add three rules below to avoid having a user to manually opt-in. 722 // See also: https://github.com/kubernetes/kubernetes/issues/39823 723 // In here can only be basic ACCEPT rules, nothing more complicated. 724 // 725 // The second rule is for the case of nodeport traffic where the backend is 726 // remote. The traffic flow in FORWARD is as follows: 727 // 728 // - Node serving nodeport request: 729 // IN=eno1 OUT=cilium_host 730 // IN=cilium_host OUT=eno1 731 // 732 // - Node running backend: 733 // IN=eno1 OUT=cilium_host 734 // IN=lxc... OUT=eno1 735 if err := runProg("iptables", append( 736 m.waitArgs, 737 "-A", ciliumTransientForwardChain, 738 "-o", localDeliveryInterface, 739 "-m", "comment", "--comment", "cilium (transient): any->cluster on "+localDeliveryInterface+" forward accept", 740 "-j", "ACCEPT"), false); err != nil { 741 return err 742 } 743 if err := runProg("iptables", append( 744 m.waitArgs, 745 "-A", ciliumTransientForwardChain, 746 "-i", localDeliveryInterface, 747 "-m", "comment", "--comment", "cilium (transient): cluster->any on "+localDeliveryInterface+" forward accept (nodeport)", 748 "-j", "ACCEPT"), false); err != nil { 749 return err 750 } 751 if err := runProg("iptables", append( 752 m.waitArgs, 753 "-A", ciliumTransientForwardChain, 754 "-i", "lxc+", 755 "-m", "comment", "--comment", "cilium (transient): cluster->any on lxc+ forward accept", 756 "-j", "ACCEPT"), false); err != nil { 757 return err 758 } 759 if err := transientChain.installFeeder(m.waitArgs); err != nil { 760 return fmt.Errorf("cannot install feeder rule %s: %s", transientChain.feederArgs, err) 761 } 762 } 763 return nil 764 } 765 766 // TransientRulesEnd removes Cilium related rules installed from TransientRulesStart. 767 func (m *IptablesManager) TransientRulesEnd(quiet bool) { 768 if option.Config.EnableIPv4 { 769 m.removeCiliumRules("filter", "iptables", ciliumTransientForwardChain) 770 transientChain.remove(m.waitArgs, quiet) 771 } 772 } 773 774 // InstallRules installs iptables rules for Cilium in specific use-cases 775 // (most specifically, interaction with kube-proxy). 776 func (m *IptablesManager) InstallRules(ifName string) error { 777 matchFromIPSecEncrypt := fmt.Sprintf("%#08x/%#08x", linux_defaults.RouteMarkDecrypt, linux_defaults.RouteMarkMask) 778 matchFromIPSecDecrypt := fmt.Sprintf("%#08x/%#08x", linux_defaults.RouteMarkEncrypt, linux_defaults.RouteMarkMask) 779 localDeliveryInterface := getDeliveryInterface(ifName) 780 781 for _, c := range ciliumChains { 782 if err := c.add(m.waitArgs); err != nil { 783 return fmt.Errorf("cannot add custom chain %s: %s", c.name, err) 784 } 785 } 786 787 if err := m.installStaticProxyRules(); err != nil { 788 return fmt.Errorf("cannot add static proxy rules: %s", err) 789 } 790 791 if err := m.addCiliumAcceptXfrmRules(); err != nil { 792 return err 793 } 794 795 if option.Config.EnableIPv4 { 796 // See kube-proxy comment in TransientRules(). 797 if err := runProg("iptables", append( 798 m.waitArgs, 799 "-A", ciliumForwardChain, 800 "-o", localDeliveryInterface, 801 "-m", "comment", "--comment", "cilium: any->cluster on "+localDeliveryInterface+" forward accept", 802 "-j", "ACCEPT"), false); err != nil { 803 return err 804 } 805 if err := runProg("iptables", append( 806 m.waitArgs, 807 "-A", ciliumForwardChain, 808 "-i", localDeliveryInterface, 809 "-m", "comment", "--comment", "cilium: cluster->any on "+localDeliveryInterface+" forward accept (nodeport)", 810 "-j", "ACCEPT"), false); err != nil { 811 return err 812 } 813 if err := runProg("iptables", append( 814 m.waitArgs, 815 "-A", ciliumForwardChain, 816 "-i", "lxc+", 817 "-m", "comment", "--comment", "cilium: cluster->any on lxc+ forward accept", 818 "-j", "ACCEPT"), false); err != nil { 819 return err 820 } 821 822 // Mark all packets sourced from processes running on the host with a 823 // special marker so that we can differentiate traffic sourced locally 824 // vs. traffic from the outside world that was masqueraded to appear 825 // like it's from the host. 826 // 827 // Originally we set this mark only for traffic destined to the 828 // ifName device, to ensure that any traffic directly reaching 829 // to a Cilium-managed IP could be classified as from the host. 830 // 831 // However, there's another case where a local process attempts to 832 // reach a service IP which is backed by a Cilium-managed pod. The 833 // service implementation is outside of Cilium's control, for example, 834 // handled by kube-proxy. We can tag even this traffic with a magic 835 // mark, then when the service implementation proxies it back into 836 // Cilium the BPF will see this mark and understand that the packet 837 // originated from the host. 838 matchFromProxy := fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkIsProxy, linux_defaults.MagicMarkProxyMask) 839 markAsFromHost := fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkHost, linux_defaults.MagicMarkHostMask) 840 if err := runProg("iptables", append( 841 m.waitArgs, 842 "-t", "filter", 843 "-A", ciliumOutputChain, 844 "-m", "mark", "!", "--mark", matchFromIPSecDecrypt, // Don't match ipsec traffic 845 "-m", "mark", "!", "--mark", matchFromIPSecEncrypt, // Don't match ipsec traffic 846 "-m", "mark", "!", "--mark", matchFromProxy, // Don't match proxy traffic 847 "-m", "comment", "--comment", "cilium: host->any mark as from host", 848 "-j", "MARK", "--set-xmark", markAsFromHost), false); err != nil { 849 return err 850 } 851 852 if option.Config.Masquerade { 853 // Masquerade all egress traffic leaving the node 854 // 855 // This rule must be first as it has different exclusion criteria 856 // than the other rules in this table. 857 // 858 // The following conditions must be met: 859 // * May not leave on a cilium_ interface, this excludes all 860 // tunnel traffic 861 // * Must originate from an IP in the local allocation range 862 // * Must not be reply if BPF NodePort is enabled 863 // * Tunnel mode: 864 // * May not be targeted to an IP in the local allocation 865 // range 866 // * Non-tunnel mode: 867 // * May not be targeted to an IP in the cluster range 868 if option.Config.EgressMasqueradeInterfaces != "" { 869 if err := runProg("iptables", append( 870 m.waitArgs, 871 "-t", "nat", 872 "-A", ciliumPostNatChain, 873 "!", "-d", m.remoteSnatDstAddrExclusion(), 874 "-o", option.Config.EgressMasqueradeInterfaces, 875 "-m", "comment", "--comment", "cilium masquerade non-cluster", 876 "-j", "MASQUERADE"), false); err != nil { 877 return err 878 } 879 } else { 880 if err := runProg("iptables", append( 881 m.waitArgs, 882 "-t", "nat", 883 "-A", ciliumPostNatChain, 884 "-s", node.GetIPv4AllocRange().String(), 885 "!", "-d", m.remoteSnatDstAddrExclusion(), 886 "!", "-o", "cilium_+", 887 "-m", "comment", "--comment", "cilium masquerade non-cluster", 888 "-j", "MASQUERADE"), false); err != nil { 889 return err 890 } 891 } 892 893 // The following rules exclude traffic from the remaining rules in this chain. 894 // If any of these rules match, none of the remaining rules in this chain 895 // are considered. 896 // Exclude traffic for other than interface from the masquarade rules. 897 // RETURN fro the chain as it is possible that other rules need to be matched. 898 if err := runProg("iptables", append( 899 m.waitArgs, 900 "-t", "nat", 901 "-A", ciliumPostNatChain, 902 "!", "-o", localDeliveryInterface, 903 "-m", "comment", "--comment", "exclude non-"+ifName+" traffic from masquerade", 904 "-j", "RETURN"), false); err != nil { 905 return err 906 } 907 908 // Exclude proxy return traffic from the masquarade rules 909 if err := runProg("iptables", append( 910 m.waitArgs, 911 "-t", "nat", 912 "-A", ciliumPostNatChain, 913 "-m", "mark", "--mark", matchFromProxy, // Don't match proxy (return) traffic 914 "-m", "comment", "--comment", "exclude proxy return traffic from masquarade", 915 "-j", "ACCEPT"), false); err != nil { 916 return err 917 } 918 919 if option.Config.Tunnel != option.TunnelDisabled { 920 // Masquerade all traffic from the host into the ifName 921 // interface if the source is not the internal IP 922 // 923 // The following conditions must be met: 924 // * Must be targeted for the ifName interface 925 // * Must be targeted to an IP that is not local 926 // * Tunnel mode: 927 // * May not already be originating from the masquerade IP 928 // * Non-tunnel mode: 929 // * May not orignate from any IP inside of the cluster range 930 if err := runProg("iptables", append( 931 m.waitArgs, 932 "-t", "nat", 933 "-A", ciliumPostNatChain, 934 "!", "-s", node.GetHostMasqueradeIPv4().String(), 935 "!", "-d", node.GetIPv4AllocRange().String(), 936 "-o", "cilium_host", 937 "-m", "comment", "--comment", "cilium host->cluster masquerade", 938 "-j", "SNAT", "--to-source", node.GetHostMasqueradeIPv4().String()), false); err != nil { 939 return err 940 } 941 } 942 943 // Masquerade all traffic from the host into local 944 // endpoints if the source is 127.0.0.1. This is 945 // required to force replies out of the endpoint's 946 // network namespace. 947 // 948 // The following conditions must be met: 949 // * Must be targeted for local endpoint 950 // * Must be from 127.0.0.1 951 if err := runProg("iptables", append( 952 m.waitArgs, 953 "-t", "nat", 954 "-A", ciliumPostNatChain, 955 "-s", "127.0.0.1", 956 "-o", localDeliveryInterface, 957 "-m", "comment", "--comment", "cilium host->cluster from 127.0.0.1 masquerade", 958 "-j", "SNAT", "--to-source", node.GetHostMasqueradeIPv4().String()), false); err != nil { 959 return err 960 } 961 962 // Masquerade all traffic that originated from a local 963 // pod and thus carries a security identity and that 964 // was also DNAT'ed. It must be masqueraded to ensure 965 // that reverse NAT can be performed. Otherwise the 966 // reply traffic would be sent directly to the pod 967 // without traversing the Linux stack again. 968 // 969 // This is only done if EnableEndpointRoutes is 970 // disabled, if EnableEndpointRoutes is enabled, then 971 // all traffic always passes through the stack anyway. 972 // 973 // This is required for: 974 // - portmap/host if both source and destination are 975 // on the same node 976 // - kiam if source and server are on the same node 977 if !option.Config.EnableEndpointRoutes { 978 if err := runProg("iptables", append( 979 m.waitArgs, 980 "-t", "nat", 981 "-A", ciliumPostNatChain, 982 "-m", "mark", "--mark", fmt.Sprintf("%#08x/%#08x", linux_defaults.MagicMarkIdentity, linux_defaults.MagicMarkHostMask), 983 "-o", localDeliveryInterface, 984 "-m", "conntrack", "--ctstate", "DNAT", 985 "-m", "comment", "--comment", "hairpin traffic that originated from a local pod", 986 "-j", "SNAT", "--to-source", node.GetHostMasqueradeIPv4().String()), false); err != nil { 987 return err 988 } 989 } 990 } 991 } 992 993 if option.Config.EnableIPSec { 994 if err := m.addCiliumNoTrackXfrmRules(); err != nil { 995 return fmt.Errorf("cannot install xfrm rules: %s", err) 996 } 997 } 998 999 for _, c := range ciliumChains { 1000 if err := c.installFeeder(m.waitArgs); err != nil { 1001 return fmt.Errorf("cannot install feeder rule %s: %s", c.feederArgs, err) 1002 } 1003 } 1004 1005 return nil 1006 } 1007 1008 func (m *IptablesManager) ciliumNoTrackXfrmRules(prog, input string) error { 1009 matchFromIPSecEncrypt := fmt.Sprintf("%#08x/%#08x", linux_defaults.RouteMarkDecrypt, linux_defaults.RouteMarkMask) 1010 matchFromIPSecDecrypt := fmt.Sprintf("%#08x/%#08x", linux_defaults.RouteMarkEncrypt, linux_defaults.RouteMarkMask) 1011 1012 if err := runProg(prog, append( 1013 m.waitArgs, 1014 "-t", "raw", input, ciliumPreRawChain, 1015 "-m", "mark", "--mark", matchFromIPSecDecrypt, 1016 "-m", "comment", "--comment", xfrmDescription, 1017 "-j", "NOTRACK"), false); err != nil { 1018 return err 1019 } 1020 if err := runProg(prog, append( 1021 m.waitArgs, 1022 "-t", "raw", input, ciliumPreRawChain, 1023 "-m", "mark", "--mark", matchFromIPSecEncrypt, 1024 "-m", "comment", "--comment", xfrmDescription, 1025 "-j", "NOTRACK"), false); err != nil { 1026 return err 1027 } 1028 return nil 1029 } 1030 1031 // Exclude crypto traffic from the filter and nat table rules. 1032 // This avoids encryption bits and keyID, 0x*d00 for decryption 1033 // and 0x*e00 for encryption, colliding with existing rules. Needed 1034 // for kube-proxy for example. 1035 func (m *IptablesManager) addCiliumAcceptXfrmRules() error { 1036 if option.Config.EnableIPSec == false { 1037 return nil 1038 } 1039 insertAcceptXfrm := func(table, chain string) error { 1040 matchFromIPSecEncrypt := fmt.Sprintf("%#08x/%#08x", linux_defaults.RouteMarkDecrypt, linux_defaults.RouteMarkMask) 1041 matchFromIPSecDecrypt := fmt.Sprintf("%#08x/%#08x", linux_defaults.RouteMarkEncrypt, linux_defaults.RouteMarkMask) 1042 1043 comment := "exclude xfrm marks from " + table + " " + chain + " chain" 1044 1045 if err := runProg("iptables", append( 1046 m.waitArgs, 1047 "-t", table, 1048 "-A", chain, 1049 "-m", "mark", "--mark", matchFromIPSecEncrypt, 1050 "-m", "comment", "--comment", comment, 1051 "-j", "ACCEPT"), false); err != nil { 1052 return err 1053 } 1054 1055 return runProg("iptables", append( 1056 m.waitArgs, 1057 "-t", table, 1058 "-A", chain, 1059 "-m", "mark", "--mark", matchFromIPSecDecrypt, 1060 "-m", "comment", "--comment", comment, 1061 "-j", "ACCEPT"), false) 1062 } 1063 if err := insertAcceptXfrm("filter", ciliumInputChain); err != nil { 1064 return err 1065 } 1066 if err := insertAcceptXfrm("filter", ciliumOutputChain); err != nil { 1067 return err 1068 } 1069 if err := insertAcceptXfrm("filter", ciliumForwardChain); err != nil { 1070 return err 1071 } 1072 if err := insertAcceptXfrm("nat", ciliumPostNatChain); err != nil { 1073 return err 1074 } 1075 if err := insertAcceptXfrm("nat", ciliumPreNatChain); err != nil { 1076 return err 1077 } 1078 if err := insertAcceptXfrm("nat", ciliumOutputNatChain); err != nil { 1079 return err 1080 } 1081 return nil 1082 } 1083 1084 func (m *IptablesManager) addCiliumNoTrackXfrmRules() error { 1085 if option.Config.EnableIPv4 { 1086 return m.ciliumNoTrackXfrmRules("iptables", "-I") 1087 } 1088 return nil 1089 }