github.com/cilium/cilium@v1.16.2/pkg/datapath/loader/netlink.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package loader 5 6 import ( 7 "errors" 8 "fmt" 9 "net" 10 "os" 11 12 "github.com/cilium/ebpf" 13 "github.com/vishvananda/netlink" 14 "golang.org/x/sys/unix" 15 16 "github.com/cilium/cilium/pkg/bpf" 17 "github.com/cilium/cilium/pkg/datapath/linux/sysctl" 18 "github.com/cilium/cilium/pkg/datapath/tables" 19 "github.com/cilium/cilium/pkg/datapath/tunnel" 20 "github.com/cilium/cilium/pkg/defaults" 21 "github.com/cilium/cilium/pkg/mac" 22 "github.com/cilium/cilium/pkg/maps/policymap" 23 "github.com/cilium/cilium/pkg/option" 24 ) 25 26 const qdiscClsact = "clsact" 27 28 func directionToParent(dir string) uint32 { 29 switch dir { 30 case dirIngress: 31 return netlink.HANDLE_MIN_INGRESS 32 case dirEgress: 33 return netlink.HANDLE_MIN_EGRESS 34 } 35 return 0 36 } 37 38 // loadDatapath returns a Collection given the ELF obj, renames maps according 39 // to mapRenames and overrides the given constants. 40 // 41 // When successful, returns a function that commits pending map pins to the bpf 42 // file system, for maps that were found to be incompatible with their pinned 43 // counterparts, or for maps with certain flags that modify the default pinning 44 // behaviour. 45 // 46 // When attaching multiple programs from the same ELF in a loop, the returned 47 // function should only be run after all entrypoints have been attached. For 48 // example, attach both bpf_host.c:cil_to_netdev and cil_from_netdev before 49 // invoking the returned function, otherwise missing tail calls will occur. 50 func loadDatapath(spec *ebpf.CollectionSpec, mapRenames map[string]string, constants map[string]uint64) (*ebpf.Collection, func() error, error) { 51 spec, err := renameMaps(spec, mapRenames) 52 if err != nil { 53 return nil, nil, err 54 } 55 56 // Inserting a program into these maps will immediately cause other BPF 57 // programs to call into it, even if other maps like cilium_calls haven't been 58 // fully populated for the current ELF. Save their contents and avoid sending 59 // them to the ELF loader. 60 var policyProgs, egressPolicyProgs []ebpf.MapKV 61 if pm, ok := spec.Maps[policymap.PolicyCallMapName]; ok { 62 policyProgs = append(policyProgs, pm.Contents...) 63 pm.Contents = nil 64 } 65 if pm, ok := spec.Maps[policymap.PolicyEgressCallMapName]; ok { 66 egressPolicyProgs = append(egressPolicyProgs, pm.Contents...) 67 pm.Contents = nil 68 } 69 70 // Load the CollectionSpec into the kernel, picking up any pinned maps from 71 // bpffs in the process. 72 pinPath := bpf.TCGlobalsPath() 73 collOpts := bpf.CollectionOptions{ 74 CollectionOptions: ebpf.CollectionOptions{ 75 Maps: ebpf.MapOptions{PinPath: pinPath}, 76 }, 77 Constants: constants, 78 } 79 if err := bpf.MkdirBPF(pinPath); err != nil { 80 return nil, nil, fmt.Errorf("creating bpffs pin path: %w", err) 81 } 82 83 log.Debug("Loading Collection into kernel") 84 85 coll, commit, err := bpf.LoadCollection(spec, &collOpts) 86 var ve *ebpf.VerifierError 87 if errors.As(err, &ve) { 88 if _, err := fmt.Fprintf(os.Stderr, "Verifier error: %s\nVerifier log: %+v\n", err, ve); err != nil { 89 return nil, nil, fmt.Errorf("writing verifier log to stderr: %w", err) 90 } 91 } 92 if err != nil { 93 return nil, nil, fmt.Errorf("loading eBPF collection into the kernel: %w", err) 94 } 95 96 // If an ELF contains one of the policy call maps, resolve and insert the 97 // programs it refers to into the map. This always needs to happen _before_ 98 // attaching the ELF's entrypoint(s), but after the ELF's internal tail call 99 // map (cilium_calls) has been populated, as doing so means the ELF's programs 100 // become reachable through its policy programs, which hold references to the 101 // endpoint's cilium_calls. Therefore, inserting policy programs is considered 102 // an 'attachment', just not through the typical bpf hooks. 103 // 104 // For example, a packet can enter to-container, jump into the bpf_host policy 105 // program, which then jumps into the endpoint's policy program that are 106 // installed by the loops below. If we allow packets to enter the endpoint's 107 // bpf programs through its tc hook(s), _all_ this plumbing needs to be done 108 // first, or we risk missing tail calls. 109 if len(policyProgs) != 0 { 110 if err := resolveAndInsertCalls(coll, policymap.PolicyCallMapName, policyProgs); err != nil { 111 return nil, nil, fmt.Errorf("inserting policy programs: %w", err) 112 } 113 } 114 115 if len(egressPolicyProgs) != 0 { 116 if err := resolveAndInsertCalls(coll, policymap.PolicyEgressCallMapName, egressPolicyProgs); err != nil { 117 return nil, nil, fmt.Errorf("inserting egress policy programs: %w", err) 118 } 119 } 120 121 return coll, commit, nil 122 } 123 124 // resolveAndInsertCalls resolves a given slice of ebpf.MapKV containing u32 keys 125 // and string values (typical for a prog array) to the Programs they point to in 126 // the Collection. The Programs are then inserted into the Map with the given 127 // mapName contained within the Collection. 128 func resolveAndInsertCalls(coll *ebpf.Collection, mapName string, calls []ebpf.MapKV) error { 129 m, ok := coll.Maps[mapName] 130 if !ok { 131 return fmt.Errorf("call map %s not found in Collection", mapName) 132 } 133 134 for _, v := range calls { 135 name := v.Value.(string) 136 slot := v.Key.(uint32) 137 138 p, ok := coll.Programs[name] 139 if !ok { 140 return fmt.Errorf("program %s not found in Collection", name) 141 } 142 143 if err := m.Update(slot, p, ebpf.UpdateAny); err != nil { 144 return fmt.Errorf("inserting program %s into slot %d", name, slot) 145 } 146 147 log.Debugf("Inserted program %s into %s slot %d", name, mapName, slot) 148 } 149 150 return nil 151 } 152 153 // enableForwarding puts the given link into the up state and enables IP forwarding. 154 func enableForwarding(sysctl sysctl.Sysctl, link netlink.Link) error { 155 ifName := link.Attrs().Name 156 157 if err := netlink.LinkSetUp(link); err != nil { 158 log.WithError(err).WithField("device", ifName).Warn("Could not set up the link") 159 return err 160 } 161 162 sysSettings := make([]tables.Sysctl, 0, 5) 163 if option.Config.EnableIPv6 { 164 sysSettings = append(sysSettings, tables.Sysctl{ 165 Name: []string{"net", "ipv6", "conf", ifName, "forwarding"}, Val: "1", IgnoreErr: false}) 166 } 167 if option.Config.EnableIPv4 { 168 sysSettings = append(sysSettings, []tables.Sysctl{ 169 {Name: []string{"net", "ipv4", "conf", ifName, "forwarding"}, Val: "1", IgnoreErr: false}, 170 {Name: []string{"net", "ipv4", "conf", ifName, "rp_filter"}, Val: "0", IgnoreErr: false}, 171 {Name: []string{"net", "ipv4", "conf", ifName, "accept_local"}, Val: "1", IgnoreErr: false}, 172 {Name: []string{"net", "ipv4", "conf", ifName, "send_redirects"}, Val: "0", IgnoreErr: false}, 173 }...) 174 } 175 if err := sysctl.ApplySettings(sysSettings); err != nil { 176 return err 177 } 178 179 return nil 180 } 181 182 func setupVethPair(sysctl sysctl.Sysctl, name, peerName string) error { 183 // Create the veth pair if it doesn't exist. 184 if _, err := netlink.LinkByName(name); err != nil { 185 hostMac, err := mac.GenerateRandMAC() 186 if err != nil { 187 return err 188 } 189 peerMac, err := mac.GenerateRandMAC() 190 if err != nil { 191 return err 192 } 193 194 veth := &netlink.Veth{ 195 LinkAttrs: netlink.LinkAttrs{ 196 Name: name, 197 HardwareAddr: net.HardwareAddr(hostMac), 198 TxQLen: 1000, 199 }, 200 PeerName: peerName, 201 PeerHardwareAddr: net.HardwareAddr(peerMac), 202 } 203 if err := netlink.LinkAdd(veth); err != nil { 204 return err 205 } 206 } 207 208 veth, err := netlink.LinkByName(name) 209 if err != nil { 210 return err 211 } 212 if err := enableForwarding(sysctl, veth); err != nil { 213 return err 214 } 215 peer, err := netlink.LinkByName(peerName) 216 if err != nil { 217 return err 218 } 219 if err := enableForwarding(sysctl, peer); err != nil { 220 return err 221 } 222 223 return nil 224 } 225 226 // setupBaseDevice decides which and what kind of interfaces should be set up as 227 // the first step of datapath initialization, then performs the setup (and 228 // creation, if needed) of those interfaces. It returns two links and an error. 229 // By default, it sets up the veth pair - cilium_host and cilium_net. 230 func setupBaseDevice(sysctl sysctl.Sysctl, mtu int) (netlink.Link, netlink.Link, error) { 231 if err := setupVethPair(sysctl, defaults.HostDevice, defaults.SecondHostDevice); err != nil { 232 return nil, nil, err 233 } 234 235 linkHost, err := netlink.LinkByName(defaults.HostDevice) 236 if err != nil { 237 return nil, nil, err 238 } 239 linkNet, err := netlink.LinkByName(defaults.SecondHostDevice) 240 if err != nil { 241 return nil, nil, err 242 } 243 244 if err := netlink.LinkSetARPOff(linkHost); err != nil { 245 return nil, nil, err 246 } 247 if err := netlink.LinkSetARPOff(linkNet); err != nil { 248 return nil, nil, err 249 } 250 251 if err := netlink.LinkSetMTU(linkHost, mtu); err != nil { 252 return nil, nil, err 253 } 254 if err := netlink.LinkSetMTU(linkNet, mtu); err != nil { 255 return nil, nil, err 256 } 257 258 return linkHost, linkNet, nil 259 } 260 261 // addHostDeviceAddr add internal ipv4 and ipv6 addresses to the cilium_host device. 262 func addHostDeviceAddr(hostDev netlink.Link, ipv4, ipv6 net.IP) error { 263 if ipv4 != nil { 264 addr := netlink.Addr{ 265 IPNet: &net.IPNet{ 266 IP: ipv4, 267 Mask: net.CIDRMask(32, 32), // corresponds to /32 268 }, 269 } 270 271 if err := netlink.AddrReplace(hostDev, &addr); err != nil { 272 return err 273 } 274 } 275 if ipv6 != nil { 276 addr := netlink.Addr{ 277 IPNet: &net.IPNet{ 278 IP: ipv6, 279 Mask: net.CIDRMask(128, 128), // corresponds to /128 280 }, 281 } 282 283 if err := netlink.AddrReplace(hostDev, &addr); err != nil { 284 return err 285 } 286 } 287 return nil 288 } 289 290 // setupTunnelDevice ensures the cilium_{mode} device is created and 291 // unused leftover devices are cleaned up in case mode changes. 292 func setupTunnelDevice(sysctl sysctl.Sysctl, mode tunnel.Protocol, port uint16, mtu int) error { 293 switch mode { 294 case tunnel.Geneve: 295 if err := setupGeneveDevice(sysctl, port, mtu); err != nil { 296 return fmt.Errorf("setting up geneve device: %w", err) 297 } 298 if err := removeDevice(defaults.VxlanDevice); err != nil { 299 return fmt.Errorf("removing %s: %w", defaults.VxlanDevice, err) 300 } 301 302 case tunnel.VXLAN: 303 if err := setupVxlanDevice(sysctl, port, mtu); err != nil { 304 return fmt.Errorf("setting up vxlan device: %w", err) 305 } 306 if err := removeDevice(defaults.GeneveDevice); err != nil { 307 return fmt.Errorf("removing %s: %w", defaults.GeneveDevice, err) 308 } 309 310 default: 311 if err := removeDevice(defaults.VxlanDevice); err != nil { 312 return fmt.Errorf("removing %s: %w", defaults.VxlanDevice, err) 313 } 314 if err := removeDevice(defaults.GeneveDevice); err != nil { 315 return fmt.Errorf("removing %s: %w", defaults.GeneveDevice, err) 316 } 317 } 318 319 return nil 320 } 321 322 // setupGeneveDevice ensures the cilium_geneve device is created with the given 323 // destination port and mtu. 324 // 325 // Changing the destination port will recreate the device. Changing the MTU will 326 // modify the device without recreating it. 327 func setupGeneveDevice(sysctl sysctl.Sysctl, dport uint16, mtu int) error { 328 mac, err := mac.GenerateRandMAC() 329 if err != nil { 330 return err 331 } 332 333 dev := &netlink.Geneve{ 334 LinkAttrs: netlink.LinkAttrs{ 335 Name: defaults.GeneveDevice, 336 MTU: mtu, 337 HardwareAddr: net.HardwareAddr(mac), 338 }, 339 FlowBased: true, 340 Dport: dport, 341 } 342 343 l, err := ensureDevice(sysctl, dev) 344 if err != nil { 345 return fmt.Errorf("creating geneve device: %w", err) 346 } 347 348 // Recreate the device with the correct destination port. Modifying the device 349 // without recreating it is not supported. 350 geneve, _ := l.(*netlink.Geneve) 351 if geneve.Dport != dport { 352 if err := netlink.LinkDel(l); err != nil { 353 return fmt.Errorf("deleting outdated geneve device: %w", err) 354 } 355 if _, err := ensureDevice(sysctl, dev); err != nil { 356 return fmt.Errorf("recreating geneve device %s: %w", defaults.GeneveDevice, err) 357 } 358 } 359 360 return nil 361 } 362 363 // setupVxlanDevice ensures the cilium_vxlan device is created with the given 364 // port and mtu. 365 // 366 // Changing the port will recreate the device. Changing the MTU will modify the 367 // device without recreating it. 368 func setupVxlanDevice(sysctl sysctl.Sysctl, port uint16, mtu int) error { 369 mac, err := mac.GenerateRandMAC() 370 if err != nil { 371 return err 372 } 373 374 dev := &netlink.Vxlan{ 375 LinkAttrs: netlink.LinkAttrs{ 376 Name: defaults.VxlanDevice, 377 MTU: mtu, 378 HardwareAddr: net.HardwareAddr(mac), 379 }, 380 FlowBased: true, 381 Port: int(port), 382 } 383 384 l, err := ensureDevice(sysctl, dev) 385 if err != nil { 386 return fmt.Errorf("creating vxlan device: %w", err) 387 } 388 389 // Recreate the device with the correct destination port. Modifying the device 390 // without recreating it is not supported. 391 vxlan, _ := l.(*netlink.Vxlan) 392 if vxlan.Port != int(port) { 393 if err := netlink.LinkDel(l); err != nil { 394 return fmt.Errorf("deleting outdated vxlan device: %w", err) 395 } 396 if _, err := ensureDevice(sysctl, dev); err != nil { 397 return fmt.Errorf("recreating vxlan device %s: %w", defaults.VxlanDevice, err) 398 } 399 } 400 401 return nil 402 } 403 404 // setupIPIPDevices ensures the specified v4 and/or v6 devices are created and 405 // configured with their respective sysctls. 406 // 407 // Calling this function may result in tunl0 (v4) or ip6tnl0 (v6) fallback 408 // interfaces being created as a result of loading the ipip and ip6_tunnel 409 // kernel modules by creating cilium_ tunnel interfaces. These are catch-all 410 // interfaces for the ipip decapsulation stack. By default, these interfaces 411 // will be created in new network namespaces, but Cilium disables this behaviour 412 // by setting net.core.fb_tunnels_only_for_init_net = 2. 413 // 414 // In versions of Cilium prior to 1.15, the behaviour was as follows: 415 // - Repurpose the default tunl0 by setting it into collect_md mode and renaming 416 // it to cilium_ipip4. Use the interface for production traffic. 417 // - The same cannot be done for ip6tunl0, as collect_md cannot be enabled on 418 // this interface. Leave it unused. 419 // - Rename sit0 to cilium_sit, if present. This was potentially a mistake, 420 // as the sit module is not involved with ip6tnl interfaces. 421 // 422 // As of Cilium 1.15, if present, tunl0 is renamed to cilium_tunl and ip6tnl0 is 423 // renamed to cilium_ip6tnl. This is to communicate to the user that Cilium has 424 // taken control of the encapsulation stack on the node, as it currently doesn't 425 // explicitly support sharing it with other tools/CNIs. Fallback devices are left 426 // unused for production traffic. Only devices that were explicitly created are used. 427 func setupIPIPDevices(sysctl sysctl.Sysctl, ipv4, ipv6 bool) error { 428 // FlowBased sets IFLA_IPTUN_COLLECT_METADATA, the equivalent of 'ip link add 429 // ... type ipip/ip6tnl external'. This is needed so bpf programs can use 430 // bpf_skb_[gs]et_tunnel_key() on packets flowing through tunnels. 431 432 if ipv4 { 433 // Set up IPv4 tunnel device if requested. 434 if _, err := ensureDevice(sysctl, &netlink.Iptun{ 435 LinkAttrs: netlink.LinkAttrs{Name: defaults.IPIPv4Device}, 436 FlowBased: true, 437 }); err != nil { 438 return fmt.Errorf("creating %s: %w", defaults.IPIPv4Device, err) 439 } 440 441 // Rename fallback device created by potential kernel module load after 442 // creating tunnel interface. 443 if err := renameDevice("tunl0", "cilium_tunl"); err != nil { 444 return fmt.Errorf("renaming fallback device %s: %w", "tunl0", err) 445 } 446 } else { 447 if err := removeDevice(defaults.IPIPv4Device); err != nil { 448 return fmt.Errorf("removing %s: %w", defaults.IPIPv4Device, err) 449 } 450 } 451 452 if ipv6 { 453 // Set up IPv6 tunnel device if requested. 454 if _, err := ensureDevice(sysctl, &netlink.Ip6tnl{ 455 LinkAttrs: netlink.LinkAttrs{Name: defaults.IPIPv6Device}, 456 FlowBased: true, 457 }); err != nil { 458 return fmt.Errorf("creating %s: %w", defaults.IPIPv6Device, err) 459 } 460 461 // Rename fallback device created by potential kernel module load after 462 // creating tunnel interface. 463 if err := renameDevice("ip6tnl0", "cilium_ip6tnl"); err != nil { 464 return fmt.Errorf("renaming fallback device %s: %w", "tunl0", err) 465 } 466 } else { 467 if err := removeDevice(defaults.IPIPv6Device); err != nil { 468 return fmt.Errorf("removing %s: %w", defaults.IPIPv6Device, err) 469 } 470 } 471 472 return nil 473 } 474 475 // ensureDevice ensures a device with the given attrs is present on the system. 476 // If a device with the given name already exists, device creation is skipped and 477 // the existing device will be used as-is for the subsequent configuration steps. 478 // The device is never recreated. 479 // 480 // The device's state is set to 'up', L3 forwarding sysctls are applied, and MTU 481 // is set. 482 func ensureDevice(sysctl sysctl.Sysctl, attrs netlink.Link) (netlink.Link, error) { 483 name := attrs.Attrs().Name 484 485 // Reuse existing tunnel interface created by previous runs. 486 l, err := netlink.LinkByName(name) 487 if err != nil { 488 if err := netlink.LinkAdd(attrs); err != nil { 489 if errors.Is(err, unix.ENOTSUP) { 490 err = fmt.Errorf("%w, maybe kernel module for %s is not available?", err, attrs.Type()) 491 } 492 return nil, fmt.Errorf("creating device %s: %w", name, err) 493 } 494 495 // Fetch the link we've just created. 496 l, err = netlink.LinkByName(name) 497 if err != nil { 498 return nil, fmt.Errorf("retrieving created device %s: %w", name, err) 499 } 500 } 501 502 if err := enableForwarding(sysctl, l); err != nil { 503 return nil, fmt.Errorf("setting up device %s: %w", name, err) 504 } 505 506 // Update MTU on the link if necessary. 507 wantMTU, gotMTU := attrs.Attrs().MTU, l.Attrs().MTU 508 if wantMTU != 0 && wantMTU != gotMTU { 509 if err := netlink.LinkSetMTU(l, wantMTU); err != nil { 510 return nil, fmt.Errorf("setting MTU on %s: %w", name, err) 511 } 512 } 513 514 return l, nil 515 } 516 517 // removeDevice removes the device with the given name. Returns error if the 518 // device exists but was unable to be removed. 519 func removeDevice(name string) error { 520 link, err := netlink.LinkByName(name) 521 if err != nil { 522 return nil 523 } 524 525 if err := netlink.LinkDel(link); err != nil { 526 return fmt.Errorf("removing device %s: %w", name, err) 527 } 528 529 return nil 530 } 531 532 // renameDevice renames a network device from and to a given value. Returns nil 533 // if the device does not exist. 534 func renameDevice(from, to string) error { 535 link, err := netlink.LinkByName(from) 536 if err != nil { 537 return nil 538 } 539 540 if err := netlink.LinkSetName(link, to); err != nil { 541 return fmt.Errorf("renaming device %s to %s: %w", from, to, err) 542 } 543 544 return nil 545 } 546 547 // DeviceHasSKBProgramLoaded returns true if the given device has a tc(x) program 548 // attached. 549 // 550 // If checkEgress is true, returns true if there's both an ingress and 551 // egress program attached. 552 func DeviceHasSKBProgramLoaded(device string, checkEgress bool) (bool, error) { 553 link, err := netlink.LinkByName(device) 554 if err != nil { 555 return false, fmt.Errorf("retrieving device %s: %w", device, err) 556 } 557 558 itcx, err := hasCiliumTCXLinks(link, ebpf.AttachTCXIngress) 559 if err != nil { 560 return false, err 561 } 562 itc, err := hasCiliumTCFilters(link, netlink.HANDLE_MIN_INGRESS) 563 if err != nil { 564 return false, err 565 } 566 ink, err := hasCiliumNetkitLinks(link, ebpf.AttachNetkitPeer) 567 if err != nil { 568 return false, err 569 } 570 571 // Need ingress programs at minimum, bail out if these are already missing. 572 if !itc && !itcx && !ink { 573 return false, nil 574 } 575 576 if !checkEgress { 577 return true, nil 578 } 579 580 etcx, err := hasCiliumTCXLinks(link, ebpf.AttachTCXEgress) 581 if err != nil { 582 return false, err 583 } 584 etc, err := hasCiliumTCFilters(link, netlink.HANDLE_MIN_EGRESS) 585 if err != nil { 586 return false, err 587 } 588 enk, err := hasCiliumNetkitLinks(link, ebpf.AttachNetkitPrimary) 589 if err != nil { 590 return false, err 591 } 592 593 return etc || etcx || enk, nil 594 }