gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/sandbox/xdp.go (about) 1 // Copyright 2023 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package sandbox 16 17 import ( 18 "bytes" 19 "fmt" 20 "net" 21 "os" 22 "strings" 23 24 "github.com/cilium/ebpf" 25 "github.com/cilium/ebpf/link" 26 "github.com/vishvananda/netlink" 27 "golang.org/x/sys/unix" 28 "gvisor.dev/gvisor/pkg/log" 29 "gvisor.dev/gvisor/pkg/urpc" 30 "gvisor.dev/gvisor/pkg/xdp" 31 "gvisor.dev/gvisor/runsc/boot" 32 "gvisor.dev/gvisor/runsc/config" 33 "gvisor.dev/gvisor/runsc/sandbox/bpf" 34 xdpcmd "gvisor.dev/gvisor/tools/xdp/cmd" 35 ) 36 37 // createRedirectInterfacesAndRoutes initializes the network using an AF_XDP 38 // socket on a *host* device, not a device in the container netns. It: 39 // 40 // - scrapes the address, interface, and routes of the device and recreates 41 // them in the sandbox 42 // - does *not* remove them from the host device 43 // - creates an AF_XDP socket bound to the device 44 // 45 // In effect, this takes over the host device for the duration of the sentry's 46 // lifetime. This also means only one container can run at a time, as it 47 // monopolizes the device. 48 // 49 // TODO(b/240191988): Enbable device sharing via XDP_SHARED_UMEM. 50 // TODO(b/240191988): IPv6 support. 51 // TODO(b/240191988): Merge redundant code with CreateLinksAndRoutes once 52 // features are finalized. 53 func createRedirectInterfacesAndRoutes(conn *urpc.Client, conf *config.Config) error { 54 args, iface, err := prepareRedirectInterfaceArgs(boot.BindRunsc, conf) 55 if err != nil { 56 return fmt.Errorf("failed to generate redirect interface args: %w", err) 57 } 58 59 // Create an XDP socket. The sentry will mmap the rings. 60 xdpSockFD, err := unix.Socket(unix.AF_XDP, unix.SOCK_RAW, 0) 61 if err != nil { 62 return fmt.Errorf("unable to create AF_XDP socket: %w", err) 63 } 64 xdpSock := os.NewFile(uintptr(xdpSockFD), "xdp-sock-fd") 65 66 // Dup to ensure os.File doesn't close it prematurely. 67 if _, err := unix.Dup(xdpSockFD); err != nil { 68 return fmt.Errorf("failed to dup XDP sock: %w", err) 69 } 70 args.FilePayload.Files = append(args.FilePayload.Files, xdpSock) 71 72 if err := pcapAndNAT(&args, conf); err != nil { 73 return err 74 } 75 76 log.Infof("Setting up network, config: %+v", args) 77 if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil { 78 return fmt.Errorf("creating links and routes: %w", err) 79 } 80 81 // Insert socket into eBPF map. Note that sockets are automatically 82 // removed from eBPF maps when released. See net/xdp/xsk.c:xsk_release 83 // and net/xdp/xsk.c:xsk_delete_from_maps. 84 mapPath := xdpcmd.RedirectMapPath(iface.Name) 85 pinnedMap, err := ebpf.LoadPinnedMap(mapPath, nil) 86 if err != nil { 87 return fmt.Errorf("failed to load pinned map %s: %w", mapPath, err) 88 } 89 // TODO(b/240191988): Updating of pinned maps should be sychronized and 90 // check for the existence of the key. 91 mapKey := uint32(0) 92 mapVal := uint32(xdpSockFD) 93 if err := pinnedMap.Update(&mapKey, &mapVal, ebpf.UpdateAny); err != nil { 94 return fmt.Errorf("failed to insert socket into map %s: %w", mapPath, err) 95 } 96 97 // Bind to the device. 98 // TODO(b/240191988): We can't assume there's only one queue, but this 99 // appears to be the case on gVNIC instances. 100 if err := xdp.Bind(xdpSockFD, uint32(iface.Index), 0 /* queueID */, conf.AFXDPUseNeedWakeup); err != nil { 101 return fmt.Errorf("failed to bind to interface %q: %v", iface.Name, err) 102 } 103 104 return nil 105 } 106 107 // Collect addresses, routes, and neighbors from the interfaces. We only 108 // process two interfaces: the loopback and the interface we've been told to 109 // bind to. This all takes place in the netns where the runsc binary is run, 110 // *not* the netns passed to the container. 111 func prepareRedirectInterfaceArgs(bind boot.BindOpt, conf *config.Config) (boot.CreateLinksAndRoutesArgs, net.Interface, error) { 112 ifaces, err := net.Interfaces() 113 if err != nil { 114 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("querying interfaces: %w", err) 115 } 116 117 args := boot.CreateLinksAndRoutesArgs{ 118 DisconnectOk: conf.NetDisconnectOk, 119 } 120 var netIface net.Interface 121 for _, iface := range ifaces { 122 if iface.Flags&net.FlagUp == 0 { 123 log.Infof("Skipping down interface: %+v", iface) 124 continue 125 } 126 127 allAddrs, err := iface.Addrs() 128 if err != nil { 129 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err) 130 } 131 132 // We build our own loopback device. 133 if iface.Flags&net.FlagLoopback != 0 { 134 link, err := loopbackLink(conf, iface, allAddrs) 135 if err != nil { 136 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("getting loopback link for iface %q: %w", iface.Name, err) 137 } 138 args.LoopbackLinks = append(args.LoopbackLinks, link) 139 continue 140 } 141 142 if iface.Name != conf.XDP.IfaceName { 143 log.Infof("Skipping interface %q", iface.Name) 144 continue 145 } 146 147 var ipAddrs []*net.IPNet 148 for _, ifaddr := range allAddrs { 149 ipNet, ok := ifaddr.(*net.IPNet) 150 if !ok { 151 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("address is not IPNet: %+v", ifaddr) 152 } 153 if ipNet.IP.To4() == nil { 154 log.Infof("Skipping non-IPv4 address %s", ipNet.IP) 155 continue 156 } 157 ipAddrs = append(ipAddrs, ipNet) 158 } 159 if len(ipAddrs) != 1 { 160 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("we only handle a single IPv4 address, but interface %q has %d: %v", iface.Name, len(ipAddrs), ipAddrs) 161 } 162 prefix, _ := ipAddrs[0].Mask.Size() 163 addr := boot.IPWithPrefix{Address: ipAddrs[0].IP, PrefixLen: prefix} 164 165 // Collect data from the ARP table. 166 dump, err := netlink.NeighList(iface.Index, 0) 167 if err != nil { 168 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("fetching ARP table for %q: %w", iface.Name, err) 169 } 170 171 var neighbors []boot.Neighbor 172 for _, n := range dump { 173 // There are only two "good" states NUD_PERMANENT and NUD_REACHABLE, 174 // but NUD_REACHABLE is fully dynamic and will be re-probed anyway. 175 if n.State == netlink.NUD_PERMANENT { 176 log.Debugf("Copying a static ARP entry: %+v %+v", n.IP, n.HardwareAddr) 177 // No flags are copied because Stack.AddStaticNeighbor does not support flags right now. 178 neighbors = append(neighbors, boot.Neighbor{IP: n.IP, HardwareAddr: n.HardwareAddr}) 179 } 180 } 181 182 // Scrape routes. 183 routes, defv4, defv6, err := routesForIface(iface) 184 if err != nil { 185 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("getting routes for interface %q: %v", iface.Name, err) 186 } 187 if defv4 != nil { 188 if !args.Defaultv4Gateway.Route.Empty() { 189 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway) 190 } 191 args.Defaultv4Gateway.Route = *defv4 192 args.Defaultv4Gateway.Name = iface.Name 193 } 194 195 if defv6 != nil { 196 if !args.Defaultv6Gateway.Route.Empty() { 197 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway) 198 } 199 args.Defaultv6Gateway.Route = *defv6 200 args.Defaultv6Gateway.Name = iface.Name 201 } 202 203 // Get the link address of the interface. 204 ifaceLink, err := netlink.LinkByName(iface.Name) 205 if err != nil { 206 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("getting link for interface %q: %w", iface.Name, err) 207 } 208 linkAddress := ifaceLink.Attrs().HardwareAddr 209 210 xdplink := boot.XDPLink{ 211 Name: iface.Name, 212 InterfaceIndex: iface.Index, 213 Routes: routes, 214 TXChecksumOffload: conf.TXChecksumOffload, 215 RXChecksumOffload: conf.RXChecksumOffload, 216 NumChannels: conf.NumNetworkChannels, 217 QDisc: conf.QDisc, 218 Neighbors: neighbors, 219 LinkAddress: linkAddress, 220 Addresses: []boot.IPWithPrefix{addr}, 221 GVisorGRO: conf.GVisorGRO, 222 Bind: bind, 223 } 224 args.XDPLinks = append(args.XDPLinks, xdplink) 225 netIface = iface 226 } 227 228 if len(args.XDPLinks) != 1 { 229 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("expected 1 XDP link, but found %d", len(args.XDPLinks)) 230 } 231 return args, netIface, nil 232 } 233 234 func createSocketXDP(iface net.Interface) ([]*os.File, error) { 235 // Create an XDP socket. The sentry will mmap memory for the various 236 // rings and bind to the device. 237 fd, err := unix.Socket(unix.AF_XDP, unix.SOCK_RAW, 0) 238 if err != nil { 239 return nil, fmt.Errorf("unable to create AF_XDP socket: %v", err) 240 } 241 242 // We also need to, before dropping privileges, attach a program to the 243 // device and insert our socket into its map. 244 245 // Load into the kernel. 246 spec, err := ebpf.LoadCollectionSpecFromReader(bytes.NewReader(bpf.AFXDPProgram)) 247 if err != nil { 248 return nil, fmt.Errorf("failed to load spec: %v", err) 249 } 250 251 var objects struct { 252 Program *ebpf.Program `ebpf:"xdp_prog"` 253 SockMap *ebpf.Map `ebpf:"sock_map"` 254 } 255 if err := spec.LoadAndAssign(&objects, nil); err != nil { 256 return nil, fmt.Errorf("failed to load program: %v", err) 257 } 258 259 rawLink, err := link.AttachRawLink(link.RawLinkOptions{ 260 Program: objects.Program, 261 Attach: ebpf.AttachXDP, 262 Target: iface.Index, 263 // By not setting the Flag field, the kernel will choose the 264 // fastest mode. In order those are: 265 // - Offloaded onto the NIC. 266 // - Running directly in the driver. 267 // - Generic mode, which works with any NIC/driver but lacks 268 // much of the XDP performance boost. 269 }) 270 if err != nil { 271 return nil, fmt.Errorf("failed to attach BPF program: %v", err) 272 } 273 274 // Insert our AF_XDP socket into the BPF map that dictates where 275 // packets are redirected to. 276 // TODO(b/240191988): Updating of pinned maps should be sychronized and 277 // check for the existence of the key. 278 key := uint32(0) 279 val := uint32(fd) 280 if err := objects.SockMap.Update(&key, &val, 0 /* flags */); err != nil { 281 return nil, fmt.Errorf("failed to insert socket into BPF map: %v", err) 282 } 283 284 // We need to keep the Program, SockMap, and link FDs open until they 285 // can be passed to the sandbox process. 286 progFD, err := unix.Dup(objects.Program.FD()) 287 if err != nil { 288 return nil, fmt.Errorf("failed to dup BPF program: %v", err) 289 } 290 sockMapFD, err := unix.Dup(objects.SockMap.FD()) 291 if err != nil { 292 return nil, fmt.Errorf("failed to dup BPF map: %v", err) 293 } 294 linkFD, err := unix.Dup(rawLink.FD()) 295 if err != nil { 296 return nil, fmt.Errorf("failed to dup BPF link: %v", err) 297 } 298 299 return []*os.File{ 300 os.NewFile(uintptr(fd), "xdp-fd"), // The socket. 301 os.NewFile(uintptr(progFD), "program-fd"), // The XDP program. 302 os.NewFile(uintptr(sockMapFD), "sockmap-fd"), // The XDP map. 303 os.NewFile(uintptr(linkFD), "link-fd"), // The XDP link. 304 }, nil 305 } 306 307 // TODO(b/240191988): Merge redundant code with CreateLinksAndRoutes once 308 // features are finalized. 309 // TODO(b/240191988): Cleanup / GC of pinned BPF objects. 310 func createXDPTunnel(conn *urpc.Client, nsPath string, conf *config.Config) error { 311 // Get the setup for the sentry nic. We need the host neighbors and routes. 312 args, hostIface, err := prepareRedirectInterfaceArgs(boot.BindSentry, conf) 313 if err != nil { 314 return fmt.Errorf("failed to generate tunnel interface args: %w", err) 315 } 316 317 // Setup the XDP socket on the gVisor nic. 318 files, err := func() ([]*os.File, error) { 319 // Join the network namespace that we will be copying. 320 restore, err := joinNetNS(nsPath) 321 if err != nil { 322 return nil, err 323 } 324 defer restore() 325 326 // Create an XDP socket. The sentry will mmap memory for the various 327 // rings and bind to the device. 328 fd, err := unix.Socket(unix.AF_XDP, unix.SOCK_RAW, 0) 329 if err != nil { 330 return nil, fmt.Errorf("unable to create AF_XDP socket: %v", err) 331 } 332 333 // We also need to, before dropping privileges, attach a program to the 334 // device and insert our socket into its map. 335 336 // Load into the kernel. 337 spec, err := ebpf.LoadCollectionSpecFromReader(bytes.NewReader(bpf.AFXDPProgram)) 338 if err != nil { 339 return nil, fmt.Errorf("failed to load spec: %v", err) 340 } 341 342 var objects struct { 343 Program *ebpf.Program `ebpf:"xdp_prog"` 344 SockMap *ebpf.Map `ebpf:"sock_map"` 345 } 346 if err := spec.LoadAndAssign(&objects, nil); err != nil { 347 return nil, fmt.Errorf("failed to load program: %v", err) 348 } 349 350 // We assume there are two interfaces in the netns: a loopback and veth. 351 ifaces, err := net.Interfaces() 352 if err != nil { 353 return nil, fmt.Errorf("querying interfaces in ns: %w", err) 354 } 355 356 var iface *net.Interface 357 for _, netIface := range ifaces { 358 if netIface.Flags&net.FlagLoopback == 0 { 359 iface = &netIface 360 break 361 } 362 } 363 if iface == nil { 364 return nil, fmt.Errorf("unable to find non-loopback interface in the ns") 365 } 366 args.XDPLinks[0].InterfaceIndex = iface.Index 367 368 rawLink, err := link.AttachRawLink(link.RawLinkOptions{ 369 Program: objects.Program, 370 Attach: ebpf.AttachXDP, 371 Target: iface.Index, 372 // By not setting the Flag field, the kernel will choose the 373 // fastest mode. In order those are: 374 // - Offloaded onto the NIC. 375 // - Running directly in the driver. 376 // - Generic mode, which works with any NIC/driver but lacks 377 // much of the XDP performance boost. 378 }) 379 if err != nil { 380 return nil, fmt.Errorf("failed to attach BPF program to interface %q: %v", iface.Name, err) 381 } 382 383 // Insert our AF_XDP socket into the BPF map that dictates where 384 // packets are redirected to. 385 // TODO(b/240191988): Updating of pinned maps should be 386 // sychronized and check for the existence of the key. 387 key := uint32(0) 388 val := uint32(fd) 389 if err := objects.SockMap.Update(&key, &val, 0 /* flags */); err != nil { 390 return nil, fmt.Errorf("failed to insert socket into BPF map: %v", err) 391 } 392 393 // We need to keep the Program, SockMap, and link FDs open until they 394 // can be passed to the sandbox process. 395 progFD, err := unix.Dup(objects.Program.FD()) 396 if err != nil { 397 return nil, fmt.Errorf("failed to dup BPF program: %v", err) 398 } 399 sockMapFD, err := unix.Dup(objects.SockMap.FD()) 400 if err != nil { 401 return nil, fmt.Errorf("failed to dup BPF map: %v", err) 402 } 403 linkFD, err := unix.Dup(rawLink.FD()) 404 if err != nil { 405 return nil, fmt.Errorf("failed to dup BPF link: %v", err) 406 } 407 408 return []*os.File{ 409 os.NewFile(uintptr(fd), "xdp-fd"), // The socket. 410 os.NewFile(uintptr(progFD), "program-fd"), // The XDP program. 411 os.NewFile(uintptr(sockMapFD), "sockmap-fd"), // The XDP map. 412 os.NewFile(uintptr(linkFD), "link-fd"), // The XDP link. 413 }, nil 414 }() 415 if err != nil { 416 return fmt.Errorf("failed to create AF_XDP socket for container: %w", err) 417 } 418 args.FilePayload.Files = append(args.FilePayload.Files, files...) 419 420 // We're back in the parent netns. Get all interfaces. 421 ifaces, err := net.Interfaces() 422 if err != nil { 423 return fmt.Errorf("querying interfaces: %w", err) 424 } 425 426 // TODO(b/240191988): Find a better way to identify the other end of the veth. 427 var vethIface *net.Interface 428 for _, iface := range ifaces { 429 if strings.HasPrefix(iface.Name, "veth") { 430 vethIface = &iface 431 break 432 } 433 } 434 if vethIface == nil { 435 return fmt.Errorf("unable to find veth interface") 436 } 437 438 // Insert veth into host eBPF map. 439 hostMapPath := xdpcmd.TunnelHostMapPath(hostIface.Name) 440 pinnedHostMap, err := ebpf.LoadPinnedMap(hostMapPath, nil) 441 if err != nil { 442 return fmt.Errorf("failed to load pinned host map %s: %w", hostMapPath, err) 443 } 444 // TODO(b/240191988): Updating of pinned maps should be sychronized and 445 // check for the existence of the key. 446 mapKey := uint32(0) 447 mapVal := uint32(vethIface.Index) 448 if err := pinnedHostMap.Update(&mapKey, &mapVal, ebpf.UpdateAny); err != nil { 449 return fmt.Errorf("failed to insert veth into host map %s: %w", hostMapPath, err) 450 } 451 452 // Attach a program to the veth. 453 spec, err := ebpf.LoadCollectionSpecFromReader(bytes.NewReader(bpf.TunnelVethProgram)) 454 if err != nil { 455 return fmt.Errorf("failed to load spec: %v", err) 456 } 457 458 var objects struct { 459 Program *ebpf.Program `ebpf:"xdp_veth_prog"` 460 DevMap *ebpf.Map `ebpf:"dev_map"` 461 } 462 if err := spec.LoadAndAssign(&objects, nil); err != nil { 463 return fmt.Errorf("failed to load program: %v", err) 464 } 465 defer func() { 466 if err := objects.Program.Close(); err != nil { 467 log.Infof("failed to close program: %v", err) 468 } 469 if err := objects.DevMap.Close(); err != nil { 470 log.Infof("failed to close sock map: %v", err) 471 } 472 }() 473 474 attached, err := link.AttachXDP(link.XDPOptions{ 475 Program: objects.Program, 476 Interface: vethIface.Index, 477 // By not setting the Flag field, the kernel will choose the 478 // fastest mode. In order those are: 479 // - Offloaded onto the NIC. 480 // - Running directly in the driver. 481 // - Generic mode, which works with any NIC/driver but lacks 482 // much of the XDP performance boost. 483 }) 484 if err != nil { 485 return fmt.Errorf("failed to attach: %w", err) 486 } 487 488 var ( 489 vethPinDir = xdpcmd.RedirectPinDir(vethIface.Name) 490 vethMapPath = xdpcmd.TunnelVethMapPath(vethIface.Name) 491 vethProgramPath = xdpcmd.TunnelVethProgramPath(vethIface.Name) 492 vethLinkPath = xdpcmd.TunnelVethLinkPath(vethIface.Name) 493 ) 494 495 // Create directory /sys/fs/bpf/<device name>/. 496 if err := os.Mkdir(vethPinDir, 0700); err != nil && !os.IsExist(err) { 497 return fmt.Errorf("failed to create directory for pinning at %s: %v", vethPinDir, err) 498 } 499 500 // Pin the map at /sys/fs/bpf/<device name>/tunnel_host_map. 501 if err := objects.DevMap.Pin(vethMapPath); err != nil { 502 return fmt.Errorf("failed to pin map at %s", vethMapPath) 503 } 504 log.Infof("Pinned map at %s", vethMapPath) 505 506 // Pin the program at /sys/fs/bpf/<device name>/tunnel_host_program. 507 if err := objects.Program.Pin(vethProgramPath); err != nil { 508 return fmt.Errorf("failed to pin program at %s", vethProgramPath) 509 } 510 log.Infof("Pinned program at %s", vethProgramPath) 511 512 // Make everything persistent by pinning the link. Otherwise, the XDP 513 // program would detach when this process exits. 514 if err := attached.Pin(vethLinkPath); err != nil { 515 return fmt.Errorf("failed to pin link at %s", vethLinkPath) 516 } 517 log.Infof("Pinned link at %s", vethLinkPath) 518 519 // Insert host into veth eBPF map. 520 // TODO(b/240191988): We should be able to use the existing map instead 521 // of opening a pinned copy. 522 pinnedVethMap, err := ebpf.LoadPinnedMap(vethMapPath, nil) 523 if err != nil { 524 return fmt.Errorf("failed to load pinned veth map %s: %w", vethMapPath, err) 525 } 526 // TODO(b/240191988): Updating of pinned maps should be sychronized and 527 // check for the existence of the key. 528 mapKey = uint32(0) 529 mapVal = uint32(hostIface.Index) 530 if err := pinnedVethMap.Update(&mapKey, &mapVal, ebpf.UpdateAny); err != nil { 531 return fmt.Errorf("failed to insert host into veth map %s: %w", vethMapPath, err) 532 } 533 534 if err := pcapAndNAT(&args, conf); err != nil { 535 return err 536 } 537 538 log.Debugf("Setting up network, config: %+v", args) 539 if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil { 540 return fmt.Errorf("creating links and routes: %w", err) 541 } 542 return nil 543 }