github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/runsc/sandbox/xdp.go (about) 1 // Copyright 2023 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package sandbox 16 17 import ( 18 "bytes" 19 "fmt" 20 "net" 21 "os" 22 "strings" 23 24 "github.com/cilium/ebpf" 25 "github.com/cilium/ebpf/link" 26 "github.com/vishvananda/netlink" 27 "golang.org/x/sys/unix" 28 "github.com/metacubex/gvisor/pkg/log" 29 "github.com/metacubex/gvisor/pkg/urpc" 30 "github.com/metacubex/gvisor/pkg/xdp" 31 "github.com/metacubex/gvisor/runsc/boot" 32 "github.com/metacubex/gvisor/runsc/config" 33 "github.com/metacubex/gvisor/runsc/sandbox/bpf" 34 xdpcmd "github.com/metacubex/gvisor/tools/xdp/cmd" 35 ) 36 37 // createRedirectInterfacesAndRoutes initializes the network using an AF_XDP 38 // socket on a *host* device, not a device in the container netns. It: 39 // 40 // - scrapes the address, interface, and routes of the device and recreates 41 // them in the sandbox 42 // - does *not* remove them from the host device 43 // - creates an AF_XDP socket bound to the device 44 // 45 // In effect, this takes over the host device for the duration of the sentry's 46 // lifetime. This also means only one container can run at a time, as it 47 // monopolizes the device. 48 // 49 // TODO(b/240191988): Enbable device sharing via XDP_SHARED_UMEM. 50 // TODO(b/240191988): IPv6 support. 51 // TODO(b/240191988): Merge redundant code with CreateLinksAndRoutes once 52 // features are finalized. 53 func createRedirectInterfacesAndRoutes(conn *urpc.Client, conf *config.Config) error { 54 args, iface, err := prepareRedirectInterfaceArgs(boot.BindRunsc, conf) 55 if err != nil { 56 return fmt.Errorf("failed to generate redirect interface args: %w", err) 57 } 58 59 // Create an XDP socket. The sentry will mmap the rings. 60 xdpSockFD, err := unix.Socket(unix.AF_XDP, unix.SOCK_RAW, 0) 61 if err != nil { 62 return fmt.Errorf("unable to create AF_XDP socket: %w", err) 63 } 64 xdpSock := os.NewFile(uintptr(xdpSockFD), "xdp-sock-fd") 65 66 // Dup to ensure os.File doesn't close it prematurely. 67 if _, err := unix.Dup(xdpSockFD); err != nil { 68 return fmt.Errorf("failed to dup XDP sock: %w", err) 69 } 70 args.FilePayload.Files = append(args.FilePayload.Files, xdpSock) 71 72 if err := pcapAndNAT(&args, conf); err != nil { 73 return err 74 } 75 76 log.Infof("Setting up network, config: %+v", args) 77 if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil { 78 return fmt.Errorf("creating links and routes: %w", err) 79 } 80 81 // Insert socket into eBPF map. Note that sockets are automatically 82 // removed from eBPF maps when released. See net/xdp/xsk.c:xsk_release 83 // and net/xdp/xsk.c:xsk_delete_from_maps. 84 mapPath := xdpcmd.RedirectMapPath(iface.Name) 85 pinnedMap, err := ebpf.LoadPinnedMap(mapPath, nil) 86 if err != nil { 87 return fmt.Errorf("failed to load pinned map %s: %w", mapPath, err) 88 } 89 // TODO(b/240191988): Updating of pinned maps should be sychronized and 90 // check for the existence of the key. 91 mapKey := uint32(0) 92 mapVal := uint32(xdpSockFD) 93 if err := pinnedMap.Update(&mapKey, &mapVal, ebpf.UpdateAny); err != nil { 94 return fmt.Errorf("failed to insert socket into map %s: %w", mapPath, err) 95 } 96 97 // Bind to the device. 98 // TODO(b/240191988): We can't assume there's only one queue, but this 99 // appears to be the case on gVNIC instances. 100 if err := xdp.Bind(xdpSockFD, uint32(iface.Index), 0 /* queueID */, conf.AFXDPUseNeedWakeup); err != nil { 101 return fmt.Errorf("failed to bind to interface %q: %v", iface.Name, err) 102 } 103 104 return nil 105 } 106 107 // Collect addresses, routes, and neighbors from the interfaces. We only 108 // process two interfaces: the loopback and the interface we've been told to 109 // bind to. This all takes place in the netns where the runsc binary is run, 110 // *not* the netns passed to the container. 111 func prepareRedirectInterfaceArgs(bind boot.BindOpt, conf *config.Config) (boot.CreateLinksAndRoutesArgs, net.Interface, error) { 112 ifaces, err := net.Interfaces() 113 if err != nil { 114 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("querying interfaces: %w", err) 115 } 116 117 var args boot.CreateLinksAndRoutesArgs 118 var netIface net.Interface 119 for _, iface := range ifaces { 120 if iface.Flags&net.FlagUp == 0 { 121 log.Infof("Skipping down interface: %+v", iface) 122 continue 123 } 124 125 allAddrs, err := iface.Addrs() 126 if err != nil { 127 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err) 128 } 129 130 // We build our own loopback device. 131 if iface.Flags&net.FlagLoopback != 0 { 132 link, err := loopbackLink(conf, iface, allAddrs) 133 if err != nil { 134 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("getting loopback link for iface %q: %w", iface.Name, err) 135 } 136 args.LoopbackLinks = append(args.LoopbackLinks, link) 137 continue 138 } 139 140 if iface.Name != conf.XDP.IfaceName { 141 log.Infof("Skipping interface %q", iface.Name) 142 continue 143 } 144 145 var ipAddrs []*net.IPNet 146 for _, ifaddr := range allAddrs { 147 ipNet, ok := ifaddr.(*net.IPNet) 148 if !ok { 149 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("address is not IPNet: %+v", ifaddr) 150 } 151 if ipNet.IP.To4() == nil { 152 log.Infof("Skipping non-IPv4 address %s", ipNet.IP) 153 continue 154 } 155 ipAddrs = append(ipAddrs, ipNet) 156 } 157 if len(ipAddrs) != 1 { 158 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("we only handle a single IPv4 address, but interface %q has %d: %v", iface.Name, len(ipAddrs), ipAddrs) 159 } 160 prefix, _ := ipAddrs[0].Mask.Size() 161 addr := boot.IPWithPrefix{Address: ipAddrs[0].IP, PrefixLen: prefix} 162 163 // Collect data from the ARP table. 164 dump, err := netlink.NeighList(iface.Index, 0) 165 if err != nil { 166 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("fetching ARP table for %q: %w", iface.Name, err) 167 } 168 169 var neighbors []boot.Neighbor 170 for _, n := range dump { 171 // There are only two "good" states NUD_PERMANENT and NUD_REACHABLE, 172 // but NUD_REACHABLE is fully dynamic and will be re-probed anyway. 173 if n.State == netlink.NUD_PERMANENT { 174 log.Debugf("Copying a static ARP entry: %+v %+v", n.IP, n.HardwareAddr) 175 // No flags are copied because Stack.AddStaticNeighbor does not support flags right now. 176 neighbors = append(neighbors, boot.Neighbor{IP: n.IP, HardwareAddr: n.HardwareAddr}) 177 } 178 } 179 180 // Scrape routes. 181 routes, defv4, defv6, err := routesForIface(iface) 182 if err != nil { 183 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("getting routes for interface %q: %v", iface.Name, err) 184 } 185 if defv4 != nil { 186 if !args.Defaultv4Gateway.Route.Empty() { 187 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway) 188 } 189 args.Defaultv4Gateway.Route = *defv4 190 args.Defaultv4Gateway.Name = iface.Name 191 } 192 193 if defv6 != nil { 194 if !args.Defaultv6Gateway.Route.Empty() { 195 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway) 196 } 197 args.Defaultv6Gateway.Route = *defv6 198 args.Defaultv6Gateway.Name = iface.Name 199 } 200 201 // Get the link address of the interface. 202 ifaceLink, err := netlink.LinkByName(iface.Name) 203 if err != nil { 204 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("getting link for interface %q: %w", iface.Name, err) 205 } 206 linkAddress := ifaceLink.Attrs().HardwareAddr 207 208 xdplink := boot.XDPLink{ 209 Name: iface.Name, 210 InterfaceIndex: iface.Index, 211 Routes: routes, 212 TXChecksumOffload: conf.TXChecksumOffload, 213 RXChecksumOffload: conf.RXChecksumOffload, 214 NumChannels: conf.NumNetworkChannels, 215 QDisc: conf.QDisc, 216 Neighbors: neighbors, 217 LinkAddress: linkAddress, 218 Addresses: []boot.IPWithPrefix{addr}, 219 GvisorGROTimeout: conf.GvisorGROTimeout, 220 Bind: bind, 221 } 222 args.XDPLinks = append(args.XDPLinks, xdplink) 223 netIface = iface 224 } 225 226 if len(args.XDPLinks) != 1 { 227 return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("expected 1 XDP link, but found %d", len(args.XDPLinks)) 228 } 229 return args, netIface, nil 230 } 231 232 func createSocketXDP(iface net.Interface) ([]*os.File, error) { 233 // Create an XDP socket. The sentry will mmap memory for the various 234 // rings and bind to the device. 235 fd, err := unix.Socket(unix.AF_XDP, unix.SOCK_RAW, 0) 236 if err != nil { 237 return nil, fmt.Errorf("unable to create AF_XDP socket: %v", err) 238 } 239 240 // We also need to, before dropping privileges, attach a program to the 241 // device and insert our socket into its map. 242 243 // Load into the kernel. 244 spec, err := ebpf.LoadCollectionSpecFromReader(bytes.NewReader(bpf.AFXDPProgram)) 245 if err != nil { 246 return nil, fmt.Errorf("failed to load spec: %v", err) 247 } 248 249 var objects struct { 250 Program *ebpf.Program `ebpf:"xdp_prog"` 251 SockMap *ebpf.Map `ebpf:"sock_map"` 252 } 253 if err := spec.LoadAndAssign(&objects, nil); err != nil { 254 return nil, fmt.Errorf("failed to load program: %v", err) 255 } 256 257 rawLink, err := link.AttachRawLink(link.RawLinkOptions{ 258 Program: objects.Program, 259 Attach: ebpf.AttachXDP, 260 Target: iface.Index, 261 // By not setting the Flag field, the kernel will choose the 262 // fastest mode. In order those are: 263 // - Offloaded onto the NIC. 264 // - Running directly in the driver. 265 // - Generic mode, which works with any NIC/driver but lacks 266 // much of the XDP performance boost. 267 }) 268 if err != nil { 269 return nil, fmt.Errorf("failed to attach BPF program: %v", err) 270 } 271 272 // Insert our AF_XDP socket into the BPF map that dictates where 273 // packets are redirected to. 274 // TODO(b/240191988): Updating of pinned maps should be sychronized and 275 // check for the existence of the key. 276 key := uint32(0) 277 val := uint32(fd) 278 if err := objects.SockMap.Update(&key, &val, 0 /* flags */); err != nil { 279 return nil, fmt.Errorf("failed to insert socket into BPF map: %v", err) 280 } 281 282 // We need to keep the Program, SockMap, and link FDs open until they 283 // can be passed to the sandbox process. 284 progFD, err := unix.Dup(objects.Program.FD()) 285 if err != nil { 286 return nil, fmt.Errorf("failed to dup BPF program: %v", err) 287 } 288 sockMapFD, err := unix.Dup(objects.SockMap.FD()) 289 if err != nil { 290 return nil, fmt.Errorf("failed to dup BPF map: %v", err) 291 } 292 linkFD, err := unix.Dup(rawLink.FD()) 293 if err != nil { 294 return nil, fmt.Errorf("failed to dup BPF link: %v", err) 295 } 296 297 return []*os.File{ 298 os.NewFile(uintptr(fd), "xdp-fd"), // The socket. 299 os.NewFile(uintptr(progFD), "program-fd"), // The XDP program. 300 os.NewFile(uintptr(sockMapFD), "sockmap-fd"), // The XDP map. 301 os.NewFile(uintptr(linkFD), "link-fd"), // The XDP link. 302 }, nil 303 } 304 305 // TODO(b/240191988): Merge redundant code with CreateLinksAndRoutes once 306 // features are finalized. 307 // TODO(b/240191988): Cleanup / GC of pinned BPF objects. 308 func createXDPTunnel(conn *urpc.Client, nsPath string, conf *config.Config) error { 309 // Get the setup for the sentry nic. We need the host neighbors and routes. 310 args, hostIface, err := prepareRedirectInterfaceArgs(boot.BindSentry, conf) 311 if err != nil { 312 return fmt.Errorf("failed to generate tunnel interface args: %w", err) 313 } 314 315 // Setup the XDP socket on the gVisor nic. 316 files, err := func() ([]*os.File, error) { 317 // Join the network namespace that we will be copying. 318 restore, err := joinNetNS(nsPath) 319 if err != nil { 320 return nil, err 321 } 322 defer restore() 323 324 // Create an XDP socket. The sentry will mmap memory for the various 325 // rings and bind to the device. 326 fd, err := unix.Socket(unix.AF_XDP, unix.SOCK_RAW, 0) 327 if err != nil { 328 return nil, fmt.Errorf("unable to create AF_XDP socket: %v", err) 329 } 330 331 // We also need to, before dropping privileges, attach a program to the 332 // device and insert our socket into its map. 333 334 // Load into the kernel. 335 spec, err := ebpf.LoadCollectionSpecFromReader(bytes.NewReader(bpf.AFXDPProgram)) 336 if err != nil { 337 return nil, fmt.Errorf("failed to load spec: %v", err) 338 } 339 340 var objects struct { 341 Program *ebpf.Program `ebpf:"xdp_prog"` 342 SockMap *ebpf.Map `ebpf:"sock_map"` 343 } 344 if err := spec.LoadAndAssign(&objects, nil); err != nil { 345 return nil, fmt.Errorf("failed to load program: %v", err) 346 } 347 348 // We assume there are two interfaces in the netns: a loopback and veth. 349 ifaces, err := net.Interfaces() 350 if err != nil { 351 return nil, fmt.Errorf("querying interfaces in ns: %w", err) 352 } 353 354 var iface *net.Interface 355 for _, netIface := range ifaces { 356 if netIface.Flags&net.FlagLoopback == 0 { 357 iface = &netIface 358 break 359 } 360 } 361 if iface == nil { 362 return nil, fmt.Errorf("unable to find non-loopback interface in the ns") 363 } 364 args.XDPLinks[0].InterfaceIndex = iface.Index 365 366 rawLink, err := link.AttachRawLink(link.RawLinkOptions{ 367 Program: objects.Program, 368 Attach: ebpf.AttachXDP, 369 Target: iface.Index, 370 // By not setting the Flag field, the kernel will choose the 371 // fastest mode. In order those are: 372 // - Offloaded onto the NIC. 373 // - Running directly in the driver. 374 // - Generic mode, which works with any NIC/driver but lacks 375 // much of the XDP performance boost. 376 }) 377 if err != nil { 378 return nil, fmt.Errorf("failed to attach BPF program to interface %q: %v", iface.Name, err) 379 } 380 381 // Insert our AF_XDP socket into the BPF map that dictates where 382 // packets are redirected to. 383 // TODO(b/240191988): Updating of pinned maps should be 384 // sychronized and check for the existence of the key. 385 key := uint32(0) 386 val := uint32(fd) 387 if err := objects.SockMap.Update(&key, &val, 0 /* flags */); err != nil { 388 return nil, fmt.Errorf("failed to insert socket into BPF map: %v", err) 389 } 390 391 // We need to keep the Program, SockMap, and link FDs open until they 392 // can be passed to the sandbox process. 393 progFD, err := unix.Dup(objects.Program.FD()) 394 if err != nil { 395 return nil, fmt.Errorf("failed to dup BPF program: %v", err) 396 } 397 sockMapFD, err := unix.Dup(objects.SockMap.FD()) 398 if err != nil { 399 return nil, fmt.Errorf("failed to dup BPF map: %v", err) 400 } 401 linkFD, err := unix.Dup(rawLink.FD()) 402 if err != nil { 403 return nil, fmt.Errorf("failed to dup BPF link: %v", err) 404 } 405 406 return []*os.File{ 407 os.NewFile(uintptr(fd), "xdp-fd"), // The socket. 408 os.NewFile(uintptr(progFD), "program-fd"), // The XDP program. 409 os.NewFile(uintptr(sockMapFD), "sockmap-fd"), // The XDP map. 410 os.NewFile(uintptr(linkFD), "link-fd"), // The XDP link. 411 }, nil 412 }() 413 if err != nil { 414 return fmt.Errorf("failed to create AF_XDP socket for container: %w", err) 415 } 416 args.FilePayload.Files = append(args.FilePayload.Files, files...) 417 418 // We're back in the parent netns. Get all interfaces. 419 ifaces, err := net.Interfaces() 420 if err != nil { 421 return fmt.Errorf("querying interfaces: %w", err) 422 } 423 424 // TODO(b/240191988): Find a better way to identify the other end of the veth. 425 var vethIface *net.Interface 426 for _, iface := range ifaces { 427 if strings.HasPrefix(iface.Name, "veth") { 428 vethIface = &iface 429 break 430 } 431 } 432 if vethIface == nil { 433 return fmt.Errorf("unable to find veth interface") 434 } 435 436 // Insert veth into host eBPF map. 437 hostMapPath := xdpcmd.TunnelHostMapPath(hostIface.Name) 438 pinnedHostMap, err := ebpf.LoadPinnedMap(hostMapPath, nil) 439 if err != nil { 440 return fmt.Errorf("failed to load pinned host map %s: %w", hostMapPath, err) 441 } 442 // TODO(b/240191988): Updating of pinned maps should be sychronized and 443 // check for the existence of the key. 444 mapKey := uint32(0) 445 mapVal := uint32(vethIface.Index) 446 if err := pinnedHostMap.Update(&mapKey, &mapVal, ebpf.UpdateAny); err != nil { 447 return fmt.Errorf("failed to insert veth into host map %s: %w", hostMapPath, err) 448 } 449 450 // Attach a program to the veth. 451 spec, err := ebpf.LoadCollectionSpecFromReader(bytes.NewReader(bpf.TunnelVethProgram)) 452 if err != nil { 453 return fmt.Errorf("failed to load spec: %v", err) 454 } 455 456 var objects struct { 457 Program *ebpf.Program `ebpf:"xdp_veth_prog"` 458 DevMap *ebpf.Map `ebpf:"dev_map"` 459 } 460 if err := spec.LoadAndAssign(&objects, nil); err != nil { 461 return fmt.Errorf("failed to load program: %v", err) 462 } 463 defer func() { 464 if err := objects.Program.Close(); err != nil { 465 log.Infof("failed to close program: %v", err) 466 } 467 if err := objects.DevMap.Close(); err != nil { 468 log.Infof("failed to close sock map: %v", err) 469 } 470 }() 471 472 attached, err := link.AttachXDP(link.XDPOptions{ 473 Program: objects.Program, 474 Interface: vethIface.Index, 475 // By not setting the Flag field, the kernel will choose the 476 // fastest mode. In order those are: 477 // - Offloaded onto the NIC. 478 // - Running directly in the driver. 479 // - Generic mode, which works with any NIC/driver but lacks 480 // much of the XDP performance boost. 481 }) 482 if err != nil { 483 return fmt.Errorf("failed to attach: %w", err) 484 } 485 486 var ( 487 vethPinDir = xdpcmd.RedirectPinDir(vethIface.Name) 488 vethMapPath = xdpcmd.TunnelVethMapPath(vethIface.Name) 489 vethProgramPath = xdpcmd.TunnelVethProgramPath(vethIface.Name) 490 vethLinkPath = xdpcmd.TunnelVethLinkPath(vethIface.Name) 491 ) 492 493 // Create directory /sys/fs/bpf/<device name>/. 494 if err := os.Mkdir(vethPinDir, 0700); err != nil && !os.IsExist(err) { 495 return fmt.Errorf("failed to create directory for pinning at %s: %v", vethPinDir, err) 496 } 497 498 // Pin the map at /sys/fs/bpf/<device name>/tunnel_host_map. 499 if err := objects.DevMap.Pin(vethMapPath); err != nil { 500 return fmt.Errorf("failed to pin map at %s", vethMapPath) 501 } 502 log.Infof("Pinned map at %s", vethMapPath) 503 504 // Pin the program at /sys/fs/bpf/<device name>/tunnel_host_program. 505 if err := objects.Program.Pin(vethProgramPath); err != nil { 506 return fmt.Errorf("failed to pin program at %s", vethProgramPath) 507 } 508 log.Infof("Pinned program at %s", vethProgramPath) 509 510 // Make everything persistent by pinning the link. Otherwise, the XDP 511 // program would detach when this process exits. 512 if err := attached.Pin(vethLinkPath); err != nil { 513 return fmt.Errorf("failed to pin link at %s", vethLinkPath) 514 } 515 log.Infof("Pinned link at %s", vethLinkPath) 516 517 // Insert host into veth eBPF map. 518 // TODO(b/240191988): We should be able to use the existing map instead 519 // of opening a pinned copy. 520 pinnedVethMap, err := ebpf.LoadPinnedMap(vethMapPath, nil) 521 if err != nil { 522 return fmt.Errorf("failed to load pinned veth map %s: %w", vethMapPath, err) 523 } 524 // TODO(b/240191988): Updating of pinned maps should be sychronized and 525 // check for the existence of the key. 526 mapKey = uint32(0) 527 mapVal = uint32(hostIface.Index) 528 if err := pinnedVethMap.Update(&mapKey, &mapVal, ebpf.UpdateAny); err != nil { 529 return fmt.Errorf("failed to insert host into veth map %s: %w", vethMapPath, err) 530 } 531 532 if err := pcapAndNAT(&args, conf); err != nil { 533 return err 534 } 535 536 log.Debugf("Setting up network, config: %+v", args) 537 if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil { 538 return fmt.Errorf("creating links and routes: %w", err) 539 } 540 return nil 541 }