github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/sandbox/network.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package sandbox 16 17 import ( 18 "bytes" 19 "fmt" 20 "net" 21 "os" 22 "path/filepath" 23 "runtime" 24 "strconv" 25 26 "github.com/MerlinKodo/gvisor/pkg/log" 27 "github.com/MerlinKodo/gvisor/pkg/tcpip/header" 28 "github.com/MerlinKodo/gvisor/pkg/tcpip/stack" 29 "github.com/MerlinKodo/gvisor/pkg/urpc" 30 "github.com/MerlinKodo/gvisor/runsc/boot" 31 "github.com/MerlinKodo/gvisor/runsc/config" 32 "github.com/MerlinKodo/gvisor/runsc/sandbox/bpf" 33 "github.com/MerlinKodo/gvisor/runsc/specutils" 34 "github.com/cilium/ebpf" 35 "github.com/cilium/ebpf/link" 36 specs "github.com/opencontainers/runtime-spec/specs-go" 37 "github.com/vishvananda/netlink" 38 "golang.org/x/sys/unix" 39 ) 40 41 // setupNetwork configures the network stack to mimic the local network 42 // configuration. Docker uses network namespaces with vnets to configure the 43 // network for the container. The untrusted app expects to see the same network 44 // inside the sandbox. Routing and port mapping is handled directly by docker 45 // with most of network information not even available to the runtime. 46 // 47 // Netstack inside the sandbox speaks directly to the device using a raw socket. 48 // All IP addresses assigned to the NIC, are removed and passed on to netstack's 49 // device. 50 // 51 // If 'conf.Network' is NoNetwork, skips local configuration and creates a 52 // loopback interface only. 53 // 54 // Run the following container to test it: 55 // 56 // docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4 57 func setupNetwork(conn *urpc.Client, pid int, conf *config.Config) error { 58 log.Infof("Setting up network") 59 60 switch conf.Network { 61 case config.NetworkNone: 62 log.Infof("Network is disabled, create loopback interface only") 63 if err := createDefaultLoopbackInterface(conf, conn); err != nil { 64 return fmt.Errorf("creating default loopback interface: %v", err) 65 } 66 case config.NetworkSandbox: 67 // Build the path to the net namespace of the sandbox process. 68 // This is what we will copy. 69 nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net") 70 if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf); err != nil { 71 return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err) 72 } 73 case config.NetworkHost: 74 // Nothing to do here. 75 default: 76 return fmt.Errorf("invalid network type: %v", conf.Network) 77 } 78 return nil 79 } 80 81 func createDefaultLoopbackInterface(conf *config.Config, conn *urpc.Client) error { 82 link := boot.DefaultLoopbackLink 83 link.GvisorGROTimeout = conf.GvisorGROTimeout 84 if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{ 85 LoopbackLinks: []boot.LoopbackLink{link}, 86 }, nil); err != nil { 87 return fmt.Errorf("creating loopback link and routes: %v", err) 88 } 89 return nil 90 } 91 92 func joinNetNS(nsPath string) (func(), error) { 93 runtime.LockOSThread() 94 restoreNS, err := specutils.ApplyNS(specs.LinuxNamespace{ 95 Type: specs.NetworkNamespace, 96 Path: nsPath, 97 }) 98 if err != nil { 99 runtime.UnlockOSThread() 100 return nil, fmt.Errorf("joining net namespace %q: %v", nsPath, err) 101 } 102 return func() { 103 restoreNS() 104 runtime.UnlockOSThread() 105 }, nil 106 } 107 108 // isRootNS determines whether we are running in the root net namespace. 109 // /proc/sys/net/core/rmem_default only exists in root network namespace. 110 func isRootNS() (bool, error) { 111 err := unix.Access("/proc/sys/net/core/rmem_default", unix.F_OK) 112 switch err { 113 case nil: 114 return true, nil 115 case unix.ENOENT: 116 return false, nil 117 default: 118 return false, fmt.Errorf("failed to access /proc/sys/net/core/rmem_default: %v", err) 119 } 120 } 121 122 // createInterfacesAndRoutesFromNS scrapes the interface and routes from the 123 // net namespace with the given path, creates them in the sandbox, and removes 124 // them from the host. 125 func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, conf *config.Config) error { 126 // Join the network namespace that we will be copying. 127 restore, err := joinNetNS(nsPath) 128 if err != nil { 129 return err 130 } 131 defer restore() 132 133 // Get all interfaces in the namespace. 134 ifaces, err := net.Interfaces() 135 if err != nil { 136 return fmt.Errorf("querying interfaces: %w", err) 137 } 138 139 isRoot, err := isRootNS() 140 if err != nil { 141 return err 142 } 143 if isRoot { 144 return fmt.Errorf("cannot run with network enabled in root network namespace") 145 } 146 147 // Collect addresses and routes from the interfaces. 148 var args boot.CreateLinksAndRoutesArgs 149 for _, iface := range ifaces { 150 if iface.Flags&net.FlagUp == 0 { 151 log.Infof("Skipping down interface: %+v", iface) 152 continue 153 } 154 155 allAddrs, err := iface.Addrs() 156 if err != nil { 157 return fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err) 158 } 159 160 // We build our own loopback device. 161 if iface.Flags&net.FlagLoopback != 0 { 162 link, err := loopbackLink(conf, iface, allAddrs) 163 if err != nil { 164 return fmt.Errorf("getting loopback link for iface %q: %w", iface.Name, err) 165 } 166 args.LoopbackLinks = append(args.LoopbackLinks, link) 167 continue 168 } 169 170 var ipAddrs []*net.IPNet 171 for _, ifaddr := range allAddrs { 172 ipNet, ok := ifaddr.(*net.IPNet) 173 if !ok { 174 return fmt.Errorf("address is not IPNet: %+v", ifaddr) 175 } 176 ipAddrs = append(ipAddrs, ipNet) 177 } 178 if len(ipAddrs) == 0 { 179 log.Warningf("No usable IP addresses found for interface %q, skipping", iface.Name) 180 continue 181 } 182 183 // Collect data from the ARP table. 184 dump, err := netlink.NeighList(iface.Index, 0) 185 if err != nil { 186 return fmt.Errorf("fetching ARP table for %q: %w", iface.Name, err) 187 } 188 189 var neighbors []boot.Neighbor 190 for _, n := range dump { 191 // There are only two "good" states NUD_PERMANENT and NUD_REACHABLE, 192 // but NUD_REACHABLE is fully dynamic and will be re-probed anyway. 193 if n.State == netlink.NUD_PERMANENT { 194 log.Debugf("Copying a static ARP entry: %+v %+v", n.IP, n.HardwareAddr) 195 // No flags are copied because Stack.AddStaticNeighbor does not support flags right now. 196 neighbors = append(neighbors, boot.Neighbor{IP: n.IP, HardwareAddr: n.HardwareAddr}) 197 } 198 } 199 200 // Scrape the routes before removing the address, since that 201 // will remove the routes as well. 202 routes, defv4, defv6, err := routesForIface(iface) 203 if err != nil { 204 return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err) 205 } 206 if defv4 != nil { 207 if !args.Defaultv4Gateway.Route.Empty() { 208 return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway) 209 } 210 args.Defaultv4Gateway.Route = *defv4 211 args.Defaultv4Gateway.Name = iface.Name 212 } 213 214 if defv6 != nil { 215 if !args.Defaultv6Gateway.Route.Empty() { 216 return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway) 217 } 218 args.Defaultv6Gateway.Route = *defv6 219 args.Defaultv6Gateway.Name = iface.Name 220 } 221 222 // Get the link for the interface. 223 ifaceLink, err := netlink.LinkByName(iface.Name) 224 if err != nil { 225 return fmt.Errorf("getting link for interface %q: %w", iface.Name, err) 226 } 227 linkAddress := ifaceLink.Attrs().HardwareAddr 228 229 // Collect the addresses for the interface, enable forwarding, 230 // and remove them from the host. 231 var addresses []boot.IPWithPrefix 232 for _, addr := range ipAddrs { 233 prefix, _ := addr.Mask.Size() 234 addresses = append(addresses, boot.IPWithPrefix{Address: addr.IP, PrefixLen: prefix}) 235 236 // Steal IP address from NIC. 237 if err := removeAddress(ifaceLink, addr.String()); err != nil { 238 // If we encounter an error while deleting the ip, 239 // verify the ip is still present on the interface. 240 if present, err := isAddressOnInterface(iface.Name, addr); err != nil { 241 return fmt.Errorf("checking if address %v is on interface %q: %w", addr, iface.Name, err) 242 } else if !present { 243 continue 244 } 245 return fmt.Errorf("removing address %v from device %q: %w", addr, iface.Name, err) 246 } 247 } 248 249 if conf.AFXDP { 250 xdpSockFDs, err := createSocketXDP(iface) 251 if err != nil { 252 return fmt.Errorf("failed to create XDP socket: %v", err) 253 } 254 args.FilePayload.Files = append(args.FilePayload.Files, xdpSockFDs...) 255 args.XDPLinks = append(args.XDPLinks, boot.XDPLink{ 256 Name: iface.Name, 257 InterfaceIndex: iface.Index, 258 Routes: routes, 259 TXChecksumOffload: conf.TXChecksumOffload, 260 RXChecksumOffload: conf.RXChecksumOffload, 261 NumChannels: conf.NumNetworkChannels, 262 QDisc: conf.QDisc, 263 Neighbors: neighbors, 264 LinkAddress: linkAddress, 265 Addresses: addresses, 266 GvisorGROTimeout: conf.GvisorGROTimeout, 267 }) 268 } else { 269 link := boot.FDBasedLink{ 270 Name: iface.Name, 271 MTU: iface.MTU, 272 Routes: routes, 273 TXChecksumOffload: conf.TXChecksumOffload, 274 RXChecksumOffload: conf.RXChecksumOffload, 275 NumChannels: conf.NumNetworkChannels, 276 QDisc: conf.QDisc, 277 Neighbors: neighbors, 278 LinkAddress: linkAddress, 279 Addresses: addresses, 280 } 281 282 log.Debugf("Setting up network channels") 283 // Create the socket for the device. 284 for i := 0; i < link.NumChannels; i++ { 285 log.Debugf("Creating Channel %d", i) 286 socketEntry, err := createSocket(iface, ifaceLink, conf.HostGSO) 287 if err != nil { 288 return fmt.Errorf("failed to createSocket for %s : %w", iface.Name, err) 289 } 290 if i == 0 { 291 link.GSOMaxSize = socketEntry.gsoMaxSize 292 } else { 293 if link.GSOMaxSize != socketEntry.gsoMaxSize { 294 return fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s", 295 link.GSOMaxSize, socketEntry.gsoMaxSize, iface.Name) 296 } 297 } 298 args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile) 299 } 300 301 if link.GSOMaxSize == 0 && conf.GvisorGSO { 302 // Host GSO is disabled. Let's enable gVisor GSO. 303 link.GSOMaxSize = stack.GvisorGSOMaxSize 304 link.GvisorGSOEnabled = true 305 } 306 link.GvisorGROTimeout = conf.GvisorGROTimeout 307 308 args.FDBasedLinks = append(args.FDBasedLinks, link) 309 } 310 } 311 312 // Pass PCAP log file if present. 313 if conf.PCAP != "" { 314 args.PCAP = true 315 pcap, err := os.OpenFile(conf.PCAP, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0664) 316 if err != nil { 317 return fmt.Errorf("failed to open PCAP file %s: %v", conf.PCAP, err) 318 } 319 args.FilePayload.Files = append(args.FilePayload.Files, pcap) 320 } 321 322 log.Debugf("Setting up network, config: %+v", args) 323 if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil { 324 return fmt.Errorf("creating links and routes: %w", err) 325 } 326 return nil 327 } 328 329 // isAddressOnInterface checks if an address is on an interface 330 func isAddressOnInterface(ifaceName string, addr *net.IPNet) (bool, error) { 331 iface, err := net.InterfaceByName(ifaceName) 332 if err != nil { 333 return false, fmt.Errorf("getting interface by name %q: %w", ifaceName, err) 334 } 335 ifaceAddrs, err := iface.Addrs() 336 if err != nil { 337 return false, fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err) 338 } 339 for _, ifaceAddr := range ifaceAddrs { 340 ipNet, ok := ifaceAddr.(*net.IPNet) 341 if !ok { 342 log.Warningf("Can't cast address to *net.IPNet, skipping: %+v", ifaceAddr) 343 continue 344 } 345 if ipNet.String() == addr.String() { 346 return true, nil 347 } 348 } 349 return false, nil 350 } 351 352 type socketEntry struct { 353 deviceFile *os.File 354 gsoMaxSize uint32 355 } 356 357 // createSocket creates an underlying AF_PACKET socket and configures it for 358 // use by the sentry and returns an *os.File that wraps the underlying socket 359 // fd. 360 func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (*socketEntry, error) { 361 // Create the socket. 362 const protocol = 0x0300 // htons(ETH_P_ALL) 363 fd, err := unix.Socket(unix.AF_PACKET, unix.SOCK_RAW, 0) // pass protocol 0 to avoid slow bind() 364 if err != nil { 365 return nil, fmt.Errorf("unable to create raw socket: %v", err) 366 } 367 deviceFile := os.NewFile(uintptr(fd), "raw-device-fd") 368 // Bind to the appropriate device. 369 ll := unix.SockaddrLinklayer{ 370 Protocol: protocol, 371 Ifindex: iface.Index, 372 } 373 if err := unix.Bind(fd, &ll); err != nil { 374 return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err) 375 } 376 377 gsoMaxSize := uint32(0) 378 if enableGSO { 379 gso, err := isGSOEnabled(fd, iface.Name) 380 if err != nil { 381 return nil, fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err) 382 } 383 if gso { 384 if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil { 385 return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err) 386 } 387 gsoMaxSize = ifaceLink.Attrs().GSOMaxSize 388 } else { 389 log.Infof("GSO not available in host.") 390 } 391 } 392 393 // Use SO_RCVBUFFORCE/SO_SNDBUFFORCE because on linux the receive/send buffer 394 // for an AF_PACKET socket is capped by "net.core.rmem_max/wmem_max". 395 // wmem_max/rmem_max default to a unusually low value of 208KB. This is too 396 // low for gVisor to be able to receive packets at high throughputs without 397 // incurring packet drops. 398 const bufSize = 4 << 20 // 4MB. 399 400 if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUFFORCE, bufSize); err != nil { 401 _ = unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUF, bufSize) 402 sz, _ := unix.GetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUF) 403 404 if sz < bufSize { 405 log.Warningf("Failed to increase rcv buffer to %d on SOCK_RAW on %s. Current buffer %d: %v", bufSize, iface.Name, sz, err) 406 } 407 } 408 409 if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUFFORCE, bufSize); err != nil { 410 _ = unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUF, bufSize) 411 sz, _ := unix.GetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUF) 412 if sz < bufSize { 413 log.Warningf("Failed to increase snd buffer to %d on SOCK_RAW on %s. Current buffer %d: %v", bufSize, iface.Name, sz, err) 414 } 415 } 416 417 return &socketEntry{deviceFile, gsoMaxSize}, nil 418 } 419 420 func createSocketXDP(iface net.Interface) ([]*os.File, error) { 421 // Create an XDP socket. The sentry will mmap memory for the various 422 // rings and bind to the device. 423 fd, err := unix.Socket(unix.AF_XDP, unix.SOCK_RAW, 0) 424 if err != nil { 425 return nil, fmt.Errorf("unable to create AF_XDP socket: %v", err) 426 } 427 428 // We also need to, before dropping privileges, attach a program to the 429 // device and insert our socket into its map. 430 431 // Load into the kernel. 432 spec, err := ebpf.LoadCollectionSpecFromReader(bytes.NewReader(bpf.AFXDPProgram)) 433 if err != nil { 434 return nil, fmt.Errorf("failed to load spec: %v", err) 435 } 436 437 var objects struct { 438 Program *ebpf.Program `ebpf:"xdp_prog"` 439 SockMap *ebpf.Map `ebpf:"sock_map"` 440 } 441 if err := spec.LoadAndAssign(&objects, nil); err != nil { 442 return nil, fmt.Errorf("failed to load program: %v", err) 443 } 444 445 rawLink, err := link.AttachRawLink(link.RawLinkOptions{ 446 Program: objects.Program, 447 Attach: ebpf.AttachXDP, 448 Target: iface.Index, 449 // By not setting the Flag field, the kernel will choose the 450 // fastest mode. In order those are: 451 // - Offloaded onto the NIC. 452 // - Running directly in the driver. 453 // - Generic mode, which works with any NIC/driver but lacks 454 // much of the XDP performance boost. 455 }) 456 if err != nil { 457 return nil, fmt.Errorf("failed to attach BPF program: %v", err) 458 } 459 460 // Insert our AF_XDP socket into the BPF map that dictates where 461 // packets are redirected to. 462 key := uint32(0) 463 val := uint32(fd) 464 if err := objects.SockMap.Update(&key, &val, 0 /* flags */); err != nil { 465 return nil, fmt.Errorf("failed to insert socket into BPF map: %v", err) 466 } 467 468 // We need to keep the Program, SockMap, and link FDs open until they 469 // can be passed to the sandbox process. 470 progFD, err := unix.Dup(objects.Program.FD()) 471 if err != nil { 472 return nil, fmt.Errorf("failed to dup BPF program: %v", err) 473 } 474 sockMapFD, err := unix.Dup(objects.SockMap.FD()) 475 if err != nil { 476 return nil, fmt.Errorf("failed to dup BPF map: %v", err) 477 } 478 linkFD, err := unix.Dup(rawLink.FD()) 479 if err != nil { 480 return nil, fmt.Errorf("failed to dup BPF link: %v", err) 481 } 482 483 return []*os.File{ 484 os.NewFile(uintptr(fd), "xdp-fd"), // The socket. 485 os.NewFile(uintptr(progFD), "program-fd"), // The XDP program. 486 os.NewFile(uintptr(sockMapFD), "sockmap-fd"), // The XDP map. 487 os.NewFile(uintptr(linkFD), "link-fd"), // The XDP link. 488 }, nil 489 } 490 491 // loopbackLink returns the link with addresses and routes for a loopback 492 // interface. 493 func loopbackLink(conf *config.Config, iface net.Interface, addrs []net.Addr) (boot.LoopbackLink, error) { 494 link := boot.LoopbackLink{ 495 Name: iface.Name, 496 GvisorGROTimeout: conf.GvisorGROTimeout, 497 } 498 for _, addr := range addrs { 499 ipNet, ok := addr.(*net.IPNet) 500 if !ok { 501 return boot.LoopbackLink{}, fmt.Errorf("address is not IPNet: %+v", addr) 502 } 503 504 prefix, _ := ipNet.Mask.Size() 505 link.Addresses = append(link.Addresses, boot.IPWithPrefix{ 506 Address: ipNet.IP, 507 PrefixLen: prefix, 508 }) 509 510 dst := *ipNet 511 dst.IP = dst.IP.Mask(dst.Mask) 512 link.Routes = append(link.Routes, boot.Route{ 513 Destination: dst, 514 }) 515 } 516 return link, nil 517 } 518 519 // routesForIface iterates over all routes for the given interface and converts 520 // them to boot.Routes. It also returns the a default v4/v6 route if found. 521 func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, *boot.Route, error) { 522 link, err := netlink.LinkByIndex(iface.Index) 523 if err != nil { 524 return nil, nil, nil, err 525 } 526 rs, err := netlink.RouteList(link, netlink.FAMILY_ALL) 527 if err != nil { 528 return nil, nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err) 529 } 530 531 var defv4, defv6 *boot.Route 532 var routes []boot.Route 533 for _, r := range rs { 534 // Is it a default route? 535 if r.Dst == nil { 536 if r.Gw == nil { 537 return nil, nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r) 538 } 539 // Create a catch all route to the gateway. 540 switch len(r.Gw) { 541 case header.IPv4AddressSize: 542 if defv4 != nil { 543 return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv4, r) 544 } 545 defv4 = &boot.Route{ 546 Destination: net.IPNet{ 547 IP: net.IPv4zero, 548 Mask: net.IPMask(net.IPv4zero), 549 }, 550 Gateway: r.Gw, 551 } 552 case header.IPv6AddressSize: 553 if defv6 != nil { 554 return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv6, r) 555 } 556 557 defv6 = &boot.Route{ 558 Destination: net.IPNet{ 559 IP: net.IPv6zero, 560 Mask: net.IPMask(net.IPv6zero), 561 }, 562 Gateway: r.Gw, 563 } 564 default: 565 return nil, nil, nil, fmt.Errorf("unexpected address size for gateway: %+v for route: %+v", r.Gw, r) 566 } 567 continue 568 } 569 570 dst := *r.Dst 571 dst.IP = dst.IP.Mask(dst.Mask) 572 routes = append(routes, boot.Route{ 573 Destination: dst, 574 Gateway: r.Gw, 575 }) 576 } 577 return routes, defv4, defv6, nil 578 } 579 580 // removeAddress removes IP address from network device. It's equivalent to: 581 // 582 // ip addr del <ipAndMask> dev <name> 583 func removeAddress(source netlink.Link, ipAndMask string) error { 584 addr, err := netlink.ParseAddr(ipAndMask) 585 if err != nil { 586 return err 587 } 588 return netlink.AddrDel(source, addr) 589 }