github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/runsc/sandbox/network.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package sandbox
    16  
    17  import (
    18  	"fmt"
    19  	"net"
    20  	"os"
    21  	"path/filepath"
    22  	"runtime"
    23  	"strconv"
    24  
    25  	specs "github.com/opencontainers/runtime-spec/specs-go"
    26  	"github.com/vishvananda/netlink"
    27  	"golang.org/x/sys/unix"
    28  	"github.com/SagerNet/gvisor/pkg/log"
    29  	"github.com/SagerNet/gvisor/pkg/tcpip/header"
    30  	"github.com/SagerNet/gvisor/pkg/tcpip/stack"
    31  	"github.com/SagerNet/gvisor/pkg/urpc"
    32  	"github.com/SagerNet/gvisor/runsc/boot"
    33  	"github.com/SagerNet/gvisor/runsc/config"
    34  	"github.com/SagerNet/gvisor/runsc/specutils"
    35  )
    36  
    37  // setupNetwork configures the network stack to mimic the local network
    38  // configuration. Docker uses network namespaces with vnets to configure the
    39  // network for the container. The untrusted app expects to see the same network
    40  // inside the sandbox. Routing and port mapping is handled directly by docker
    41  // with most of network information not even available to the runtime.
    42  //
    43  // Netstack inside the sandbox speaks directly to the device using a raw socket.
    44  // All IP addresses assigned to the NIC, are removed and passed on to netstack's
    45  // device.
    46  //
    47  // If 'conf.Network' is NoNetwork, skips local configuration and creates a
    48  // loopback interface only.
    49  //
    50  // Run the following container to test it:
    51  //  docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4
    52  func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *config.Config) error {
    53  	log.Infof("Setting up network")
    54  
    55  	switch conf.Network {
    56  	case config.NetworkNone:
    57  		log.Infof("Network is disabled, create loopback interface only")
    58  		if err := createDefaultLoopbackInterface(conn); err != nil {
    59  			return fmt.Errorf("creating default loopback interface: %v", err)
    60  		}
    61  	case config.NetworkSandbox:
    62  		// Build the path to the net namespace of the sandbox process.
    63  		// This is what we will copy.
    64  		nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
    65  		if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.TXChecksumOffload, conf.RXChecksumOffload, conf.NumNetworkChannels, conf.QDisc); err != nil {
    66  			return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
    67  		}
    68  	case config.NetworkHost:
    69  		// Nothing to do here.
    70  	default:
    71  		return fmt.Errorf("invalid network type: %v", conf.Network)
    72  	}
    73  	return nil
    74  }
    75  
    76  func createDefaultLoopbackInterface(conn *urpc.Client) error {
    77  	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{
    78  		LoopbackLinks: []boot.LoopbackLink{boot.DefaultLoopbackLink},
    79  	}, nil); err != nil {
    80  		return fmt.Errorf("creating loopback link and routes: %v", err)
    81  	}
    82  	return nil
    83  }
    84  
    85  func joinNetNS(nsPath string) (func(), error) {
    86  	runtime.LockOSThread()
    87  	restoreNS, err := specutils.ApplyNS(specs.LinuxNamespace{
    88  		Type: specs.NetworkNamespace,
    89  		Path: nsPath,
    90  	})
    91  	if err != nil {
    92  		runtime.UnlockOSThread()
    93  		return nil, fmt.Errorf("joining net namespace %q: %v", nsPath, err)
    94  	}
    95  	return func() {
    96  		restoreNS()
    97  		runtime.UnlockOSThread()
    98  	}, nil
    99  }
   100  
   101  // isRootNS determines whether we are running in the root net namespace.
   102  // /proc/sys/net/core/rmem_default only exists in root network namespace.
   103  func isRootNS() (bool, error) {
   104  	err := unix.Access("/proc/sys/net/core/rmem_default", unix.F_OK)
   105  	switch err {
   106  	case nil:
   107  		return true, nil
   108  	case unix.ENOENT:
   109  		return false, nil
   110  	default:
   111  		return false, fmt.Errorf("failed to access /proc/sys/net/core/rmem_default: %v", err)
   112  	}
   113  }
   114  
   115  // createInterfacesAndRoutesFromNS scrapes the interface and routes from the
   116  // net namespace with the given path, creates them in the sandbox, and removes
   117  // them from the host.
   118  func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, txChecksumOffload bool, rxChecksumOffload bool, numNetworkChannels int, qDisc config.QueueingDiscipline) error {
   119  	// Join the network namespace that we will be copying.
   120  	restore, err := joinNetNS(nsPath)
   121  	if err != nil {
   122  		return err
   123  	}
   124  	defer restore()
   125  
   126  	// Get all interfaces in the namespace.
   127  	ifaces, err := net.Interfaces()
   128  	if err != nil {
   129  		return fmt.Errorf("querying interfaces: %w", err)
   130  	}
   131  
   132  	isRoot, err := isRootNS()
   133  	if err != nil {
   134  		return err
   135  	}
   136  	if isRoot {
   137  		return fmt.Errorf("cannot run with network enabled in root network namespace")
   138  	}
   139  
   140  	// Collect addresses and routes from the interfaces.
   141  	var args boot.CreateLinksAndRoutesArgs
   142  	for _, iface := range ifaces {
   143  		if iface.Flags&net.FlagUp == 0 {
   144  			log.Infof("Skipping down interface: %+v", iface)
   145  			continue
   146  		}
   147  
   148  		allAddrs, err := iface.Addrs()
   149  		if err != nil {
   150  			return fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err)
   151  		}
   152  
   153  		// We build our own loopback device.
   154  		if iface.Flags&net.FlagLoopback != 0 {
   155  			link, err := loopbackLink(iface, allAddrs)
   156  			if err != nil {
   157  				return fmt.Errorf("getting loopback link for iface %q: %w", iface.Name, err)
   158  			}
   159  			args.LoopbackLinks = append(args.LoopbackLinks, link)
   160  			continue
   161  		}
   162  
   163  		var ipAddrs []*net.IPNet
   164  		for _, ifaddr := range allAddrs {
   165  			ipNet, ok := ifaddr.(*net.IPNet)
   166  			if !ok {
   167  				return fmt.Errorf("address is not IPNet: %+v", ifaddr)
   168  			}
   169  			ipAddrs = append(ipAddrs, ipNet)
   170  		}
   171  		if len(ipAddrs) == 0 {
   172  			log.Warningf("No usable IP addresses found for interface %q, skipping", iface.Name)
   173  			continue
   174  		}
   175  
   176  		// Scrape the routes before removing the address, since that
   177  		// will remove the routes as well.
   178  		routes, defv4, defv6, err := routesForIface(iface)
   179  		if err != nil {
   180  			return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err)
   181  		}
   182  		if defv4 != nil {
   183  			if !args.Defaultv4Gateway.Route.Empty() {
   184  				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway)
   185  			}
   186  			args.Defaultv4Gateway.Route = *defv4
   187  			args.Defaultv4Gateway.Name = iface.Name
   188  		}
   189  
   190  		if defv6 != nil {
   191  			if !args.Defaultv6Gateway.Route.Empty() {
   192  				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway)
   193  			}
   194  			args.Defaultv6Gateway.Route = *defv6
   195  			args.Defaultv6Gateway.Name = iface.Name
   196  		}
   197  
   198  		link := boot.FDBasedLink{
   199  			Name:              iface.Name,
   200  			MTU:               iface.MTU,
   201  			Routes:            routes,
   202  			TXChecksumOffload: txChecksumOffload,
   203  			RXChecksumOffload: rxChecksumOffload,
   204  			NumChannels:       numNetworkChannels,
   205  			QDisc:             qDisc,
   206  		}
   207  
   208  		// Get the link for the interface.
   209  		ifaceLink, err := netlink.LinkByName(iface.Name)
   210  		if err != nil {
   211  			return fmt.Errorf("getting link for interface %q: %w", iface.Name, err)
   212  		}
   213  		link.LinkAddress = ifaceLink.Attrs().HardwareAddr
   214  
   215  		log.Debugf("Setting up network channels")
   216  		// Create the socket for the device.
   217  		for i := 0; i < link.NumChannels; i++ {
   218  			log.Debugf("Creating Channel %d", i)
   219  			socketEntry, err := createSocket(iface, ifaceLink, hardwareGSO)
   220  			if err != nil {
   221  				return fmt.Errorf("failed to createSocket for %s : %w", iface.Name, err)
   222  			}
   223  			if i == 0 {
   224  				link.GSOMaxSize = socketEntry.gsoMaxSize
   225  			} else {
   226  				if link.GSOMaxSize != socketEntry.gsoMaxSize {
   227  					return fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s",
   228  						link.GSOMaxSize, socketEntry.gsoMaxSize, iface.Name)
   229  				}
   230  			}
   231  			args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile)
   232  		}
   233  
   234  		if link.GSOMaxSize == 0 && softwareGSO {
   235  			// Hardware GSO is disabled. Let's enable software GSO.
   236  			link.GSOMaxSize = stack.SoftwareGSOMaxSize
   237  			link.SoftwareGSOEnabled = true
   238  		}
   239  
   240  		// Collect the addresses for the interface, enable forwarding,
   241  		// and remove them from the host.
   242  		for _, addr := range ipAddrs {
   243  			prefix, _ := addr.Mask.Size()
   244  			link.Addresses = append(link.Addresses, boot.IPWithPrefix{Address: addr.IP, PrefixLen: prefix})
   245  
   246  			// Steal IP address from NIC.
   247  			if err := removeAddress(ifaceLink, addr.String()); err != nil {
   248  				return fmt.Errorf("removing address %v from device %q: %w", addr, iface.Name, err)
   249  			}
   250  		}
   251  
   252  		args.FDBasedLinks = append(args.FDBasedLinks, link)
   253  	}
   254  
   255  	log.Debugf("Setting up network, config: %+v", args)
   256  	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil {
   257  		return fmt.Errorf("creating links and routes: %w", err)
   258  	}
   259  	return nil
   260  }
   261  
   262  type socketEntry struct {
   263  	deviceFile *os.File
   264  	gsoMaxSize uint32
   265  }
   266  
   267  // createSocket creates an underlying AF_PACKET socket and configures it for use by
   268  // the sentry and returns an *os.File that wraps the underlying socket fd.
   269  func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (*socketEntry, error) {
   270  	// Create the socket.
   271  	const protocol = 0x0300 // htons(ETH_P_ALL)
   272  	fd, err := unix.Socket(unix.AF_PACKET, unix.SOCK_RAW, protocol)
   273  	if err != nil {
   274  		return nil, fmt.Errorf("unable to create raw socket: %v", err)
   275  	}
   276  	deviceFile := os.NewFile(uintptr(fd), "raw-device-fd")
   277  	// Bind to the appropriate device.
   278  	ll := unix.SockaddrLinklayer{
   279  		Protocol: protocol,
   280  		Ifindex:  iface.Index,
   281  	}
   282  	if err := unix.Bind(fd, &ll); err != nil {
   283  		return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
   284  	}
   285  
   286  	gsoMaxSize := uint32(0)
   287  	if enableGSO {
   288  		gso, err := isGSOEnabled(fd, iface.Name)
   289  		if err != nil {
   290  			return nil, fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err)
   291  		}
   292  		if gso {
   293  			if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
   294  				return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
   295  			}
   296  			gsoMaxSize = ifaceLink.Attrs().GSOMaxSize
   297  		} else {
   298  			log.Infof("GSO not available in host.")
   299  		}
   300  	}
   301  
   302  	// Use SO_RCVBUFFORCE/SO_SNDBUFFORCE because on linux the receive/send buffer
   303  	// for an AF_PACKET socket is capped by "net.core.rmem_max/wmem_max".
   304  	// wmem_max/rmem_max default to a unusually low value of 208KB. This is too low
   305  	// for gVisor to be able to receive packets at high throughputs without
   306  	// incurring packet drops.
   307  	const bufSize = 4 << 20 // 4MB.
   308  
   309  	if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUFFORCE, bufSize); err != nil {
   310  		unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUF, bufSize)
   311  		sz, _ := unix.GetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUF)
   312  
   313  		if sz < bufSize {
   314  			log.Warningf("Failed to increase rcv buffer to %d on SOCK_RAW on %s. Current buffer %d: %v", bufSize, iface.Name, sz, err)
   315  		}
   316  	}
   317  
   318  	if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUFFORCE, bufSize); err != nil {
   319  		unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUF, bufSize)
   320  		sz, _ := unix.GetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUF)
   321  		if sz < bufSize {
   322  			log.Warningf("Failed to increase snd buffer to %d on SOCK_RAW on %s. Curent buffer %d: %v", bufSize, iface.Name, sz, err)
   323  		}
   324  	}
   325  
   326  	return &socketEntry{deviceFile, gsoMaxSize}, nil
   327  }
   328  
   329  // loopbackLink returns the link with addresses and routes for a loopback
   330  // interface.
   331  func loopbackLink(iface net.Interface, addrs []net.Addr) (boot.LoopbackLink, error) {
   332  	link := boot.LoopbackLink{
   333  		Name: iface.Name,
   334  	}
   335  	for _, addr := range addrs {
   336  		ipNet, ok := addr.(*net.IPNet)
   337  		if !ok {
   338  			return boot.LoopbackLink{}, fmt.Errorf("address is not IPNet: %+v", addr)
   339  		}
   340  
   341  		prefix, _ := ipNet.Mask.Size()
   342  		link.Addresses = append(link.Addresses, boot.IPWithPrefix{
   343  			Address:   ipNet.IP,
   344  			PrefixLen: prefix,
   345  		})
   346  
   347  		dst := *ipNet
   348  		dst.IP = dst.IP.Mask(dst.Mask)
   349  		link.Routes = append(link.Routes, boot.Route{
   350  			Destination: dst,
   351  		})
   352  	}
   353  	return link, nil
   354  }
   355  
   356  // routesForIface iterates over all routes for the given interface and converts
   357  // them to boot.Routes. It also returns the a default v4/v6 route if found.
   358  func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, *boot.Route, error) {
   359  	link, err := netlink.LinkByIndex(iface.Index)
   360  	if err != nil {
   361  		return nil, nil, nil, err
   362  	}
   363  	rs, err := netlink.RouteList(link, netlink.FAMILY_ALL)
   364  	if err != nil {
   365  		return nil, nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err)
   366  	}
   367  
   368  	var defv4, defv6 *boot.Route
   369  	var routes []boot.Route
   370  	for _, r := range rs {
   371  		// Is it a default route?
   372  		if r.Dst == nil {
   373  			if r.Gw == nil {
   374  				return nil, nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
   375  			}
   376  			// Create a catch all route to the gateway.
   377  			switch len(r.Gw) {
   378  			case header.IPv4AddressSize:
   379  				if defv4 != nil {
   380  					return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv4, r)
   381  				}
   382  				defv4 = &boot.Route{
   383  					Destination: net.IPNet{
   384  						IP:   net.IPv4zero,
   385  						Mask: net.IPMask(net.IPv4zero),
   386  					},
   387  					Gateway: r.Gw,
   388  				}
   389  			case header.IPv6AddressSize:
   390  				if defv6 != nil {
   391  					return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv6, r)
   392  				}
   393  
   394  				defv6 = &boot.Route{
   395  					Destination: net.IPNet{
   396  						IP:   net.IPv6zero,
   397  						Mask: net.IPMask(net.IPv6zero),
   398  					},
   399  					Gateway: r.Gw,
   400  				}
   401  			default:
   402  				return nil, nil, nil, fmt.Errorf("unexpected address size for gateway: %+v for route: %+v", r.Gw, r)
   403  			}
   404  			continue
   405  		}
   406  
   407  		dst := *r.Dst
   408  		dst.IP = dst.IP.Mask(dst.Mask)
   409  		routes = append(routes, boot.Route{
   410  			Destination: dst,
   411  			Gateway:     r.Gw,
   412  		})
   413  	}
   414  	return routes, defv4, defv6, nil
   415  }
   416  
   417  // removeAddress removes IP address from network device. It's equivalent to:
   418  //   ip addr del <ipAndMask> dev <name>
   419  func removeAddress(source netlink.Link, ipAndMask string) error {
   420  	addr, err := netlink.ParseAddr(ipAndMask)
   421  	if err != nil {
   422  		return err
   423  	}
   424  	return netlink.AddrDel(source, addr)
   425  }