github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/sandbox/network.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package sandbox
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"net"
    21  	"os"
    22  	"path/filepath"
    23  	"runtime"
    24  	"strconv"
    25  
    26  	"github.com/MerlinKodo/gvisor/pkg/log"
    27  	"github.com/MerlinKodo/gvisor/pkg/tcpip/header"
    28  	"github.com/MerlinKodo/gvisor/pkg/tcpip/stack"
    29  	"github.com/MerlinKodo/gvisor/pkg/urpc"
    30  	"github.com/MerlinKodo/gvisor/runsc/boot"
    31  	"github.com/MerlinKodo/gvisor/runsc/config"
    32  	"github.com/MerlinKodo/gvisor/runsc/sandbox/bpf"
    33  	"github.com/MerlinKodo/gvisor/runsc/specutils"
    34  	"github.com/cilium/ebpf"
    35  	"github.com/cilium/ebpf/link"
    36  	specs "github.com/opencontainers/runtime-spec/specs-go"
    37  	"github.com/vishvananda/netlink"
    38  	"golang.org/x/sys/unix"
    39  )
    40  
    41  // setupNetwork configures the network stack to mimic the local network
    42  // configuration. Docker uses network namespaces with vnets to configure the
    43  // network for the container. The untrusted app expects to see the same network
    44  // inside the sandbox. Routing and port mapping is handled directly by docker
    45  // with most of network information not even available to the runtime.
    46  //
    47  // Netstack inside the sandbox speaks directly to the device using a raw socket.
    48  // All IP addresses assigned to the NIC, are removed and passed on to netstack's
    49  // device.
    50  //
    51  // If 'conf.Network' is NoNetwork, skips local configuration and creates a
    52  // loopback interface only.
    53  //
    54  // Run the following container to test it:
    55  //
    56  //	docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4
    57  func setupNetwork(conn *urpc.Client, pid int, conf *config.Config) error {
    58  	log.Infof("Setting up network")
    59  
    60  	switch conf.Network {
    61  	case config.NetworkNone:
    62  		log.Infof("Network is disabled, create loopback interface only")
    63  		if err := createDefaultLoopbackInterface(conf, conn); err != nil {
    64  			return fmt.Errorf("creating default loopback interface: %v", err)
    65  		}
    66  	case config.NetworkSandbox:
    67  		// Build the path to the net namespace of the sandbox process.
    68  		// This is what we will copy.
    69  		nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
    70  		if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf); err != nil {
    71  			return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
    72  		}
    73  	case config.NetworkHost:
    74  		// Nothing to do here.
    75  	default:
    76  		return fmt.Errorf("invalid network type: %v", conf.Network)
    77  	}
    78  	return nil
    79  }
    80  
    81  func createDefaultLoopbackInterface(conf *config.Config, conn *urpc.Client) error {
    82  	link := boot.DefaultLoopbackLink
    83  	link.GvisorGROTimeout = conf.GvisorGROTimeout
    84  	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{
    85  		LoopbackLinks: []boot.LoopbackLink{link},
    86  	}, nil); err != nil {
    87  		return fmt.Errorf("creating loopback link and routes: %v", err)
    88  	}
    89  	return nil
    90  }
    91  
    92  func joinNetNS(nsPath string) (func(), error) {
    93  	runtime.LockOSThread()
    94  	restoreNS, err := specutils.ApplyNS(specs.LinuxNamespace{
    95  		Type: specs.NetworkNamespace,
    96  		Path: nsPath,
    97  	})
    98  	if err != nil {
    99  		runtime.UnlockOSThread()
   100  		return nil, fmt.Errorf("joining net namespace %q: %v", nsPath, err)
   101  	}
   102  	return func() {
   103  		restoreNS()
   104  		runtime.UnlockOSThread()
   105  	}, nil
   106  }
   107  
   108  // isRootNS determines whether we are running in the root net namespace.
   109  // /proc/sys/net/core/rmem_default only exists in root network namespace.
   110  func isRootNS() (bool, error) {
   111  	err := unix.Access("/proc/sys/net/core/rmem_default", unix.F_OK)
   112  	switch err {
   113  	case nil:
   114  		return true, nil
   115  	case unix.ENOENT:
   116  		return false, nil
   117  	default:
   118  		return false, fmt.Errorf("failed to access /proc/sys/net/core/rmem_default: %v", err)
   119  	}
   120  }
   121  
   122  // createInterfacesAndRoutesFromNS scrapes the interface and routes from the
   123  // net namespace with the given path, creates them in the sandbox, and removes
   124  // them from the host.
   125  func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, conf *config.Config) error {
   126  	// Join the network namespace that we will be copying.
   127  	restore, err := joinNetNS(nsPath)
   128  	if err != nil {
   129  		return err
   130  	}
   131  	defer restore()
   132  
   133  	// Get all interfaces in the namespace.
   134  	ifaces, err := net.Interfaces()
   135  	if err != nil {
   136  		return fmt.Errorf("querying interfaces: %w", err)
   137  	}
   138  
   139  	isRoot, err := isRootNS()
   140  	if err != nil {
   141  		return err
   142  	}
   143  	if isRoot {
   144  		return fmt.Errorf("cannot run with network enabled in root network namespace")
   145  	}
   146  
   147  	// Collect addresses and routes from the interfaces.
   148  	var args boot.CreateLinksAndRoutesArgs
   149  	for _, iface := range ifaces {
   150  		if iface.Flags&net.FlagUp == 0 {
   151  			log.Infof("Skipping down interface: %+v", iface)
   152  			continue
   153  		}
   154  
   155  		allAddrs, err := iface.Addrs()
   156  		if err != nil {
   157  			return fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err)
   158  		}
   159  
   160  		// We build our own loopback device.
   161  		if iface.Flags&net.FlagLoopback != 0 {
   162  			link, err := loopbackLink(conf, iface, allAddrs)
   163  			if err != nil {
   164  				return fmt.Errorf("getting loopback link for iface %q: %w", iface.Name, err)
   165  			}
   166  			args.LoopbackLinks = append(args.LoopbackLinks, link)
   167  			continue
   168  		}
   169  
   170  		var ipAddrs []*net.IPNet
   171  		for _, ifaddr := range allAddrs {
   172  			ipNet, ok := ifaddr.(*net.IPNet)
   173  			if !ok {
   174  				return fmt.Errorf("address is not IPNet: %+v", ifaddr)
   175  			}
   176  			ipAddrs = append(ipAddrs, ipNet)
   177  		}
   178  		if len(ipAddrs) == 0 {
   179  			log.Warningf("No usable IP addresses found for interface %q, skipping", iface.Name)
   180  			continue
   181  		}
   182  
   183  		// Collect data from the ARP table.
   184  		dump, err := netlink.NeighList(iface.Index, 0)
   185  		if err != nil {
   186  			return fmt.Errorf("fetching ARP table for %q: %w", iface.Name, err)
   187  		}
   188  
   189  		var neighbors []boot.Neighbor
   190  		for _, n := range dump {
   191  			// There are only two "good" states NUD_PERMANENT and NUD_REACHABLE,
   192  			// but NUD_REACHABLE is fully dynamic and will be re-probed anyway.
   193  			if n.State == netlink.NUD_PERMANENT {
   194  				log.Debugf("Copying a static ARP entry: %+v %+v", n.IP, n.HardwareAddr)
   195  				// No flags are copied because Stack.AddStaticNeighbor does not support flags right now.
   196  				neighbors = append(neighbors, boot.Neighbor{IP: n.IP, HardwareAddr: n.HardwareAddr})
   197  			}
   198  		}
   199  
   200  		// Scrape the routes before removing the address, since that
   201  		// will remove the routes as well.
   202  		routes, defv4, defv6, err := routesForIface(iface)
   203  		if err != nil {
   204  			return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err)
   205  		}
   206  		if defv4 != nil {
   207  			if !args.Defaultv4Gateway.Route.Empty() {
   208  				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway)
   209  			}
   210  			args.Defaultv4Gateway.Route = *defv4
   211  			args.Defaultv4Gateway.Name = iface.Name
   212  		}
   213  
   214  		if defv6 != nil {
   215  			if !args.Defaultv6Gateway.Route.Empty() {
   216  				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway)
   217  			}
   218  			args.Defaultv6Gateway.Route = *defv6
   219  			args.Defaultv6Gateway.Name = iface.Name
   220  		}
   221  
   222  		// Get the link for the interface.
   223  		ifaceLink, err := netlink.LinkByName(iface.Name)
   224  		if err != nil {
   225  			return fmt.Errorf("getting link for interface %q: %w", iface.Name, err)
   226  		}
   227  		linkAddress := ifaceLink.Attrs().HardwareAddr
   228  
   229  		// Collect the addresses for the interface, enable forwarding,
   230  		// and remove them from the host.
   231  		var addresses []boot.IPWithPrefix
   232  		for _, addr := range ipAddrs {
   233  			prefix, _ := addr.Mask.Size()
   234  			addresses = append(addresses, boot.IPWithPrefix{Address: addr.IP, PrefixLen: prefix})
   235  
   236  			// Steal IP address from NIC.
   237  			if err := removeAddress(ifaceLink, addr.String()); err != nil {
   238  				// If we encounter an error while deleting the ip,
   239  				// verify the ip is still present on the interface.
   240  				if present, err := isAddressOnInterface(iface.Name, addr); err != nil {
   241  					return fmt.Errorf("checking if address %v is on interface %q: %w", addr, iface.Name, err)
   242  				} else if !present {
   243  					continue
   244  				}
   245  				return fmt.Errorf("removing address %v from device %q: %w", addr, iface.Name, err)
   246  			}
   247  		}
   248  
   249  		if conf.AFXDP {
   250  			xdpSockFDs, err := createSocketXDP(iface)
   251  			if err != nil {
   252  				return fmt.Errorf("failed to create XDP socket: %v", err)
   253  			}
   254  			args.FilePayload.Files = append(args.FilePayload.Files, xdpSockFDs...)
   255  			args.XDPLinks = append(args.XDPLinks, boot.XDPLink{
   256  				Name:              iface.Name,
   257  				InterfaceIndex:    iface.Index,
   258  				Routes:            routes,
   259  				TXChecksumOffload: conf.TXChecksumOffload,
   260  				RXChecksumOffload: conf.RXChecksumOffload,
   261  				NumChannels:       conf.NumNetworkChannels,
   262  				QDisc:             conf.QDisc,
   263  				Neighbors:         neighbors,
   264  				LinkAddress:       linkAddress,
   265  				Addresses:         addresses,
   266  				GvisorGROTimeout:  conf.GvisorGROTimeout,
   267  			})
   268  		} else {
   269  			link := boot.FDBasedLink{
   270  				Name:              iface.Name,
   271  				MTU:               iface.MTU,
   272  				Routes:            routes,
   273  				TXChecksumOffload: conf.TXChecksumOffload,
   274  				RXChecksumOffload: conf.RXChecksumOffload,
   275  				NumChannels:       conf.NumNetworkChannels,
   276  				QDisc:             conf.QDisc,
   277  				Neighbors:         neighbors,
   278  				LinkAddress:       linkAddress,
   279  				Addresses:         addresses,
   280  			}
   281  
   282  			log.Debugf("Setting up network channels")
   283  			// Create the socket for the device.
   284  			for i := 0; i < link.NumChannels; i++ {
   285  				log.Debugf("Creating Channel %d", i)
   286  				socketEntry, err := createSocket(iface, ifaceLink, conf.HostGSO)
   287  				if err != nil {
   288  					return fmt.Errorf("failed to createSocket for %s : %w", iface.Name, err)
   289  				}
   290  				if i == 0 {
   291  					link.GSOMaxSize = socketEntry.gsoMaxSize
   292  				} else {
   293  					if link.GSOMaxSize != socketEntry.gsoMaxSize {
   294  						return fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s",
   295  							link.GSOMaxSize, socketEntry.gsoMaxSize, iface.Name)
   296  					}
   297  				}
   298  				args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile)
   299  			}
   300  
   301  			if link.GSOMaxSize == 0 && conf.GvisorGSO {
   302  				// Host GSO is disabled. Let's enable gVisor GSO.
   303  				link.GSOMaxSize = stack.GvisorGSOMaxSize
   304  				link.GvisorGSOEnabled = true
   305  			}
   306  			link.GvisorGROTimeout = conf.GvisorGROTimeout
   307  
   308  			args.FDBasedLinks = append(args.FDBasedLinks, link)
   309  		}
   310  	}
   311  
   312  	// Pass PCAP log file if present.
   313  	if conf.PCAP != "" {
   314  		args.PCAP = true
   315  		pcap, err := os.OpenFile(conf.PCAP, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0664)
   316  		if err != nil {
   317  			return fmt.Errorf("failed to open PCAP file %s: %v", conf.PCAP, err)
   318  		}
   319  		args.FilePayload.Files = append(args.FilePayload.Files, pcap)
   320  	}
   321  
   322  	log.Debugf("Setting up network, config: %+v", args)
   323  	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil {
   324  		return fmt.Errorf("creating links and routes: %w", err)
   325  	}
   326  	return nil
   327  }
   328  
   329  // isAddressOnInterface checks if an address is on an interface
   330  func isAddressOnInterface(ifaceName string, addr *net.IPNet) (bool, error) {
   331  	iface, err := net.InterfaceByName(ifaceName)
   332  	if err != nil {
   333  		return false, fmt.Errorf("getting interface by name %q: %w", ifaceName, err)
   334  	}
   335  	ifaceAddrs, err := iface.Addrs()
   336  	if err != nil {
   337  		return false, fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err)
   338  	}
   339  	for _, ifaceAddr := range ifaceAddrs {
   340  		ipNet, ok := ifaceAddr.(*net.IPNet)
   341  		if !ok {
   342  			log.Warningf("Can't cast address to *net.IPNet, skipping: %+v", ifaceAddr)
   343  			continue
   344  		}
   345  		if ipNet.String() == addr.String() {
   346  			return true, nil
   347  		}
   348  	}
   349  	return false, nil
   350  }
   351  
   352  type socketEntry struct {
   353  	deviceFile *os.File
   354  	gsoMaxSize uint32
   355  }
   356  
   357  // createSocket creates an underlying AF_PACKET socket and configures it for
   358  // use by the sentry and returns an *os.File that wraps the underlying socket
   359  // fd.
   360  func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (*socketEntry, error) {
   361  	// Create the socket.
   362  	const protocol = 0x0300                                  // htons(ETH_P_ALL)
   363  	fd, err := unix.Socket(unix.AF_PACKET, unix.SOCK_RAW, 0) // pass protocol 0 to avoid slow bind()
   364  	if err != nil {
   365  		return nil, fmt.Errorf("unable to create raw socket: %v", err)
   366  	}
   367  	deviceFile := os.NewFile(uintptr(fd), "raw-device-fd")
   368  	// Bind to the appropriate device.
   369  	ll := unix.SockaddrLinklayer{
   370  		Protocol: protocol,
   371  		Ifindex:  iface.Index,
   372  	}
   373  	if err := unix.Bind(fd, &ll); err != nil {
   374  		return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
   375  	}
   376  
   377  	gsoMaxSize := uint32(0)
   378  	if enableGSO {
   379  		gso, err := isGSOEnabled(fd, iface.Name)
   380  		if err != nil {
   381  			return nil, fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err)
   382  		}
   383  		if gso {
   384  			if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
   385  				return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
   386  			}
   387  			gsoMaxSize = ifaceLink.Attrs().GSOMaxSize
   388  		} else {
   389  			log.Infof("GSO not available in host.")
   390  		}
   391  	}
   392  
   393  	// Use SO_RCVBUFFORCE/SO_SNDBUFFORCE because on linux the receive/send buffer
   394  	// for an AF_PACKET socket is capped by "net.core.rmem_max/wmem_max".
   395  	// wmem_max/rmem_max default to a unusually low value of 208KB. This is too
   396  	// low for gVisor to be able to receive packets at high throughputs without
   397  	// incurring packet drops.
   398  	const bufSize = 4 << 20 // 4MB.
   399  
   400  	if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUFFORCE, bufSize); err != nil {
   401  		_ = unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUF, bufSize)
   402  		sz, _ := unix.GetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUF)
   403  
   404  		if sz < bufSize {
   405  			log.Warningf("Failed to increase rcv buffer to %d on SOCK_RAW on %s. Current buffer %d: %v", bufSize, iface.Name, sz, err)
   406  		}
   407  	}
   408  
   409  	if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUFFORCE, bufSize); err != nil {
   410  		_ = unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUF, bufSize)
   411  		sz, _ := unix.GetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUF)
   412  		if sz < bufSize {
   413  			log.Warningf("Failed to increase snd buffer to %d on SOCK_RAW on %s. Current buffer %d: %v", bufSize, iface.Name, sz, err)
   414  		}
   415  	}
   416  
   417  	return &socketEntry{deviceFile, gsoMaxSize}, nil
   418  }
   419  
   420  func createSocketXDP(iface net.Interface) ([]*os.File, error) {
   421  	// Create an XDP socket. The sentry will mmap memory for the various
   422  	// rings and bind to the device.
   423  	fd, err := unix.Socket(unix.AF_XDP, unix.SOCK_RAW, 0)
   424  	if err != nil {
   425  		return nil, fmt.Errorf("unable to create AF_XDP socket: %v", err)
   426  	}
   427  
   428  	// We also need to, before dropping privileges, attach a program to the
   429  	// device and insert our socket into its map.
   430  
   431  	// Load into the kernel.
   432  	spec, err := ebpf.LoadCollectionSpecFromReader(bytes.NewReader(bpf.AFXDPProgram))
   433  	if err != nil {
   434  		return nil, fmt.Errorf("failed to load spec: %v", err)
   435  	}
   436  
   437  	var objects struct {
   438  		Program *ebpf.Program `ebpf:"xdp_prog"`
   439  		SockMap *ebpf.Map     `ebpf:"sock_map"`
   440  	}
   441  	if err := spec.LoadAndAssign(&objects, nil); err != nil {
   442  		return nil, fmt.Errorf("failed to load program: %v", err)
   443  	}
   444  
   445  	rawLink, err := link.AttachRawLink(link.RawLinkOptions{
   446  		Program: objects.Program,
   447  		Attach:  ebpf.AttachXDP,
   448  		Target:  iface.Index,
   449  		// By not setting the Flag field, the kernel will choose the
   450  		// fastest mode. In order those are:
   451  		// - Offloaded onto the NIC.
   452  		// - Running directly in the driver.
   453  		// - Generic mode, which works with any NIC/driver but lacks
   454  		//   much of the XDP performance boost.
   455  	})
   456  	if err != nil {
   457  		return nil, fmt.Errorf("failed to attach BPF program: %v", err)
   458  	}
   459  
   460  	// Insert our AF_XDP socket into the BPF map that dictates where
   461  	// packets are redirected to.
   462  	key := uint32(0)
   463  	val := uint32(fd)
   464  	if err := objects.SockMap.Update(&key, &val, 0 /* flags */); err != nil {
   465  		return nil, fmt.Errorf("failed to insert socket into BPF map: %v", err)
   466  	}
   467  
   468  	// We need to keep the Program, SockMap, and link FDs open until they
   469  	// can be passed to the sandbox process.
   470  	progFD, err := unix.Dup(objects.Program.FD())
   471  	if err != nil {
   472  		return nil, fmt.Errorf("failed to dup BPF program: %v", err)
   473  	}
   474  	sockMapFD, err := unix.Dup(objects.SockMap.FD())
   475  	if err != nil {
   476  		return nil, fmt.Errorf("failed to dup BPF map: %v", err)
   477  	}
   478  	linkFD, err := unix.Dup(rawLink.FD())
   479  	if err != nil {
   480  		return nil, fmt.Errorf("failed to dup BPF link: %v", err)
   481  	}
   482  
   483  	return []*os.File{
   484  		os.NewFile(uintptr(fd), "xdp-fd"),            // The socket.
   485  		os.NewFile(uintptr(progFD), "program-fd"),    // The XDP program.
   486  		os.NewFile(uintptr(sockMapFD), "sockmap-fd"), // The XDP map.
   487  		os.NewFile(uintptr(linkFD), "link-fd"),       // The XDP link.
   488  	}, nil
   489  }
   490  
   491  // loopbackLink returns the link with addresses and routes for a loopback
   492  // interface.
   493  func loopbackLink(conf *config.Config, iface net.Interface, addrs []net.Addr) (boot.LoopbackLink, error) {
   494  	link := boot.LoopbackLink{
   495  		Name:             iface.Name,
   496  		GvisorGROTimeout: conf.GvisorGROTimeout,
   497  	}
   498  	for _, addr := range addrs {
   499  		ipNet, ok := addr.(*net.IPNet)
   500  		if !ok {
   501  			return boot.LoopbackLink{}, fmt.Errorf("address is not IPNet: %+v", addr)
   502  		}
   503  
   504  		prefix, _ := ipNet.Mask.Size()
   505  		link.Addresses = append(link.Addresses, boot.IPWithPrefix{
   506  			Address:   ipNet.IP,
   507  			PrefixLen: prefix,
   508  		})
   509  
   510  		dst := *ipNet
   511  		dst.IP = dst.IP.Mask(dst.Mask)
   512  		link.Routes = append(link.Routes, boot.Route{
   513  			Destination: dst,
   514  		})
   515  	}
   516  	return link, nil
   517  }
   518  
   519  // routesForIface iterates over all routes for the given interface and converts
   520  // them to boot.Routes. It also returns the a default v4/v6 route if found.
   521  func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, *boot.Route, error) {
   522  	link, err := netlink.LinkByIndex(iface.Index)
   523  	if err != nil {
   524  		return nil, nil, nil, err
   525  	}
   526  	rs, err := netlink.RouteList(link, netlink.FAMILY_ALL)
   527  	if err != nil {
   528  		return nil, nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err)
   529  	}
   530  
   531  	var defv4, defv6 *boot.Route
   532  	var routes []boot.Route
   533  	for _, r := range rs {
   534  		// Is it a default route?
   535  		if r.Dst == nil {
   536  			if r.Gw == nil {
   537  				return nil, nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
   538  			}
   539  			// Create a catch all route to the gateway.
   540  			switch len(r.Gw) {
   541  			case header.IPv4AddressSize:
   542  				if defv4 != nil {
   543  					return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv4, r)
   544  				}
   545  				defv4 = &boot.Route{
   546  					Destination: net.IPNet{
   547  						IP:   net.IPv4zero,
   548  						Mask: net.IPMask(net.IPv4zero),
   549  					},
   550  					Gateway: r.Gw,
   551  				}
   552  			case header.IPv6AddressSize:
   553  				if defv6 != nil {
   554  					return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv6, r)
   555  				}
   556  
   557  				defv6 = &boot.Route{
   558  					Destination: net.IPNet{
   559  						IP:   net.IPv6zero,
   560  						Mask: net.IPMask(net.IPv6zero),
   561  					},
   562  					Gateway: r.Gw,
   563  				}
   564  			default:
   565  				return nil, nil, nil, fmt.Errorf("unexpected address size for gateway: %+v for route: %+v", r.Gw, r)
   566  			}
   567  			continue
   568  		}
   569  
   570  		dst := *r.Dst
   571  		dst.IP = dst.IP.Mask(dst.Mask)
   572  		routes = append(routes, boot.Route{
   573  			Destination: dst,
   574  			Gateway:     r.Gw,
   575  		})
   576  	}
   577  	return routes, defv4, defv6, nil
   578  }
   579  
   580  // removeAddress removes IP address from network device. It's equivalent to:
   581  //
   582  //	ip addr del <ipAndMask> dev <name>
   583  func removeAddress(source netlink.Link, ipAndMask string) error {
   584  	addr, err := netlink.ParseAddr(ipAndMask)
   585  	if err != nil {
   586  		return err
   587  	}
   588  	return netlink.AddrDel(source, addr)
   589  }