gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/sandbox/network.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package sandbox
    16  
    17  import (
    18  	"fmt"
    19  	"net"
    20  	"os"
    21  	"os/exec"
    22  	"path/filepath"
    23  	"runtime"
    24  	"strconv"
    25  
    26  	specs "github.com/opencontainers/runtime-spec/specs-go"
    27  	"github.com/vishvananda/netlink"
    28  	"golang.org/x/sys/unix"
    29  	"gvisor.dev/gvisor/pkg/log"
    30  	"gvisor.dev/gvisor/pkg/tcpip/header"
    31  	"gvisor.dev/gvisor/pkg/tcpip/stack"
    32  	"gvisor.dev/gvisor/pkg/urpc"
    33  	"gvisor.dev/gvisor/runsc/boot"
    34  	"gvisor.dev/gvisor/runsc/config"
    35  	"gvisor.dev/gvisor/runsc/specutils"
    36  )
    37  
    38  // setupNetwork configures the network stack to mimic the local network
    39  // configuration. Docker uses network namespaces with vnets to configure the
    40  // network for the container. The untrusted app expects to see the same network
    41  // inside the sandbox. Routing and port mapping is handled directly by docker
    42  // with most of network information not even available to the runtime.
    43  //
    44  // Netstack inside the sandbox speaks directly to the device using a raw socket.
    45  // All IP addresses assigned to the NIC, are removed and passed on to netstack's
    46  // device.
    47  //
    48  // If 'conf.Network' is NoNetwork, skips local configuration and creates a
    49  // loopback interface only.
    50  //
    51  // Run the following container to test it:
    52  //
    53  //	docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4
    54  func setupNetwork(conn *urpc.Client, pid int, conf *config.Config) error {
    55  	log.Infof("Setting up network")
    56  
    57  	switch conf.Network {
    58  	case config.NetworkNone:
    59  		log.Infof("Network is disabled, create loopback interface only")
    60  		if err := createDefaultLoopbackInterface(conf, conn); err != nil {
    61  			return fmt.Errorf("creating default loopback interface: %v", err)
    62  		}
    63  	case config.NetworkSandbox:
    64  		// Build the path to the net namespace of the sandbox process.
    65  		// This is what we will copy.
    66  		nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
    67  		if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf); err != nil {
    68  			return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
    69  		}
    70  	case config.NetworkHost:
    71  		// Nothing to do here.
    72  	default:
    73  		return fmt.Errorf("invalid network type: %v", conf.Network)
    74  	}
    75  	return nil
    76  }
    77  
    78  func createDefaultLoopbackInterface(conf *config.Config, conn *urpc.Client) error {
    79  	link := boot.DefaultLoopbackLink
    80  	link.GVisorGRO = conf.GVisorGRO
    81  	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{
    82  		LoopbackLinks: []boot.LoopbackLink{link},
    83  		DisconnectOk:  conf.NetDisconnectOk,
    84  	}, nil); err != nil {
    85  		return fmt.Errorf("creating loopback link and routes: %v", err)
    86  	}
    87  	return nil
    88  }
    89  
    90  func joinNetNS(nsPath string) (func(), error) {
    91  	runtime.LockOSThread()
    92  	restoreNS, err := specutils.ApplyNS(specs.LinuxNamespace{
    93  		Type: specs.NetworkNamespace,
    94  		Path: nsPath,
    95  	})
    96  	if err != nil {
    97  		runtime.UnlockOSThread()
    98  		return nil, fmt.Errorf("joining net namespace %q: %v", nsPath, err)
    99  	}
   100  	return func() {
   101  		restoreNS()
   102  		runtime.UnlockOSThread()
   103  	}, nil
   104  }
   105  
   106  // isRootNS determines whether we are running in the root net namespace.
   107  // /proc/sys/net/core/rmem_default only exists in root network namespace.
   108  func isRootNS() (bool, error) {
   109  	err := unix.Access("/proc/sys/net/core/rmem_default", unix.F_OK)
   110  	switch err {
   111  	case nil:
   112  		return true, nil
   113  	case unix.ENOENT:
   114  		return false, nil
   115  	default:
   116  		return false, fmt.Errorf("failed to access /proc/sys/net/core/rmem_default: %v", err)
   117  	}
   118  }
   119  
   120  // createInterfacesAndRoutesFromNS scrapes the interface and routes from the
   121  // net namespace with the given path, creates them in the sandbox, and removes
   122  // them from the host.
   123  func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, conf *config.Config) error {
   124  	switch conf.XDP.Mode {
   125  	case config.XDPModeOff:
   126  	case config.XDPModeNS:
   127  	case config.XDPModeRedirect:
   128  		if err := createRedirectInterfacesAndRoutes(conn, conf); err != nil {
   129  			return fmt.Errorf("failed to create XDP redirect interface: %w", err)
   130  		}
   131  		return nil
   132  	case config.XDPModeTunnel:
   133  		if err := createXDPTunnel(conn, nsPath, conf); err != nil {
   134  			return fmt.Errorf("failed to create XDP tunnel: %w", err)
   135  		}
   136  		return nil
   137  	default:
   138  		return fmt.Errorf("unknown XDP mode: %v", conf.XDP.Mode)
   139  	}
   140  
   141  	// Join the network namespace that we will be copying.
   142  	restore, err := joinNetNS(nsPath)
   143  	if err != nil {
   144  		return err
   145  	}
   146  	defer restore()
   147  
   148  	// Get all interfaces in the namespace.
   149  	ifaces, err := net.Interfaces()
   150  	if err != nil {
   151  		return fmt.Errorf("querying interfaces: %w", err)
   152  	}
   153  
   154  	isRoot, err := isRootNS()
   155  	if err != nil {
   156  		return err
   157  	}
   158  	if isRoot {
   159  		return fmt.Errorf("cannot run with network enabled in root network namespace")
   160  	}
   161  
   162  	// Collect addresses and routes from the interfaces.
   163  	args := boot.CreateLinksAndRoutesArgs{
   164  		DisconnectOk: conf.NetDisconnectOk,
   165  	}
   166  	for _, iface := range ifaces {
   167  		if iface.Flags&net.FlagUp == 0 {
   168  			log.Infof("Skipping down interface: %+v", iface)
   169  			continue
   170  		}
   171  
   172  		allAddrs, err := iface.Addrs()
   173  		if err != nil {
   174  			return fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err)
   175  		}
   176  
   177  		// We build our own loopback device.
   178  		if iface.Flags&net.FlagLoopback != 0 {
   179  			link, err := loopbackLink(conf, iface, allAddrs)
   180  			if err != nil {
   181  				return fmt.Errorf("getting loopback link for iface %q: %w", iface.Name, err)
   182  			}
   183  			args.LoopbackLinks = append(args.LoopbackLinks, link)
   184  			continue
   185  		}
   186  
   187  		var ipAddrs []*net.IPNet
   188  		for _, ifaddr := range allAddrs {
   189  			ipNet, ok := ifaddr.(*net.IPNet)
   190  			if !ok {
   191  				return fmt.Errorf("address is not IPNet: %+v", ifaddr)
   192  			}
   193  			ipAddrs = append(ipAddrs, ipNet)
   194  		}
   195  		if len(ipAddrs) == 0 {
   196  			log.Warningf("No usable IP addresses found for interface %q, skipping", iface.Name)
   197  			continue
   198  		}
   199  
   200  		// Collect data from the ARP table.
   201  		dump, err := netlink.NeighList(iface.Index, 0)
   202  		if err != nil {
   203  			return fmt.Errorf("fetching ARP table for %q: %w", iface.Name, err)
   204  		}
   205  
   206  		var neighbors []boot.Neighbor
   207  		for _, n := range dump {
   208  			// There are only two "good" states NUD_PERMANENT and NUD_REACHABLE,
   209  			// but NUD_REACHABLE is fully dynamic and will be re-probed anyway.
   210  			if n.State == netlink.NUD_PERMANENT {
   211  				log.Debugf("Copying a static ARP entry: %+v %+v", n.IP, n.HardwareAddr)
   212  				// No flags are copied because Stack.AddStaticNeighbor does not support flags right now.
   213  				neighbors = append(neighbors, boot.Neighbor{IP: n.IP, HardwareAddr: n.HardwareAddr})
   214  			}
   215  		}
   216  
   217  		// Scrape the routes before removing the address, since that
   218  		// will remove the routes as well.
   219  		routes, defv4, defv6, err := routesForIface(iface)
   220  		if err != nil {
   221  			return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err)
   222  		}
   223  		if defv4 != nil {
   224  			if !args.Defaultv4Gateway.Route.Empty() {
   225  				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway)
   226  			}
   227  			args.Defaultv4Gateway.Route = *defv4
   228  			args.Defaultv4Gateway.Name = iface.Name
   229  		}
   230  
   231  		if defv6 != nil {
   232  			if !args.Defaultv6Gateway.Route.Empty() {
   233  				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway)
   234  			}
   235  			args.Defaultv6Gateway.Route = *defv6
   236  			args.Defaultv6Gateway.Name = iface.Name
   237  		}
   238  
   239  		// Get the link for the interface.
   240  		ifaceLink, err := netlink.LinkByName(iface.Name)
   241  		if err != nil {
   242  			return fmt.Errorf("getting link for interface %q: %w", iface.Name, err)
   243  		}
   244  		linkAddress := ifaceLink.Attrs().HardwareAddr
   245  
   246  		// Collect the addresses for the interface, enable forwarding,
   247  		// and remove them from the host.
   248  		var addresses []boot.IPWithPrefix
   249  		for _, addr := range ipAddrs {
   250  			prefix, _ := addr.Mask.Size()
   251  			addresses = append(addresses, boot.IPWithPrefix{Address: addr.IP, PrefixLen: prefix})
   252  
   253  			// Steal IP address from NIC.
   254  			if err := removeAddress(ifaceLink, addr.String()); err != nil {
   255  				// If we encounter an error while deleting the ip,
   256  				// verify the ip is still present on the interface.
   257  				if present, err := isAddressOnInterface(iface.Name, addr); err != nil {
   258  					return fmt.Errorf("checking if address %v is on interface %q: %w", addr, iface.Name, err)
   259  				} else if !present {
   260  					continue
   261  				}
   262  				return fmt.Errorf("removing address %v from device %q: %w", addr, iface.Name, err)
   263  			}
   264  		}
   265  
   266  		if conf.XDP.Mode == config.XDPModeNS {
   267  			xdpSockFDs, err := createSocketXDP(iface)
   268  			if err != nil {
   269  				return fmt.Errorf("failed to create XDP socket: %v", err)
   270  			}
   271  			args.FilePayload.Files = append(args.FilePayload.Files, xdpSockFDs...)
   272  			args.XDPLinks = append(args.XDPLinks, boot.XDPLink{
   273  				Name:              iface.Name,
   274  				InterfaceIndex:    iface.Index,
   275  				Routes:            routes,
   276  				TXChecksumOffload: conf.TXChecksumOffload,
   277  				RXChecksumOffload: conf.RXChecksumOffload,
   278  				NumChannels:       conf.NumNetworkChannels,
   279  				QDisc:             conf.QDisc,
   280  				Neighbors:         neighbors,
   281  				LinkAddress:       linkAddress,
   282  				Addresses:         addresses,
   283  				GVisorGRO:         conf.GVisorGRO,
   284  			})
   285  		} else {
   286  			link := boot.FDBasedLink{
   287  				Name:                 iface.Name,
   288  				MTU:                  iface.MTU,
   289  				Routes:               routes,
   290  				TXChecksumOffload:    conf.TXChecksumOffload,
   291  				RXChecksumOffload:    conf.RXChecksumOffload,
   292  				NumChannels:          conf.NumNetworkChannels,
   293  				ProcessorsPerChannel: conf.NetworkProcessorsPerChannel,
   294  				QDisc:                conf.QDisc,
   295  				Neighbors:            neighbors,
   296  				LinkAddress:          linkAddress,
   297  				Addresses:            addresses,
   298  			}
   299  
   300  			log.Debugf("Setting up network channels")
   301  			// Create the socket for the device.
   302  			for i := 0; i < link.NumChannels; i++ {
   303  				log.Debugf("Creating Channel %d", i)
   304  				socketEntry, err := createSocket(iface, ifaceLink, conf.HostGSO)
   305  				if err != nil {
   306  					return fmt.Errorf("failed to createSocket for %s : %w", iface.Name, err)
   307  				}
   308  				if i == 0 {
   309  					link.GSOMaxSize = socketEntry.gsoMaxSize
   310  				} else {
   311  					if link.GSOMaxSize != socketEntry.gsoMaxSize {
   312  						return fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s",
   313  							link.GSOMaxSize, socketEntry.gsoMaxSize, iface.Name)
   314  					}
   315  				}
   316  				args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile)
   317  			}
   318  
   319  			if link.GSOMaxSize == 0 && conf.GVisorGSO {
   320  				// Host GSO is disabled. Let's enable gVisor GSO.
   321  				link.GSOMaxSize = stack.GVisorGSOMaxSize
   322  				link.GVisorGSOEnabled = true
   323  			}
   324  			link.GVisorGRO = conf.GVisorGRO
   325  
   326  			args.FDBasedLinks = append(args.FDBasedLinks, link)
   327  		}
   328  	}
   329  
   330  	if err := pcapAndNAT(&args, conf); err != nil {
   331  		return err
   332  	}
   333  
   334  	log.Debugf("Setting up network, config: %+v", args)
   335  	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil {
   336  		return fmt.Errorf("creating links and routes: %w", err)
   337  	}
   338  	return nil
   339  }
   340  
   341  // isAddressOnInterface checks if an address is on an interface
   342  func isAddressOnInterface(ifaceName string, addr *net.IPNet) (bool, error) {
   343  	iface, err := net.InterfaceByName(ifaceName)
   344  	if err != nil {
   345  		return false, fmt.Errorf("getting interface by name %q: %w", ifaceName, err)
   346  	}
   347  	ifaceAddrs, err := iface.Addrs()
   348  	if err != nil {
   349  		return false, fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err)
   350  	}
   351  	for _, ifaceAddr := range ifaceAddrs {
   352  		ipNet, ok := ifaceAddr.(*net.IPNet)
   353  		if !ok {
   354  			log.Warningf("Can't cast address to *net.IPNet, skipping: %+v", ifaceAddr)
   355  			continue
   356  		}
   357  		if ipNet.String() == addr.String() {
   358  			return true, nil
   359  		}
   360  	}
   361  	return false, nil
   362  }
   363  
   364  type socketEntry struct {
   365  	deviceFile *os.File
   366  	gsoMaxSize uint32
   367  }
   368  
   369  // createSocket creates an underlying AF_PACKET socket and configures it for
   370  // use by the sentry and returns an *os.File that wraps the underlying socket
   371  // fd.
   372  func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (*socketEntry, error) {
   373  	// Create the socket.
   374  	const protocol = 0x0300                                  // htons(ETH_P_ALL)
   375  	fd, err := unix.Socket(unix.AF_PACKET, unix.SOCK_RAW, 0) // pass protocol 0 to avoid slow bind()
   376  	if err != nil {
   377  		return nil, fmt.Errorf("unable to create raw socket: %v", err)
   378  	}
   379  	deviceFile := os.NewFile(uintptr(fd), "raw-device-fd")
   380  	// Bind to the appropriate device.
   381  	ll := unix.SockaddrLinklayer{
   382  		Protocol: protocol,
   383  		Ifindex:  iface.Index,
   384  	}
   385  	if err := unix.Bind(fd, &ll); err != nil {
   386  		return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
   387  	}
   388  
   389  	gsoMaxSize := uint32(0)
   390  	if enableGSO {
   391  		gso, err := isGSOEnabled(fd, iface.Name)
   392  		if err != nil {
   393  			return nil, fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err)
   394  		}
   395  		if gso {
   396  			if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
   397  				return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
   398  			}
   399  			gsoMaxSize = ifaceLink.Attrs().GSOMaxSize
   400  		} else {
   401  			log.Infof("GSO not available in host.")
   402  		}
   403  	}
   404  
   405  	// Use SO_RCVBUFFORCE/SO_SNDBUFFORCE because on linux the receive/send buffer
   406  	// for an AF_PACKET socket is capped by "net.core.rmem_max/wmem_max".
   407  	// wmem_max/rmem_max default to a unusually low value of 208KB. This is too
   408  	// low for gVisor to be able to receive packets at high throughputs without
   409  	// incurring packet drops.
   410  	const bufSize = 4 << 20 // 4MB.
   411  
   412  	if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUFFORCE, bufSize); err != nil {
   413  		_ = unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUF, bufSize)
   414  		sz, _ := unix.GetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUF)
   415  
   416  		if sz < bufSize {
   417  			log.Warningf("Failed to increase rcv buffer to %d on SOCK_RAW on %s. Current buffer %d: %v", bufSize, iface.Name, sz, err)
   418  		}
   419  	}
   420  
   421  	if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUFFORCE, bufSize); err != nil {
   422  		_ = unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUF, bufSize)
   423  		sz, _ := unix.GetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUF)
   424  		if sz < bufSize {
   425  			log.Warningf("Failed to increase snd buffer to %d on SOCK_RAW on %s. Current buffer %d: %v", bufSize, iface.Name, sz, err)
   426  		}
   427  	}
   428  
   429  	return &socketEntry{deviceFile, gsoMaxSize}, nil
   430  }
   431  
   432  // loopbackLink returns the link with addresses and routes for a loopback
   433  // interface.
   434  func loopbackLink(conf *config.Config, iface net.Interface, addrs []net.Addr) (boot.LoopbackLink, error) {
   435  	link := boot.LoopbackLink{
   436  		Name:      iface.Name,
   437  		GVisorGRO: conf.GVisorGRO,
   438  	}
   439  	for _, addr := range addrs {
   440  		ipNet, ok := addr.(*net.IPNet)
   441  		if !ok {
   442  			return boot.LoopbackLink{}, fmt.Errorf("address is not IPNet: %+v", addr)
   443  		}
   444  
   445  		prefix, _ := ipNet.Mask.Size()
   446  		link.Addresses = append(link.Addresses, boot.IPWithPrefix{
   447  			Address:   ipNet.IP,
   448  			PrefixLen: prefix,
   449  		})
   450  
   451  		dst := *ipNet
   452  		dst.IP = dst.IP.Mask(dst.Mask)
   453  		link.Routes = append(link.Routes, boot.Route{
   454  			Destination: dst,
   455  		})
   456  	}
   457  	return link, nil
   458  }
   459  
   460  // routesForIface iterates over all routes for the given interface and converts
   461  // them to boot.Routes. It also returns the a default v4/v6 route if found.
   462  func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, *boot.Route, error) {
   463  	link, err := netlink.LinkByIndex(iface.Index)
   464  	if err != nil {
   465  		return nil, nil, nil, err
   466  	}
   467  	rs, err := netlink.RouteList(link, netlink.FAMILY_ALL)
   468  	if err != nil {
   469  		return nil, nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err)
   470  	}
   471  
   472  	var defv4, defv6 *boot.Route
   473  	var routes []boot.Route
   474  	for _, r := range rs {
   475  		// Is it a default route?
   476  		if r.Dst == nil {
   477  			if r.Gw == nil {
   478  				return nil, nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
   479  			}
   480  			// Create a catch all route to the gateway.
   481  			switch len(r.Gw) {
   482  			case header.IPv4AddressSize:
   483  				if defv4 != nil {
   484  					return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv4, r)
   485  				}
   486  				defv4 = &boot.Route{
   487  					Destination: net.IPNet{
   488  						IP:   net.IPv4zero,
   489  						Mask: net.IPMask(net.IPv4zero),
   490  					},
   491  					Gateway: r.Gw,
   492  				}
   493  			case header.IPv6AddressSize:
   494  				if defv6 != nil {
   495  					return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv6, r)
   496  				}
   497  
   498  				defv6 = &boot.Route{
   499  					Destination: net.IPNet{
   500  						IP:   net.IPv6zero,
   501  						Mask: net.IPMask(net.IPv6zero),
   502  					},
   503  					Gateway: r.Gw,
   504  				}
   505  			default:
   506  				return nil, nil, nil, fmt.Errorf("unexpected address size for gateway: %+v for route: %+v", r.Gw, r)
   507  			}
   508  			continue
   509  		}
   510  
   511  		dst := *r.Dst
   512  		dst.IP = dst.IP.Mask(dst.Mask)
   513  		routes = append(routes, boot.Route{
   514  			Destination: dst,
   515  			Gateway:     r.Gw,
   516  		})
   517  	}
   518  	return routes, defv4, defv6, nil
   519  }
   520  
   521  // removeAddress removes IP address from network device. It's equivalent to:
   522  //
   523  //	ip addr del <ipAndMask> dev <name>
   524  func removeAddress(source netlink.Link, ipAndMask string) error {
   525  	addr, err := netlink.ParseAddr(ipAndMask)
   526  	if err != nil {
   527  		return err
   528  	}
   529  	return netlink.AddrDel(source, addr)
   530  }
   531  
   532  func pcapAndNAT(args *boot.CreateLinksAndRoutesArgs, conf *config.Config) error {
   533  	// Possibly enable packet logging.
   534  	args.LogPackets = conf.LogPackets
   535  
   536  	// Pass PCAP log file if present.
   537  	if conf.PCAP != "" {
   538  		args.PCAP = true
   539  		pcap, err := os.OpenFile(conf.PCAP, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0664)
   540  		if err != nil {
   541  			return fmt.Errorf("failed to open PCAP file %s: %v", conf.PCAP, err)
   542  		}
   543  		args.FilePayload.Files = append(args.FilePayload.Files, pcap)
   544  	}
   545  
   546  	// Pass the host's NAT table if requested.
   547  	if conf.ReproduceNftables || conf.ReproduceNAT {
   548  		var f *os.File
   549  		var err error
   550  		if conf.ReproduceNftables {
   551  			log.Infof("reproing nftables")
   552  			f, err = checkNftables()
   553  		} else if conf.ReproduceNAT {
   554  			log.Infof("reproing legacy tables")
   555  			f, err = writeNATBlob()
   556  		}
   557  		if err != nil {
   558  			return fmt.Errorf("failed to write NAT blob: %v", err)
   559  		}
   560  		args.NATBlob = true
   561  		args.FilePayload.Files = append(args.FilePayload.Files, f)
   562  	}
   563  
   564  	return nil
   565  }
   566  
   567  // The below is a work around to generate iptables-legacy rules on machines
   568  // that use iptables-nftables. The logic goes something like this:
   569  //
   570  //             start
   571  //               |
   572  //               v               no
   573  //     are legacy tables empty? -----> scrape rules -----> done <----+
   574  //               |                                          ^        |
   575  //               | yes                                      |        |
   576  //               v                        yes               |        |
   577  //     are nft tables empty? -------------------------------+        |
   578  //               |                                                   |
   579  //               | no                                                |
   580  //               v                                                   |
   581  //     pipe iptables-nft-save -t nat to iptables-legacy-restore      |
   582  //     scrape rules                                                  |
   583  //     delete iptables-legacy rules                                  |
   584  //               |                                                   |
   585  //               +---------------------------------------------------+
   586  //
   587  // If we fail at some point (e.g. to find a binary), we just try to scrape the
   588  // legacy rules.
   589  
   590  const emptyNatRules = `-P PREROUTING ACCEPT
   591  -P INPUT ACCEPT
   592  -P OUTPUT ACCEPT
   593  -P POSTROUTING ACCEPT
   594  `
   595  
   596  func checkNftables() (*os.File, error) {
   597  	// Use iptables (not iptables-save) to test table emptiness because it
   598  	// gives predictable results: no counters and no comments.
   599  
   600  	// Is the legacy table empty?
   601  	if out, err := exec.Command("iptables-legacy", "-t", "nat", "-S").Output(); err != nil || string(out) != emptyNatRules {
   602  		return writeNATBlob()
   603  	}
   604  
   605  	// Is the nftables table empty?
   606  	if out, err := exec.Command("iptables-nft", "-t", "nat", "-S").Output(); err != nil || string(out) == emptyNatRules {
   607  		return nil, fmt.Errorf("no rules to scrape: %v", err)
   608  	}
   609  
   610  	// Get the current (empty) legacy rules.
   611  	currLegacy, err := exec.Command("iptables-legacy-save", "-t", "nat").Output()
   612  	if err != nil {
   613  		return nil, fmt.Errorf("failed to save existing rules with error (%v) and output: %s", err, currLegacy)
   614  	}
   615  
   616  	// Restore empty legacy rules.
   617  	defer func() {
   618  		cmd := exec.Command("iptables-legacy-restore")
   619  		stdin, err := cmd.StdinPipe()
   620  		if err != nil {
   621  			log.Warningf("failed to get stdin pipe: %v", err)
   622  			return
   623  		}
   624  
   625  		go func() {
   626  			defer stdin.Close()
   627  			stdin.Write(currLegacy)
   628  		}()
   629  
   630  		if out, err := cmd.CombinedOutput(); err != nil {
   631  			log.Warningf("failed to restore iptables error (%v) with output: %s", err, out)
   632  		}
   633  	}()
   634  
   635  	// Pipe the output of iptables-nft-save to iptables-legacy-restore.
   636  	nftOut, err := exec.Command("iptables-nft-save", "-t", "nat").Output()
   637  	if err != nil {
   638  		return nil, fmt.Errorf("failed to run iptables-nft-save: %v", err)
   639  	}
   640  
   641  	cmd := exec.Command("iptables-legacy-restore")
   642  	stdin, err := cmd.StdinPipe()
   643  	if err != nil {
   644  		return nil, fmt.Errorf("failed to get stdin pipe: %v", err)
   645  	}
   646  
   647  	go func() {
   648  		defer stdin.Close()
   649  		stdin.Write(nftOut)
   650  	}()
   651  
   652  	if out, err := cmd.CombinedOutput(); err != nil {
   653  		return nil, fmt.Errorf("failed to restore iptables error (%v) with output: %s", err, out)
   654  	}
   655  
   656  	return writeNATBlob()
   657  }