github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/runsc/sandbox/network.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package sandbox
    16  
    17  import (
    18  	"fmt"
    19  	"net"
    20  	"os"
    21  	"os/exec"
    22  	"path/filepath"
    23  	"runtime"
    24  	"strconv"
    25  
    26  	specs "github.com/opencontainers/runtime-spec/specs-go"
    27  	"github.com/vishvananda/netlink"
    28  	"golang.org/x/sys/unix"
    29  	"github.com/metacubex/gvisor/pkg/log"
    30  	"github.com/metacubex/gvisor/pkg/tcpip/header"
    31  	"github.com/metacubex/gvisor/pkg/tcpip/stack"
    32  	"github.com/metacubex/gvisor/pkg/urpc"
    33  	"github.com/metacubex/gvisor/runsc/boot"
    34  	"github.com/metacubex/gvisor/runsc/config"
    35  	"github.com/metacubex/gvisor/runsc/specutils"
    36  )
    37  
    38  // setupNetwork configures the network stack to mimic the local network
    39  // configuration. Docker uses network namespaces with vnets to configure the
    40  // network for the container. The untrusted app expects to see the same network
    41  // inside the sandbox. Routing and port mapping is handled directly by docker
    42  // with most of network information not even available to the runtime.
    43  //
    44  // Netstack inside the sandbox speaks directly to the device using a raw socket.
    45  // All IP addresses assigned to the NIC, are removed and passed on to netstack's
    46  // device.
    47  //
    48  // If 'conf.Network' is NoNetwork, skips local configuration and creates a
    49  // loopback interface only.
    50  //
    51  // Run the following container to test it:
    52  //
    53  //	docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4
    54  func setupNetwork(conn *urpc.Client, pid int, conf *config.Config) error {
    55  	log.Infof("Setting up network")
    56  
    57  	switch conf.Network {
    58  	case config.NetworkNone:
    59  		log.Infof("Network is disabled, create loopback interface only")
    60  		if err := createDefaultLoopbackInterface(conf, conn); err != nil {
    61  			return fmt.Errorf("creating default loopback interface: %v", err)
    62  		}
    63  	case config.NetworkSandbox:
    64  		// Build the path to the net namespace of the sandbox process.
    65  		// This is what we will copy.
    66  		nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
    67  		if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf); err != nil {
    68  			return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
    69  		}
    70  	case config.NetworkHost:
    71  		// Nothing to do here.
    72  	default:
    73  		return fmt.Errorf("invalid network type: %v", conf.Network)
    74  	}
    75  	return nil
    76  }
    77  
    78  func createDefaultLoopbackInterface(conf *config.Config, conn *urpc.Client) error {
    79  	link := boot.DefaultLoopbackLink
    80  	link.GvisorGROTimeout = conf.GvisorGROTimeout
    81  	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{
    82  		LoopbackLinks: []boot.LoopbackLink{link},
    83  	}, nil); err != nil {
    84  		return fmt.Errorf("creating loopback link and routes: %v", err)
    85  	}
    86  	return nil
    87  }
    88  
    89  func joinNetNS(nsPath string) (func(), error) {
    90  	runtime.LockOSThread()
    91  	restoreNS, err := specutils.ApplyNS(specs.LinuxNamespace{
    92  		Type: specs.NetworkNamespace,
    93  		Path: nsPath,
    94  	})
    95  	if err != nil {
    96  		runtime.UnlockOSThread()
    97  		return nil, fmt.Errorf("joining net namespace %q: %v", nsPath, err)
    98  	}
    99  	return func() {
   100  		restoreNS()
   101  		runtime.UnlockOSThread()
   102  	}, nil
   103  }
   104  
   105  // isRootNS determines whether we are running in the root net namespace.
   106  // /proc/sys/net/core/rmem_default only exists in root network namespace.
   107  func isRootNS() (bool, error) {
   108  	err := unix.Access("/proc/sys/net/core/rmem_default", unix.F_OK)
   109  	switch err {
   110  	case nil:
   111  		return true, nil
   112  	case unix.ENOENT:
   113  		return false, nil
   114  	default:
   115  		return false, fmt.Errorf("failed to access /proc/sys/net/core/rmem_default: %v", err)
   116  	}
   117  }
   118  
   119  // createInterfacesAndRoutesFromNS scrapes the interface and routes from the
   120  // net namespace with the given path, creates them in the sandbox, and removes
   121  // them from the host.
   122  func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, conf *config.Config) error {
   123  	switch conf.XDP.Mode {
   124  	case config.XDPModeOff:
   125  	case config.XDPModeNS:
   126  	case config.XDPModeRedirect:
   127  		if err := createRedirectInterfacesAndRoutes(conn, conf); err != nil {
   128  			return fmt.Errorf("failed to create XDP redirect interface: %w", err)
   129  		}
   130  		return nil
   131  	case config.XDPModeTunnel:
   132  		if err := createXDPTunnel(conn, nsPath, conf); err != nil {
   133  			return fmt.Errorf("failed to create XDP tunnel: %w", err)
   134  		}
   135  		return nil
   136  	default:
   137  		return fmt.Errorf("unknown XDP mode: %v", conf.XDP.Mode)
   138  	}
   139  
   140  	// Join the network namespace that we will be copying.
   141  	restore, err := joinNetNS(nsPath)
   142  	if err != nil {
   143  		return err
   144  	}
   145  	defer restore()
   146  
   147  	// Get all interfaces in the namespace.
   148  	ifaces, err := net.Interfaces()
   149  	if err != nil {
   150  		return fmt.Errorf("querying interfaces: %w", err)
   151  	}
   152  
   153  	isRoot, err := isRootNS()
   154  	if err != nil {
   155  		return err
   156  	}
   157  	if isRoot {
   158  		return fmt.Errorf("cannot run with network enabled in root network namespace")
   159  	}
   160  
   161  	// Collect addresses and routes from the interfaces.
   162  	var args boot.CreateLinksAndRoutesArgs
   163  	for _, iface := range ifaces {
   164  		if iface.Flags&net.FlagUp == 0 {
   165  			log.Infof("Skipping down interface: %+v", iface)
   166  			continue
   167  		}
   168  
   169  		allAddrs, err := iface.Addrs()
   170  		if err != nil {
   171  			return fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err)
   172  		}
   173  
   174  		// We build our own loopback device.
   175  		if iface.Flags&net.FlagLoopback != 0 {
   176  			link, err := loopbackLink(conf, iface, allAddrs)
   177  			if err != nil {
   178  				return fmt.Errorf("getting loopback link for iface %q: %w", iface.Name, err)
   179  			}
   180  			args.LoopbackLinks = append(args.LoopbackLinks, link)
   181  			continue
   182  		}
   183  
   184  		var ipAddrs []*net.IPNet
   185  		for _, ifaddr := range allAddrs {
   186  			ipNet, ok := ifaddr.(*net.IPNet)
   187  			if !ok {
   188  				return fmt.Errorf("address is not IPNet: %+v", ifaddr)
   189  			}
   190  			ipAddrs = append(ipAddrs, ipNet)
   191  		}
   192  		if len(ipAddrs) == 0 {
   193  			log.Warningf("No usable IP addresses found for interface %q, skipping", iface.Name)
   194  			continue
   195  		}
   196  
   197  		// Collect data from the ARP table.
   198  		dump, err := netlink.NeighList(iface.Index, 0)
   199  		if err != nil {
   200  			return fmt.Errorf("fetching ARP table for %q: %w", iface.Name, err)
   201  		}
   202  
   203  		var neighbors []boot.Neighbor
   204  		for _, n := range dump {
   205  			// There are only two "good" states NUD_PERMANENT and NUD_REACHABLE,
   206  			// but NUD_REACHABLE is fully dynamic and will be re-probed anyway.
   207  			if n.State == netlink.NUD_PERMANENT {
   208  				log.Debugf("Copying a static ARP entry: %+v %+v", n.IP, n.HardwareAddr)
   209  				// No flags are copied because Stack.AddStaticNeighbor does not support flags right now.
   210  				neighbors = append(neighbors, boot.Neighbor{IP: n.IP, HardwareAddr: n.HardwareAddr})
   211  			}
   212  		}
   213  
   214  		// Scrape the routes before removing the address, since that
   215  		// will remove the routes as well.
   216  		routes, defv4, defv6, err := routesForIface(iface)
   217  		if err != nil {
   218  			return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err)
   219  		}
   220  		if defv4 != nil {
   221  			if !args.Defaultv4Gateway.Route.Empty() {
   222  				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway)
   223  			}
   224  			args.Defaultv4Gateway.Route = *defv4
   225  			args.Defaultv4Gateway.Name = iface.Name
   226  		}
   227  
   228  		if defv6 != nil {
   229  			if !args.Defaultv6Gateway.Route.Empty() {
   230  				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway)
   231  			}
   232  			args.Defaultv6Gateway.Route = *defv6
   233  			args.Defaultv6Gateway.Name = iface.Name
   234  		}
   235  
   236  		// Get the link for the interface.
   237  		ifaceLink, err := netlink.LinkByName(iface.Name)
   238  		if err != nil {
   239  			return fmt.Errorf("getting link for interface %q: %w", iface.Name, err)
   240  		}
   241  		linkAddress := ifaceLink.Attrs().HardwareAddr
   242  
   243  		// Collect the addresses for the interface, enable forwarding,
   244  		// and remove them from the host.
   245  		var addresses []boot.IPWithPrefix
   246  		for _, addr := range ipAddrs {
   247  			prefix, _ := addr.Mask.Size()
   248  			addresses = append(addresses, boot.IPWithPrefix{Address: addr.IP, PrefixLen: prefix})
   249  
   250  			// Steal IP address from NIC.
   251  			if err := removeAddress(ifaceLink, addr.String()); err != nil {
   252  				// If we encounter an error while deleting the ip,
   253  				// verify the ip is still present on the interface.
   254  				if present, err := isAddressOnInterface(iface.Name, addr); err != nil {
   255  					return fmt.Errorf("checking if address %v is on interface %q: %w", addr, iface.Name, err)
   256  				} else if !present {
   257  					continue
   258  				}
   259  				return fmt.Errorf("removing address %v from device %q: %w", addr, iface.Name, err)
   260  			}
   261  		}
   262  
   263  		if conf.XDP.Mode == config.XDPModeNS {
   264  			xdpSockFDs, err := createSocketXDP(iface)
   265  			if err != nil {
   266  				return fmt.Errorf("failed to create XDP socket: %v", err)
   267  			}
   268  			args.FilePayload.Files = append(args.FilePayload.Files, xdpSockFDs...)
   269  			args.XDPLinks = append(args.XDPLinks, boot.XDPLink{
   270  				Name:              iface.Name,
   271  				InterfaceIndex:    iface.Index,
   272  				Routes:            routes,
   273  				TXChecksumOffload: conf.TXChecksumOffload,
   274  				RXChecksumOffload: conf.RXChecksumOffload,
   275  				NumChannels:       conf.NumNetworkChannels,
   276  				QDisc:             conf.QDisc,
   277  				Neighbors:         neighbors,
   278  				LinkAddress:       linkAddress,
   279  				Addresses:         addresses,
   280  				GvisorGROTimeout:  conf.GvisorGROTimeout,
   281  			})
   282  		} else {
   283  			link := boot.FDBasedLink{
   284  				Name:              iface.Name,
   285  				MTU:               iface.MTU,
   286  				Routes:            routes,
   287  				TXChecksumOffload: conf.TXChecksumOffload,
   288  				RXChecksumOffload: conf.RXChecksumOffload,
   289  				NumChannels:       conf.NumNetworkChannels,
   290  				QDisc:             conf.QDisc,
   291  				Neighbors:         neighbors,
   292  				LinkAddress:       linkAddress,
   293  				Addresses:         addresses,
   294  			}
   295  
   296  			log.Debugf("Setting up network channels")
   297  			// Create the socket for the device.
   298  			for i := 0; i < link.NumChannels; i++ {
   299  				log.Debugf("Creating Channel %d", i)
   300  				socketEntry, err := createSocket(iface, ifaceLink, conf.HostGSO)
   301  				if err != nil {
   302  					return fmt.Errorf("failed to createSocket for %s : %w", iface.Name, err)
   303  				}
   304  				if i == 0 {
   305  					link.GSOMaxSize = socketEntry.gsoMaxSize
   306  				} else {
   307  					if link.GSOMaxSize != socketEntry.gsoMaxSize {
   308  						return fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s",
   309  							link.GSOMaxSize, socketEntry.gsoMaxSize, iface.Name)
   310  					}
   311  				}
   312  				args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile)
   313  			}
   314  
   315  			if link.GSOMaxSize == 0 && conf.GvisorGSO {
   316  				// Host GSO is disabled. Let's enable gVisor GSO.
   317  				link.GSOMaxSize = stack.GvisorGSOMaxSize
   318  				link.GvisorGSOEnabled = true
   319  			}
   320  			link.GvisorGROTimeout = conf.GvisorGROTimeout
   321  
   322  			args.FDBasedLinks = append(args.FDBasedLinks, link)
   323  		}
   324  	}
   325  
   326  	if err := pcapAndNAT(&args, conf); err != nil {
   327  		return err
   328  	}
   329  
   330  	log.Debugf("Setting up network, config: %+v", args)
   331  	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil {
   332  		return fmt.Errorf("creating links and routes: %w", err)
   333  	}
   334  	return nil
   335  }
   336  
   337  // isAddressOnInterface checks if an address is on an interface
   338  func isAddressOnInterface(ifaceName string, addr *net.IPNet) (bool, error) {
   339  	iface, err := net.InterfaceByName(ifaceName)
   340  	if err != nil {
   341  		return false, fmt.Errorf("getting interface by name %q: %w", ifaceName, err)
   342  	}
   343  	ifaceAddrs, err := iface.Addrs()
   344  	if err != nil {
   345  		return false, fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err)
   346  	}
   347  	for _, ifaceAddr := range ifaceAddrs {
   348  		ipNet, ok := ifaceAddr.(*net.IPNet)
   349  		if !ok {
   350  			log.Warningf("Can't cast address to *net.IPNet, skipping: %+v", ifaceAddr)
   351  			continue
   352  		}
   353  		if ipNet.String() == addr.String() {
   354  			return true, nil
   355  		}
   356  	}
   357  	return false, nil
   358  }
   359  
   360  type socketEntry struct {
   361  	deviceFile *os.File
   362  	gsoMaxSize uint32
   363  }
   364  
   365  // createSocket creates an underlying AF_PACKET socket and configures it for
   366  // use by the sentry and returns an *os.File that wraps the underlying socket
   367  // fd.
   368  func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (*socketEntry, error) {
   369  	// Create the socket.
   370  	const protocol = 0x0300                                  // htons(ETH_P_ALL)
   371  	fd, err := unix.Socket(unix.AF_PACKET, unix.SOCK_RAW, 0) // pass protocol 0 to avoid slow bind()
   372  	if err != nil {
   373  		return nil, fmt.Errorf("unable to create raw socket: %v", err)
   374  	}
   375  	deviceFile := os.NewFile(uintptr(fd), "raw-device-fd")
   376  	// Bind to the appropriate device.
   377  	ll := unix.SockaddrLinklayer{
   378  		Protocol: protocol,
   379  		Ifindex:  iface.Index,
   380  	}
   381  	if err := unix.Bind(fd, &ll); err != nil {
   382  		return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
   383  	}
   384  
   385  	gsoMaxSize := uint32(0)
   386  	if enableGSO {
   387  		gso, err := isGSOEnabled(fd, iface.Name)
   388  		if err != nil {
   389  			return nil, fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err)
   390  		}
   391  		if gso {
   392  			if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
   393  				return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
   394  			}
   395  			gsoMaxSize = ifaceLink.Attrs().GSOMaxSize
   396  		} else {
   397  			log.Infof("GSO not available in host.")
   398  		}
   399  	}
   400  
   401  	// Use SO_RCVBUFFORCE/SO_SNDBUFFORCE because on linux the receive/send buffer
   402  	// for an AF_PACKET socket is capped by "net.core.rmem_max/wmem_max".
   403  	// wmem_max/rmem_max default to a unusually low value of 208KB. This is too
   404  	// low for gVisor to be able to receive packets at high throughputs without
   405  	// incurring packet drops.
   406  	const bufSize = 4 << 20 // 4MB.
   407  
   408  	if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUFFORCE, bufSize); err != nil {
   409  		_ = unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUF, bufSize)
   410  		sz, _ := unix.GetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUF)
   411  
   412  		if sz < bufSize {
   413  			log.Warningf("Failed to increase rcv buffer to %d on SOCK_RAW on %s. Current buffer %d: %v", bufSize, iface.Name, sz, err)
   414  		}
   415  	}
   416  
   417  	if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUFFORCE, bufSize); err != nil {
   418  		_ = unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUF, bufSize)
   419  		sz, _ := unix.GetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUF)
   420  		if sz < bufSize {
   421  			log.Warningf("Failed to increase snd buffer to %d on SOCK_RAW on %s. Current buffer %d: %v", bufSize, iface.Name, sz, err)
   422  		}
   423  	}
   424  
   425  	return &socketEntry{deviceFile, gsoMaxSize}, nil
   426  }
   427  
   428  // loopbackLink returns the link with addresses and routes for a loopback
   429  // interface.
   430  func loopbackLink(conf *config.Config, iface net.Interface, addrs []net.Addr) (boot.LoopbackLink, error) {
   431  	link := boot.LoopbackLink{
   432  		Name:             iface.Name,
   433  		GvisorGROTimeout: conf.GvisorGROTimeout,
   434  	}
   435  	for _, addr := range addrs {
   436  		ipNet, ok := addr.(*net.IPNet)
   437  		if !ok {
   438  			return boot.LoopbackLink{}, fmt.Errorf("address is not IPNet: %+v", addr)
   439  		}
   440  
   441  		prefix, _ := ipNet.Mask.Size()
   442  		link.Addresses = append(link.Addresses, boot.IPWithPrefix{
   443  			Address:   ipNet.IP,
   444  			PrefixLen: prefix,
   445  		})
   446  
   447  		dst := *ipNet
   448  		dst.IP = dst.IP.Mask(dst.Mask)
   449  		link.Routes = append(link.Routes, boot.Route{
   450  			Destination: dst,
   451  		})
   452  	}
   453  	return link, nil
   454  }
   455  
   456  // routesForIface iterates over all routes for the given interface and converts
   457  // them to boot.Routes. It also returns the a default v4/v6 route if found.
   458  func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, *boot.Route, error) {
   459  	link, err := netlink.LinkByIndex(iface.Index)
   460  	if err != nil {
   461  		return nil, nil, nil, err
   462  	}
   463  	rs, err := netlink.RouteList(link, netlink.FAMILY_ALL)
   464  	if err != nil {
   465  		return nil, nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err)
   466  	}
   467  
   468  	var defv4, defv6 *boot.Route
   469  	var routes []boot.Route
   470  	for _, r := range rs {
   471  		// Is it a default route?
   472  		if r.Dst == nil {
   473  			if r.Gw == nil {
   474  				return nil, nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
   475  			}
   476  			// Create a catch all route to the gateway.
   477  			switch len(r.Gw) {
   478  			case header.IPv4AddressSize:
   479  				if defv4 != nil {
   480  					return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv4, r)
   481  				}
   482  				defv4 = &boot.Route{
   483  					Destination: net.IPNet{
   484  						IP:   net.IPv4zero,
   485  						Mask: net.IPMask(net.IPv4zero),
   486  					},
   487  					Gateway: r.Gw,
   488  				}
   489  			case header.IPv6AddressSize:
   490  				if defv6 != nil {
   491  					return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv6, r)
   492  				}
   493  
   494  				defv6 = &boot.Route{
   495  					Destination: net.IPNet{
   496  						IP:   net.IPv6zero,
   497  						Mask: net.IPMask(net.IPv6zero),
   498  					},
   499  					Gateway: r.Gw,
   500  				}
   501  			default:
   502  				return nil, nil, nil, fmt.Errorf("unexpected address size for gateway: %+v for route: %+v", r.Gw, r)
   503  			}
   504  			continue
   505  		}
   506  
   507  		dst := *r.Dst
   508  		dst.IP = dst.IP.Mask(dst.Mask)
   509  		routes = append(routes, boot.Route{
   510  			Destination: dst,
   511  			Gateway:     r.Gw,
   512  		})
   513  	}
   514  	return routes, defv4, defv6, nil
   515  }
   516  
   517  // removeAddress removes IP address from network device. It's equivalent to:
   518  //
   519  //	ip addr del <ipAndMask> dev <name>
   520  func removeAddress(source netlink.Link, ipAndMask string) error {
   521  	addr, err := netlink.ParseAddr(ipAndMask)
   522  	if err != nil {
   523  		return err
   524  	}
   525  	return netlink.AddrDel(source, addr)
   526  }
   527  
   528  func pcapAndNAT(args *boot.CreateLinksAndRoutesArgs, conf *config.Config) error {
   529  	// Possibly enable packet logging.
   530  	args.LogPackets = conf.LogPackets
   531  
   532  	// Pass PCAP log file if present.
   533  	if conf.PCAP != "" {
   534  		args.PCAP = true
   535  		pcap, err := os.OpenFile(conf.PCAP, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0664)
   536  		if err != nil {
   537  			return fmt.Errorf("failed to open PCAP file %s: %v", conf.PCAP, err)
   538  		}
   539  		args.FilePayload.Files = append(args.FilePayload.Files, pcap)
   540  	}
   541  
   542  	// Pass the host's NAT table if requested.
   543  	if conf.ReproduceNftables || conf.ReproduceNAT {
   544  		var f *os.File
   545  		var err error
   546  		if conf.ReproduceNftables {
   547  			log.Infof("reproing nftables")
   548  			f, err = checkNftables()
   549  		} else if conf.ReproduceNAT {
   550  			log.Infof("reproing legacy tables")
   551  			f, err = writeNATBlob()
   552  		}
   553  		if err != nil {
   554  			return fmt.Errorf("failed to write NAT blob: %v", err)
   555  		}
   556  		args.NATBlob = true
   557  		args.FilePayload.Files = append(args.FilePayload.Files, f)
   558  	}
   559  
   560  	return nil
   561  }
   562  
   563  // The below is a work around to generate iptables-legacy rules on machines
   564  // that use iptables-nftables. The logic goes something like this:
   565  //
   566  //             start
   567  //               |
   568  //               v               no
   569  //     are legacy tables empty? -----> scrape rules -----> done <----+
   570  //               |                                          ^        |
   571  //               | yes                                      |        |
   572  //               v                        yes               |        |
   573  //     are nft tables empty? -------------------------------+        |
   574  //               |                                                   |
   575  //               | no                                                |
   576  //               v                                                   |
   577  //     pipe iptables-nft-save -t nat to iptables-legacy-restore      |
   578  //     scrape rules                                                  |
   579  //     delete iptables-legacy rules                                  |
   580  //               |                                                   |
   581  //               +---------------------------------------------------+
   582  //
   583  // If we fail at some point (e.g. to find a binary), we just try to scrape the
   584  // legacy rules.
   585  
   586  const emptyNatRules = `-P PREROUTING ACCEPT
   587  -P INPUT ACCEPT
   588  -P OUTPUT ACCEPT
   589  -P POSTROUTING ACCEPT
   590  `
   591  
   592  func checkNftables() (*os.File, error) {
   593  	// Use iptables (not iptables-save) to test table emptiness because it
   594  	// gives predictable results: no counters and no comments.
   595  
   596  	// Is the legacy table empty?
   597  	if out, err := exec.Command("iptables-legacy", "-t", "nat", "-S").Output(); err != nil || string(out) != emptyNatRules {
   598  		return writeNATBlob()
   599  	}
   600  
   601  	// Is the nftables table empty?
   602  	if out, err := exec.Command("iptables-nft", "-t", "nat", "-S").Output(); err != nil || string(out) == emptyNatRules {
   603  		return nil, fmt.Errorf("no rules to scrape: %v", err)
   604  	}
   605  
   606  	// Get the current (empty) legacy rules.
   607  	currLegacy, err := exec.Command("iptables-legacy-save", "-t", "nat").Output()
   608  	if err != nil {
   609  		return nil, fmt.Errorf("failed to save existing rules with error (%v) and output: %s", err, currLegacy)
   610  	}
   611  
   612  	// Restore empty legacy rules.
   613  	defer func() {
   614  		cmd := exec.Command("iptables-legacy-restore")
   615  		stdin, err := cmd.StdinPipe()
   616  		if err != nil {
   617  			log.Warningf("failed to get stdin pipe: %v", err)
   618  			return
   619  		}
   620  
   621  		go func() {
   622  			defer stdin.Close()
   623  			stdin.Write(currLegacy)
   624  		}()
   625  
   626  		if out, err := cmd.CombinedOutput(); err != nil {
   627  			log.Warningf("failed to restore iptables error (%v) with output: %s", err, out)
   628  		}
   629  	}()
   630  
   631  	// Pipe the output of iptables-nft-save to iptables-legacy-restore.
   632  	nftOut, err := exec.Command("iptables-nft-save", "-t", "nat").Output()
   633  	if err != nil {
   634  		return nil, fmt.Errorf("failed to run iptables-nft-save: %v", err)
   635  	}
   636  
   637  	cmd := exec.Command("iptables-legacy-restore")
   638  	stdin, err := cmd.StdinPipe()
   639  	if err != nil {
   640  		return nil, fmt.Errorf("failed to get stdin pipe: %v", err)
   641  	}
   642  
   643  	go func() {
   644  		defer stdin.Close()
   645  		stdin.Write(nftOut)
   646  	}()
   647  
   648  	if out, err := cmd.CombinedOutput(); err != nil {
   649  		return nil, fmt.Errorf("failed to restore iptables error (%v) with output: %s", err, out)
   650  	}
   651  
   652  	return writeNATBlob()
   653  }