github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/runsc/boot/network.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package boot
    16  
    17  import (
    18  	"fmt"
    19  	"io"
    20  	"net"
    21  	"os"
    22  	"runtime"
    23  	"strings"
    24  	"time"
    25  
    26  	"golang.org/x/sys/unix"
    27  	"github.com/metacubex/gvisor/pkg/hostos"
    28  	"github.com/metacubex/gvisor/pkg/log"
    29  	"github.com/metacubex/gvisor/pkg/sentry/kernel"
    30  	"github.com/metacubex/gvisor/pkg/sentry/socket/netfilter"
    31  	"github.com/metacubex/gvisor/pkg/tcpip"
    32  	"github.com/metacubex/gvisor/pkg/tcpip/link/ethernet"
    33  	"github.com/metacubex/gvisor/pkg/tcpip/link/fdbased"
    34  	"github.com/metacubex/gvisor/pkg/tcpip/link/loopback"
    35  	"github.com/metacubex/gvisor/pkg/tcpip/link/qdisc/fifo"
    36  	"github.com/metacubex/gvisor/pkg/tcpip/link/sniffer"
    37  	"github.com/metacubex/gvisor/pkg/tcpip/link/xdp"
    38  	"github.com/metacubex/gvisor/pkg/tcpip/network/ipv4"
    39  	"github.com/metacubex/gvisor/pkg/tcpip/network/ipv6"
    40  	"github.com/metacubex/gvisor/pkg/tcpip/stack"
    41  	"github.com/metacubex/gvisor/pkg/urpc"
    42  	"github.com/metacubex/gvisor/runsc/config"
    43  )
    44  
    45  var (
    46  	// DefaultLoopbackLink contains IP addresses and routes of "127.0.0.1/8" and
    47  	// "::1/8" on "lo" interface.
    48  	DefaultLoopbackLink = LoopbackLink{
    49  		Name: "lo",
    50  		Addresses: []IPWithPrefix{
    51  			{Address: net.IP("\x7f\x00\x00\x01"), PrefixLen: 8},
    52  			{Address: net.IPv6loopback, PrefixLen: 128},
    53  		},
    54  		Routes: []Route{
    55  			{
    56  				Destination: net.IPNet{
    57  					IP:   net.IPv4(0x7f, 0, 0, 0),
    58  					Mask: net.IPv4Mask(0xff, 0, 0, 0),
    59  				},
    60  			},
    61  			{
    62  				Destination: net.IPNet{
    63  					IP:   net.IPv6loopback,
    64  					Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)),
    65  				},
    66  			},
    67  		},
    68  	}
    69  )
    70  
    71  // Network exposes methods that can be used to configure a network stack.
    72  type Network struct {
    73  	Stack  *stack.Stack
    74  	Kernel *kernel.Kernel
    75  }
    76  
    77  // Route represents a route in the network stack.
    78  type Route struct {
    79  	Destination net.IPNet
    80  	Gateway     net.IP
    81  }
    82  
    83  // DefaultRoute represents a catch all route to the default gateway.
    84  type DefaultRoute struct {
    85  	Route Route
    86  	Name  string
    87  }
    88  
    89  type Neighbor struct {
    90  	IP           net.IP
    91  	HardwareAddr net.HardwareAddr
    92  }
    93  
    94  // FDBasedLink configures an fd-based link.
    95  type FDBasedLink struct {
    96  	Name              string
    97  	InterfaceIndex    int
    98  	MTU               int
    99  	Addresses         []IPWithPrefix
   100  	Routes            []Route
   101  	GSOMaxSize        uint32
   102  	GvisorGSOEnabled  bool
   103  	GvisorGROTimeout  time.Duration
   104  	TXChecksumOffload bool
   105  	RXChecksumOffload bool
   106  	LinkAddress       net.HardwareAddr
   107  	QDisc             config.QueueingDiscipline
   108  	Neighbors         []Neighbor
   109  
   110  	// NumChannels controls how many underlying FDs are to be used to
   111  	// create this endpoint.
   112  	NumChannels int
   113  }
   114  
   115  // BindOpt indicates whether the sentry or runsc process is responsible for
   116  // binding the AF_XDP socket.
   117  type BindOpt int
   118  
   119  const (
   120  	// BindSentry indicates the sentry process must call bind.
   121  	BindSentry BindOpt = iota
   122  
   123  	// BindRunsc indicates the runsc process must call bind.
   124  	BindRunsc
   125  )
   126  
   127  // XDPLink configures an XDP link.
   128  type XDPLink struct {
   129  	Name              string
   130  	InterfaceIndex    int
   131  	MTU               int
   132  	Addresses         []IPWithPrefix
   133  	Routes            []Route
   134  	TXChecksumOffload bool
   135  	RXChecksumOffload bool
   136  	LinkAddress       net.HardwareAddr
   137  	QDisc             config.QueueingDiscipline
   138  	Neighbors         []Neighbor
   139  	GvisorGROTimeout  time.Duration
   140  	Bind              BindOpt
   141  
   142  	// NumChannels controls how many underlying FDs are to be used to
   143  	// create this endpoint.
   144  	NumChannels int
   145  }
   146  
   147  // LoopbackLink configures a loopback link.
   148  type LoopbackLink struct {
   149  	Name             string
   150  	Addresses        []IPWithPrefix
   151  	Routes           []Route
   152  	GvisorGROTimeout time.Duration
   153  }
   154  
   155  // CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes.
   156  type CreateLinksAndRoutesArgs struct {
   157  	// FilePayload contains the fds associated with the FDBasedLinks. The
   158  	// number of fd's should match the sum of the NumChannels field of the
   159  	// FDBasedLink entries below.
   160  	urpc.FilePayload
   161  
   162  	LoopbackLinks []LoopbackLink
   163  	FDBasedLinks  []FDBasedLink
   164  	XDPLinks      []XDPLink
   165  
   166  	Defaultv4Gateway DefaultRoute
   167  	Defaultv6Gateway DefaultRoute
   168  
   169  	// PCAP indicates that FilePayload also contains a PCAP log file.
   170  	PCAP bool
   171  
   172  	// LogPackets indicates that packets should be logged.
   173  	LogPackets bool
   174  
   175  	// NATBlob indicates whether FilePayload also contains an iptables NAT
   176  	// ruleset.
   177  	NATBlob bool
   178  }
   179  
   180  // IPWithPrefix is an address with its subnet prefix length.
   181  type IPWithPrefix struct {
   182  	// Address is a network address.
   183  	Address net.IP
   184  
   185  	// PrefixLen is the subnet prefix length.
   186  	PrefixLen int
   187  }
   188  
   189  func (ip IPWithPrefix) String() string {
   190  	return fmt.Sprintf("%s/%d", ip.Address, ip.PrefixLen)
   191  }
   192  
   193  // Empty returns true if route hasn't been set.
   194  func (r *Route) Empty() bool {
   195  	return r.Destination.IP == nil && r.Destination.Mask == nil && r.Gateway == nil
   196  }
   197  
   198  func (r *Route) toTcpipRoute(id tcpip.NICID) (tcpip.Route, error) {
   199  	subnet, err := tcpip.NewSubnet(ipToAddress(r.Destination.IP), ipMaskToAddressMask(r.Destination.Mask))
   200  	if err != nil {
   201  		return tcpip.Route{}, err
   202  	}
   203  	return tcpip.Route{
   204  		Destination: subnet,
   205  		Gateway:     ipToAddress(r.Gateway),
   206  		NIC:         id,
   207  	}, nil
   208  }
   209  
   210  // CreateLinksAndRoutes creates links and routes in a network stack.  It should
   211  // only be called once.
   212  func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error {
   213  	if len(args.FDBasedLinks) > 0 && len(args.XDPLinks) > 0 {
   214  		return fmt.Errorf("received both fdbased and XDP links, but only one can be used at a time")
   215  	}
   216  	wantFDs := 0
   217  	for _, l := range args.FDBasedLinks {
   218  		wantFDs += l.NumChannels
   219  	}
   220  	for _, link := range args.XDPLinks {
   221  		// We have to keep several FDs alive when the sentry is
   222  		// responsible for binding, but when runsc binds we only expect
   223  		// the AF_XDP socket itself.
   224  		switch v := link.Bind; v {
   225  		case BindSentry:
   226  			wantFDs += 4
   227  		case BindRunsc:
   228  			wantFDs++
   229  		default:
   230  			return fmt.Errorf("unknown bind value: %d", v)
   231  		}
   232  	}
   233  	if args.PCAP {
   234  		wantFDs++
   235  	}
   236  	if args.NATBlob {
   237  		wantFDs++
   238  	}
   239  	if got := len(args.FilePayload.Files); got != wantFDs {
   240  		return fmt.Errorf("args.FilePayload.Files has %d FDs but we need %d entries based on FDBasedLinks, XDPLinks, and PCAP", got, wantFDs)
   241  	}
   242  
   243  	var nicID tcpip.NICID
   244  	nicids := make(map[string]tcpip.NICID)
   245  
   246  	// Collect routes from all links.
   247  	var routes []tcpip.Route
   248  
   249  	// Loopback normally appear before other interfaces.
   250  	for _, link := range args.LoopbackLinks {
   251  		nicID++
   252  		nicids[link.Name] = nicID
   253  
   254  		linkEP := ethernet.New(loopback.New())
   255  
   256  		log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
   257  		opts := stack.NICOptions{
   258  			Name:               link.Name,
   259  			GROTimeout:         link.GvisorGROTimeout,
   260  			DeliverLinkPackets: true,
   261  		}
   262  		if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil {
   263  			return err
   264  		}
   265  
   266  		// Collect the routes from this link.
   267  		for _, r := range link.Routes {
   268  			route, err := r.toTcpipRoute(nicID)
   269  			if err != nil {
   270  				return err
   271  			}
   272  			routes = append(routes, route)
   273  		}
   274  	}
   275  
   276  	// Setup fdbased or XDP links.
   277  	fdOffset := 0
   278  	if len(args.FDBasedLinks) > 0 {
   279  		// Choose a dispatch mode.
   280  		dispatchMode := fdbased.RecvMMsg
   281  		version, err := hostos.KernelVersion()
   282  		if err != nil {
   283  			return err
   284  		}
   285  		if version.AtLeast(5, 6) {
   286  			dispatchMode = fdbased.PacketMMap
   287  		} else {
   288  			log.Infof("Host kernel version < 5.6, falling back to RecvMMsg dispatch")
   289  		}
   290  
   291  		for _, link := range args.FDBasedLinks {
   292  			nicID++
   293  			nicids[link.Name] = nicID
   294  
   295  			FDs := make([]int, 0, link.NumChannels)
   296  			for j := 0; j < link.NumChannels; j++ {
   297  				// Copy the underlying FD.
   298  				oldFD := args.FilePayload.Files[fdOffset].Fd()
   299  				newFD, err := unix.Dup(int(oldFD))
   300  				if err != nil {
   301  					return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
   302  				}
   303  				FDs = append(FDs, newFD)
   304  				fdOffset++
   305  			}
   306  
   307  			mac := tcpip.LinkAddress(link.LinkAddress)
   308  			log.Infof("gso max size is: %d", link.GSOMaxSize)
   309  
   310  			linkEP, err := fdbased.New(&fdbased.Options{
   311  				FDs:                FDs,
   312  				MTU:                uint32(link.MTU),
   313  				EthernetHeader:     mac != "",
   314  				Address:            mac,
   315  				PacketDispatchMode: dispatchMode,
   316  				GSOMaxSize:         link.GSOMaxSize,
   317  				GvisorGSOEnabled:   link.GvisorGSOEnabled,
   318  				TXChecksumOffload:  link.TXChecksumOffload,
   319  				RXChecksumOffload:  link.RXChecksumOffload,
   320  			})
   321  			if err != nil {
   322  				return err
   323  			}
   324  
   325  			// Setup packet logging if requested.
   326  			if args.PCAP {
   327  				newFD, err := unix.Dup(int(args.FilePayload.Files[fdOffset].Fd()))
   328  				if err != nil {
   329  					return fmt.Errorf("failed to dup pcap FD: %v", err)
   330  				}
   331  				const packetTruncateSize = 4096
   332  				linkEP, err = sniffer.NewWithWriter(linkEP, os.NewFile(uintptr(newFD), "pcap-file"), packetTruncateSize)
   333  				if err != nil {
   334  					return fmt.Errorf("failed to create PCAP logger: %v", err)
   335  				}
   336  				fdOffset++
   337  			} else if args.LogPackets {
   338  				linkEP = sniffer.New(linkEP)
   339  			}
   340  
   341  			var qDisc stack.QueueingDiscipline
   342  			switch link.QDisc {
   343  			case config.QDiscNone:
   344  			case config.QDiscFIFO:
   345  				log.Infof("Enabling FIFO QDisc on %q", link.Name)
   346  				qDisc = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000)
   347  			}
   348  
   349  			log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
   350  			opts := stack.NICOptions{
   351  				Name:               link.Name,
   352  				QDisc:              qDisc,
   353  				GROTimeout:         link.GvisorGROTimeout,
   354  				DeliverLinkPackets: true,
   355  			}
   356  			if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil {
   357  				return err
   358  			}
   359  
   360  			// Collect the routes from this link.
   361  			for _, r := range link.Routes {
   362  				route, err := r.toTcpipRoute(nicID)
   363  				if err != nil {
   364  					return err
   365  				}
   366  				routes = append(routes, route)
   367  			}
   368  
   369  			for _, neigh := range link.Neighbors {
   370  				proto, tcpipAddr := ipToAddressAndProto(neigh.IP)
   371  				n.Stack.AddStaticNeighbor(nicID, proto, tcpipAddr, tcpip.LinkAddress(neigh.HardwareAddr))
   372  			}
   373  		}
   374  	} else if len(args.XDPLinks) > 0 {
   375  		if nlinks := len(args.XDPLinks); nlinks > 1 {
   376  			return fmt.Errorf("XDP only supports one link device, but got %d", nlinks)
   377  		}
   378  		link := args.XDPLinks[0]
   379  		nicID++
   380  		nicids[link.Name] = nicID
   381  
   382  		// Get the AF_XDP socket.
   383  		oldFD := args.FilePayload.Files[fdOffset].Fd()
   384  		fd, err := unix.Dup(int(oldFD))
   385  		if err != nil {
   386  			return fmt.Errorf("failed to dup AF_XDP fd %v: %v", oldFD, err)
   387  		}
   388  		fdOffset++
   389  
   390  		// When the sentry is responsible for binding, the runsc
   391  		// process sends several other FDs in order to keep them open
   392  		// and alive. These are for BPF programs and maps that, if
   393  		// closed, will break the dispatcher.
   394  		if link.Bind == BindSentry {
   395  			for _, fdName := range []string{"program-fd", "sockmap-fd", "link-fd"} {
   396  				oldFD := args.FilePayload.Files[fdOffset].Fd()
   397  				if _, err := unix.Dup(int(oldFD)); err != nil {
   398  					return fmt.Errorf("failed to dup %s with FD %d: %v", fdName, oldFD, err)
   399  				}
   400  				fdOffset++
   401  			}
   402  		}
   403  
   404  		// Setup packet logging if requested.
   405  		mac := tcpip.LinkAddress(link.LinkAddress)
   406  		linkEP, err := xdp.New(&xdp.Options{
   407  			FD:                fd,
   408  			Address:           mac,
   409  			TXChecksumOffload: link.TXChecksumOffload,
   410  			RXChecksumOffload: link.RXChecksumOffload,
   411  			InterfaceIndex:    link.InterfaceIndex,
   412  			Bind:              link.Bind == BindSentry,
   413  		})
   414  		if err != nil {
   415  			return err
   416  		}
   417  
   418  		if args.PCAP {
   419  			newFD, err := unix.Dup(int(args.FilePayload.Files[fdOffset].Fd()))
   420  			if err != nil {
   421  				return fmt.Errorf("failed to dup pcap FD: %v", err)
   422  			}
   423  			const packetTruncateSize = 4096
   424  			linkEP, err = sniffer.NewWithWriter(linkEP, os.NewFile(uintptr(newFD), "pcap-file"), packetTruncateSize)
   425  			if err != nil {
   426  				return fmt.Errorf("failed to create PCAP logger: %v", err)
   427  			}
   428  			fdOffset++
   429  		} else if args.LogPackets {
   430  			linkEP = sniffer.New(linkEP)
   431  		}
   432  
   433  		var qDisc stack.QueueingDiscipline
   434  		switch link.QDisc {
   435  		case config.QDiscNone:
   436  		case config.QDiscFIFO:
   437  			log.Infof("Enabling FIFO QDisc on %q", link.Name)
   438  			qDisc = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000)
   439  		}
   440  
   441  		log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
   442  		opts := stack.NICOptions{
   443  			Name:               link.Name,
   444  			QDisc:              qDisc,
   445  			GROTimeout:         link.GvisorGROTimeout,
   446  			DeliverLinkPackets: true,
   447  		}
   448  		if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil {
   449  			return err
   450  		}
   451  
   452  		// Collect the routes from this link.
   453  		for _, r := range link.Routes {
   454  			route, err := r.toTcpipRoute(nicID)
   455  			if err != nil {
   456  				return err
   457  			}
   458  			routes = append(routes, route)
   459  		}
   460  
   461  		for _, neigh := range link.Neighbors {
   462  			proto, tcpipAddr := ipToAddressAndProto(neigh.IP)
   463  			n.Stack.AddStaticNeighbor(nicID, proto, tcpipAddr, tcpip.LinkAddress(neigh.HardwareAddr))
   464  		}
   465  	}
   466  
   467  	if !args.Defaultv4Gateway.Route.Empty() {
   468  		nicID, ok := nicids[args.Defaultv4Gateway.Name]
   469  		if !ok {
   470  			return fmt.Errorf("invalid interface name %q for default route", args.Defaultv4Gateway.Name)
   471  		}
   472  		route, err := args.Defaultv4Gateway.Route.toTcpipRoute(nicID)
   473  		if err != nil {
   474  			return err
   475  		}
   476  		routes = append(routes, route)
   477  	}
   478  
   479  	if !args.Defaultv6Gateway.Route.Empty() {
   480  		nicID, ok := nicids[args.Defaultv6Gateway.Name]
   481  		if !ok {
   482  			return fmt.Errorf("invalid interface name %q for default route", args.Defaultv6Gateway.Name)
   483  		}
   484  		route, err := args.Defaultv6Gateway.Route.toTcpipRoute(nicID)
   485  		if err != nil {
   486  			return err
   487  		}
   488  		routes = append(routes, route)
   489  	}
   490  
   491  	log.Infof("Setting routes %+v", routes)
   492  	n.Stack.SetRouteTable(routes)
   493  
   494  	// Set NAT table rules if necessary.
   495  	if args.NATBlob {
   496  		log.Infof("Replacing NAT table")
   497  		iptReplaceBlob, err := io.ReadAll(args.FilePayload.Files[fdOffset])
   498  		if err != nil {
   499  			return fmt.Errorf("failed to read iptables blob: %v", err)
   500  		}
   501  		fdOffset++
   502  		if err := netfilter.SetEntries(n.Kernel.RootUserNamespace(), n.Stack, iptReplaceBlob, false); err != nil {
   503  			return fmt.Errorf("failed to SetEntries: %v", err)
   504  		}
   505  	}
   506  
   507  	return nil
   508  }
   509  
   510  // createNICWithAddrs creates a NIC in the network stack and adds the given
   511  // addresses.
   512  func (n *Network) createNICWithAddrs(id tcpip.NICID, ep stack.LinkEndpoint, opts stack.NICOptions, addrs []IPWithPrefix) error {
   513  	if err := n.Stack.CreateNICWithOptions(id, ep, opts); err != nil {
   514  		return fmt.Errorf("CreateNICWithOptions(%d, _, %+v) failed: %v", id, opts, err)
   515  	}
   516  
   517  	for _, addr := range addrs {
   518  		proto, tcpipAddr := ipToAddressAndProto(addr.Address)
   519  		protocolAddr := tcpip.ProtocolAddress{
   520  			Protocol: proto,
   521  			AddressWithPrefix: tcpip.AddressWithPrefix{
   522  				Address:   tcpipAddr,
   523  				PrefixLen: addr.PrefixLen,
   524  			},
   525  		}
   526  		if err := n.Stack.AddProtocolAddress(id, protocolAddr, stack.AddressProperties{}); err != nil {
   527  			return fmt.Errorf("AddProtocolAddress(%d, %+v, {}) failed: %s", id, protocolAddr, err)
   528  		}
   529  	}
   530  	return nil
   531  }
   532  
   533  // ipToAddressAndProto converts IP to tcpip.Address and a protocol number.
   534  //
   535  // Note: don't use 'len(ip)' to determine IP version because length is always 16.
   536  func ipToAddressAndProto(ip net.IP) (tcpip.NetworkProtocolNumber, tcpip.Address) {
   537  	if i4 := ip.To4(); i4 != nil {
   538  		return ipv4.ProtocolNumber, tcpip.AddrFromSlice(i4)
   539  	}
   540  	return ipv6.ProtocolNumber, tcpip.AddrFromSlice(ip)
   541  }
   542  
   543  // ipToAddress converts IP to tcpip.Address, ignoring the protocol.
   544  func ipToAddress(ip net.IP) tcpip.Address {
   545  	_, addr := ipToAddressAndProto(ip)
   546  	return addr
   547  }
   548  
   549  // ipMaskToAddressMask converts IPMask to tcpip.AddressMask, ignoring the
   550  // protocol.
   551  func ipMaskToAddressMask(ipMask net.IPMask) tcpip.AddressMask {
   552  	addr := ipToAddress(net.IP(ipMask))
   553  	return tcpip.MaskFromBytes(addr.AsSlice())
   554  }