gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/boot/network.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package boot
    16  
    17  import (
    18  	"fmt"
    19  	"io"
    20  	"net"
    21  	"os"
    22  	"runtime"
    23  	"strings"
    24  
    25  	"golang.org/x/sys/unix"
    26  	"gvisor.dev/gvisor/pkg/hostos"
    27  	"gvisor.dev/gvisor/pkg/log"
    28  	"gvisor.dev/gvisor/pkg/sentry/kernel"
    29  	"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
    30  	"gvisor.dev/gvisor/pkg/tcpip"
    31  	"gvisor.dev/gvisor/pkg/tcpip/link/ethernet"
    32  	"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
    33  	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
    34  	"gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo"
    35  	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
    36  	"gvisor.dev/gvisor/pkg/tcpip/link/xdp"
    37  	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
    38  	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
    39  	"gvisor.dev/gvisor/pkg/tcpip/stack"
    40  	"gvisor.dev/gvisor/pkg/urpc"
    41  	"gvisor.dev/gvisor/runsc/config"
    42  )
    43  
    44  var (
    45  	// DefaultLoopbackLink contains IP addresses and routes of "127.0.0.1/8" and
    46  	// "::1/8" on "lo" interface.
    47  	DefaultLoopbackLink = LoopbackLink{
    48  		Name: "lo",
    49  		Addresses: []IPWithPrefix{
    50  			{Address: net.IP("\x7f\x00\x00\x01"), PrefixLen: 8},
    51  			{Address: net.IPv6loopback, PrefixLen: 128},
    52  		},
    53  		Routes: []Route{
    54  			{
    55  				Destination: net.IPNet{
    56  					IP:   net.IPv4(0x7f, 0, 0, 0),
    57  					Mask: net.IPv4Mask(0xff, 0, 0, 0),
    58  				},
    59  			},
    60  			{
    61  				Destination: net.IPNet{
    62  					IP:   net.IPv6loopback,
    63  					Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)),
    64  				},
    65  			},
    66  		},
    67  	}
    68  )
    69  
    70  // Network exposes methods that can be used to configure a network stack.
    71  type Network struct {
    72  	Stack  *stack.Stack
    73  	Kernel *kernel.Kernel
    74  }
    75  
    76  // Route represents a route in the network stack.
    77  type Route struct {
    78  	Destination net.IPNet
    79  	Gateway     net.IP
    80  }
    81  
    82  // DefaultRoute represents a catch all route to the default gateway.
    83  type DefaultRoute struct {
    84  	Route Route
    85  	Name  string
    86  }
    87  
    88  type Neighbor struct {
    89  	IP           net.IP
    90  	HardwareAddr net.HardwareAddr
    91  }
    92  
    93  // FDBasedLink configures an fd-based link.
    94  type FDBasedLink struct {
    95  	Name              string
    96  	InterfaceIndex    int
    97  	MTU               int
    98  	Addresses         []IPWithPrefix
    99  	Routes            []Route
   100  	GSOMaxSize        uint32
   101  	GVisorGSOEnabled  bool
   102  	GVisorGRO         bool
   103  	TXChecksumOffload bool
   104  	RXChecksumOffload bool
   105  	LinkAddress       net.HardwareAddr
   106  	QDisc             config.QueueingDiscipline
   107  	Neighbors         []Neighbor
   108  
   109  	// NumChannels controls how many underlying FDs are to be used to
   110  	// create this endpoint.
   111  	NumChannels int
   112  
   113  	// ProcessorsPerChannel controls how many goroutines are used to handle
   114  	// packets on each channel.
   115  	ProcessorsPerChannel int
   116  }
   117  
   118  // BindOpt indicates whether the sentry or runsc process is responsible for
   119  // binding the AF_XDP socket.
   120  type BindOpt int
   121  
   122  const (
   123  	// BindSentry indicates the sentry process must call bind.
   124  	BindSentry BindOpt = iota
   125  
   126  	// BindRunsc indicates the runsc process must call bind.
   127  	BindRunsc
   128  )
   129  
   130  // XDPLink configures an XDP link.
   131  type XDPLink struct {
   132  	Name              string
   133  	InterfaceIndex    int
   134  	MTU               int
   135  	Addresses         []IPWithPrefix
   136  	Routes            []Route
   137  	TXChecksumOffload bool
   138  	RXChecksumOffload bool
   139  	LinkAddress       net.HardwareAddr
   140  	QDisc             config.QueueingDiscipline
   141  	Neighbors         []Neighbor
   142  	GVisorGRO         bool
   143  	Bind              BindOpt
   144  
   145  	// NumChannels controls how many underlying FDs are to be used to
   146  	// create this endpoint.
   147  	NumChannels int
   148  }
   149  
   150  // LoopbackLink configures a loopback link.
   151  type LoopbackLink struct {
   152  	Name      string
   153  	Addresses []IPWithPrefix
   154  	Routes    []Route
   155  	GVisorGRO bool
   156  }
   157  
   158  // CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes.
   159  type CreateLinksAndRoutesArgs struct {
   160  	// FilePayload contains the fds associated with the FDBasedLinks. The
   161  	// number of fd's should match the sum of the NumChannels field of the
   162  	// FDBasedLink entries below.
   163  	urpc.FilePayload
   164  
   165  	LoopbackLinks []LoopbackLink
   166  	FDBasedLinks  []FDBasedLink
   167  	XDPLinks      []XDPLink
   168  
   169  	Defaultv4Gateway DefaultRoute
   170  	Defaultv6Gateway DefaultRoute
   171  
   172  	// PCAP indicates that FilePayload also contains a PCAP log file.
   173  	PCAP bool
   174  
   175  	// LogPackets indicates that packets should be logged.
   176  	LogPackets bool
   177  
   178  	// NATBlob indicates whether FilePayload also contains an iptables NAT
   179  	// ruleset.
   180  	NATBlob bool
   181  
   182  	// DisconnectOk indicates that link endpoints should have the capability
   183  	// CapabilityDisconnectOk set.
   184  	DisconnectOk bool
   185  }
   186  
   187  // IPWithPrefix is an address with its subnet prefix length.
   188  type IPWithPrefix struct {
   189  	// Address is a network address.
   190  	Address net.IP
   191  
   192  	// PrefixLen is the subnet prefix length.
   193  	PrefixLen int
   194  }
   195  
   196  func (ip IPWithPrefix) String() string {
   197  	return fmt.Sprintf("%s/%d", ip.Address, ip.PrefixLen)
   198  }
   199  
   200  // Empty returns true if route hasn't been set.
   201  func (r *Route) Empty() bool {
   202  	return r.Destination.IP == nil && r.Destination.Mask == nil && r.Gateway == nil
   203  }
   204  
   205  func (r *Route) toTcpipRoute(id tcpip.NICID) (tcpip.Route, error) {
   206  	subnet, err := tcpip.NewSubnet(ipToAddress(r.Destination.IP), ipMaskToAddressMask(r.Destination.Mask))
   207  	if err != nil {
   208  		return tcpip.Route{}, err
   209  	}
   210  	return tcpip.Route{
   211  		Destination: subnet,
   212  		Gateway:     ipToAddress(r.Gateway),
   213  		NIC:         id,
   214  	}, nil
   215  }
   216  
   217  // CreateLinksAndRoutes creates links and routes in a network stack.  It should
   218  // only be called once.
   219  func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error {
   220  	if len(args.FDBasedLinks) > 0 && len(args.XDPLinks) > 0 {
   221  		return fmt.Errorf("received both fdbased and XDP links, but only one can be used at a time")
   222  	}
   223  	wantFDs := 0
   224  	for _, l := range args.FDBasedLinks {
   225  		wantFDs += l.NumChannels
   226  	}
   227  	for _, link := range args.XDPLinks {
   228  		// We have to keep several FDs alive when the sentry is
   229  		// responsible for binding, but when runsc binds we only expect
   230  		// the AF_XDP socket itself.
   231  		switch v := link.Bind; v {
   232  		case BindSentry:
   233  			wantFDs += 4
   234  		case BindRunsc:
   235  			wantFDs++
   236  		default:
   237  			return fmt.Errorf("unknown bind value: %d", v)
   238  		}
   239  	}
   240  	if args.PCAP {
   241  		wantFDs++
   242  	}
   243  	if args.NATBlob {
   244  		wantFDs++
   245  	}
   246  	if got := len(args.FilePayload.Files); got != wantFDs {
   247  		return fmt.Errorf("args.FilePayload.Files has %d FDs but we need %d entries based on FDBasedLinks, XDPLinks, and PCAP", got, wantFDs)
   248  	}
   249  
   250  	var nicID tcpip.NICID
   251  	nicids := make(map[string]tcpip.NICID)
   252  
   253  	// Collect routes from all links.
   254  	var routes []tcpip.Route
   255  
   256  	// Loopback normally appear before other interfaces.
   257  	for _, link := range args.LoopbackLinks {
   258  		nicID++
   259  		nicids[link.Name] = nicID
   260  
   261  		linkEP := ethernet.New(loopback.New())
   262  
   263  		log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
   264  		opts := stack.NICOptions{
   265  			Name:               link.Name,
   266  			DeliverLinkPackets: true,
   267  		}
   268  		if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil {
   269  			return err
   270  		}
   271  
   272  		// Collect the routes from this link.
   273  		for _, r := range link.Routes {
   274  			route, err := r.toTcpipRoute(nicID)
   275  			if err != nil {
   276  				return err
   277  			}
   278  			routes = append(routes, route)
   279  		}
   280  	}
   281  
   282  	// Setup fdbased or XDP links.
   283  	fdOffset := 0
   284  	if len(args.FDBasedLinks) > 0 {
   285  		// Choose a dispatch mode.
   286  		dispatchMode := fdbased.RecvMMsg
   287  		version, err := hostos.KernelVersion()
   288  		if err != nil {
   289  			return err
   290  		}
   291  		if version.AtLeast(5, 6) {
   292  			// TODO(b/333120887): Switch back to using the packet mmap dispatcher when
   293  			// we have the performance data to justify it.
   294  			// dispatchMode = fdbased.PacketMMap
   295  			// log.Infof("Host kernel version >= 5.6, using to packet mmap to dispatch")
   296  		} else {
   297  			log.Infof("Host kernel version < 5.6, using to RecvMMsg to dispatch")
   298  		}
   299  
   300  		for _, link := range args.FDBasedLinks {
   301  			nicID++
   302  			nicids[link.Name] = nicID
   303  
   304  			FDs := make([]int, 0, link.NumChannels)
   305  			for j := 0; j < link.NumChannels; j++ {
   306  				// Copy the underlying FD.
   307  				oldFD := args.FilePayload.Files[fdOffset].Fd()
   308  				newFD, err := unix.Dup(int(oldFD))
   309  				if err != nil {
   310  					return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
   311  				}
   312  				FDs = append(FDs, newFD)
   313  				fdOffset++
   314  			}
   315  
   316  			mac := tcpip.LinkAddress(link.LinkAddress)
   317  			log.Infof("gso max size is: %d", link.GSOMaxSize)
   318  
   319  			linkEP, err := fdbased.New(&fdbased.Options{
   320  				FDs:                  FDs,
   321  				MTU:                  uint32(link.MTU),
   322  				EthernetHeader:       mac != "",
   323  				Address:              mac,
   324  				PacketDispatchMode:   dispatchMode,
   325  				GSOMaxSize:           link.GSOMaxSize,
   326  				GVisorGSOEnabled:     link.GVisorGSOEnabled,
   327  				TXChecksumOffload:    link.TXChecksumOffload,
   328  				RXChecksumOffload:    link.RXChecksumOffload,
   329  				GRO:                  link.GVisorGRO,
   330  				ProcessorsPerChannel: link.ProcessorsPerChannel,
   331  				DisconnectOk:         args.DisconnectOk,
   332  			})
   333  			if err != nil {
   334  				return err
   335  			}
   336  
   337  			// Setup packet logging if requested.
   338  			if args.PCAP {
   339  				newFD, err := unix.Dup(int(args.FilePayload.Files[fdOffset].Fd()))
   340  				if err != nil {
   341  					return fmt.Errorf("failed to dup pcap FD: %v", err)
   342  				}
   343  				const packetTruncateSize = 4096
   344  				linkEP, err = sniffer.NewWithWriter(linkEP, os.NewFile(uintptr(newFD), "pcap-file"), packetTruncateSize)
   345  				if err != nil {
   346  					return fmt.Errorf("failed to create PCAP logger: %v", err)
   347  				}
   348  				fdOffset++
   349  			} else if args.LogPackets {
   350  				linkEP = sniffer.New(linkEP)
   351  			}
   352  
   353  			var qDisc stack.QueueingDiscipline
   354  			switch link.QDisc {
   355  			case config.QDiscNone:
   356  			case config.QDiscFIFO:
   357  				log.Infof("Enabling FIFO QDisc on %q", link.Name)
   358  				qDisc = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000)
   359  			}
   360  
   361  			log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
   362  			opts := stack.NICOptions{
   363  				Name:               link.Name,
   364  				QDisc:              qDisc,
   365  				DeliverLinkPackets: true,
   366  			}
   367  			if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil {
   368  				return err
   369  			}
   370  
   371  			// Collect the routes from this link.
   372  			for _, r := range link.Routes {
   373  				route, err := r.toTcpipRoute(nicID)
   374  				if err != nil {
   375  					return err
   376  				}
   377  				routes = append(routes, route)
   378  			}
   379  
   380  			for _, neigh := range link.Neighbors {
   381  				proto, tcpipAddr := ipToAddressAndProto(neigh.IP)
   382  				n.Stack.AddStaticNeighbor(nicID, proto, tcpipAddr, tcpip.LinkAddress(neigh.HardwareAddr))
   383  			}
   384  		}
   385  	} else if len(args.XDPLinks) > 0 {
   386  		if nlinks := len(args.XDPLinks); nlinks > 1 {
   387  			return fmt.Errorf("XDP only supports one link device, but got %d", nlinks)
   388  		}
   389  		link := args.XDPLinks[0]
   390  		nicID++
   391  		nicids[link.Name] = nicID
   392  
   393  		// Get the AF_XDP socket.
   394  		oldFD := args.FilePayload.Files[fdOffset].Fd()
   395  		fd, err := unix.Dup(int(oldFD))
   396  		if err != nil {
   397  			return fmt.Errorf("failed to dup AF_XDP fd %v: %v", oldFD, err)
   398  		}
   399  		fdOffset++
   400  
   401  		// When the sentry is responsible for binding, the runsc
   402  		// process sends several other FDs in order to keep them open
   403  		// and alive. These are for BPF programs and maps that, if
   404  		// closed, will break the dispatcher.
   405  		if link.Bind == BindSentry {
   406  			for _, fdName := range []string{"program-fd", "sockmap-fd", "link-fd"} {
   407  				oldFD := args.FilePayload.Files[fdOffset].Fd()
   408  				if _, err := unix.Dup(int(oldFD)); err != nil {
   409  					return fmt.Errorf("failed to dup %s with FD %d: %v", fdName, oldFD, err)
   410  				}
   411  				fdOffset++
   412  			}
   413  		}
   414  
   415  		// Setup packet logging if requested.
   416  		mac := tcpip.LinkAddress(link.LinkAddress)
   417  		linkEP, err := xdp.New(&xdp.Options{
   418  			FD:                fd,
   419  			Address:           mac,
   420  			TXChecksumOffload: link.TXChecksumOffload,
   421  			RXChecksumOffload: link.RXChecksumOffload,
   422  			InterfaceIndex:    link.InterfaceIndex,
   423  			Bind:              link.Bind == BindSentry,
   424  			GRO:               link.GVisorGRO,
   425  			DisconnectOk:      args.DisconnectOk,
   426  		})
   427  		if err != nil {
   428  			return err
   429  		}
   430  
   431  		if args.PCAP {
   432  			newFD, err := unix.Dup(int(args.FilePayload.Files[fdOffset].Fd()))
   433  			if err != nil {
   434  				return fmt.Errorf("failed to dup pcap FD: %v", err)
   435  			}
   436  			const packetTruncateSize = 4096
   437  			linkEP, err = sniffer.NewWithWriter(linkEP, os.NewFile(uintptr(newFD), "pcap-file"), packetTruncateSize)
   438  			if err != nil {
   439  				return fmt.Errorf("failed to create PCAP logger: %v", err)
   440  			}
   441  			fdOffset++
   442  		} else if args.LogPackets {
   443  			linkEP = sniffer.New(linkEP)
   444  		}
   445  
   446  		var qDisc stack.QueueingDiscipline
   447  		switch link.QDisc {
   448  		case config.QDiscNone:
   449  		case config.QDiscFIFO:
   450  			log.Infof("Enabling FIFO QDisc on %q", link.Name)
   451  			qDisc = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000)
   452  		}
   453  
   454  		log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
   455  		opts := stack.NICOptions{
   456  			Name:               link.Name,
   457  			QDisc:              qDisc,
   458  			DeliverLinkPackets: true,
   459  		}
   460  		if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil {
   461  			return err
   462  		}
   463  
   464  		// Collect the routes from this link.
   465  		for _, r := range link.Routes {
   466  			route, err := r.toTcpipRoute(nicID)
   467  			if err != nil {
   468  				return err
   469  			}
   470  			routes = append(routes, route)
   471  		}
   472  
   473  		for _, neigh := range link.Neighbors {
   474  			proto, tcpipAddr := ipToAddressAndProto(neigh.IP)
   475  			n.Stack.AddStaticNeighbor(nicID, proto, tcpipAddr, tcpip.LinkAddress(neigh.HardwareAddr))
   476  		}
   477  	}
   478  
   479  	if !args.Defaultv4Gateway.Route.Empty() {
   480  		nicID, ok := nicids[args.Defaultv4Gateway.Name]
   481  		if !ok {
   482  			return fmt.Errorf("invalid interface name %q for default route", args.Defaultv4Gateway.Name)
   483  		}
   484  		route, err := args.Defaultv4Gateway.Route.toTcpipRoute(nicID)
   485  		if err != nil {
   486  			return err
   487  		}
   488  		routes = append(routes, route)
   489  	}
   490  
   491  	if !args.Defaultv6Gateway.Route.Empty() {
   492  		nicID, ok := nicids[args.Defaultv6Gateway.Name]
   493  		if !ok {
   494  			return fmt.Errorf("invalid interface name %q for default route", args.Defaultv6Gateway.Name)
   495  		}
   496  		route, err := args.Defaultv6Gateway.Route.toTcpipRoute(nicID)
   497  		if err != nil {
   498  			return err
   499  		}
   500  		routes = append(routes, route)
   501  	}
   502  
   503  	log.Infof("Setting routes %+v", routes)
   504  	n.Stack.SetRouteTable(routes)
   505  
   506  	// Set NAT table rules if necessary.
   507  	if args.NATBlob {
   508  		log.Infof("Replacing NAT table")
   509  		iptReplaceBlob, err := io.ReadAll(args.FilePayload.Files[fdOffset])
   510  		if err != nil {
   511  			return fmt.Errorf("failed to read iptables blob: %v", err)
   512  		}
   513  		fdOffset++
   514  		if err := netfilter.SetEntries(n.Kernel.RootUserNamespace(), n.Stack, iptReplaceBlob, false); err != nil {
   515  			return fmt.Errorf("failed to SetEntries: %v", err)
   516  		}
   517  	}
   518  
   519  	return nil
   520  }
   521  
   522  // createNICWithAddrs creates a NIC in the network stack and adds the given
   523  // addresses.
   524  func (n *Network) createNICWithAddrs(id tcpip.NICID, ep stack.LinkEndpoint, opts stack.NICOptions, addrs []IPWithPrefix) error {
   525  	if err := n.Stack.CreateNICWithOptions(id, ep, opts); err != nil {
   526  		return fmt.Errorf("CreateNICWithOptions(%d, _, %+v) failed: %v", id, opts, err)
   527  	}
   528  
   529  	for _, addr := range addrs {
   530  		proto, tcpipAddr := ipToAddressAndProto(addr.Address)
   531  		protocolAddr := tcpip.ProtocolAddress{
   532  			Protocol: proto,
   533  			AddressWithPrefix: tcpip.AddressWithPrefix{
   534  				Address:   tcpipAddr,
   535  				PrefixLen: addr.PrefixLen,
   536  			},
   537  		}
   538  		if err := n.Stack.AddProtocolAddress(id, protocolAddr, stack.AddressProperties{}); err != nil {
   539  			return fmt.Errorf("AddProtocolAddress(%d, %+v, {}) failed: %s", id, protocolAddr, err)
   540  		}
   541  	}
   542  	return nil
   543  }
   544  
   545  // ipToAddressAndProto converts IP to tcpip.Address and a protocol number.
   546  //
   547  // Note: don't use 'len(ip)' to determine IP version because length is always 16.
   548  func ipToAddressAndProto(ip net.IP) (tcpip.NetworkProtocolNumber, tcpip.Address) {
   549  	if i4 := ip.To4(); i4 != nil {
   550  		return ipv4.ProtocolNumber, tcpip.AddrFromSlice(i4)
   551  	}
   552  	return ipv6.ProtocolNumber, tcpip.AddrFromSlice(ip)
   553  }
   554  
   555  // ipToAddress converts IP to tcpip.Address, ignoring the protocol.
   556  func ipToAddress(ip net.IP) tcpip.Address {
   557  	_, addr := ipToAddressAndProto(ip)
   558  	return addr
   559  }
   560  
   561  // ipMaskToAddressMask converts IPMask to tcpip.AddressMask, ignoring the
   562  // protocol.
   563  func ipMaskToAddressMask(ipMask net.IPMask) tcpip.AddressMask {
   564  	addr := ipToAddress(net.IP(ipMask))
   565  	return tcpip.MaskFromBytes(addr.AsSlice())
   566  }