gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/tcpip/link/fdbased/endpoint.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build linux
    16  // +build linux
    17  
    18  // Package fdbased provides the implementation of data-link layer endpoints
    19  // backed by boundary-preserving file descriptors (e.g., TUN devices,
    20  // seqpacket/datagram sockets).
    21  //
    22  // FD based endpoints can be used in the networking stack by calling New() to
    23  // create a new endpoint, and then passing it as an argument to
    24  // Stack.CreateNIC().
    25  //
    26  // FD based endpoints can use more than one file descriptor to read incoming
    27  // packets. If there are more than one FDs specified and the underlying FD is an
    28  // AF_PACKET then the endpoint will enable FANOUT mode on the socket so that the
    29  // host kernel will consistently hash the packets to the sockets. This ensures
    30  // that packets for the same TCP streams are not reordered.
    31  //
    32  // Similarly if more than one FD's are specified where the underlying FD is not
    33  // AF_PACKET then it's the caller's responsibility to ensure that all inbound
    34  // packets on the descriptors are consistently 5 tuple hashed to one of the
    35  // descriptors to prevent TCP reordering.
    36  //
    37  // Since netstack today does not compute 5 tuple hashes for outgoing packets we
    38  // only use the first FD to write outbound packets. Once 5 tuple hashes for
    39  // all outbound packets are available we will make use of all underlying FD's to
    40  // write outbound packets.
    41  package fdbased
    42  
    43  import (
    44  	"fmt"
    45  	"runtime"
    46  
    47  	"golang.org/x/sys/unix"
    48  	"gvisor.dev/gvisor/pkg/atomicbitops"
    49  	"gvisor.dev/gvisor/pkg/buffer"
    50  	"gvisor.dev/gvisor/pkg/sync"
    51  	"gvisor.dev/gvisor/pkg/tcpip"
    52  	"gvisor.dev/gvisor/pkg/tcpip/header"
    53  	"gvisor.dev/gvisor/pkg/tcpip/link/rawfile"
    54  	"gvisor.dev/gvisor/pkg/tcpip/stack"
    55  )
    56  
    57  // linkDispatcher reads packets from the link FD and dispatches them to the
    58  // NetworkDispatcher.
    59  type linkDispatcher interface {
    60  	Stop()
    61  	dispatch() (bool, tcpip.Error)
    62  	release()
    63  }
    64  
    65  // PacketDispatchMode are the various supported methods of receiving and
    66  // dispatching packets from the underlying FD.
    67  type PacketDispatchMode int
    68  
    69  // BatchSize is the number of packets to write in each syscall. It is 47
    70  // because when GVisorGSO is in use then a single 65KB TCP segment can get
    71  // split into 46 segments of 1420 bytes and a single 216 byte segment.
    72  const BatchSize = 47
    73  
    74  const (
    75  	// Readv is the default dispatch mode and is the least performant of the
    76  	// dispatch options but the one that is supported by all underlying FD
    77  	// types.
    78  	Readv PacketDispatchMode = iota
    79  	// RecvMMsg enables use of recvmmsg() syscall instead of readv() to
    80  	// read inbound packets. This reduces # of syscalls needed to process
    81  	// packets.
    82  	//
    83  	// NOTE: recvmmsg() is only supported for sockets, so if the underlying
    84  	// FD is not a socket then the code will still fall back to the readv()
    85  	// path.
    86  	RecvMMsg
    87  	// PacketMMap enables use of PACKET_RX_RING to receive packets from the
    88  	// NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The
    89  	// primary use-case for this is runsc which uses an AF_PACKET FD to
    90  	// receive packets from the veth device.
    91  	PacketMMap
    92  )
    93  
    94  func (p PacketDispatchMode) String() string {
    95  	switch p {
    96  	case Readv:
    97  		return "Readv"
    98  	case RecvMMsg:
    99  		return "RecvMMsg"
   100  	case PacketMMap:
   101  		return "PacketMMap"
   102  	default:
   103  		return fmt.Sprintf("unknown packet dispatch mode '%d'", p)
   104  	}
   105  }
   106  
   107  var _ stack.LinkEndpoint = (*endpoint)(nil)
   108  var _ stack.GSOEndpoint = (*endpoint)(nil)
   109  
   110  type fdInfo struct {
   111  	fd       int
   112  	isSocket bool
   113  }
   114  
   115  type endpoint struct {
   116  	// fds is the set of file descriptors each identifying one inbound/outbound
   117  	// channel. The endpoint will dispatch from all inbound channels as well as
   118  	// hash outbound packets to specific channels based on the packet hash.
   119  	fds []fdInfo
   120  
   121  	// mtu (maximum transmission unit) is the maximum size of a packet.
   122  	mtu uint32
   123  
   124  	// hdrSize specifies the link-layer header size. If set to 0, no header
   125  	// is added/removed; otherwise an ethernet header is used.
   126  	hdrSize int
   127  
   128  	// addr is the address of the endpoint.
   129  	addr tcpip.LinkAddress
   130  
   131  	// caps holds the endpoint capabilities.
   132  	caps stack.LinkEndpointCapabilities
   133  
   134  	// closed is a function to be called when the FD's peer (if any) closes
   135  	// its end of the communication pipe.
   136  	closed func(tcpip.Error)
   137  
   138  	inboundDispatchers []linkDispatcher
   139  
   140  	mu sync.RWMutex
   141  	// +checklocks:mu
   142  	dispatcher stack.NetworkDispatcher
   143  
   144  	// packetDispatchMode controls the packet dispatcher used by this
   145  	// endpoint.
   146  	packetDispatchMode PacketDispatchMode
   147  
   148  	// gsoMaxSize is the maximum GSO packet size. It is zero if GSO is
   149  	// disabled.
   150  	gsoMaxSize uint32
   151  
   152  	// wg keeps track of running goroutines.
   153  	wg sync.WaitGroup
   154  
   155  	// gsoKind is the supported kind of GSO.
   156  	gsoKind stack.SupportedGSO
   157  
   158  	// maxSyscallHeaderBytes has the same meaning as
   159  	// Options.MaxSyscallHeaderBytes.
   160  	maxSyscallHeaderBytes uintptr
   161  
   162  	// writevMaxIovs is the maximum number of iovecs that may be passed to
   163  	// rawfile.NonBlockingWriteIovec, as possibly limited by
   164  	// maxSyscallHeaderBytes. (No analogous limit is defined for
   165  	// rawfile.NonBlockingSendMMsg, since in that case the maximum number of
   166  	// iovecs also depends on the number of mmsghdrs. Instead, if sendBatch
   167  	// encounters a packet whose iovec count is limited by
   168  	// maxSyscallHeaderBytes, it falls back to writing the packet using writev
   169  	// via WritePacket.)
   170  	writevMaxIovs int
   171  }
   172  
   173  // Options specify the details about the fd-based endpoint to be created.
   174  type Options struct {
   175  	// FDs is a set of FDs used to read/write packets.
   176  	FDs []int
   177  
   178  	// MTU is the mtu to use for this endpoint.
   179  	MTU uint32
   180  
   181  	// EthernetHeader if true, indicates that the endpoint should read/write
   182  	// ethernet frames instead of IP packets.
   183  	EthernetHeader bool
   184  
   185  	// ClosedFunc is a function to be called when an endpoint's peer (if
   186  	// any) closes its end of the communication pipe.
   187  	ClosedFunc func(tcpip.Error)
   188  
   189  	// Address is the link address for this endpoint. Only used if
   190  	// EthernetHeader is true.
   191  	Address tcpip.LinkAddress
   192  
   193  	// SaveRestore if true, indicates that this NIC capability set should
   194  	// include CapabilitySaveRestore
   195  	SaveRestore bool
   196  
   197  	// DisconnectOk if true, indicates that this NIC capability set should
   198  	// include CapabilityDisconnectOk.
   199  	DisconnectOk bool
   200  
   201  	// GSOMaxSize is the maximum GSO packet size. It is zero if GSO is
   202  	// disabled.
   203  	GSOMaxSize uint32
   204  
   205  	// GVisorGSOEnabled indicates whether Gvisor GSO is enabled or not.
   206  	GVisorGSOEnabled bool
   207  
   208  	// PacketDispatchMode specifies the type of inbound dispatcher to be
   209  	// used for this endpoint.
   210  	PacketDispatchMode PacketDispatchMode
   211  
   212  	// TXChecksumOffload if true, indicates that this endpoints capability
   213  	// set should include CapabilityTXChecksumOffload.
   214  	TXChecksumOffload bool
   215  
   216  	// RXChecksumOffload if true, indicates that this endpoints capability
   217  	// set should include CapabilityRXChecksumOffload.
   218  	RXChecksumOffload bool
   219  
   220  	// If MaxSyscallHeaderBytes is non-zero, it is the maximum number of bytes
   221  	// of struct iovec, msghdr, and mmsghdr that may be passed by each host
   222  	// system call.
   223  	MaxSyscallHeaderBytes int
   224  
   225  	// InterfaceIndex is the interface index of the underlying device.
   226  	InterfaceIndex int
   227  
   228  	// GRO enables generic receive offload.
   229  	GRO bool
   230  
   231  	// ProcessorsPerChannel is the number of goroutines used to handle packets
   232  	// from each FD.
   233  	ProcessorsPerChannel int
   234  }
   235  
   236  // fanoutID is used for AF_PACKET based endpoints to enable PACKET_FANOUT
   237  // support in the host kernel. This allows us to use multiple FD's to receive
   238  // from the same underlying NIC. The fanoutID needs to be the same for a given
   239  // set of FD's that point to the same NIC. Trying to set the PACKET_FANOUT
   240  // option for an FD with a fanoutID already in use by another FD for a different
   241  // NIC will return an EINVAL.
   242  //
   243  // Since fanoutID must be unique within the network namespace, we start with
   244  // the PID to avoid collisions. The only way to be sure of avoiding collisions
   245  // is to run in a new network namespace.
   246  var fanoutID atomicbitops.Int32 = atomicbitops.FromInt32(int32(unix.Getpid()))
   247  
   248  // New creates a new fd-based endpoint.
   249  //
   250  // Makes fd non-blocking, but does not take ownership of fd, which must remain
   251  // open for the lifetime of the returned endpoint (until after the endpoint has
   252  // stopped being using and Wait returns).
   253  func New(opts *Options) (stack.LinkEndpoint, error) {
   254  	caps := stack.LinkEndpointCapabilities(0)
   255  	if opts.RXChecksumOffload {
   256  		caps |= stack.CapabilityRXChecksumOffload
   257  	}
   258  
   259  	if opts.TXChecksumOffload {
   260  		caps |= stack.CapabilityTXChecksumOffload
   261  	}
   262  
   263  	hdrSize := 0
   264  	if opts.EthernetHeader {
   265  		hdrSize = header.EthernetMinimumSize
   266  		caps |= stack.CapabilityResolutionRequired
   267  	}
   268  
   269  	if opts.SaveRestore {
   270  		caps |= stack.CapabilitySaveRestore
   271  	}
   272  
   273  	if opts.DisconnectOk {
   274  		caps |= stack.CapabilityDisconnectOk
   275  	}
   276  
   277  	if len(opts.FDs) == 0 {
   278  		return nil, fmt.Errorf("opts.FD is empty, at least one FD must be specified")
   279  	}
   280  
   281  	if opts.MaxSyscallHeaderBytes < 0 {
   282  		return nil, fmt.Errorf("opts.MaxSyscallHeaderBytes is negative")
   283  	}
   284  
   285  	e := &endpoint{
   286  		mtu:                   opts.MTU,
   287  		caps:                  caps,
   288  		closed:                opts.ClosedFunc,
   289  		addr:                  opts.Address,
   290  		hdrSize:               hdrSize,
   291  		packetDispatchMode:    opts.PacketDispatchMode,
   292  		maxSyscallHeaderBytes: uintptr(opts.MaxSyscallHeaderBytes),
   293  		writevMaxIovs:         rawfile.MaxIovs,
   294  	}
   295  	if e.maxSyscallHeaderBytes != 0 {
   296  		if max := int(e.maxSyscallHeaderBytes / rawfile.SizeofIovec); max < e.writevMaxIovs {
   297  			e.writevMaxIovs = max
   298  		}
   299  	}
   300  
   301  	// Increment fanoutID to ensure that we don't re-use the same fanoutID
   302  	// for the next endpoint.
   303  	fid := fanoutID.Add(1)
   304  
   305  	// Create per channel dispatchers.
   306  	for _, fd := range opts.FDs {
   307  		if err := unix.SetNonblock(fd, true); err != nil {
   308  			return nil, fmt.Errorf("unix.SetNonblock(%v) failed: %v", fd, err)
   309  		}
   310  
   311  		isSocket, err := isSocketFD(fd)
   312  		if err != nil {
   313  			return nil, err
   314  		}
   315  		e.fds = append(e.fds, fdInfo{fd: fd, isSocket: isSocket})
   316  		if isSocket {
   317  			if opts.GSOMaxSize != 0 {
   318  				if opts.GVisorGSOEnabled {
   319  					e.gsoKind = stack.GVisorGSOSupported
   320  				} else {
   321  					e.gsoKind = stack.HostGSOSupported
   322  				}
   323  				e.gsoMaxSize = opts.GSOMaxSize
   324  			}
   325  		}
   326  		if opts.ProcessorsPerChannel == 0 {
   327  			opts.ProcessorsPerChannel = max(1, runtime.GOMAXPROCS(0)/len(opts.FDs))
   328  		}
   329  
   330  		inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket, fid, opts)
   331  		if err != nil {
   332  			return nil, fmt.Errorf("createInboundDispatcher(...) = %v", err)
   333  		}
   334  		e.inboundDispatchers = append(e.inboundDispatchers, inboundDispatcher)
   335  	}
   336  
   337  	return e, nil
   338  }
   339  
   340  func createInboundDispatcher(e *endpoint, fd int, isSocket bool, fID int32, opts *Options) (linkDispatcher, error) {
   341  	// By default use the readv() dispatcher as it works with all kinds of
   342  	// FDs (tap/tun/unix domain sockets and af_packet).
   343  	inboundDispatcher, err := newReadVDispatcher(fd, e, opts)
   344  	if err != nil {
   345  		return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", fd, e, err)
   346  	}
   347  
   348  	if isSocket {
   349  		sa, err := unix.Getsockname(fd)
   350  		if err != nil {
   351  			return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err)
   352  		}
   353  		switch sa.(type) {
   354  		case *unix.SockaddrLinklayer:
   355  			// Enable PACKET_FANOUT mode if the underlying socket is of type
   356  			// AF_PACKET. We do not enable PACKET_FANOUT_FLAG_DEFRAG as that will
   357  			// prevent gvisor from receiving fragmented packets and the host does the
   358  			// reassembly on our behalf before delivering the fragments. This makes it
   359  			// hard to test fragmentation reassembly code in Netstack.
   360  			//
   361  			// See: include/uapi/linux/if_packet.h (struct fanout_args).
   362  			//
   363  			// NOTE: We are using SetSockOptInt here even though the underlying
   364  			// option is actually a struct. The code follows the example in the
   365  			// kernel documentation as described at the link below:
   366  			//
   367  			// See: https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
   368  			//
   369  			// This works out because the actual implementation for the option zero
   370  			// initializes the structure and will initialize the max_members field
   371  			// to a proper value if zero.
   372  			//
   373  			// See: https://github.com/torvalds/linux/blob/7acac4b3196caee5e21fb5ea53f8bc124e6a16fc/net/packet/af_packet.c#L3881
   374  			const fanoutType = unix.PACKET_FANOUT_HASH
   375  			fanoutArg := (int(fID) & 0xffff) | fanoutType<<16
   376  			if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil {
   377  				return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err)
   378  			}
   379  		}
   380  
   381  		switch e.packetDispatchMode {
   382  		case PacketMMap:
   383  			inboundDispatcher, err = newPacketMMapDispatcher(fd, e, opts)
   384  			if err != nil {
   385  				return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", fd, e, err)
   386  			}
   387  		case RecvMMsg:
   388  			// If the provided FD is a socket then we optimize
   389  			// packet reads by using recvmmsg() instead of read() to
   390  			// read packets in a batch.
   391  			inboundDispatcher, err = newRecvMMsgDispatcher(fd, e, opts)
   392  			if err != nil {
   393  				return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", fd, e, err)
   394  			}
   395  		case Readv:
   396  		default:
   397  			return nil, fmt.Errorf("unknown dispatch mode %d", e.packetDispatchMode)
   398  		}
   399  	}
   400  	return inboundDispatcher, nil
   401  }
   402  
   403  func isSocketFD(fd int) (bool, error) {
   404  	var stat unix.Stat_t
   405  	if err := unix.Fstat(fd, &stat); err != nil {
   406  		return false, fmt.Errorf("unix.Fstat(%v,...) failed: %v", fd, err)
   407  	}
   408  	return (stat.Mode & unix.S_IFSOCK) == unix.S_IFSOCK, nil
   409  }
   410  
   411  // Attach launches the goroutine that reads packets from the file descriptor and
   412  // dispatches them via the provided dispatcher. If one is already attached,
   413  // then nothing happens.
   414  //
   415  // Attach implements stack.LinkEndpoint.Attach.
   416  func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
   417  	e.mu.Lock()
   418  	defer e.mu.Unlock()
   419  
   420  	// nil means the NIC is being removed.
   421  	if dispatcher == nil && e.dispatcher != nil {
   422  		for _, dispatcher := range e.inboundDispatchers {
   423  			dispatcher.Stop()
   424  		}
   425  		e.Wait()
   426  		e.dispatcher = nil
   427  		return
   428  	}
   429  	if dispatcher != nil && e.dispatcher == nil {
   430  		e.dispatcher = dispatcher
   431  		// Link endpoints are not savable. When transportation endpoints are
   432  		// saved, they stop sending outgoing packets and all incoming packets
   433  		// are rejected.
   434  		for i := range e.inboundDispatchers {
   435  			e.wg.Add(1)
   436  			go func(i int) { // S/R-SAFE: See above.
   437  				e.dispatchLoop(e.inboundDispatchers[i])
   438  				e.wg.Done()
   439  			}(i)
   440  		}
   441  	}
   442  }
   443  
   444  // IsAttached implements stack.LinkEndpoint.IsAttached.
   445  func (e *endpoint) IsAttached() bool {
   446  	e.mu.RLock()
   447  	defer e.mu.RUnlock()
   448  	return e.dispatcher != nil
   449  }
   450  
   451  // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
   452  // during construction.
   453  func (e *endpoint) MTU() uint32 {
   454  	return e.mtu
   455  }
   456  
   457  // Capabilities implements stack.LinkEndpoint.Capabilities.
   458  func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
   459  	return e.caps
   460  }
   461  
   462  // MaxHeaderLength returns the maximum size of the link-layer header.
   463  func (e *endpoint) MaxHeaderLength() uint16 {
   464  	return uint16(e.hdrSize)
   465  }
   466  
   467  // LinkAddress returns the link address of this endpoint.
   468  func (e *endpoint) LinkAddress() tcpip.LinkAddress {
   469  	return e.addr
   470  }
   471  
   472  // Wait implements stack.LinkEndpoint.Wait. It waits for the endpoint to stop
   473  // reading from its FD.
   474  func (e *endpoint) Wait() {
   475  	e.wg.Wait()
   476  }
   477  
   478  // virtioNetHdr is declared in linux/virtio_net.h.
   479  type virtioNetHdr struct {
   480  	flags      uint8
   481  	gsoType    uint8
   482  	hdrLen     uint16
   483  	gsoSize    uint16
   484  	csumStart  uint16
   485  	csumOffset uint16
   486  }
   487  
   488  // marshal serializes h to a newly-allocated byte slice, in little-endian byte
   489  // order.
   490  //
   491  // Note: Virtio v1.0 onwards specifies little-endian as the byte ordering used
   492  // for general serialization. This makes it difficult to use go-marshal for
   493  // virtio types, as go-marshal implicitly uses the native byte ordering.
   494  func (h *virtioNetHdr) marshal() []byte {
   495  	buf := [virtioNetHdrSize]byte{
   496  		0: byte(h.flags),
   497  		1: byte(h.gsoType),
   498  
   499  		// Manually lay out the fields in little-endian byte order. Little endian =>
   500  		// least significant bit goes to the lower address.
   501  
   502  		2: byte(h.hdrLen),
   503  		3: byte(h.hdrLen >> 8),
   504  
   505  		4: byte(h.gsoSize),
   506  		5: byte(h.gsoSize >> 8),
   507  
   508  		6: byte(h.csumStart),
   509  		7: byte(h.csumStart >> 8),
   510  
   511  		8: byte(h.csumOffset),
   512  		9: byte(h.csumOffset >> 8),
   513  	}
   514  	return buf[:]
   515  }
   516  
   517  // These constants are declared in linux/virtio_net.h.
   518  const (
   519  	_VIRTIO_NET_HDR_F_NEEDS_CSUM = 1
   520  
   521  	_VIRTIO_NET_HDR_GSO_TCPV4 = 1
   522  	_VIRTIO_NET_HDR_GSO_TCPV6 = 4
   523  )
   524  
   525  // AddHeader implements stack.LinkEndpoint.AddHeader.
   526  func (e *endpoint) AddHeader(pkt *stack.PacketBuffer) {
   527  	if e.hdrSize > 0 {
   528  		// Add ethernet header if needed.
   529  		eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize))
   530  		eth.Encode(&header.EthernetFields{
   531  			SrcAddr: pkt.EgressRoute.LocalLinkAddress,
   532  			DstAddr: pkt.EgressRoute.RemoteLinkAddress,
   533  			Type:    pkt.NetworkProtocolNumber,
   534  		})
   535  	}
   536  }
   537  
   538  func (e *endpoint) parseHeader(pkt *stack.PacketBuffer) bool {
   539  	_, ok := pkt.LinkHeader().Consume(e.hdrSize)
   540  	return ok
   541  
   542  }
   543  
   544  // ParseHeader implements stack.LinkEndpoint.ParseHeader.
   545  func (e *endpoint) ParseHeader(pkt *stack.PacketBuffer) bool {
   546  	if e.hdrSize > 0 {
   547  		return e.parseHeader(pkt)
   548  	}
   549  	return true
   550  }
   551  
   552  // writePacket writes outbound packets to the file descriptor. If it is not
   553  // currently writable, the packet is dropped.
   554  func (e *endpoint) writePacket(pkt *stack.PacketBuffer) tcpip.Error {
   555  	fdInfo := e.fds[pkt.Hash%uint32(len(e.fds))]
   556  	fd := fdInfo.fd
   557  	var vnetHdrBuf []byte
   558  	if e.gsoKind == stack.HostGSOSupported {
   559  		vnetHdr := virtioNetHdr{}
   560  		if pkt.GSOOptions.Type != stack.GSONone {
   561  			vnetHdr.hdrLen = uint16(pkt.HeaderSize())
   562  			if pkt.GSOOptions.NeedsCsum {
   563  				vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM
   564  				vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen
   565  				vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset
   566  			}
   567  			if uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS {
   568  				switch pkt.GSOOptions.Type {
   569  				case stack.GSOTCPv4:
   570  					vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4
   571  				case stack.GSOTCPv6:
   572  					vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6
   573  				default:
   574  					panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type))
   575  				}
   576  				vnetHdr.gsoSize = pkt.GSOOptions.MSS
   577  			}
   578  		}
   579  		vnetHdrBuf = vnetHdr.marshal()
   580  	}
   581  
   582  	views := pkt.AsSlices()
   583  	numIovecs := len(views)
   584  	if len(vnetHdrBuf) != 0 {
   585  		numIovecs++
   586  	}
   587  	if numIovecs > e.writevMaxIovs {
   588  		numIovecs = e.writevMaxIovs
   589  	}
   590  
   591  	// Allocate small iovec arrays on the stack.
   592  	var iovecsArr [8]unix.Iovec
   593  	iovecs := iovecsArr[:0]
   594  	if numIovecs > len(iovecsArr) {
   595  		iovecs = make([]unix.Iovec, 0, numIovecs)
   596  	}
   597  	iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs)
   598  	for _, v := range views {
   599  		iovecs = rawfile.AppendIovecFromBytes(iovecs, v, numIovecs)
   600  	}
   601  	return rawfile.NonBlockingWriteIovec(fd, iovecs)
   602  }
   603  
   604  func (e *endpoint) sendBatch(batchFDInfo fdInfo, pkts []*stack.PacketBuffer) (int, tcpip.Error) {
   605  	// Degrade to writePacket if underlying fd is not a socket.
   606  	if !batchFDInfo.isSocket {
   607  		var written int
   608  		var err tcpip.Error
   609  		for written < len(pkts) {
   610  			if err = e.writePacket(pkts[written]); err != nil {
   611  				break
   612  			}
   613  			written++
   614  		}
   615  		return written, err
   616  	}
   617  
   618  	// Send a batch of packets through batchFD.
   619  	batchFD := batchFDInfo.fd
   620  	mmsgHdrsStorage := make([]rawfile.MMsgHdr, 0, len(pkts))
   621  	packets := 0
   622  	for packets < len(pkts) {
   623  		mmsgHdrs := mmsgHdrsStorage
   624  		batch := pkts[packets:]
   625  		syscallHeaderBytes := uintptr(0)
   626  		for _, pkt := range batch {
   627  			var vnetHdrBuf []byte
   628  			if e.gsoKind == stack.HostGSOSupported {
   629  				vnetHdr := virtioNetHdr{}
   630  				if pkt.GSOOptions.Type != stack.GSONone {
   631  					vnetHdr.hdrLen = uint16(pkt.HeaderSize())
   632  					if pkt.GSOOptions.NeedsCsum {
   633  						vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM
   634  						vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen
   635  						vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset
   636  					}
   637  					if pkt.GSOOptions.Type != stack.GSONone && uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS {
   638  						switch pkt.GSOOptions.Type {
   639  						case stack.GSOTCPv4:
   640  							vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4
   641  						case stack.GSOTCPv6:
   642  							vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6
   643  						default:
   644  							panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type))
   645  						}
   646  						vnetHdr.gsoSize = pkt.GSOOptions.MSS
   647  					}
   648  				}
   649  				vnetHdrBuf = vnetHdr.marshal()
   650  			}
   651  
   652  			views, offset := pkt.AsViewList()
   653  			var skipped int
   654  			var view *buffer.View
   655  			for view = views.Front(); view != nil && offset >= view.Size(); view = view.Next() {
   656  				offset -= view.Size()
   657  				skipped++
   658  			}
   659  
   660  			// We've made it to the usable views.
   661  			numIovecs := views.Len() - skipped
   662  			if len(vnetHdrBuf) != 0 {
   663  				numIovecs++
   664  			}
   665  			if numIovecs > rawfile.MaxIovs {
   666  				numIovecs = rawfile.MaxIovs
   667  			}
   668  			if e.maxSyscallHeaderBytes != 0 {
   669  				syscallHeaderBytes += rawfile.SizeofMMsgHdr + uintptr(numIovecs)*rawfile.SizeofIovec
   670  				if syscallHeaderBytes > e.maxSyscallHeaderBytes {
   671  					// We can't fit this packet into this call to sendmmsg().
   672  					// We could potentially do so if we reduced numIovecs
   673  					// further, but this might incur considerable extra
   674  					// copying. Leave it to the next batch instead.
   675  					break
   676  				}
   677  			}
   678  
   679  			// We can't easily allocate iovec arrays on the stack here since
   680  			// they will escape this loop iteration via mmsgHdrs.
   681  			iovecs := make([]unix.Iovec, 0, numIovecs)
   682  			iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs)
   683  			// At most one slice has a non-zero offset.
   684  			iovecs = rawfile.AppendIovecFromBytes(iovecs, view.AsSlice()[offset:], numIovecs)
   685  			for view = view.Next(); view != nil; view = view.Next() {
   686  				iovecs = rawfile.AppendIovecFromBytes(iovecs, view.AsSlice(), numIovecs)
   687  			}
   688  
   689  			var mmsgHdr rawfile.MMsgHdr
   690  			mmsgHdr.Msg.Iov = &iovecs[0]
   691  			mmsgHdr.Msg.SetIovlen(len(iovecs))
   692  			mmsgHdrs = append(mmsgHdrs, mmsgHdr)
   693  		}
   694  
   695  		if len(mmsgHdrs) == 0 {
   696  			// We can't fit batch[0] into a mmsghdr while staying under
   697  			// e.maxSyscallHeaderBytes. Use WritePacket, which will avoid the
   698  			// mmsghdr (by using writev) and re-buffer iovecs more aggressively
   699  			// if necessary (by using e.writevMaxIovs instead of
   700  			// rawfile.MaxIovs).
   701  			pkt := batch[0]
   702  			if err := e.writePacket(pkt); err != nil {
   703  				return packets, err
   704  			}
   705  			packets++
   706  		} else {
   707  			for len(mmsgHdrs) > 0 {
   708  				sent, err := rawfile.NonBlockingSendMMsg(batchFD, mmsgHdrs)
   709  				if err != nil {
   710  					return packets, err
   711  				}
   712  				packets += sent
   713  				mmsgHdrs = mmsgHdrs[sent:]
   714  			}
   715  		}
   716  	}
   717  
   718  	return packets, nil
   719  }
   720  
   721  // WritePackets writes outbound packets to the underlying file descriptors. If
   722  // one is not currently writable, the packet is dropped.
   723  //
   724  // Being a batch API, each packet in pkts should have the following
   725  // fields populated:
   726  //   - pkt.EgressRoute
   727  //   - pkt.GSOOptions
   728  //   - pkt.NetworkProtocolNumber
   729  func (e *endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) {
   730  	// Preallocate to avoid repeated reallocation as we append to batch.
   731  	batch := make([]*stack.PacketBuffer, 0, BatchSize)
   732  	batchFDInfo := fdInfo{fd: -1, isSocket: false}
   733  	sentPackets := 0
   734  	for _, pkt := range pkts.AsSlice() {
   735  		if len(batch) == 0 {
   736  			batchFDInfo = e.fds[pkt.Hash%uint32(len(e.fds))]
   737  		}
   738  		pktFDInfo := e.fds[pkt.Hash%uint32(len(e.fds))]
   739  		if sendNow := pktFDInfo != batchFDInfo; !sendNow {
   740  			batch = append(batch, pkt)
   741  			continue
   742  		}
   743  		n, err := e.sendBatch(batchFDInfo, batch)
   744  		sentPackets += n
   745  		if err != nil {
   746  			return sentPackets, err
   747  		}
   748  		batch = batch[:0]
   749  		batch = append(batch, pkt)
   750  		batchFDInfo = pktFDInfo
   751  	}
   752  
   753  	if len(batch) != 0 {
   754  		n, err := e.sendBatch(batchFDInfo, batch)
   755  		sentPackets += n
   756  		if err != nil {
   757  			return sentPackets, err
   758  		}
   759  	}
   760  	return sentPackets, nil
   761  }
   762  
   763  // InjectOutbound implements stack.InjectableEndpoint.InjectOutbound.
   764  func (e *endpoint) InjectOutbound(dest tcpip.Address, packet *buffer.View) tcpip.Error {
   765  	return rawfile.NonBlockingWrite(e.fds[0].fd, packet.AsSlice())
   766  }
   767  
   768  // dispatchLoop reads packets from the file descriptor in a loop and dispatches
   769  // them to the network stack.
   770  func (e *endpoint) dispatchLoop(inboundDispatcher linkDispatcher) tcpip.Error {
   771  	for {
   772  		cont, err := inboundDispatcher.dispatch()
   773  		if err != nil || !cont {
   774  			if e.closed != nil {
   775  				e.closed(err)
   776  			}
   777  			inboundDispatcher.release()
   778  			return err
   779  		}
   780  	}
   781  }
   782  
   783  // GSOMaxSize implements stack.GSOEndpoint.
   784  func (e *endpoint) GSOMaxSize() uint32 {
   785  	return e.gsoMaxSize
   786  }
   787  
   788  // SupportedGSO implements stack.GSOEndpoint.
   789  func (e *endpoint) SupportedGSO() stack.SupportedGSO {
   790  	return e.gsoKind
   791  }
   792  
   793  // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
   794  func (e *endpoint) ARPHardwareType() header.ARPHardwareType {
   795  	if e.hdrSize > 0 {
   796  		return header.ARPHardwareEther
   797  	}
   798  	return header.ARPHardwareNone
   799  }
   800  
   801  // InjectableEndpoint is an injectable fd-based endpoint. The endpoint writes
   802  // to the FD, but does not read from it. All reads come from injected packets.
   803  type InjectableEndpoint struct {
   804  	endpoint
   805  
   806  	mu sync.RWMutex
   807  	// +checklocks:mu
   808  	dispatcher stack.NetworkDispatcher
   809  }
   810  
   811  // Attach saves the stack network-layer dispatcher for use later when packets
   812  // are injected.
   813  func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
   814  	e.mu.Lock()
   815  	defer e.mu.Unlock()
   816  	e.dispatcher = dispatcher
   817  }
   818  
   819  // InjectInbound injects an inbound packet. If the endpoint is not attached, the
   820  // packet is not delivered.
   821  func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
   822  	e.mu.RLock()
   823  	d := e.dispatcher
   824  	e.mu.RUnlock()
   825  	if d != nil {
   826  		d.DeliverNetworkPacket(protocol, pkt)
   827  	}
   828  }
   829  
   830  // NewInjectable creates a new fd-based InjectableEndpoint.
   831  func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabilities) (*InjectableEndpoint, error) {
   832  	unix.SetNonblock(fd, true)
   833  	isSocket, err := isSocketFD(fd)
   834  	if err != nil {
   835  		return nil, err
   836  	}
   837  
   838  	return &InjectableEndpoint{endpoint: endpoint{
   839  		fds:           []fdInfo{{fd: fd, isSocket: isSocket}},
   840  		mtu:           mtu,
   841  		caps:          capabilities,
   842  		writevMaxIovs: rawfile.MaxIovs,
   843  	}}, nil
   844  }