github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/tcpip/link/fdbased/endpoint.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // +build linux
    16  
    17  // Package fdbased provides the implemention of data-link layer endpoints
    18  // backed by boundary-preserving file descriptors (e.g., TUN devices,
    19  // seqpacket/datagram sockets).
    20  //
    21  // FD based endpoints can be used in the networking stack by calling New() to
    22  // create a new endpoint, and then passing it as an argument to
    23  // Stack.CreateNIC().
    24  //
    25  // FD based endpoints can use more than one file descriptor to read incoming
    26  // packets. If there are more than one FDs specified and the underlying FD is an
    27  // AF_PACKET then the endpoint will enable FANOUT mode on the socket so that the
    28  // host kernel will consistently hash the packets to the sockets. This ensures
    29  // that packets for the same TCP streams are not reordered.
    30  //
    31  // Similarly if more than one FD's are specified where the underlying FD is not
    32  // AF_PACKET then it's the caller's responsibility to ensure that all inbound
    33  // packets on the descriptors are consistently 5 tuple hashed to one of the
    34  // descriptors to prevent TCP reordering.
    35  //
    36  // Since netstack today does not compute 5 tuple hashes for outgoing packets we
    37  // only use the first FD to write outbound packets. Once 5 tuple hashes for
    38  // all outbound packets are available we will make use of all underlying FD's to
    39  // write outbound packets.
    40  package fdbased
    41  
    42  import (
    43  	"fmt"
    44  	"runtime"
    45  	"sync/atomic"
    46  
    47  	"golang.org/x/sys/unix"
    48  	"github.com/SagerNet/gvisor/pkg/sync"
    49  	"github.com/SagerNet/gvisor/pkg/tcpip"
    50  	"github.com/SagerNet/gvisor/pkg/tcpip/buffer"
    51  	"github.com/SagerNet/gvisor/pkg/tcpip/header"
    52  	"github.com/SagerNet/gvisor/pkg/tcpip/link/rawfile"
    53  	"github.com/SagerNet/gvisor/pkg/tcpip/stack"
    54  )
    55  
    56  // linkDispatcher reads packets from the link FD and dispatches them to the
    57  // NetworkDispatcher.
    58  type linkDispatcher interface {
    59  	dispatch() (bool, tcpip.Error)
    60  }
    61  
    62  // PacketDispatchMode are the various supported methods of receiving and
    63  // dispatching packets from the underlying FD.
    64  type PacketDispatchMode int
    65  
    66  const (
    67  	// Readv is the default dispatch mode and is the least performant of the
    68  	// dispatch options but the one that is supported by all underlying FD
    69  	// types.
    70  	Readv PacketDispatchMode = iota
    71  	// RecvMMsg enables use of recvmmsg() syscall instead of readv() to
    72  	// read inbound packets. This reduces # of syscalls needed to process
    73  	// packets.
    74  	//
    75  	// NOTE: recvmmsg() is only supported for sockets, so if the underlying
    76  	// FD is not a socket then the code will still fall back to the readv()
    77  	// path.
    78  	RecvMMsg
    79  	// PacketMMap enables use of PACKET_RX_RING to receive packets from the
    80  	// NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The
    81  	// primary use-case for this is runsc which uses an AF_PACKET FD to
    82  	// receive packets from the veth device.
    83  	PacketMMap
    84  )
    85  
    86  func (p PacketDispatchMode) String() string {
    87  	switch p {
    88  	case Readv:
    89  		return "Readv"
    90  	case RecvMMsg:
    91  		return "RecvMMsg"
    92  	case PacketMMap:
    93  		return "PacketMMap"
    94  	default:
    95  		return fmt.Sprintf("unknown packet dispatch mode '%d'", p)
    96  	}
    97  }
    98  
    99  var _ stack.LinkEndpoint = (*endpoint)(nil)
   100  var _ stack.GSOEndpoint = (*endpoint)(nil)
   101  
   102  type endpoint struct {
   103  	// fds is the set of file descriptors each identifying one inbound/outbound
   104  	// channel. The endpoint will dispatch from all inbound channels as well as
   105  	// hash outbound packets to specific channels based on the packet hash.
   106  	fds []int
   107  
   108  	// mtu (maximum transmission unit) is the maximum size of a packet.
   109  	mtu uint32
   110  
   111  	// hdrSize specifies the link-layer header size. If set to 0, no header
   112  	// is added/removed; otherwise an ethernet header is used.
   113  	hdrSize int
   114  
   115  	// addr is the address of the endpoint.
   116  	addr tcpip.LinkAddress
   117  
   118  	// caps holds the endpoint capabilities.
   119  	caps stack.LinkEndpointCapabilities
   120  
   121  	// closed is a function to be called when the FD's peer (if any) closes
   122  	// its end of the communication pipe.
   123  	closed func(tcpip.Error)
   124  
   125  	inboundDispatchers []linkDispatcher
   126  	dispatcher         stack.NetworkDispatcher
   127  
   128  	// packetDispatchMode controls the packet dispatcher used by this
   129  	// endpoint.
   130  	packetDispatchMode PacketDispatchMode
   131  
   132  	// gsoMaxSize is the maximum GSO packet size. It is zero if GSO is
   133  	// disabled.
   134  	gsoMaxSize uint32
   135  
   136  	// wg keeps track of running goroutines.
   137  	wg sync.WaitGroup
   138  
   139  	// gsoKind is the supported kind of GSO.
   140  	gsoKind stack.SupportedGSO
   141  
   142  	// maxSyscallHeaderBytes has the same meaning as
   143  	// Options.MaxSyscallHeaderBytes.
   144  	maxSyscallHeaderBytes uintptr
   145  
   146  	// writevMaxIovs is the maximum number of iovecs that may be passed to
   147  	// rawfile.NonBlockingWriteIovec, as possibly limited by
   148  	// maxSyscallHeaderBytes. (No analogous limit is defined for
   149  	// rawfile.NonBlockingSendMMsg, since in that case the maximum number of
   150  	// iovecs also depends on the number of mmsghdrs. Instead, if sendBatch
   151  	// encounters a packet whose iovec count is limited by
   152  	// maxSyscallHeaderBytes, it falls back to writing the packet using writev
   153  	// via WritePacket.)
   154  	writevMaxIovs int
   155  }
   156  
   157  // Options specify the details about the fd-based endpoint to be created.
   158  type Options struct {
   159  	// FDs is a set of FDs used to read/write packets.
   160  	FDs []int
   161  
   162  	// MTU is the mtu to use for this endpoint.
   163  	MTU uint32
   164  
   165  	// EthernetHeader if true, indicates that the endpoint should read/write
   166  	// ethernet frames instead of IP packets.
   167  	EthernetHeader bool
   168  
   169  	// ClosedFunc is a function to be called when an endpoint's peer (if
   170  	// any) closes its end of the communication pipe.
   171  	ClosedFunc func(tcpip.Error)
   172  
   173  	// Address is the link address for this endpoint. Only used if
   174  	// EthernetHeader is true.
   175  	Address tcpip.LinkAddress
   176  
   177  	// SaveRestore if true, indicates that this NIC capability set should
   178  	// include CapabilitySaveRestore
   179  	SaveRestore bool
   180  
   181  	// DisconnectOk if true, indicates that this NIC capability set should
   182  	// include CapabilityDisconnectOk.
   183  	DisconnectOk bool
   184  
   185  	// GSOMaxSize is the maximum GSO packet size. It is zero if GSO is
   186  	// disabled.
   187  	GSOMaxSize uint32
   188  
   189  	// SoftwareGSOEnabled indicates whether software GSO is enabled or not.
   190  	SoftwareGSOEnabled bool
   191  
   192  	// PacketDispatchMode specifies the type of inbound dispatcher to be
   193  	// used for this endpoint.
   194  	PacketDispatchMode PacketDispatchMode
   195  
   196  	// TXChecksumOffload if true, indicates that this endpoints capability
   197  	// set should include CapabilityTXChecksumOffload.
   198  	TXChecksumOffload bool
   199  
   200  	// RXChecksumOffload if true, indicates that this endpoints capability
   201  	// set should include CapabilityRXChecksumOffload.
   202  	RXChecksumOffload bool
   203  
   204  	// If MaxSyscallHeaderBytes is non-zero, it is the maximum number of bytes
   205  	// of struct iovec, msghdr, and mmsghdr that may be passed by each host
   206  	// system call.
   207  	MaxSyscallHeaderBytes int
   208  }
   209  
   210  // fanoutID is used for AF_PACKET based endpoints to enable PACKET_FANOUT
   211  // support in the host kernel. This allows us to use multiple FD's to receive
   212  // from the same underlying NIC. The fanoutID needs to be the same for a given
   213  // set of FD's that point to the same NIC. Trying to set the PACKET_FANOUT
   214  // option for an FD with a fanoutID already in use by another FD for a different
   215  // NIC will return an EINVAL.
   216  //
   217  // Since fanoutID must be unique within the network namespace, we start with
   218  // the PID to avoid collisions. The only way to be sure of avoiding collisions
   219  // is to run in a new network namespace.
   220  //
   221  // Must be accessed using atomic operations.
   222  var fanoutID int32 = int32(unix.Getpid())
   223  
   224  // New creates a new fd-based endpoint.
   225  //
   226  // Makes fd non-blocking, but does not take ownership of fd, which must remain
   227  // open for the lifetime of the returned endpoint (until after the endpoint has
   228  // stopped being using and Wait returns).
   229  func New(opts *Options) (stack.LinkEndpoint, error) {
   230  	caps := stack.LinkEndpointCapabilities(0)
   231  	if opts.RXChecksumOffload {
   232  		caps |= stack.CapabilityRXChecksumOffload
   233  	}
   234  
   235  	if opts.TXChecksumOffload {
   236  		caps |= stack.CapabilityTXChecksumOffload
   237  	}
   238  
   239  	hdrSize := 0
   240  	if opts.EthernetHeader {
   241  		hdrSize = header.EthernetMinimumSize
   242  		caps |= stack.CapabilityResolutionRequired
   243  	}
   244  
   245  	if opts.SaveRestore {
   246  		caps |= stack.CapabilitySaveRestore
   247  	}
   248  
   249  	if opts.DisconnectOk {
   250  		caps |= stack.CapabilityDisconnectOk
   251  	}
   252  
   253  	if len(opts.FDs) == 0 {
   254  		return nil, fmt.Errorf("opts.FD is empty, at least one FD must be specified")
   255  	}
   256  
   257  	if opts.MaxSyscallHeaderBytes < 0 {
   258  		return nil, fmt.Errorf("opts.MaxSyscallHeaderBytes is negative")
   259  	}
   260  
   261  	e := &endpoint{
   262  		fds:                   opts.FDs,
   263  		mtu:                   opts.MTU,
   264  		caps:                  caps,
   265  		closed:                opts.ClosedFunc,
   266  		addr:                  opts.Address,
   267  		hdrSize:               hdrSize,
   268  		packetDispatchMode:    opts.PacketDispatchMode,
   269  		maxSyscallHeaderBytes: uintptr(opts.MaxSyscallHeaderBytes),
   270  		writevMaxIovs:         rawfile.MaxIovs,
   271  	}
   272  	if e.maxSyscallHeaderBytes != 0 {
   273  		if max := int(e.maxSyscallHeaderBytes / rawfile.SizeofIovec); max < e.writevMaxIovs {
   274  			e.writevMaxIovs = max
   275  		}
   276  	}
   277  
   278  	// Increment fanoutID to ensure that we don't re-use the same fanoutID for
   279  	// the next endpoint.
   280  	fid := atomic.AddInt32(&fanoutID, 1)
   281  
   282  	// Create per channel dispatchers.
   283  	for i := 0; i < len(e.fds); i++ {
   284  		fd := e.fds[i]
   285  
   286  		if runtime.GOOS != "android" {
   287  			if err := unix.SetNonblock(fd, true); err != nil {
   288  				return nil, fmt.Errorf("unix.SetNonblock(%v) failed: %v", fd, err)
   289  			}
   290  		}
   291  
   292  		isSocket, err := isSocketFD(fd)
   293  		if err != nil {
   294  			return nil, err
   295  		}
   296  		if isSocket {
   297  			if opts.GSOMaxSize != 0 {
   298  				if opts.SoftwareGSOEnabled {
   299  					e.gsoKind = stack.SWGSOSupported
   300  				} else {
   301  					e.gsoKind = stack.HWGSOSupported
   302  				}
   303  				e.gsoMaxSize = opts.GSOMaxSize
   304  			}
   305  		}
   306  		inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket, fid)
   307  		if err != nil {
   308  			return nil, fmt.Errorf("createInboundDispatcher(...) = %v", err)
   309  		}
   310  		e.inboundDispatchers = append(e.inboundDispatchers, inboundDispatcher)
   311  	}
   312  
   313  	return e, nil
   314  }
   315  
   316  func createInboundDispatcher(e *endpoint, fd int, isSocket bool, fID int32) (linkDispatcher, error) {
   317  	// By default use the readv() dispatcher as it works with all kinds of
   318  	// FDs (tap/tun/unix domain sockets and af_packet).
   319  	inboundDispatcher, err := newReadVDispatcher(fd, e)
   320  	if err != nil {
   321  		return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", fd, e, err)
   322  	}
   323  
   324  	if isSocket {
   325  		sa, err := unix.Getsockname(fd)
   326  		if err != nil {
   327  			return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err)
   328  		}
   329  		switch sa.(type) {
   330  		case *unix.SockaddrLinklayer:
   331  			// Enable PACKET_FANOUT mode if the underlying socket is of type
   332  			// AF_PACKET. We do not enable PACKET_FANOUT_FLAG_DEFRAG as that will
   333  			// prevent gvisor from receiving fragmented packets and the host does the
   334  			// reassembly on our behalf before delivering the fragments. This makes it
   335  			// hard to test fragmentation reassembly code in Netstack.
   336  			//
   337  			// See: include/uapi/linux/if_packet.h (struct fanout_args).
   338  			//
   339  			// NOTE: We are using SetSockOptInt here even though the underlying
   340  			// option is actually a struct. The code follows the example in the
   341  			// kernel documentation as described at the link below:
   342  			//
   343  			// See: https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
   344  			//
   345  			// This works out because the actual implementation for the option zero
   346  			// initializes the structure and will initialize the max_members field
   347  			// to a proper value if zero.
   348  			//
   349  			// See: https://github.com/torvalds/linux/blob/7acac4b3196caee5e21fb5ea53f8bc124e6a16fc/net/packet/af_packet.c#L3881
   350  			const fanoutType = unix.PACKET_FANOUT_HASH
   351  			fanoutArg := (int(fID) & 0xffff) | fanoutType<<16
   352  			if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil {
   353  				return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err)
   354  			}
   355  		}
   356  
   357  		switch e.packetDispatchMode {
   358  		case PacketMMap:
   359  			inboundDispatcher, err = newPacketMMapDispatcher(fd, e)
   360  			if err != nil {
   361  				return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", fd, e, err)
   362  			}
   363  		case RecvMMsg:
   364  			// If the provided FD is a socket then we optimize
   365  			// packet reads by using recvmmsg() instead of read() to
   366  			// read packets in a batch.
   367  			inboundDispatcher, err = newRecvMMsgDispatcher(fd, e)
   368  			if err != nil {
   369  				return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", fd, e, err)
   370  			}
   371  		}
   372  	}
   373  	return inboundDispatcher, nil
   374  }
   375  
   376  func isSocketFD(fd int) (bool, error) {
   377  	var stat unix.Stat_t
   378  	if err := unix.Fstat(fd, &stat); err != nil {
   379  		return false, fmt.Errorf("unix.Fstat(%v,...) failed: %v", fd, err)
   380  	}
   381  	return (stat.Mode & unix.S_IFSOCK) == unix.S_IFSOCK, nil
   382  }
   383  
   384  // Attach launches the goroutine that reads packets from the file descriptor and
   385  // dispatches them via the provided dispatcher.
   386  func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
   387  	e.dispatcher = dispatcher
   388  	// Link endpoints are not savable. When transportation endpoints are
   389  	// saved, they stop sending outgoing packets and all incoming packets
   390  	// are rejected.
   391  	for i := range e.inboundDispatchers {
   392  		e.wg.Add(1)
   393  		go func(i int) { // S/R-SAFE: See above.
   394  			e.dispatchLoop(e.inboundDispatchers[i])
   395  			e.wg.Done()
   396  		}(i)
   397  	}
   398  }
   399  
   400  // IsAttached implements stack.LinkEndpoint.IsAttached.
   401  func (e *endpoint) IsAttached() bool {
   402  	return e.dispatcher != nil
   403  }
   404  
   405  // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
   406  // during construction.
   407  func (e *endpoint) MTU() uint32 {
   408  	return e.mtu
   409  }
   410  
   411  // Capabilities implements stack.LinkEndpoint.Capabilities.
   412  func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
   413  	return e.caps
   414  }
   415  
   416  // MaxHeaderLength returns the maximum size of the link-layer header.
   417  func (e *endpoint) MaxHeaderLength() uint16 {
   418  	return uint16(e.hdrSize)
   419  }
   420  
   421  // LinkAddress returns the link address of this endpoint.
   422  func (e *endpoint) LinkAddress() tcpip.LinkAddress {
   423  	return e.addr
   424  }
   425  
   426  // Wait implements stack.LinkEndpoint.Wait. It waits for the endpoint to stop
   427  // reading from its FD.
   428  func (e *endpoint) Wait() {
   429  	e.wg.Wait()
   430  }
   431  
   432  // virtioNetHdr is declared in linux/virtio_net.h.
   433  type virtioNetHdr struct {
   434  	flags      uint8
   435  	gsoType    uint8
   436  	hdrLen     uint16
   437  	gsoSize    uint16
   438  	csumStart  uint16
   439  	csumOffset uint16
   440  }
   441  
   442  // marshal serializes h to a newly-allocated byte slice, in little-endian byte
   443  // order.
   444  //
   445  // Note: Virtio v1.0 onwards specifies little-endian as the byte ordering used
   446  // for general serialization. This makes it difficult to use go-marshal for
   447  // virtio types, as go-marshal implicitly uses the native byte ordering.
   448  func (h *virtioNetHdr) marshal() []byte {
   449  	buf := [virtioNetHdrSize]byte{
   450  		0: byte(h.flags),
   451  		1: byte(h.gsoType),
   452  
   453  		// Manually lay out the fields in little-endian byte order. Little endian =>
   454  		// least significant bit goes to the lower address.
   455  
   456  		2: byte(h.hdrLen),
   457  		3: byte(h.hdrLen >> 8),
   458  
   459  		4: byte(h.gsoSize),
   460  		5: byte(h.gsoSize >> 8),
   461  
   462  		6: byte(h.csumStart),
   463  		7: byte(h.csumStart >> 8),
   464  
   465  		8: byte(h.csumOffset),
   466  		9: byte(h.csumOffset >> 8),
   467  	}
   468  	return buf[:]
   469  }
   470  
   471  // These constants are declared in linux/virtio_net.h.
   472  const (
   473  	_VIRTIO_NET_HDR_F_NEEDS_CSUM = 1
   474  
   475  	_VIRTIO_NET_HDR_GSO_TCPV4 = 1
   476  	_VIRTIO_NET_HDR_GSO_TCPV6 = 4
   477  )
   478  
   479  // AddHeader implements stack.LinkEndpoint.AddHeader.
   480  func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
   481  	if e.hdrSize > 0 {
   482  		// Add ethernet header if needed.
   483  		eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize))
   484  		ethHdr := &header.EthernetFields{
   485  			DstAddr: remote,
   486  			Type:    protocol,
   487  		}
   488  
   489  		// Preserve the src address if it's set in the route.
   490  		if local != "" {
   491  			ethHdr.SrcAddr = local
   492  		} else {
   493  			ethHdr.SrcAddr = e.addr
   494  		}
   495  		eth.Encode(ethHdr)
   496  	}
   497  }
   498  
   499  // WritePacket writes outbound packets to the file descriptor. If it is not
   500  // currently writable, the packet is dropped.
   501  func (e *endpoint) WritePacket(r stack.RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error {
   502  	if e.hdrSize > 0 {
   503  		e.AddHeader(r.LocalLinkAddress, r.RemoteLinkAddress, protocol, pkt)
   504  	}
   505  
   506  	fd := e.fds[pkt.Hash%uint32(len(e.fds))]
   507  	var vnetHdrBuf []byte
   508  	if e.gsoKind == stack.HWGSOSupported {
   509  		vnetHdr := virtioNetHdr{}
   510  		if pkt.GSOOptions.Type != stack.GSONone {
   511  			vnetHdr.hdrLen = uint16(pkt.HeaderSize())
   512  			if pkt.GSOOptions.NeedsCsum {
   513  				vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM
   514  				vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen
   515  				vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset
   516  			}
   517  			if pkt.GSOOptions.Type != stack.GSONone && uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS {
   518  				switch pkt.GSOOptions.Type {
   519  				case stack.GSOTCPv4:
   520  					vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4
   521  				case stack.GSOTCPv6:
   522  					vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6
   523  				default:
   524  					panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type))
   525  				}
   526  				vnetHdr.gsoSize = pkt.GSOOptions.MSS
   527  			}
   528  		}
   529  		vnetHdrBuf = vnetHdr.marshal()
   530  	}
   531  
   532  	views := pkt.Views()
   533  	numIovecs := len(views)
   534  	if len(vnetHdrBuf) != 0 {
   535  		numIovecs++
   536  	}
   537  	if numIovecs > e.writevMaxIovs {
   538  		numIovecs = e.writevMaxIovs
   539  	}
   540  
   541  	// Allocate small iovec arrays on the stack.
   542  	var iovecsArr [8]unix.Iovec
   543  	iovecs := iovecsArr[:0]
   544  	if numIovecs > len(iovecsArr) {
   545  		iovecs = make([]unix.Iovec, 0, numIovecs)
   546  	}
   547  	iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs)
   548  	for _, v := range views {
   549  		iovecs = rawfile.AppendIovecFromBytes(iovecs, v, numIovecs)
   550  	}
   551  	return rawfile.NonBlockingWriteIovec(fd, iovecs)
   552  }
   553  
   554  func (e *endpoint) sendBatch(batchFD int, pkts []*stack.PacketBuffer) (int, tcpip.Error) {
   555  	// Send a batch of packets through batchFD.
   556  	mmsgHdrsStorage := make([]rawfile.MMsgHdr, 0, len(pkts))
   557  	packets := 0
   558  	for packets < len(pkts) {
   559  		mmsgHdrs := mmsgHdrsStorage
   560  		batch := pkts[packets:]
   561  		syscallHeaderBytes := uintptr(0)
   562  		for _, pkt := range batch {
   563  			if e.hdrSize > 0 {
   564  				e.AddHeader(pkt.EgressRoute.LocalLinkAddress, pkt.EgressRoute.RemoteLinkAddress, pkt.NetworkProtocolNumber, pkt)
   565  			}
   566  
   567  			var vnetHdrBuf []byte
   568  			if e.gsoKind == stack.HWGSOSupported {
   569  				vnetHdr := virtioNetHdr{}
   570  				if pkt.GSOOptions.Type != stack.GSONone {
   571  					vnetHdr.hdrLen = uint16(pkt.HeaderSize())
   572  					if pkt.GSOOptions.NeedsCsum {
   573  						vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM
   574  						vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen
   575  						vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset
   576  					}
   577  					if pkt.GSOOptions.Type != stack.GSONone && uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS {
   578  						switch pkt.GSOOptions.Type {
   579  						case stack.GSOTCPv4:
   580  							vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4
   581  						case stack.GSOTCPv6:
   582  							vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6
   583  						default:
   584  							panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type))
   585  						}
   586  						vnetHdr.gsoSize = pkt.GSOOptions.MSS
   587  					}
   588  				}
   589  				vnetHdrBuf = vnetHdr.marshal()
   590  			}
   591  
   592  			views := pkt.Views()
   593  			numIovecs := len(views)
   594  			if len(vnetHdrBuf) != 0 {
   595  				numIovecs++
   596  			}
   597  			if numIovecs > rawfile.MaxIovs {
   598  				numIovecs = rawfile.MaxIovs
   599  			}
   600  			if e.maxSyscallHeaderBytes != 0 {
   601  				syscallHeaderBytes += rawfile.SizeofMMsgHdr + uintptr(numIovecs)*rawfile.SizeofIovec
   602  				if syscallHeaderBytes > e.maxSyscallHeaderBytes {
   603  					// We can't fit this packet into this call to sendmmsg().
   604  					// We could potentially do so if we reduced numIovecs
   605  					// further, but this might incur considerable extra
   606  					// copying. Leave it to the next batch instead.
   607  					break
   608  				}
   609  			}
   610  
   611  			// We can't easily allocate iovec arrays on the stack here since
   612  			// they will escape this loop iteration via mmsgHdrs.
   613  			iovecs := make([]unix.Iovec, 0, numIovecs)
   614  			iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs)
   615  			for _, v := range views {
   616  				iovecs = rawfile.AppendIovecFromBytes(iovecs, v, numIovecs)
   617  			}
   618  
   619  			var mmsgHdr rawfile.MMsgHdr
   620  			mmsgHdr.Msg.Iov = &iovecs[0]
   621  			mmsgHdr.Msg.SetIovlen(len(iovecs))
   622  			mmsgHdrs = append(mmsgHdrs, mmsgHdr)
   623  		}
   624  
   625  		if len(mmsgHdrs) == 0 {
   626  			// We can't fit batch[0] into a mmsghdr while staying under
   627  			// e.maxSyscallHeaderBytes. Use WritePacket, which will avoid the
   628  			// mmsghdr (by using writev) and re-buffer iovecs more aggressively
   629  			// if necessary (by using e.writevMaxIovs instead of
   630  			// rawfile.MaxIovs).
   631  			pkt := batch[0]
   632  			if err := e.WritePacket(pkt.EgressRoute, pkt.NetworkProtocolNumber, pkt); err != nil {
   633  				return packets, err
   634  			}
   635  			packets++
   636  		} else {
   637  			for len(mmsgHdrs) > 0 {
   638  				sent, err := rawfile.NonBlockingSendMMsg(batchFD, mmsgHdrs)
   639  				if err != nil {
   640  					return packets, err
   641  				}
   642  				packets += sent
   643  				mmsgHdrs = mmsgHdrs[sent:]
   644  			}
   645  		}
   646  	}
   647  
   648  	return packets, nil
   649  }
   650  
   651  // WritePackets writes outbound packets to the underlying file descriptors. If
   652  // one is not currently writable, the packet is dropped.
   653  //
   654  // Being a batch API, each packet in pkts should have the following
   655  // fields populated:
   656  //  - pkt.EgressRoute
   657  //  - pkt.GSOOptions
   658  //  - pkt.NetworkProtocolNumber
   659  func (e *endpoint) WritePackets(_ stack.RouteInfo, pkts stack.PacketBufferList, _ tcpip.NetworkProtocolNumber) (int, tcpip.Error) {
   660  	// Preallocate to avoid repeated reallocation as we append to batch.
   661  	// batchSz is 47 because when SWGSO is in use then a single 65KB TCP
   662  	// segment can get split into 46 segments of 1420 bytes and a single 216
   663  	// byte segment.
   664  	const batchSz = 47
   665  	batch := make([]*stack.PacketBuffer, 0, batchSz)
   666  	batchFD := -1
   667  	sentPackets := 0
   668  	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
   669  		if len(batch) == 0 {
   670  			batchFD = e.fds[pkt.Hash%uint32(len(e.fds))]
   671  		}
   672  		pktFD := e.fds[pkt.Hash%uint32(len(e.fds))]
   673  		if sendNow := pktFD != batchFD; !sendNow {
   674  			batch = append(batch, pkt)
   675  			continue
   676  		}
   677  		n, err := e.sendBatch(batchFD, batch)
   678  		sentPackets += n
   679  		if err != nil {
   680  			return sentPackets, err
   681  		}
   682  		batch = batch[:0]
   683  		batch = append(batch, pkt)
   684  		batchFD = pktFD
   685  	}
   686  
   687  	if len(batch) != 0 {
   688  		n, err := e.sendBatch(batchFD, batch)
   689  		sentPackets += n
   690  		if err != nil {
   691  			return sentPackets, err
   692  		}
   693  	}
   694  	return sentPackets, nil
   695  }
   696  
   697  // viewsEqual tests whether v1 and v2 refer to the same backing bytes.
   698  func viewsEqual(vs1, vs2 []buffer.View) bool {
   699  	return len(vs1) == len(vs2) && (len(vs1) == 0 || &vs1[0] == &vs2[0])
   700  }
   701  
   702  // InjectOutobund implements stack.InjectableEndpoint.InjectOutbound.
   703  func (e *endpoint) InjectOutbound(dest tcpip.Address, packet []byte) tcpip.Error {
   704  	return rawfile.NonBlockingWrite(e.fds[0], packet)
   705  }
   706  
   707  // dispatchLoop reads packets from the file descriptor in a loop and dispatches
   708  // them to the network stack.
   709  func (e *endpoint) dispatchLoop(inboundDispatcher linkDispatcher) tcpip.Error {
   710  	for {
   711  		cont, err := inboundDispatcher.dispatch()
   712  		if err != nil || !cont {
   713  			if e.closed != nil {
   714  				e.closed(err)
   715  			}
   716  			return err
   717  		}
   718  	}
   719  }
   720  
   721  // GSOMaxSize implements stack.GSOEndpoint.
   722  func (e *endpoint) GSOMaxSize() uint32 {
   723  	return e.gsoMaxSize
   724  }
   725  
   726  // SupportsHWGSO implements stack.GSOEndpoint.
   727  func (e *endpoint) SupportedGSO() stack.SupportedGSO {
   728  	return e.gsoKind
   729  }
   730  
   731  // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
   732  func (e *endpoint) ARPHardwareType() header.ARPHardwareType {
   733  	if e.hdrSize > 0 {
   734  		return header.ARPHardwareEther
   735  	}
   736  	return header.ARPHardwareNone
   737  }
   738  
   739  // InjectableEndpoint is an injectable fd-based endpoint. The endpoint writes
   740  // to the FD, but does not read from it. All reads come from injected packets.
   741  type InjectableEndpoint struct {
   742  	endpoint
   743  
   744  	dispatcher stack.NetworkDispatcher
   745  }
   746  
   747  // Attach saves the stack network-layer dispatcher for use later when packets
   748  // are injected.
   749  func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
   750  	e.dispatcher = dispatcher
   751  }
   752  
   753  // InjectInbound injects an inbound packet.
   754  func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
   755  	e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, pkt)
   756  }
   757  
   758  // NewInjectable creates a new fd-based InjectableEndpoint.
   759  func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabilities) *InjectableEndpoint {
   760  	unix.SetNonblock(fd, true)
   761  
   762  	return &InjectableEndpoint{endpoint: endpoint{
   763  		fds:           []int{fd},
   764  		mtu:           mtu,
   765  		caps:          capabilities,
   766  		writevMaxIovs: rawfile.MaxIovs,
   767  	}}
   768  }