github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/tcpip/link/fdbased/endpoint.go

github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/tcpip/link/fdbased/endpoint.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build linux
    16  // +build linux
    17  
    18  // Package fdbased provides the implemention of data-link layer endpoints
    19  // backed by boundary-preserving file descriptors (e.g., TUN devices,
    20  // seqpacket/datagram sockets).
    21  //
    22  // FD based endpoints can be used in the networking stack by calling New() to
    23  // create a new endpoint, and then passing it as an argument to
    24  // Stack.CreateNIC().
    25  //
    26  // FD based endpoints can use more than one file descriptor to read incoming
    27  // packets. If there are more than one FDs specified and the underlying FD is an
    28  // AF_PACKET then the endpoint will enable FANOUT mode on the socket so that the
    29  // host kernel will consistently hash the packets to the sockets. This ensures
    30  // that packets for the same TCP streams are not reordered.
    31  //
    32  // Similarly if more than one FD's are specified where the underlying FD is not
    33  // AF_PACKET then it's the caller's responsibility to ensure that all inbound
    34  // packets on the descriptors are consistently 5 tuple hashed to one of the
    35  // descriptors to prevent TCP reordering.
    36  //
    37  // Since netstack today does not compute 5 tuple hashes for outgoing packets we
    38  // only use the first FD to write outbound packets. Once 5 tuple hashes for
    39  // all outbound packets are available we will make use of all underlying FD's to
    40  // write outbound packets.
    41  package fdbased
    42  
    43  import (
    44  	"fmt"
    45  
    46  	"golang.org/x/sys/unix"
    47  	"github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops"
    48  	"github.com/nicocha30/gvisor-ligolo/pkg/buffer"
    49  	"github.com/nicocha30/gvisor-ligolo/pkg/sync"
    50  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip"
    51  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/header"
    52  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/link/rawfile"
    53  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/stack"
    54  )
    55  
    56  // linkDispatcher reads packets from the link FD and dispatches them to the
    57  // NetworkDispatcher.
    58  type linkDispatcher interface {
    59  	Stop()
    60  	dispatch() (bool, tcpip.Error)
    61  	release()
    62  }
    63  
    64  // PacketDispatchMode are the various supported methods of receiving and
    65  // dispatching packets from the underlying FD.
    66  type PacketDispatchMode int
    67  
    68  // BatchSize is the number of packets to write in each syscall. It is 47
    69  // because when GvisorGSO is in use then a single 65KB TCP segment can get
    70  // split into 46 segments of 1420 bytes and a single 216 byte segment.
    71  const BatchSize = 47
    72  
    73  const (
    74  	// Readv is the default dispatch mode and is the least performant of the
    75  	// dispatch options but the one that is supported by all underlying FD
    76  	// types.
    77  	Readv PacketDispatchMode = iota
    78  	// RecvMMsg enables use of recvmmsg() syscall instead of readv() to
    79  	// read inbound packets. This reduces # of syscalls needed to process
    80  	// packets.
    81  	//
    82  	// NOTE: recvmmsg() is only supported for sockets, so if the underlying
    83  	// FD is not a socket then the code will still fall back to the readv()
    84  	// path.
    85  	RecvMMsg
    86  	// PacketMMap enables use of PACKET_RX_RING to receive packets from the
    87  	// NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The
    88  	// primary use-case for this is runsc which uses an AF_PACKET FD to
    89  	// receive packets from the veth device.
    90  	PacketMMap
    91  )
    92  
    93  func (p PacketDispatchMode) String() string {
    94  	switch p {
    95  	case Readv:
    96  		return "Readv"
    97  	case RecvMMsg:
    98  		return "RecvMMsg"
    99  	case PacketMMap:
   100  		return "PacketMMap"
   101  	default:
   102  		return fmt.Sprintf("unknown packet dispatch mode '%d'", p)
   103  	}
   104  }
   105  
   106  var _ stack.LinkEndpoint = (*endpoint)(nil)
   107  var _ stack.GSOEndpoint = (*endpoint)(nil)
   108  
   109  type fdInfo struct {
   110  	fd       int
   111  	isSocket bool
   112  }
   113  
   114  type endpoint struct {
   115  	// fds is the set of file descriptors each identifying one inbound/outbound
   116  	// channel. The endpoint will dispatch from all inbound channels as well as
   117  	// hash outbound packets to specific channels based on the packet hash.
   118  	fds []fdInfo
   119  
   120  	// mtu (maximum transmission unit) is the maximum size of a packet.
   121  	mtu uint32
   122  
   123  	// hdrSize specifies the link-layer header size. If set to 0, no header
   124  	// is added/removed; otherwise an ethernet header is used.
   125  	hdrSize int
   126  
   127  	// addr is the address of the endpoint.
   128  	addr tcpip.LinkAddress
   129  
   130  	// caps holds the endpoint capabilities.
   131  	caps stack.LinkEndpointCapabilities
   132  
   133  	// closed is a function to be called when the FD's peer (if any) closes
   134  	// its end of the communication pipe.
   135  	closed func(tcpip.Error)
   136  
   137  	inboundDispatchers []linkDispatcher
   138  
   139  	mu sync.RWMutex
   140  	// +checklocks:mu
   141  	dispatcher stack.NetworkDispatcher
   142  
   143  	// packetDispatchMode controls the packet dispatcher used by this
   144  	// endpoint.
   145  	packetDispatchMode PacketDispatchMode
   146  
   147  	// gsoMaxSize is the maximum GSO packet size. It is zero if GSO is
   148  	// disabled.
   149  	gsoMaxSize uint32
   150  
   151  	// wg keeps track of running goroutines.
   152  	wg sync.WaitGroup
   153  
   154  	// gsoKind is the supported kind of GSO.
   155  	gsoKind stack.SupportedGSO
   156  
   157  	// maxSyscallHeaderBytes has the same meaning as
   158  	// Options.MaxSyscallHeaderBytes.
   159  	maxSyscallHeaderBytes uintptr
   160  
   161  	// writevMaxIovs is the maximum number of iovecs that may be passed to
   162  	// rawfile.NonBlockingWriteIovec, as possibly limited by
   163  	// maxSyscallHeaderBytes. (No analogous limit is defined for
   164  	// rawfile.NonBlockingSendMMsg, since in that case the maximum number of
   165  	// iovecs also depends on the number of mmsghdrs. Instead, if sendBatch
   166  	// encounters a packet whose iovec count is limited by
   167  	// maxSyscallHeaderBytes, it falls back to writing the packet using writev
   168  	// via WritePacket.)
   169  	writevMaxIovs int
   170  }
   171  
   172  // Options specify the details about the fd-based endpoint to be created.
   173  type Options struct {
   174  	// FDs is a set of FDs used to read/write packets.
   175  	FDs []int
   176  
   177  	// MTU is the mtu to use for this endpoint.
   178  	MTU uint32
   179  
   180  	// EthernetHeader if true, indicates that the endpoint should read/write
   181  	// ethernet frames instead of IP packets.
   182  	EthernetHeader bool
   183  
   184  	// ClosedFunc is a function to be called when an endpoint's peer (if
   185  	// any) closes its end of the communication pipe.
   186  	ClosedFunc func(tcpip.Error)
   187  
   188  	// Address is the link address for this endpoint. Only used if
   189  	// EthernetHeader is true.
   190  	Address tcpip.LinkAddress
   191  
   192  	// SaveRestore if true, indicates that this NIC capability set should
   193  	// include CapabilitySaveRestore
   194  	SaveRestore bool
   195  
   196  	// DisconnectOk if true, indicates that this NIC capability set should
   197  	// include CapabilityDisconnectOk.
   198  	DisconnectOk bool
   199  
   200  	// GSOMaxSize is the maximum GSO packet size. It is zero if GSO is
   201  	// disabled.
   202  	GSOMaxSize uint32
   203  
   204  	// GvisorGSOEnabled indicates whether Gvisor GSO is enabled or not.
   205  	GvisorGSOEnabled bool
   206  
   207  	// PacketDispatchMode specifies the type of inbound dispatcher to be
   208  	// used for this endpoint.
   209  	PacketDispatchMode PacketDispatchMode
   210  
   211  	// TXChecksumOffload if true, indicates that this endpoints capability
   212  	// set should include CapabilityTXChecksumOffload.
   213  	TXChecksumOffload bool
   214  
   215  	// RXChecksumOffload if true, indicates that this endpoints capability
   216  	// set should include CapabilityRXChecksumOffload.
   217  	RXChecksumOffload bool
   218  
   219  	// If MaxSyscallHeaderBytes is non-zero, it is the maximum number of bytes
   220  	// of struct iovec, msghdr, and mmsghdr that may be passed by each host
   221  	// system call.
   222  	MaxSyscallHeaderBytes int
   223  
   224  	// AFXDPFD is used with the experimental AF_XDP mode.
   225  	// TODO(b/240191988): Use multiple sockets.
   226  	// TODO(b/240191988): How do we handle the MTU issue?
   227  	AFXDPFD *int
   228  
   229  	// InterfaceIndex is the interface index of the underlying device.
   230  	InterfaceIndex int
   231  }
   232  
   233  // fanoutID is used for AF_PACKET based endpoints to enable PACKET_FANOUT
   234  // support in the host kernel. This allows us to use multiple FD's to receive
   235  // from the same underlying NIC. The fanoutID needs to be the same for a given
   236  // set of FD's that point to the same NIC. Trying to set the PACKET_FANOUT
   237  // option for an FD with a fanoutID already in use by another FD for a different
   238  // NIC will return an EINVAL.
   239  //
   240  // Since fanoutID must be unique within the network namespace, we start with
   241  // the PID to avoid collisions. The only way to be sure of avoiding collisions
   242  // is to run in a new network namespace.
   243  var fanoutID atomicbitops.Int32 = atomicbitops.FromInt32(int32(unix.Getpid()))
   244  
   245  // New creates a new fd-based endpoint.
   246  //
   247  // Makes fd non-blocking, but does not take ownership of fd, which must remain
   248  // open for the lifetime of the returned endpoint (until after the endpoint has
   249  // stopped being using and Wait returns).
   250  func New(opts *Options) (stack.LinkEndpoint, error) {
   251  	caps := stack.LinkEndpointCapabilities(0)
   252  	if opts.RXChecksumOffload {
   253  		caps |= stack.CapabilityRXChecksumOffload
   254  	}
   255  
   256  	if opts.TXChecksumOffload {
   257  		caps |= stack.CapabilityTXChecksumOffload
   258  	}
   259  
   260  	hdrSize := 0
   261  	if opts.EthernetHeader {
   262  		hdrSize = header.EthernetMinimumSize
   263  		caps |= stack.CapabilityResolutionRequired
   264  	}
   265  
   266  	if opts.SaveRestore {
   267  		caps |= stack.CapabilitySaveRestore
   268  	}
   269  
   270  	if opts.DisconnectOk {
   271  		caps |= stack.CapabilityDisconnectOk
   272  	}
   273  
   274  	if len(opts.FDs) == 0 {
   275  		return nil, fmt.Errorf("opts.FD is empty, at least one FD must be specified")
   276  	}
   277  
   278  	if opts.MaxSyscallHeaderBytes < 0 {
   279  		return nil, fmt.Errorf("opts.MaxSyscallHeaderBytes is negative")
   280  	}
   281  
   282  	e := &endpoint{
   283  		mtu:                   opts.MTU,
   284  		caps:                  caps,
   285  		closed:                opts.ClosedFunc,
   286  		addr:                  opts.Address,
   287  		hdrSize:               hdrSize,
   288  		packetDispatchMode:    opts.PacketDispatchMode,
   289  		maxSyscallHeaderBytes: uintptr(opts.MaxSyscallHeaderBytes),
   290  		writevMaxIovs:         rawfile.MaxIovs,
   291  	}
   292  	if e.maxSyscallHeaderBytes != 0 {
   293  		if max := int(e.maxSyscallHeaderBytes / rawfile.SizeofIovec); max < e.writevMaxIovs {
   294  			e.writevMaxIovs = max
   295  		}
   296  	}
   297  
   298  	// Increment fanoutID to ensure that we don't re-use the same fanoutID
   299  	// for the next endpoint.
   300  	fid := fanoutID.Add(1)
   301  
   302  	// Create per channel dispatchers.
   303  	for _, fd := range opts.FDs {
   304  		if err := unix.SetNonblock(fd, true); err != nil {
   305  			return nil, fmt.Errorf("unix.SetNonblock(%v) failed: %v", fd, err)
   306  		}
   307  
   308  		isSocket, err := isSocketFD(fd)
   309  		if err != nil {
   310  			return nil, err
   311  		}
   312  		e.fds = append(e.fds, fdInfo{fd: fd, isSocket: isSocket})
   313  		if isSocket {
   314  			if opts.GSOMaxSize != 0 {
   315  				if opts.GvisorGSOEnabled {
   316  					e.gsoKind = stack.GvisorGSOSupported
   317  				} else {
   318  					e.gsoKind = stack.HostGSOSupported
   319  				}
   320  				e.gsoMaxSize = opts.GSOMaxSize
   321  			}
   322  		}
   323  
   324  		inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket, fid)
   325  		if err != nil {
   326  			return nil, fmt.Errorf("createInboundDispatcher(...) = %v", err)
   327  		}
   328  		e.inboundDispatchers = append(e.inboundDispatchers, inboundDispatcher)
   329  	}
   330  
   331  	return e, nil
   332  }
   333  
   334  func createInboundDispatcher(e *endpoint, fd int, isSocket bool, fID int32) (linkDispatcher, error) {
   335  	// By default use the readv() dispatcher as it works with all kinds of
   336  	// FDs (tap/tun/unix domain sockets and af_packet).
   337  	inboundDispatcher, err := newReadVDispatcher(fd, e)
   338  	if err != nil {
   339  		return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", fd, e, err)
   340  	}
   341  
   342  	if isSocket {
   343  		sa, err := unix.Getsockname(fd)
   344  		if err != nil {
   345  			return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err)
   346  		}
   347  		switch sa.(type) {
   348  		case *unix.SockaddrLinklayer:
   349  			// Enable PACKET_FANOUT mode if the underlying socket is of type
   350  			// AF_PACKET. We do not enable PACKET_FANOUT_FLAG_DEFRAG as that will
   351  			// prevent gvisor from receiving fragmented packets and the host does the
   352  			// reassembly on our behalf before delivering the fragments. This makes it
   353  			// hard to test fragmentation reassembly code in Netstack.
   354  			//
   355  			// See: include/uapi/linux/if_packet.h (struct fanout_args).
   356  			//
   357  			// NOTE: We are using SetSockOptInt here even though the underlying
   358  			// option is actually a struct. The code follows the example in the
   359  			// kernel documentation as described at the link below:
   360  			//
   361  			// See: https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
   362  			//
   363  			// This works out because the actual implementation for the option zero
   364  			// initializes the structure and will initialize the max_members field
   365  			// to a proper value if zero.
   366  			//
   367  			// See: https://github.com/torvalds/linux/blob/7acac4b3196caee5e21fb5ea53f8bc124e6a16fc/net/packet/af_packet.c#L3881
   368  			const fanoutType = unix.PACKET_FANOUT_HASH
   369  			fanoutArg := (int(fID) & 0xffff) | fanoutType<<16
   370  			if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil {
   371  				return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err)
   372  			}
   373  		}
   374  
   375  		switch e.packetDispatchMode {
   376  		case PacketMMap:
   377  			inboundDispatcher, err = newPacketMMapDispatcher(fd, e)
   378  			if err != nil {
   379  				return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", fd, e, err)
   380  			}
   381  		case RecvMMsg:
   382  			// If the provided FD is a socket then we optimize
   383  			// packet reads by using recvmmsg() instead of read() to
   384  			// read packets in a batch.
   385  			inboundDispatcher, err = newRecvMMsgDispatcher(fd, e)
   386  			if err != nil {
   387  				return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", fd, e, err)
   388  			}
   389  		case Readv:
   390  		default:
   391  			return nil, fmt.Errorf("unknown dispatch mode %d", e.packetDispatchMode)
   392  		}
   393  	}
   394  	return inboundDispatcher, nil
   395  }
   396  
   397  func isSocketFD(fd int) (bool, error) {
   398  	var stat unix.Stat_t
   399  	if err := unix.Fstat(fd, &stat); err != nil {
   400  		return false, fmt.Errorf("unix.Fstat(%v,...) failed: %v", fd, err)
   401  	}
   402  	return (stat.Mode & unix.S_IFSOCK) == unix.S_IFSOCK, nil
   403  }
   404  
   405  // Attach launches the goroutine that reads packets from the file descriptor and
   406  // dispatches them via the provided dispatcher. If one is already attached,
   407  // then nothing happens.
   408  //
   409  // Attach implements stack.LinkEndpoint.Attach.
   410  func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
   411  	e.mu.Lock()
   412  	defer e.mu.Unlock()
   413  	// nil means the NIC is being removed.
   414  	if dispatcher == nil && e.dispatcher != nil {
   415  		for _, dispatcher := range e.inboundDispatchers {
   416  			dispatcher.Stop()
   417  		}
   418  		e.Wait()
   419  		e.dispatcher = nil
   420  		return
   421  	}
   422  	if dispatcher != nil && e.dispatcher == nil {
   423  		e.dispatcher = dispatcher
   424  		// Link endpoints are not savable. When transportation endpoints are
   425  		// saved, they stop sending outgoing packets and all incoming packets
   426  		// are rejected.
   427  		for i := range e.inboundDispatchers {
   428  			e.wg.Add(1)
   429  			go func(i int) { // S/R-SAFE: See above.
   430  				e.dispatchLoop(e.inboundDispatchers[i])
   431  				e.wg.Done()
   432  			}(i)
   433  		}
   434  	}
   435  }
   436  
   437  // IsAttached implements stack.LinkEndpoint.IsAttached.
   438  func (e *endpoint) IsAttached() bool {
   439  	e.mu.RLock()
   440  	defer e.mu.RUnlock()
   441  	return e.dispatcher != nil
   442  }
   443  
   444  // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
   445  // during construction.
   446  func (e *endpoint) MTU() uint32 {
   447  	return e.mtu
   448  }
   449  
   450  // Capabilities implements stack.LinkEndpoint.Capabilities.
   451  func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
   452  	return e.caps
   453  }
   454  
   455  // MaxHeaderLength returns the maximum size of the link-layer header.
   456  func (e *endpoint) MaxHeaderLength() uint16 {
   457  	return uint16(e.hdrSize)
   458  }
   459  
   460  // LinkAddress returns the link address of this endpoint.
   461  func (e *endpoint) LinkAddress() tcpip.LinkAddress {
   462  	return e.addr
   463  }
   464  
   465  // Wait implements stack.LinkEndpoint.Wait. It waits for the endpoint to stop
   466  // reading from its FD.
   467  func (e *endpoint) Wait() {
   468  	e.wg.Wait()
   469  }
   470  
   471  // virtioNetHdr is declared in linux/virtio_net.h.
   472  type virtioNetHdr struct {
   473  	flags      uint8
   474  	gsoType    uint8
   475  	hdrLen     uint16
   476  	gsoSize    uint16
   477  	csumStart  uint16
   478  	csumOffset uint16
   479  }
   480  
   481  // marshal serializes h to a newly-allocated byte slice, in little-endian byte
   482  // order.
   483  //
   484  // Note: Virtio v1.0 onwards specifies little-endian as the byte ordering used
   485  // for general serialization. This makes it difficult to use go-marshal for
   486  // virtio types, as go-marshal implicitly uses the native byte ordering.
   487  func (h *virtioNetHdr) marshal() []byte {
   488  	buf := [virtioNetHdrSize]byte{
   489  		0: byte(h.flags),
   490  		1: byte(h.gsoType),
   491  
   492  		// Manually lay out the fields in little-endian byte order. Little endian =>
   493  		// least significant bit goes to the lower address.
   494  
   495  		2: byte(h.hdrLen),
   496  		3: byte(h.hdrLen >> 8),
   497  
   498  		4: byte(h.gsoSize),
   499  		5: byte(h.gsoSize >> 8),
   500  
   501  		6: byte(h.csumStart),
   502  		7: byte(h.csumStart >> 8),
   503  
   504  		8: byte(h.csumOffset),
   505  		9: byte(h.csumOffset >> 8),
   506  	}
   507  	return buf[:]
   508  }
   509  
   510  // These constants are declared in linux/virtio_net.h.
   511  const (
   512  	_VIRTIO_NET_HDR_F_NEEDS_CSUM = 1
   513  
   514  	_VIRTIO_NET_HDR_GSO_TCPV4 = 1
   515  	_VIRTIO_NET_HDR_GSO_TCPV6 = 4
   516  )
   517  
   518  // AddHeader implements stack.LinkEndpoint.AddHeader.
   519  func (e *endpoint) AddHeader(pkt stack.PacketBufferPtr) {
   520  	if e.hdrSize > 0 {
   521  		// Add ethernet header if needed.
   522  		eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize))
   523  		eth.Encode(&header.EthernetFields{
   524  			SrcAddr: pkt.EgressRoute.LocalLinkAddress,
   525  			DstAddr: pkt.EgressRoute.RemoteLinkAddress,
   526  			Type:    pkt.NetworkProtocolNumber,
   527  		})
   528  	}
   529  }
   530  
   531  func (e *endpoint) parseHeader(pkt stack.PacketBufferPtr) bool {
   532  	_, ok := pkt.LinkHeader().Consume(e.hdrSize)
   533  	return ok
   534  
   535  }
   536  
   537  // ParseHeader implements stack.LinkEndpoint.ParseHeader.
   538  func (e *endpoint) ParseHeader(pkt stack.PacketBufferPtr) bool {
   539  	if e.hdrSize > 0 {
   540  		return e.parseHeader(pkt)
   541  	}
   542  	return true
   543  }
   544  
   545  // writePacket writes outbound packets to the file descriptor. If it is not
   546  // currently writable, the packet is dropped.
   547  func (e *endpoint) writePacket(pkt stack.PacketBufferPtr) tcpip.Error {
   548  	fdInfo := e.fds[pkt.Hash%uint32(len(e.fds))]
   549  	fd := fdInfo.fd
   550  	var vnetHdrBuf []byte
   551  	if e.gsoKind == stack.HostGSOSupported {
   552  		vnetHdr := virtioNetHdr{}
   553  		if pkt.GSOOptions.Type != stack.GSONone {
   554  			vnetHdr.hdrLen = uint16(pkt.HeaderSize())
   555  			if pkt.GSOOptions.NeedsCsum {
   556  				vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM
   557  				vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen
   558  				vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset
   559  			}
   560  			if uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS {
   561  				switch pkt.GSOOptions.Type {
   562  				case stack.GSOTCPv4:
   563  					vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4
   564  				case stack.GSOTCPv6:
   565  					vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6
   566  				default:
   567  					panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type))
   568  				}
   569  				vnetHdr.gsoSize = pkt.GSOOptions.MSS
   570  			}
   571  		}
   572  		vnetHdrBuf = vnetHdr.marshal()
   573  	}
   574  
   575  	views := pkt.AsSlices()
   576  	numIovecs := len(views)
   577  	if len(vnetHdrBuf) != 0 {
   578  		numIovecs++
   579  	}
   580  	if numIovecs > e.writevMaxIovs {
   581  		numIovecs = e.writevMaxIovs
   582  	}
   583  
   584  	// Allocate small iovec arrays on the stack.
   585  	var iovecsArr [8]unix.Iovec
   586  	iovecs := iovecsArr[:0]
   587  	if numIovecs > len(iovecsArr) {
   588  		iovecs = make([]unix.Iovec, 0, numIovecs)
   589  	}
   590  	iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs)
   591  	for _, v := range views {
   592  		iovecs = rawfile.AppendIovecFromBytes(iovecs, v, numIovecs)
   593  	}
   594  	return rawfile.NonBlockingWriteIovec(fd, iovecs)
   595  }
   596  
   597  func (e *endpoint) sendBatch(batchFDInfo fdInfo, pkts []stack.PacketBufferPtr) (int, tcpip.Error) {
   598  	// Degrade to writePacket if underlying fd is not a socket.
   599  	if !batchFDInfo.isSocket {
   600  		var written int
   601  		var err tcpip.Error
   602  		for written < len(pkts) {
   603  			if err = e.writePacket(pkts[written]); err != nil {
   604  				break
   605  			}
   606  			written++
   607  		}
   608  		return written, err
   609  	}
   610  
   611  	// Send a batch of packets through batchFD.
   612  	batchFD := batchFDInfo.fd
   613  	mmsgHdrsStorage := make([]rawfile.MMsgHdr, 0, len(pkts))
   614  	packets := 0
   615  	for packets < len(pkts) {
   616  		mmsgHdrs := mmsgHdrsStorage
   617  		batch := pkts[packets:]
   618  		syscallHeaderBytes := uintptr(0)
   619  		for _, pkt := range batch {
   620  			var vnetHdrBuf []byte
   621  			if e.gsoKind == stack.HostGSOSupported {
   622  				vnetHdr := virtioNetHdr{}
   623  				if pkt.GSOOptions.Type != stack.GSONone {
   624  					vnetHdr.hdrLen = uint16(pkt.HeaderSize())
   625  					if pkt.GSOOptions.NeedsCsum {
   626  						vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM
   627  						vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen
   628  						vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset
   629  					}
   630  					if pkt.GSOOptions.Type != stack.GSONone && uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS {
   631  						switch pkt.GSOOptions.Type {
   632  						case stack.GSOTCPv4:
   633  							vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4
   634  						case stack.GSOTCPv6:
   635  							vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6
   636  						default:
   637  							panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type))
   638  						}
   639  						vnetHdr.gsoSize = pkt.GSOOptions.MSS
   640  					}
   641  				}
   642  				vnetHdrBuf = vnetHdr.marshal()
   643  			}
   644  
   645  			views := pkt.AsSlices()
   646  			numIovecs := len(views)
   647  			if len(vnetHdrBuf) != 0 {
   648  				numIovecs++
   649  			}
   650  			if numIovecs > rawfile.MaxIovs {
   651  				numIovecs = rawfile.MaxIovs
   652  			}
   653  			if e.maxSyscallHeaderBytes != 0 {
   654  				syscallHeaderBytes += rawfile.SizeofMMsgHdr + uintptr(numIovecs)*rawfile.SizeofIovec
   655  				if syscallHeaderBytes > e.maxSyscallHeaderBytes {
   656  					// We can't fit this packet into this call to sendmmsg().
   657  					// We could potentially do so if we reduced numIovecs
   658  					// further, but this might incur considerable extra
   659  					// copying. Leave it to the next batch instead.
   660  					break
   661  				}
   662  			}
   663  
   664  			// We can't easily allocate iovec arrays on the stack here since
   665  			// they will escape this loop iteration via mmsgHdrs.
   666  			iovecs := make([]unix.Iovec, 0, numIovecs)
   667  			iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs)
   668  			for _, v := range views {
   669  				iovecs = rawfile.AppendIovecFromBytes(iovecs, v, numIovecs)
   670  			}
   671  
   672  			var mmsgHdr rawfile.MMsgHdr
   673  			mmsgHdr.Msg.Iov = &iovecs[0]
   674  			mmsgHdr.Msg.SetIovlen(len(iovecs))
   675  			mmsgHdrs = append(mmsgHdrs, mmsgHdr)
   676  		}
   677  
   678  		if len(mmsgHdrs) == 0 {
   679  			// We can't fit batch[0] into a mmsghdr while staying under
   680  			// e.maxSyscallHeaderBytes. Use WritePacket, which will avoid the
   681  			// mmsghdr (by using writev) and re-buffer iovecs more aggressively
   682  			// if necessary (by using e.writevMaxIovs instead of
   683  			// rawfile.MaxIovs).
   684  			pkt := batch[0]
   685  			if err := e.writePacket(pkt); err != nil {
   686  				return packets, err
   687  			}
   688  			packets++
   689  		} else {
   690  			for len(mmsgHdrs) > 0 {
   691  				sent, err := rawfile.NonBlockingSendMMsg(batchFD, mmsgHdrs)
   692  				if err != nil {
   693  					return packets, err
   694  				}
   695  				packets += sent
   696  				mmsgHdrs = mmsgHdrs[sent:]
   697  			}
   698  		}
   699  	}
   700  
   701  	return packets, nil
   702  }
   703  
   704  // WritePackets writes outbound packets to the underlying file descriptors. If
   705  // one is not currently writable, the packet is dropped.
   706  //
   707  // Being a batch API, each packet in pkts should have the following
   708  // fields populated:
   709  //   - pkt.EgressRoute
   710  //   - pkt.GSOOptions
   711  //   - pkt.NetworkProtocolNumber
   712  func (e *endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) {
   713  	// Preallocate to avoid repeated reallocation as we append to batch.
   714  	batch := make([]stack.PacketBufferPtr, 0, BatchSize)
   715  	batchFDInfo := fdInfo{fd: -1, isSocket: false}
   716  	sentPackets := 0
   717  	for _, pkt := range pkts.AsSlice() {
   718  		if len(batch) == 0 {
   719  			batchFDInfo = e.fds[pkt.Hash%uint32(len(e.fds))]
   720  		}
   721  		pktFDInfo := e.fds[pkt.Hash%uint32(len(e.fds))]
   722  		if sendNow := pktFDInfo != batchFDInfo; !sendNow {
   723  			batch = append(batch, pkt)
   724  			continue
   725  		}
   726  		n, err := e.sendBatch(batchFDInfo, batch)
   727  		sentPackets += n
   728  		if err != nil {
   729  			return sentPackets, err
   730  		}
   731  		batch = batch[:0]
   732  		batch = append(batch, pkt)
   733  		batchFDInfo = pktFDInfo
   734  	}
   735  
   736  	if len(batch) != 0 {
   737  		n, err := e.sendBatch(batchFDInfo, batch)
   738  		sentPackets += n
   739  		if err != nil {
   740  			return sentPackets, err
   741  		}
   742  	}
   743  	return sentPackets, nil
   744  }
   745  
   746  // InjectOutbound implements stack.InjectableEndpoint.InjectOutbound.
   747  func (e *endpoint) InjectOutbound(dest tcpip.Address, packet *buffer.View) tcpip.Error {
   748  	return rawfile.NonBlockingWrite(e.fds[0].fd, packet.AsSlice())
   749  }
   750  
   751  // dispatchLoop reads packets from the file descriptor in a loop and dispatches
   752  // them to the network stack.
   753  func (e *endpoint) dispatchLoop(inboundDispatcher linkDispatcher) tcpip.Error {
   754  	for {
   755  		cont, err := inboundDispatcher.dispatch()
   756  		if err != nil || !cont {
   757  			if e.closed != nil {
   758  				e.closed(err)
   759  			}
   760  			inboundDispatcher.release()
   761  			return err
   762  		}
   763  	}
   764  }
   765  
   766  // GSOMaxSize implements stack.GSOEndpoint.
   767  func (e *endpoint) GSOMaxSize() uint32 {
   768  	return e.gsoMaxSize
   769  }
   770  
   771  // SupportedGSO implements stack.GSOEndpoint.
   772  func (e *endpoint) SupportedGSO() stack.SupportedGSO {
   773  	return e.gsoKind
   774  }
   775  
   776  // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
   777  func (e *endpoint) ARPHardwareType() header.ARPHardwareType {
   778  	if e.hdrSize > 0 {
   779  		return header.ARPHardwareEther
   780  	}
   781  	return header.ARPHardwareNone
   782  }
   783  
   784  // InjectableEndpoint is an injectable fd-based endpoint. The endpoint writes
   785  // to the FD, but does not read from it. All reads come from injected packets.
   786  type InjectableEndpoint struct {
   787  	endpoint
   788  
   789  	mu sync.RWMutex
   790  	// +checklocks:mu
   791  	dispatcher stack.NetworkDispatcher
   792  }
   793  
   794  // Attach saves the stack network-layer dispatcher for use later when packets
   795  // are injected.
   796  func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
   797  	e.mu.Lock()
   798  	defer e.mu.Unlock()
   799  	e.dispatcher = dispatcher
   800  }
   801  
   802  // InjectInbound injects an inbound packet. If the endpoint is not attached, the
   803  // packet is not delivered.
   804  func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBufferPtr) {
   805  	e.mu.RLock()
   806  	d := e.dispatcher
   807  	e.mu.RUnlock()
   808  	if d != nil {
   809  		d.DeliverNetworkPacket(protocol, pkt)
   810  	}
   811  }
   812  
   813  // NewInjectable creates a new fd-based InjectableEndpoint.
   814  func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabilities) (*InjectableEndpoint, error) {
   815  	unix.SetNonblock(fd, true)
   816  	isSocket, err := isSocketFD(fd)
   817  	if err != nil {
   818  		return nil, err
   819  	}
   820  
   821  	return &InjectableEndpoint{endpoint: endpoint{
   822  		fds:           []fdInfo{{fd: fd, isSocket: isSocket}},
   823  		mtu:           mtu,
   824  		caps:          capabilities,
   825  		writevMaxIovs: rawfile.MaxIovs,
   826  	}}, nil
   827  }