github.com/flowerwrong/netstack@v0.0.0-20191009141956-e5848263af28/tcpip/link/fdbased/endpoint.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package fdbased provides the implemention of data-link layer endpoints
    16  // backed by boundary-preserving file descriptors (e.g., TUN devices,
    17  // seqpacket/datagram sockets).
    18  //
    19  // FD based endpoints can be used in the networking stack by calling New() to
    20  // create a new endpoint, and then passing it as an argument to
    21  // Stack.CreateNIC().
    22  //
    23  // FD based endpoints can use more than one file descriptor to read incoming
    24  // packets. If there are more than one FDs specified and the underlying FD is an
    25  // AF_PACKET then the endpoint will enable FANOUT mode on the socket so that the
    26  // host kernel will consistently hash the packets to the sockets. This ensures
    27  // that packets for the same TCP streams are not reordered.
    28  //
    29  // Similarly if more than one FD's are specified where the underlying FD is not
    30  // AF_PACKET then it's the caller's responsibility to ensure that all inbound
    31  // packets on the descriptors are consistently 5 tuple hashed to one of the
    32  // descriptors to prevent TCP reordering.
    33  //
    34  // Since netstack today does not compute 5 tuple hashes for outgoing packets we
    35  // only use the first FD to write outbound packets. Once 5 tuple hashes for
    36  // all outbound packets are available we will make use of all underlying FD's to
    37  // write outbound packets.
    38  package fdbased
    39  
    40  import (
    41  	"fmt"
    42  	"sync"
    43  	"syscall"
    44  
    45  	"github.com/FlowerWrong/netstack/tcpip"
    46  	"github.com/FlowerWrong/netstack/tcpip/buffer"
    47  	"github.com/FlowerWrong/netstack/tcpip/header"
    48  	"github.com/FlowerWrong/netstack/tcpip/link/rawfile"
    49  	"github.com/FlowerWrong/netstack/tcpip/stack"
    50  	"golang.org/x/sys/unix"
    51  )
    52  
    53  // linkDispatcher reads packets from the link FD and dispatches them to the
    54  // NetworkDispatcher.
    55  type linkDispatcher interface {
    56  	dispatch() (bool, *tcpip.Error)
    57  }
    58  
    59  // PacketDispatchMode are the various supported methods of receiving and
    60  // dispatching packets from the underlying FD.
    61  type PacketDispatchMode int
    62  
    63  const (
    64  	// Readv is the default dispatch mode and is the least performant of the
    65  	// dispatch options but the one that is supported by all underlying FD
    66  	// types.
    67  	Readv PacketDispatchMode = iota
    68  	// RecvMMsg enables use of recvmmsg() syscall instead of readv() to
    69  	// read inbound packets. This reduces # of syscalls needed to process
    70  	// packets.
    71  	//
    72  	// NOTE: recvmmsg() is only supported for sockets, so if the underlying
    73  	// FD is not a socket then the code will still fall back to the readv()
    74  	// path.
    75  	RecvMMsg
    76  	// PacketMMap enables use of PACKET_RX_RING to receive packets from the
    77  	// NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The
    78  	// primary use-case for this is runsc which uses an AF_PACKET FD to
    79  	// receive packets from the veth device.
    80  	PacketMMap
    81  )
    82  
    83  func (p PacketDispatchMode) String() string {
    84  	switch p {
    85  	case Readv:
    86  		return "Readv"
    87  	case RecvMMsg:
    88  		return "RecvMMsg"
    89  	case PacketMMap:
    90  		return "PacketMMap"
    91  	default:
    92  		return fmt.Sprintf("unknown packet dispatch mode %v", p)
    93  	}
    94  }
    95  
    96  type endpoint struct {
    97  	// fds is the set of file descriptors each identifying one inbound/outbound
    98  	// channel. The endpoint will dispatch from all inbound channels as well as
    99  	// hash outbound packets to specific channels based on the packet hash.
   100  	fds []int
   101  
   102  	// mtu (maximum transmission unit) is the maximum size of a packet.
   103  	mtu uint32
   104  
   105  	// hdrSize specifies the link-layer header size. If set to 0, no header
   106  	// is added/removed; otherwise an ethernet header is used.
   107  	hdrSize int
   108  
   109  	// addr is the address of the endpoint.
   110  	addr tcpip.LinkAddress
   111  
   112  	// caps holds the endpoint capabilities.
   113  	caps stack.LinkEndpointCapabilities
   114  
   115  	// closed is a function to be called when the FD's peer (if any) closes
   116  	// its end of the communication pipe.
   117  	closed func(*tcpip.Error)
   118  
   119  	inboundDispatchers []linkDispatcher
   120  	dispatcher         stack.NetworkDispatcher
   121  
   122  	// packetDispatchMode controls the packet dispatcher used by this
   123  	// endpoint.
   124  	packetDispatchMode PacketDispatchMode
   125  
   126  	// gsoMaxSize is the maximum GSO packet size. It is zero if GSO is
   127  	// disabled.
   128  	gsoMaxSize uint32
   129  
   130  	// wg keeps track of running goroutines.
   131  	wg sync.WaitGroup
   132  }
   133  
   134  // Options specify the details about the fd-based endpoint to be created.
   135  type Options struct {
   136  	// FDs is a set of FDs used to read/write packets.
   137  	FDs []int
   138  
   139  	// MTU is the mtu to use for this endpoint.
   140  	MTU uint32
   141  
   142  	// EthernetHeader if true, indicates that the endpoint should read/write
   143  	// ethernet frames instead of IP packets.
   144  	EthernetHeader bool
   145  
   146  	// ClosedFunc is a function to be called when an endpoint's peer (if
   147  	// any) closes its end of the communication pipe.
   148  	ClosedFunc func(*tcpip.Error)
   149  
   150  	// Address is the link address for this endpoint. Only used if
   151  	// EthernetHeader is true.
   152  	Address tcpip.LinkAddress
   153  
   154  	// SaveRestore if true, indicates that this NIC capability set should
   155  	// include CapabilitySaveRestore
   156  	SaveRestore bool
   157  
   158  	// DisconnectOk if true, indicates that this NIC capability set should
   159  	// include CapabilityDisconnectOk.
   160  	DisconnectOk bool
   161  
   162  	// GSOMaxSize is the maximum GSO packet size. It is zero if GSO is
   163  	// disabled.
   164  	GSOMaxSize uint32
   165  
   166  	// PacketDispatchMode specifies the type of inbound dispatcher to be
   167  	// used for this endpoint.
   168  	PacketDispatchMode PacketDispatchMode
   169  
   170  	// TXChecksumOffload if true, indicates that this endpoints capability
   171  	// set should include CapabilityTXChecksumOffload.
   172  	TXChecksumOffload bool
   173  
   174  	// RXChecksumOffload if true, indicates that this endpoints capability
   175  	// set should include CapabilityRXChecksumOffload.
   176  	RXChecksumOffload bool
   177  }
   178  
   179  // New creates a new fd-based endpoint.
   180  //
   181  // Makes fd non-blocking, but does not take ownership of fd, which must remain
   182  // open for the lifetime of the returned endpoint (until after the endpoint has
   183  // stopped being using and Wait returns).
   184  func New(opts *Options) (stack.LinkEndpoint, error) {
   185  	caps := stack.LinkEndpointCapabilities(0)
   186  	if opts.RXChecksumOffload {
   187  		caps |= stack.CapabilityRXChecksumOffload
   188  	}
   189  
   190  	if opts.TXChecksumOffload {
   191  		caps |= stack.CapabilityTXChecksumOffload
   192  	}
   193  
   194  	hdrSize := 0
   195  	if opts.EthernetHeader {
   196  		hdrSize = header.EthernetMinimumSize
   197  		caps |= stack.CapabilityResolutionRequired
   198  	}
   199  
   200  	if opts.SaveRestore {
   201  		caps |= stack.CapabilitySaveRestore
   202  	}
   203  
   204  	if opts.DisconnectOk {
   205  		caps |= stack.CapabilityDisconnectOk
   206  	}
   207  
   208  	if len(opts.FDs) == 0 {
   209  		return nil, fmt.Errorf("opts.FD is empty, at least one FD must be specified")
   210  	}
   211  
   212  	e := &endpoint{
   213  		fds:                opts.FDs,
   214  		mtu:                opts.MTU,
   215  		caps:               caps,
   216  		closed:             opts.ClosedFunc,
   217  		addr:               opts.Address,
   218  		hdrSize:            hdrSize,
   219  		packetDispatchMode: opts.PacketDispatchMode,
   220  	}
   221  
   222  	// Create per channel dispatchers.
   223  	for i := 0; i < len(e.fds); i++ {
   224  		fd := e.fds[i]
   225  		if err := syscall.SetNonblock(fd, true); err != nil {
   226  			return nil, fmt.Errorf("syscall.SetNonblock(%v) failed: %v", fd, err)
   227  		}
   228  
   229  		isSocket, err := isSocketFD(fd)
   230  		if err != nil {
   231  			return nil, err
   232  		}
   233  		if isSocket {
   234  			if opts.GSOMaxSize != 0 {
   235  				e.caps |= stack.CapabilityGSO
   236  				e.gsoMaxSize = opts.GSOMaxSize
   237  			}
   238  		}
   239  		inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket)
   240  		if err != nil {
   241  			return nil, fmt.Errorf("createInboundDispatcher(...) = %v", err)
   242  		}
   243  		e.inboundDispatchers = append(e.inboundDispatchers, inboundDispatcher)
   244  	}
   245  
   246  	return e, nil
   247  }
   248  
   249  func createInboundDispatcher(e *endpoint, fd int, isSocket bool) (linkDispatcher, error) {
   250  	// By default use the readv() dispatcher as it works with all kinds of
   251  	// FDs (tap/tun/unix domain sockets and af_packet).
   252  	inboundDispatcher, err := newReadVDispatcher(fd, e)
   253  	if err != nil {
   254  		return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", fd, e, err)
   255  	}
   256  
   257  	if isSocket {
   258  		sa, err := unix.Getsockname(fd)
   259  		if err != nil {
   260  			return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err)
   261  		}
   262  		switch sa.(type) {
   263  		//case *unix.SockaddrLinklayer:
   264  		//	// enable PACKET_FANOUT mode is the underlying socket is
   265  		//	// of type AF_PACKET.
   266  		//	const fanoutID = 1
   267  		//	const fanoutType = 0x8000 // PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_DEFRAG
   268  		//	fanoutArg := fanoutID | fanoutType<<16
   269  		//	if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil {
   270  		//		return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err)
   271  		//	}
   272  		}
   273  
   274  		switch e.packetDispatchMode {
   275  		case PacketMMap:
   276  			inboundDispatcher, err = newPacketMMapDispatcher(fd, e)
   277  			if err != nil {
   278  				return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", fd, e, err)
   279  			}
   280  		case RecvMMsg:
   281  			// If the provided FD is a socket then we optimize
   282  			// packet reads by using recvmmsg() instead of read() to
   283  			// read packets in a batch.
   284  			inboundDispatcher, err = newRecvMMsgDispatcher(fd, e)
   285  			if err != nil {
   286  				return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", fd, e, err)
   287  			}
   288  		}
   289  	}
   290  	return inboundDispatcher, nil
   291  }
   292  
   293  func isSocketFD(fd int) (bool, error) {
   294  	var stat syscall.Stat_t
   295  	if err := syscall.Fstat(fd, &stat); err != nil {
   296  		return false, fmt.Errorf("syscall.Fstat(%v,...) failed: %v", fd, err)
   297  	}
   298  	return (stat.Mode & syscall.S_IFSOCK) == syscall.S_IFSOCK, nil
   299  }
   300  
   301  // Attach launches the goroutine that reads packets from the file descriptor and
   302  // dispatches them via the provided dispatcher.
   303  func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
   304  	e.dispatcher = dispatcher
   305  	// Link endpoints are not savable. When transportation endpoints are
   306  	// saved, they stop sending outgoing packets and all incoming packets
   307  	// are rejected.
   308  	for i := range e.inboundDispatchers {
   309  		e.wg.Add(1)
   310  		go func(i int) {
   311  			e.dispatchLoop(e.inboundDispatchers[i])
   312  			e.wg.Done()
   313  		}(i)
   314  	}
   315  }
   316  
   317  // IsAttached implements stack.LinkEndpoint.IsAttached.
   318  func (e *endpoint) IsAttached() bool {
   319  	return e.dispatcher != nil
   320  }
   321  
   322  // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
   323  // during construction.
   324  func (e *endpoint) MTU() uint32 {
   325  	return e.mtu
   326  }
   327  
   328  // Capabilities implements stack.LinkEndpoint.Capabilities.
   329  func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
   330  	return e.caps
   331  }
   332  
   333  // MaxHeaderLength returns the maximum size of the link-layer header.
   334  func (e *endpoint) MaxHeaderLength() uint16 {
   335  	return uint16(e.hdrSize)
   336  }
   337  
   338  // LinkAddress returns the link address of this endpoint.
   339  func (e *endpoint) LinkAddress() tcpip.LinkAddress {
   340  	return e.addr
   341  }
   342  
   343  // Wait implements stack.LinkEndpoint.Wait. It waits for the endpoint to stop
   344  // reading from its FD.
   345  func (e *endpoint) Wait() {
   346  	e.wg.Wait()
   347  }
   348  
   349  // virtioNetHdr is declared in linux/virtio_net.h.
   350  type virtioNetHdr struct {
   351  	flags      uint8
   352  	gsoType    uint8
   353  	hdrLen     uint16
   354  	gsoSize    uint16
   355  	csumStart  uint16
   356  	csumOffset uint16
   357  }
   358  
   359  // These constants are declared in linux/virtio_net.h.
   360  const (
   361  	_VIRTIO_NET_HDR_F_NEEDS_CSUM = 1
   362  
   363  	_VIRTIO_NET_HDR_GSO_TCPV4 = 1
   364  	_VIRTIO_NET_HDR_GSO_TCPV6 = 4
   365  )
   366  
   367  // WritePacket writes outbound packets to the file descriptor. If it is not
   368  // currently writable, the packet is dropped.
   369  func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
   370  	if e.hdrSize > 0 {
   371  		// Add ethernet header if needed.
   372  		eth := header.Ethernet(hdr.Prepend(header.EthernetMinimumSize))
   373  		ethHdr := &header.EthernetFields{
   374  			DstAddr: r.RemoteLinkAddress,
   375  			Type:    protocol,
   376  		}
   377  
   378  		// Preserve the src address if it's set in the route.
   379  		if r.LocalLinkAddress != "" {
   380  			ethHdr.SrcAddr = r.LocalLinkAddress
   381  		} else {
   382  			ethHdr.SrcAddr = e.addr
   383  		}
   384  		eth.Encode(ethHdr)
   385  	}
   386  
   387  	if e.Capabilities()&stack.CapabilityGSO != 0 {
   388  		vnetHdr := virtioNetHdr{}
   389  		vnetHdrBuf := vnetHdrToByteSlice(&vnetHdr)
   390  		if gso != nil {
   391  			vnetHdr.hdrLen = uint16(hdr.UsedLength())
   392  			if gso.NeedsCsum {
   393  				vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM
   394  				vnetHdr.csumStart = header.EthernetMinimumSize + gso.L3HdrLen
   395  				vnetHdr.csumOffset = gso.CsumOffset
   396  			}
   397  			if gso.Type != stack.GSONone && uint16(payload.Size()) > gso.MSS {
   398  				switch gso.Type {
   399  				case stack.GSOTCPv4:
   400  					vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4
   401  				case stack.GSOTCPv6:
   402  					vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6
   403  				default:
   404  					panic(fmt.Sprintf("Unknown gso type: %v", gso.Type))
   405  				}
   406  				vnetHdr.gsoSize = gso.MSS
   407  			}
   408  		}
   409  
   410  		return rawfile.NonBlockingWrite3(e.fds[0], vnetHdrBuf, hdr.View(), payload.ToView())
   411  	}
   412  
   413  	if payload.Size() == 0 {
   414  		return rawfile.NonBlockingWrite(e.fds[0], hdr.View())
   415  	}
   416  
   417  	return rawfile.NonBlockingWrite3(e.fds[0], hdr.View(), payload.ToView(), nil)
   418  }
   419  
   420  // WriteRawPacket writes a raw packet directly to the file descriptor.
   421  func (e *endpoint) WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Error {
   422  	return rawfile.NonBlockingWrite(e.fds[0], packet)
   423  }
   424  
   425  // dispatchLoop reads packets from the file descriptor in a loop and dispatches
   426  // them to the network stack.
   427  func (e *endpoint) dispatchLoop(inboundDispatcher linkDispatcher) *tcpip.Error {
   428  	for {
   429  		cont, err := inboundDispatcher.dispatch()
   430  		if err != nil || !cont {
   431  			if e.closed != nil {
   432  				e.closed(err)
   433  			}
   434  			return err
   435  		}
   436  	}
   437  }
   438  
   439  // GSOMaxSize returns the maximum GSO packet size.
   440  func (e *endpoint) GSOMaxSize() uint32 {
   441  	return e.gsoMaxSize
   442  }
   443  
   444  // InjectableEndpoint is an injectable fd-based endpoint. The endpoint writes
   445  // to the FD, but does not read from it. All reads come from injected packets.
   446  type InjectableEndpoint struct {
   447  	endpoint
   448  
   449  	dispatcher stack.NetworkDispatcher
   450  }
   451  
   452  // Attach saves the stack network-layer dispatcher for use later when packets
   453  // are injected.
   454  func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
   455  	e.dispatcher = dispatcher
   456  }
   457  
   458  // Inject injects an inbound packet.
   459  func (e *InjectableEndpoint) Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
   460  	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv)
   461  }
   462  
   463  // NewInjectable creates a new fd-based InjectableEndpoint.
   464  func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabilities) *InjectableEndpoint {
   465  	syscall.SetNonblock(fd, true)
   466  
   467  	return &InjectableEndpoint{endpoint: endpoint{
   468  		fds:  []int{fd},
   469  		mtu:  mtu,
   470  		caps: capabilities,
   471  	}}
   472  }