github.com/sagernet/gvisor@v0.0.0-20240428053021-e691de28565f/pkg/tcpip/link/xdp/endpoint.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build linux
    16  // +build linux
    17  
    18  // Package xdp provides link layer endpoints backed by AF_XDP sockets.
    19  package xdp
    20  
    21  import (
    22  	"fmt"
    23  
    24  	"golang.org/x/sys/unix"
    25  	"github.com/sagernet/gvisor/pkg/buffer"
    26  	"github.com/sagernet/gvisor/pkg/sync"
    27  	"github.com/sagernet/gvisor/pkg/tcpip"
    28  	"github.com/sagernet/gvisor/pkg/tcpip/header"
    29  	"github.com/sagernet/gvisor/pkg/tcpip/link/qdisc/fifo"
    30  	"github.com/sagernet/gvisor/pkg/tcpip/link/rawfile"
    31  	"github.com/sagernet/gvisor/pkg/tcpip/link/stopfd"
    32  	"github.com/sagernet/gvisor/pkg/tcpip/stack"
    33  	"github.com/sagernet/gvisor/pkg/xdp"
    34  )
    35  
    36  // TODO(b/240191988): Turn off GSO, GRO, and LRO. Limit veth MTU to 1500.
    37  
    38  // MTU is sized to ensure packets fit inside a 2048 byte XDP frame.
    39  const MTU = 1500
    40  
    41  var _ stack.LinkEndpoint = (*endpoint)(nil)
    42  
    43  type endpoint struct {
    44  	// fd is the underlying AF_XDP socket.
    45  	fd int
    46  
    47  	// addr is the address of the endpoint.
    48  	addr tcpip.LinkAddress
    49  
    50  	// caps holds the endpoint capabilities.
    51  	caps stack.LinkEndpointCapabilities
    52  
    53  	// closed is a function to be called when the FD's peer (if any) closes
    54  	// its end of the communication pipe.
    55  	closed func(tcpip.Error)
    56  
    57  	mu sync.RWMutex
    58  	// +checkloks:mu
    59  	networkDispatcher stack.NetworkDispatcher
    60  
    61  	// wg keeps track of running goroutines.
    62  	wg sync.WaitGroup
    63  
    64  	// control is used to control the AF_XDP socket.
    65  	control *xdp.ControlBlock
    66  
    67  	// stopFD is used to stop the dispatch loop.
    68  	stopFD stopfd.StopFD
    69  }
    70  
    71  // Options specify the details about the fd-based endpoint to be created.
    72  type Options struct {
    73  	// FD is used to read/write packets.
    74  	FD int
    75  
    76  	// ClosedFunc is a function to be called when an endpoint's peer (if
    77  	// any) closes its end of the communication pipe.
    78  	ClosedFunc func(tcpip.Error)
    79  
    80  	// Address is the link address for this endpoint.
    81  	Address tcpip.LinkAddress
    82  
    83  	// SaveRestore if true, indicates that this NIC capability set should
    84  	// include CapabilitySaveRestore
    85  	SaveRestore bool
    86  
    87  	// DisconnectOk if true, indicates that this NIC capability set should
    88  	// include CapabilityDisconnectOk.
    89  	DisconnectOk bool
    90  
    91  	// TXChecksumOffload if true, indicates that this endpoints capability
    92  	// set should include CapabilityTXChecksumOffload.
    93  	TXChecksumOffload bool
    94  
    95  	// RXChecksumOffload if true, indicates that this endpoints capability
    96  	// set should include CapabilityRXChecksumOffload.
    97  	RXChecksumOffload bool
    98  
    99  	// InterfaceIndex is the interface index of the underlying device.
   100  	InterfaceIndex int
   101  
   102  	// Bind is true when we're responsible for binding the AF_XDP socket to
   103  	// a device. When false, another process is expected to bind for us.
   104  	Bind bool
   105  
   106  	// GRO enables generic receive offload.
   107  	GRO bool
   108  }
   109  
   110  // New creates a new endpoint from an AF_XDP socket.
   111  func New(opts *Options) (stack.LinkEndpoint, error) {
   112  	caps := stack.CapabilityResolutionRequired
   113  	if opts.RXChecksumOffload {
   114  		caps |= stack.CapabilityRXChecksumOffload
   115  	}
   116  
   117  	if opts.TXChecksumOffload {
   118  		caps |= stack.CapabilityTXChecksumOffload
   119  	}
   120  
   121  	if opts.SaveRestore {
   122  		caps |= stack.CapabilitySaveRestore
   123  	}
   124  
   125  	if opts.DisconnectOk {
   126  		caps |= stack.CapabilityDisconnectOk
   127  	}
   128  
   129  	if err := unix.SetNonblock(opts.FD, true); err != nil {
   130  		return nil, fmt.Errorf("unix.SetNonblock(%v) failed: %v", opts.FD, err)
   131  	}
   132  
   133  	ep := &endpoint{
   134  		fd:     opts.FD,
   135  		caps:   caps,
   136  		closed: opts.ClosedFunc,
   137  		addr:   opts.Address,
   138  	}
   139  
   140  	stopFD, err := stopfd.New()
   141  	if err != nil {
   142  		return nil, err
   143  	}
   144  	ep.stopFD = stopFD
   145  
   146  	// Use a 2MB UMEM to match the PACKET_MMAP dispatcher. There will be
   147  	// 1024 UMEM frames, and each queue will have 512 descriptors. Having
   148  	// fewer descriptors than frames prevents RX and TX from starving each
   149  	// other.
   150  	// TODO(b/240191988): Consider different numbers of descriptors for
   151  	// different queues.
   152  	const (
   153  		frameSize = 2048
   154  		umemSize  = 1 << 21
   155  		nFrames   = umemSize / frameSize
   156  	)
   157  	xdpOpts := xdp.Opts{
   158  		NFrames:      nFrames,
   159  		FrameSize:    frameSize,
   160  		NDescriptors: nFrames / 2,
   161  		Bind:         opts.Bind,
   162  	}
   163  	ep.control, err = xdp.NewFromSocket(opts.FD, uint32(opts.InterfaceIndex), 0 /* queueID */, xdpOpts)
   164  	if err != nil {
   165  		return nil, fmt.Errorf("failed to create AF_XDP dispatcher: %v", err)
   166  	}
   167  
   168  	ep.control.UMEM.Lock()
   169  	defer ep.control.UMEM.Unlock()
   170  
   171  	ep.control.Fill.FillAll(&ep.control.UMEM)
   172  
   173  	return ep, nil
   174  }
   175  
   176  // Attach launches the goroutine that reads packets from the file descriptor and
   177  // dispatches them via the provided dispatcher. If one is already attached,
   178  // then nothing happens.
   179  //
   180  // Attach implements stack.LinkEndpoint.Attach.
   181  func (ep *endpoint) Attach(networkDispatcher stack.NetworkDispatcher) {
   182  	ep.mu.Lock()
   183  	defer ep.mu.Unlock()
   184  	// nil means the NIC is being removed.
   185  	if networkDispatcher == nil && ep.IsAttached() {
   186  		ep.stopFD.Stop()
   187  		ep.Wait()
   188  		ep.networkDispatcher = nil
   189  		return
   190  	}
   191  	if networkDispatcher != nil && ep.networkDispatcher == nil {
   192  		ep.networkDispatcher = networkDispatcher
   193  		// Link endpoints are not savable. When transportation endpoints are
   194  		// saved, they stop sending outgoing packets and all incoming packets
   195  		// are rejected.
   196  		ep.wg.Add(1)
   197  		go func() { // S/R-SAFE: See above.
   198  			defer ep.wg.Done()
   199  			for {
   200  				cont, err := ep.dispatch()
   201  				if err != nil || !cont {
   202  					if ep.closed != nil {
   203  						ep.closed(err)
   204  					}
   205  					return
   206  				}
   207  			}
   208  		}()
   209  	}
   210  }
   211  
   212  // IsAttached implements stack.LinkEndpoint.IsAttached.
   213  func (ep *endpoint) IsAttached() bool {
   214  	ep.mu.RLock()
   215  	defer ep.mu.RUnlock()
   216  	return ep.networkDispatcher != nil
   217  }
   218  
   219  // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
   220  // during construction.
   221  func (ep *endpoint) MTU() uint32 {
   222  	return MTU
   223  }
   224  
   225  // Capabilities implements stack.LinkEndpoint.Capabilities.
   226  func (ep *endpoint) Capabilities() stack.LinkEndpointCapabilities {
   227  	return ep.caps
   228  }
   229  
   230  // MaxHeaderLength returns the maximum size of the link-layer header.
   231  func (ep *endpoint) MaxHeaderLength() uint16 {
   232  	return uint16(header.EthernetMinimumSize)
   233  }
   234  
   235  // LinkAddress returns the link address of this endpoint.
   236  func (ep *endpoint) LinkAddress() tcpip.LinkAddress {
   237  	return ep.addr
   238  }
   239  
   240  // Wait implements stack.LinkEndpoint.Wait. It waits for the endpoint to stop
   241  // reading from its FD.
   242  func (ep *endpoint) Wait() {
   243  	ep.wg.Wait()
   244  }
   245  
   246  // AddHeader implements stack.LinkEndpoint.AddHeader.
   247  func (ep *endpoint) AddHeader(pkt *stack.PacketBuffer) {
   248  	// Add ethernet header if needed.
   249  	eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize))
   250  	eth.Encode(&header.EthernetFields{
   251  		SrcAddr: pkt.EgressRoute.LocalLinkAddress,
   252  		DstAddr: pkt.EgressRoute.RemoteLinkAddress,
   253  		Type:    pkt.NetworkProtocolNumber,
   254  	})
   255  }
   256  
   257  // ParseHeader implements stack.LinkEndpoint.ParseHeader.
   258  func (ep *endpoint) ParseHeader(pkt *stack.PacketBuffer) bool {
   259  	_, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize)
   260  	return ok
   261  }
   262  
   263  // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
   264  func (ep *endpoint) ARPHardwareType() header.ARPHardwareType {
   265  	return header.ARPHardwareEther
   266  }
   267  
   268  // WritePackets writes outbound packets to the underlying file descriptors. If
   269  // one is not currently writable, the packet is dropped.
   270  //
   271  // Each packet in pkts should have the following fields populated:
   272  //   - pkt.EgressRoute
   273  //   - pkt.NetworkProtocolNumber
   274  //
   275  // The following should not be populated, as GSO is not supported with XDP.
   276  //   - pkt.GSOOptions
   277  func (ep *endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) {
   278  	// We expect to be called via fifo, which imposes a limit of
   279  	// fifo.BatchSize.
   280  	var preallocatedBatch [fifo.BatchSize]unix.XDPDesc
   281  	batch := preallocatedBatch[:0]
   282  
   283  	ep.control.UMEM.Lock()
   284  
   285  	ep.control.Completion.FreeAll(&ep.control.UMEM)
   286  
   287  	// Reserve TX queue descriptors and umem buffers
   288  	nReserved, index := ep.control.TX.Reserve(&ep.control.UMEM, uint32(pkts.Len()))
   289  	if nReserved == 0 {
   290  		ep.control.UMEM.Unlock()
   291  		return 0, &tcpip.ErrNoBufferSpace{}
   292  	}
   293  
   294  	// Allocate UMEM space. In order to release the UMEM lock as soon as
   295  	// possible we allocate up-front.
   296  	for _, pkt := range pkts.AsSlice() {
   297  		batch = append(batch, unix.XDPDesc{
   298  			Addr: ep.control.UMEM.AllocFrame(),
   299  			Len:  uint32(pkt.Size()),
   300  		})
   301  	}
   302  
   303  	for i, pkt := range pkts.AsSlice() {
   304  		// Copy packets into UMEM frame.
   305  		frame := ep.control.UMEM.Get(batch[i])
   306  		offset := 0
   307  		var view *buffer.View
   308  		views, pktOffset := pkt.AsViewList()
   309  		for view = views.Front(); view != nil && pktOffset >= view.Size(); view = view.Next() {
   310  			pktOffset -= view.Size()
   311  		}
   312  		offset += copy(frame[offset:], view.AsSlice()[pktOffset:])
   313  		for view = view.Next(); view != nil; view = view.Next() {
   314  			offset += copy(frame[offset:], view.AsSlice())
   315  		}
   316  		ep.control.TX.Set(index+uint32(i), batch[i])
   317  	}
   318  
   319  	// Notify the kernel that there're packets to write.
   320  	ep.control.TX.Notify()
   321  
   322  	// TODO(b/240191988): Explore more fine-grained locking. We shouldn't
   323  	// need to hold the UMEM lock for the whole duration of packet copying.
   324  	ep.control.UMEM.Unlock()
   325  
   326  	return pkts.Len(), nil
   327  }
   328  
   329  func (ep *endpoint) dispatch() (bool, tcpip.Error) {
   330  	var views []*buffer.View
   331  
   332  	for {
   333  		stopped, errno := rawfile.BlockingPollUntilStopped(ep.stopFD.EFD, ep.fd, unix.POLLIN|unix.POLLERR)
   334  		if errno != 0 {
   335  			if errno == unix.EINTR {
   336  				continue
   337  			}
   338  			return !stopped, rawfile.TranslateErrno(errno)
   339  		}
   340  		if stopped {
   341  			return true, nil
   342  		}
   343  
   344  		// Avoid the cost of the poll syscall if possible by peeking
   345  		// until there are no packets left.
   346  		for {
   347  			// We can receive multiple packets at once.
   348  			nReceived, rxIndex := ep.control.RX.Peek()
   349  
   350  			if nReceived == 0 {
   351  				break
   352  			}
   353  
   354  			// Reuse views to avoid allocating.
   355  			views = views[:0]
   356  
   357  			// Populate views quickly so that we can release frames
   358  			// back to the kernel.
   359  			ep.control.UMEM.Lock()
   360  			for i := uint32(0); i < nReceived; i++ {
   361  				// Copy packet bytes into a view and free up the
   362  				// buffer.
   363  				descriptor := ep.control.RX.Get(rxIndex + i)
   364  				data := ep.control.UMEM.Get(descriptor)
   365  				view := buffer.NewView(len(data))
   366  				view.Write(data)
   367  				views = append(views, view)
   368  				ep.control.UMEM.FreeFrame(descriptor.Addr)
   369  			}
   370  			ep.control.Fill.FillAll(&ep.control.UMEM)
   371  			ep.control.UMEM.Unlock()
   372  
   373  			// Process each packet.
   374  			ep.mu.RLock()
   375  			d := ep.networkDispatcher
   376  			ep.mu.RUnlock()
   377  			for i := uint32(0); i < nReceived; i++ {
   378  				view := views[i]
   379  				data := view.AsSlice()
   380  
   381  				netProto := header.Ethernet(data).Type()
   382  
   383  				// Wrap the packet in a PacketBuffer and send it up the stack.
   384  				pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
   385  					Payload: buffer.MakeWithView(view),
   386  				})
   387  				// AF_XDP packets always have a link header.
   388  				if !ep.ParseHeader(pkt) {
   389  					panic("ParseHeader(_) must succeed")
   390  				}
   391  				d.DeliverNetworkPacket(netProto, pkt)
   392  				pkt.DecRef()
   393  			}
   394  			// Tell the kernel that we're done with these
   395  			// descriptors in the RX queue.
   396  			ep.control.RX.Release(nReceived)
   397  		}
   398  	}
   399  }