github.com/sagernet/gvisor@v0.0.0-20240428053021-e691de28565f/pkg/xdp/xdp.go (about)

     1  // Copyright 2022 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build amd64 || arm64
    16  // +build amd64 arm64
    17  
    18  // Package xdp provides tools for working with AF_XDP sockets.
    19  //
    20  // AF_XDP shares a memory area (UMEM) with the kernel to pass packets
    21  // back and forth. Communication is done via a number of queues.
    22  // Briefly, the queues work as follows:
    23  //
    24  //   - Receive: Userspace adds a descriptor to the fill queue. The
    25  //     descriptor points to an area of the UMEM that the kernel should fill
    26  //     with an incoming packet. The packet is filled by the kernel, which
    27  //     places a descriptor to the same UMEM area in the RX queue, signifying
    28  //     that userspace may read the packet.
    29  //   - Trasmit: Userspace adds a descriptor to TX queue. The kernel
    30  //     sends the packet (stored in UMEM) pointed to by the descriptor.
    31  //     Upon completion, the kernel places a descriptor in the completion
    32  //     queue to notify userspace that the packet is sent and the UMEM
    33  //     area can be reused.
    34  //
    35  // So in short: RX packets move from the fill to RX queue, and TX
    36  // packets move from the TX to completion queue.
    37  //
    38  // Note that the shared UMEM for RX and TX means that packet forwarding
    39  // can be done without copying; only the queues need to be updated to point to
    40  // the packet in UMEM.
    41  package xdp
    42  
    43  import (
    44  	"fmt"
    45  	"math/bits"
    46  
    47  	"golang.org/x/sys/unix"
    48  	"github.com/sagernet/gvisor/pkg/cleanup"
    49  	"github.com/sagernet/gvisor/pkg/log"
    50  	"github.com/sagernet/gvisor/pkg/memutil"
    51  )
    52  
    53  // A ControlBlock contains all the control structures necessary to use an
    54  // AF_XDP socket.
    55  //
    56  // The ControlBlock and the structures it contains are meant to be used with a
    57  // single RX goroutine and a single TX goroutine.
    58  type ControlBlock struct {
    59  	UMEM       UMEM
    60  	Fill       FillQueue
    61  	RX         RXQueue
    62  	TX         TXQueue
    63  	Completion CompletionQueue
    64  }
    65  
    66  // Opts configure an AF_XDP socket.
    67  type Opts struct {
    68  	NFrames       uint32
    69  	FrameSize     uint32
    70  	NDescriptors  uint32
    71  	Bind          bool
    72  	UseNeedWakeup bool
    73  }
    74  
    75  // DefaultOpts provides recommended default options for initializing an AF_XDP
    76  // socket. AF_XDP setup is extremely finnicky and can fail if incorrect values
    77  // are used.
    78  func DefaultOpts() Opts {
    79  	return Opts{
    80  		NFrames: 4096,
    81  		// Frames must be 2048 or 4096 bytes, although not all drivers support
    82  		// both.
    83  		FrameSize:    4096,
    84  		NDescriptors: 2048,
    85  	}
    86  }
    87  
    88  // New returns an initialized AF_XDP socket bound to a particular interface and
    89  // queue.
    90  func New(ifaceIdx, queueID uint32, opts Opts) (*ControlBlock, error) {
    91  	sockfd, err := unix.Socket(unix.AF_XDP, unix.SOCK_RAW, 0)
    92  	if err != nil {
    93  		return nil, fmt.Errorf("failed to create AF_XDP socket: %v", err)
    94  	}
    95  	return NewFromSocket(sockfd, ifaceIdx, queueID, opts)
    96  }
    97  
    98  // NewFromSocket takes an AF_XDP socket, initializes it, and binds it to a
    99  // particular interface and queue.
   100  func NewFromSocket(sockfd int, ifaceIdx, queueID uint32, opts Opts) (*ControlBlock, error) {
   101  	if opts.FrameSize != 2048 && opts.FrameSize != 4096 {
   102  		return nil, fmt.Errorf("invalid frame size %d: must be either 2048 or 4096", opts.FrameSize)
   103  	}
   104  	if bits.OnesCount32(opts.NDescriptors) != 1 {
   105  		return nil, fmt.Errorf("invalid number of descriptors %d: must be a power of 2", opts.NDescriptors)
   106  	}
   107  
   108  	var cb ControlBlock
   109  
   110  	// Create the UMEM area. Use mmap instead of make([[]byte) to ensure
   111  	// that the UMEM is page-aligned. Aligning the UMEM keeps individual
   112  	// packets from spilling over between pages.
   113  	var zerofd uintptr
   114  	umemMemory, err := memutil.MapSlice(
   115  		0,
   116  		uintptr(opts.NFrames*opts.FrameSize),
   117  		unix.PROT_READ|unix.PROT_WRITE,
   118  		unix.MAP_PRIVATE|unix.MAP_ANONYMOUS,
   119  		zerofd-1,
   120  		0,
   121  	)
   122  	if err != nil {
   123  		return nil, fmt.Errorf("failed to mmap umem: %v", err)
   124  	}
   125  	cleanup := cleanup.Make(func() {
   126  		memutil.UnmapSlice(umemMemory)
   127  	})
   128  
   129  	if sliceBackingPointer(umemMemory)%uintptr(unix.Getpagesize()) != 0 {
   130  		return nil, fmt.Errorf("UMEM is not page aligned (address 0x%x)", sliceBackingPointer(umemMemory))
   131  	}
   132  
   133  	cb.UMEM = UMEM{
   134  		mem:            umemMemory,
   135  		sockfd:         uint32(sockfd),
   136  		frameAddresses: make([]uint64, opts.NFrames),
   137  		nFreeFrames:    opts.NFrames,
   138  		frameMask:      ^(uint64(opts.FrameSize) - 1),
   139  	}
   140  
   141  	// Fill in each frame address.
   142  	for i := range cb.UMEM.frameAddresses {
   143  		cb.UMEM.frameAddresses[i] = uint64(i) * uint64(opts.FrameSize)
   144  	}
   145  
   146  	// Check whether we're likely to fail due to RLIMIT_MEMLOCK.
   147  	var rlimit unix.Rlimit
   148  	if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlimit); err != nil {
   149  		return nil, fmt.Errorf("failed to get rlimit for memlock: %v", err)
   150  	}
   151  	if rlimit.Cur < uint64(len(cb.UMEM.mem)) {
   152  		log.Infof("UMEM size (%d) may exceed RLIMIT_MEMLOCK (%+v) and cause registration to fail", len(cb.UMEM.mem), rlimit)
   153  	}
   154  
   155  	reg := unix.XDPUmemReg{
   156  		Addr: uint64(sliceBackingPointer(umemMemory)),
   157  		Len:  uint64(len(umemMemory)),
   158  		Size: opts.FrameSize,
   159  		// Not useful in the RX path.
   160  		Headroom: 0,
   161  		// TODO(b/240191988): Investigate use of SHARED flag.
   162  		Flags: 0,
   163  	}
   164  	if err := registerUMEM(sockfd, reg); err != nil {
   165  		return nil, fmt.Errorf("failed to register UMEM: %v", err)
   166  	}
   167  
   168  	// Set the number of descriptors in the fill queue.
   169  	if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_UMEM_FILL_RING, int(opts.NDescriptors)); err != nil {
   170  		return nil, fmt.Errorf("failed to register fill ring: %v", err)
   171  	}
   172  	// Set the number of descriptors in the completion queue.
   173  	if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_UMEM_COMPLETION_RING, int(opts.NDescriptors)); err != nil {
   174  		return nil, fmt.Errorf("failed to register completion ring: %v", err)
   175  	}
   176  	// Set the number of descriptors in the RX queue.
   177  	if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_RX_RING, int(opts.NDescriptors)); err != nil {
   178  		return nil, fmt.Errorf("failed to register RX queue: %v", err)
   179  	}
   180  	// Set the number of descriptors in the TX queue.
   181  	if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_TX_RING, int(opts.NDescriptors)); err != nil {
   182  		return nil, fmt.Errorf("failed to register TX queue: %v", err)
   183  	}
   184  
   185  	// Get offset information for the queues. Offsets indicate where, once
   186  	// we mmap space for each queue, values in the queue are. They give
   187  	// offsets for the shared pointers, a shared flags value, and the
   188  	// beginning of the ring of descriptors.
   189  	off, err := getOffsets(sockfd)
   190  	if err != nil {
   191  		return nil, fmt.Errorf("failed to get offsets: %v", err)
   192  	}
   193  
   194  	// Allocate space for the fill queue.
   195  	fillQueueMem, err := memutil.MapSlice(
   196  		0,
   197  		uintptr(off.Fr.Desc+uint64(opts.NDescriptors)*sizeOfFillQueueDesc()),
   198  		unix.PROT_READ|unix.PROT_WRITE,
   199  		unix.MAP_SHARED|unix.MAP_POPULATE,
   200  		uintptr(sockfd),
   201  		unix.XDP_UMEM_PGOFF_FILL_RING,
   202  	)
   203  	if err != nil {
   204  		return nil, fmt.Errorf("failed to mmap fill queue: %v", err)
   205  	}
   206  	cleanup.Add(func() {
   207  		memutil.UnmapSlice(fillQueueMem)
   208  	})
   209  	// Setup the fillQueue with offsets into allocated memory.
   210  	cb.Fill = FillQueue{
   211  		mem:            fillQueueMem,
   212  		mask:           opts.NDescriptors - 1,
   213  		cachedConsumer: opts.NDescriptors,
   214  	}
   215  	cb.Fill.init(off, opts)
   216  
   217  	// Allocate space for the completion queue.
   218  	completionQueueMem, err := memutil.MapSlice(
   219  		0,
   220  		uintptr(off.Cr.Desc+uint64(opts.NDescriptors)*sizeOfCompletionQueueDesc()),
   221  		unix.PROT_READ|unix.PROT_WRITE,
   222  		unix.MAP_SHARED|unix.MAP_POPULATE,
   223  		uintptr(sockfd),
   224  		unix.XDP_UMEM_PGOFF_COMPLETION_RING,
   225  	)
   226  	if err != nil {
   227  		return nil, fmt.Errorf("failed to mmap completion queue: %v", err)
   228  	}
   229  	cleanup.Add(func() {
   230  		memutil.UnmapSlice(completionQueueMem)
   231  	})
   232  	// Setup the completionQueue with offsets into allocated memory.
   233  	cb.Completion = CompletionQueue{
   234  		mem:  completionQueueMem,
   235  		mask: opts.NDescriptors - 1,
   236  	}
   237  	cb.Completion.init(off, opts)
   238  
   239  	// Allocate space for the RX queue.
   240  	rxQueueMem, err := memutil.MapSlice(
   241  		0,
   242  		uintptr(off.Rx.Desc+uint64(opts.NDescriptors)*sizeOfRXQueueDesc()),
   243  		unix.PROT_READ|unix.PROT_WRITE,
   244  		unix.MAP_SHARED|unix.MAP_POPULATE,
   245  		uintptr(sockfd),
   246  		unix.XDP_PGOFF_RX_RING,
   247  	)
   248  	if err != nil {
   249  		return nil, fmt.Errorf("failed to mmap RX queue: %v", err)
   250  	}
   251  	cleanup.Add(func() {
   252  		memutil.UnmapSlice(rxQueueMem)
   253  	})
   254  	// Setup the rxQueue with offsets into allocated memory.
   255  	cb.RX = RXQueue{
   256  		mem:  rxQueueMem,
   257  		mask: opts.NDescriptors - 1,
   258  	}
   259  	cb.RX.init(off, opts)
   260  
   261  	// Allocate space for the TX queue.
   262  	txQueueMem, err := memutil.MapSlice(
   263  		0,
   264  		uintptr(off.Tx.Desc+uint64(opts.NDescriptors)*sizeOfTXQueueDesc()),
   265  		unix.PROT_READ|unix.PROT_WRITE,
   266  		unix.MAP_SHARED|unix.MAP_POPULATE,
   267  		uintptr(sockfd),
   268  		unix.XDP_PGOFF_TX_RING,
   269  	)
   270  	if err != nil {
   271  		return nil, fmt.Errorf("failed to mmap tx queue: %v", err)
   272  	}
   273  	cleanup.Add(func() {
   274  		memutil.UnmapSlice(txQueueMem)
   275  	})
   276  	// Setup the txQueue with offsets into allocated memory.
   277  	cb.TX = TXQueue{
   278  		sockfd:         uint32(sockfd),
   279  		mem:            txQueueMem,
   280  		mask:           opts.NDescriptors - 1,
   281  		cachedConsumer: opts.NDescriptors,
   282  	}
   283  	cb.TX.init(off, opts)
   284  
   285  	// In some cases we don't call bind, as we're not in the netns with the
   286  	// device. In those cases, another process with the same socket will
   287  	// bind for us.
   288  	if opts.Bind {
   289  		if err := Bind(sockfd, ifaceIdx, queueID, opts.UseNeedWakeup); err != nil {
   290  			return nil, fmt.Errorf("failed to bind to interface %d: %v", ifaceIdx, err)
   291  		}
   292  	}
   293  
   294  	cleanup.Release()
   295  	return &cb, nil
   296  }
   297  
   298  // Bind binds a socket to a particular network interface and queue.
   299  func Bind(sockfd int, ifindex, queueID uint32, useNeedWakeup bool) error {
   300  	var flags uint16
   301  	if useNeedWakeup {
   302  		flags |= unix.XDP_USE_NEED_WAKEUP
   303  	}
   304  	addr := unix.SockaddrXDP{
   305  		// XDP_USE_NEED_WAKEUP lets the driver sleep if there is no
   306  		// work to do. It will need to be woken by poll. It is expected
   307  		// that this improves performance by preventing the driver from
   308  		// burning cycles.
   309  		//
   310  		// By not setting either XDP_COPY or XDP_ZEROCOPY, we instruct
   311  		// the kernel to use zerocopy if available and then fallback to
   312  		// copy mode.
   313  		Flags:   flags,
   314  		Ifindex: ifindex,
   315  		// AF_XDP sockets are per device RX queue, although multiple
   316  		// sockets on multiple queues (or devices) can share a single
   317  		// UMEM.
   318  		QueueID: queueID,
   319  		// We're not using shared mode, so the value here is irrelevant.
   320  		SharedUmemFD: 0,
   321  	}
   322  	return unix.Bind(sockfd, &addr)
   323  }