github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/pkg/xdp/xdp.go (about)

     1  // Copyright 2022 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build amd64 || arm64
    16  // +build amd64 arm64
    17  
    18  // Package xdp provides tools for working with AF_XDP sockets.
    19  //
    20  // AF_XDP shares a memory area (UMEM) with the kernel to pass packets
    21  // back and forth. Communication is done via a number of queues.
    22  // Briefly, the queues work as follows:
    23  //
    24  //   - Receive: Userspace adds a descriptor to the fill queue. The
    25  //     descriptor points to an area of the UMEM that the kernel should fill
    26  //     with an incoming packet. The packet is filled by the kernel, which
    27  //     places a descriptor to the same UMEM area in the RX queue, signifying
    28  //     that userspace may read the packet.
    29  //   - Trasmit: Userspace adds a descriptor to TX queue. The kernel
    30  //     sends the packet (stored in UMEM) pointed to by the descriptor.
    31  //     Upon completion, the kernel places a desciptor in the completion
    32  //     queue to notify userspace that the packet is sent and the UMEM
    33  //     area can be reused.
    34  //
    35  // So in short: RX packets move from the fill to RX queue, and TX
    36  // packets move from the TX to completion queue.
    37  //
    38  // Note that the shared UMEM for RX and TX means that packet forwarding
    39  // can be done without copying; only the queues need to be updated to point to
    40  // the packet in UMEM.
    41  package xdp
    42  
    43  import (
    44  	"fmt"
    45  	"math/bits"
    46  
    47  	"github.com/ttpreport/gvisor-ligolo/pkg/cleanup"
    48  	"github.com/ttpreport/gvisor-ligolo/pkg/log"
    49  	"github.com/ttpreport/gvisor-ligolo/pkg/memutil"
    50  	"golang.org/x/sys/unix"
    51  )
    52  
    53  // A ControlBlock contains all the control structures necessary to use an
    54  // AF_XDP socket.
    55  //
    56  // The ControlBlock and the structures it contains are meant to be used with a
    57  // single RX goroutine and a single TX goroutine.
    58  type ControlBlock struct {
    59  	UMEM       UMEM
    60  	Fill       FillQueue
    61  	RX         RXQueue
    62  	TX         TXQueue
    63  	Completion CompletionQueue
    64  }
    65  
    66  // ReadOnlySocketOpts configure a read-only AF_XDP socket.
    67  type ReadOnlySocketOpts struct {
    68  	NFrames      uint32
    69  	FrameSize    uint32
    70  	NDescriptors uint32
    71  }
    72  
    73  // DefaultReadOnlyOpts provides recommended default options for initializing a
    74  // readonly AF_XDP socket. AF_XDP setup is extremely finnicky and can fail if
    75  // incorrect values are used.
    76  func DefaultReadOnlyOpts() ReadOnlySocketOpts {
    77  	return ReadOnlySocketOpts{
    78  		NFrames: 4096,
    79  		// Frames must be 2048 or 4096 bytes, although not all drivers support
    80  		// both.
    81  		FrameSize:    4096,
    82  		NDescriptors: 2048,
    83  	}
    84  }
    85  
    86  // ReadOnlySocket returns an initialized read-only AF_XDP socket bound to a
    87  // particular interface and queue.
    88  func ReadOnlySocket(ifaceIdx, queueID uint32, opts ReadOnlySocketOpts) (*ControlBlock, error) {
    89  	sockfd, err := unix.Socket(unix.AF_XDP, unix.SOCK_RAW, 0)
    90  	if err != nil {
    91  		return nil, fmt.Errorf("failed to create AF_XDP socket: %v", err)
    92  	}
    93  	return ReadOnlyFromSocket(sockfd, ifaceIdx, queueID, opts)
    94  }
    95  
    96  // ReadOnlyFromSocket takes an AF_XDP socket, initializes it, and binds it to a
    97  // particular interface and queue.
    98  func ReadOnlyFromSocket(sockfd int, ifaceIdx, queueID uint32, opts ReadOnlySocketOpts) (*ControlBlock, error) {
    99  	if opts.FrameSize != 2048 && opts.FrameSize != 4096 {
   100  		return nil, fmt.Errorf("invalid frame size %d: must be either 2048 or 4096", opts.FrameSize)
   101  	}
   102  	if bits.OnesCount32(opts.NDescriptors) != 1 {
   103  		return nil, fmt.Errorf("invalid number of descriptors %d: must be a power of 2", opts.NDescriptors)
   104  	}
   105  
   106  	var cb ControlBlock
   107  
   108  	// Create the UMEM area. Use mmap instead of make([[]byte) to ensure
   109  	// that the UMEM is page-aligned. Aligning the UMEM keeps individual
   110  	// packets from spilling over between pages.
   111  	var zerofd uintptr
   112  	umemMemory, err := memutil.MapSlice(
   113  		0,
   114  		uintptr(opts.NFrames*opts.FrameSize),
   115  		unix.PROT_READ|unix.PROT_WRITE,
   116  		unix.MAP_PRIVATE|unix.MAP_ANONYMOUS,
   117  		zerofd-1,
   118  		0,
   119  	)
   120  	if err != nil {
   121  		return nil, fmt.Errorf("failed to mmap umem: %v", err)
   122  	}
   123  	cleanup := cleanup.Make(func() {
   124  		memutil.UnmapSlice(umemMemory)
   125  	})
   126  
   127  	if sliceBackingPointer(umemMemory)%uintptr(unix.Getpagesize()) != 0 {
   128  		return nil, fmt.Errorf("UMEM is not page aligned (address 0x%x)", sliceBackingPointer(umemMemory))
   129  	}
   130  
   131  	cb.UMEM = UMEM{
   132  		mem:            umemMemory,
   133  		sockfd:         uint32(sockfd),
   134  		frameAddresses: make([]uint64, opts.NFrames),
   135  		nFreeFrames:    opts.NFrames,
   136  		frameMask:      ^(uint64(opts.FrameSize) - 1),
   137  	}
   138  
   139  	// Fill in each frame address.
   140  	for i := range cb.UMEM.frameAddresses {
   141  		cb.UMEM.frameAddresses[i] = uint64(i) * uint64(opts.FrameSize)
   142  	}
   143  
   144  	// Check whether we're likely to fail due to RLIMIT_MEMLOCK.
   145  	var rlimit unix.Rlimit
   146  	if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlimit); err != nil {
   147  		return nil, fmt.Errorf("failed to get rlimit for memlock: %v", err)
   148  	}
   149  	if rlimit.Cur < uint64(len(cb.UMEM.mem)) {
   150  		log.Infof("UMEM size (%d) may exceed RLIMIT_MEMLOCK (%+v) and cause registration to fail", len(cb.UMEM.mem), rlimit)
   151  	}
   152  
   153  	reg := unix.XDPUmemReg{
   154  		Addr: uint64(sliceBackingPointer(umemMemory)),
   155  		Len:  uint64(len(umemMemory)),
   156  		Size: opts.FrameSize,
   157  		// Not useful in the RX path.
   158  		Headroom: 0,
   159  		// TODO(b/240191988): Investigate use of SHARED flag.
   160  		Flags: 0,
   161  	}
   162  	if err := registerUMEM(sockfd, reg); err != nil {
   163  		return nil, fmt.Errorf("failed to register UMEM: %v", err)
   164  	}
   165  
   166  	// Set the number of descriptors in the fill queue.
   167  	if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_UMEM_FILL_RING, int(opts.NDescriptors)); err != nil {
   168  		return nil, fmt.Errorf("failed to register fill ring: %v", err)
   169  	}
   170  	// Set the number of descriptors in the completion queue.
   171  	if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_UMEM_COMPLETION_RING, int(opts.NDescriptors)); err != nil {
   172  		return nil, fmt.Errorf("failed to register completion ring: %v", err)
   173  	}
   174  	// Set the number of descriptors in the RX queue.
   175  	if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_RX_RING, int(opts.NDescriptors)); err != nil {
   176  		return nil, fmt.Errorf("failed to register RX queue: %v", err)
   177  	}
   178  	// Set the number of descriptors in the TX queue.
   179  	if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_TX_RING, int(opts.NDescriptors)); err != nil {
   180  		return nil, fmt.Errorf("failed to register TX queue: %v", err)
   181  	}
   182  
   183  	// Get offset information for the queues. Offsets indicate where, once
   184  	// we mmap space for each queue, values in the queue are. They give
   185  	// offsets for the shared pointers, a shared flags value, and the
   186  	// beginning of the ring of descriptors.
   187  	off, err := getOffsets(sockfd)
   188  	if err != nil {
   189  		return nil, fmt.Errorf("failed to get offsets: %v", err)
   190  	}
   191  
   192  	// Allocate space for the fill queue.
   193  	fillQueueMem, err := memutil.MapSlice(
   194  		0,
   195  		uintptr(off.Fr.Desc+uint64(opts.NDescriptors)*sizeOfFillQueueDesc()),
   196  		unix.PROT_READ|unix.PROT_WRITE,
   197  		unix.MAP_SHARED|unix.MAP_POPULATE,
   198  		uintptr(sockfd),
   199  		unix.XDP_UMEM_PGOFF_FILL_RING,
   200  	)
   201  	if err != nil {
   202  		return nil, fmt.Errorf("failed to mmap fill queue: %v", err)
   203  	}
   204  	cleanup.Add(func() {
   205  		memutil.UnmapSlice(fillQueueMem)
   206  	})
   207  	// Setup the fillQueue with offsets into allocated memory.
   208  	cb.Fill = FillQueue{
   209  		mem:            fillQueueMem,
   210  		mask:           opts.NDescriptors - 1,
   211  		cachedConsumer: opts.NDescriptors,
   212  	}
   213  	cb.Fill.init(off, opts)
   214  
   215  	// Allocate space for the completion queue.
   216  	completionQueueMem, err := memutil.MapSlice(
   217  		0,
   218  		uintptr(off.Cr.Desc+uint64(opts.NDescriptors)*sizeOfCompletionQueueDesc()),
   219  		unix.PROT_READ|unix.PROT_WRITE,
   220  		unix.MAP_SHARED|unix.MAP_POPULATE,
   221  		uintptr(sockfd),
   222  		unix.XDP_UMEM_PGOFF_COMPLETION_RING,
   223  	)
   224  	if err != nil {
   225  		return nil, fmt.Errorf("failed to mmap completion queue: %v", err)
   226  	}
   227  	cleanup.Add(func() {
   228  		memutil.UnmapSlice(completionQueueMem)
   229  	})
   230  	// Setup the completionQueue with offsets into allocated memory.
   231  	cb.Completion = CompletionQueue{
   232  		mem:  completionQueueMem,
   233  		mask: opts.NDescriptors - 1,
   234  	}
   235  	cb.Completion.init(off, opts)
   236  
   237  	// Allocate space for the RX queue.
   238  	rxQueueMem, err := memutil.MapSlice(
   239  		0,
   240  		uintptr(off.Rx.Desc+uint64(opts.NDescriptors)*sizeOfRXQueueDesc()),
   241  		unix.PROT_READ|unix.PROT_WRITE,
   242  		unix.MAP_SHARED|unix.MAP_POPULATE,
   243  		uintptr(sockfd),
   244  		unix.XDP_PGOFF_RX_RING,
   245  	)
   246  	if err != nil {
   247  		return nil, fmt.Errorf("failed to mmap RX queue: %v", err)
   248  	}
   249  	cleanup.Add(func() {
   250  		memutil.UnmapSlice(rxQueueMem)
   251  	})
   252  	// Setup the rxQueue with offsets into allocated memory.
   253  	cb.RX = RXQueue{
   254  		mem:  rxQueueMem,
   255  		mask: opts.NDescriptors - 1,
   256  	}
   257  	cb.RX.init(off, opts)
   258  
   259  	// Allocate space for the TX queue.
   260  	txQueueMem, err := memutil.MapSlice(
   261  		0,
   262  		uintptr(off.Tx.Desc+uint64(opts.NDescriptors)*sizeOfTXQueueDesc()),
   263  		unix.PROT_READ|unix.PROT_WRITE,
   264  		unix.MAP_SHARED|unix.MAP_POPULATE,
   265  		uintptr(sockfd),
   266  		unix.XDP_PGOFF_TX_RING,
   267  	)
   268  	if err != nil {
   269  		return nil, fmt.Errorf("failed to mmap tx queue: %v", err)
   270  	}
   271  	cleanup.Add(func() {
   272  		memutil.UnmapSlice(txQueueMem)
   273  	})
   274  	// Setup the txQueue with offsets into allocated memory.
   275  	cb.TX = TXQueue{
   276  		sockfd:         uint32(sockfd),
   277  		mem:            txQueueMem,
   278  		mask:           opts.NDescriptors - 1,
   279  		cachedConsumer: opts.NDescriptors,
   280  	}
   281  	cb.TX.init(off, opts)
   282  
   283  	addr := unix.SockaddrXDP{
   284  		// XDP_USE_NEED_WAKEUP lets the driver sleep if there is no
   285  		// work to do. It will need to be woken by poll. It is expected
   286  		// that this improves performance by preventing the driver from
   287  		// burning cycles.
   288  		//
   289  		// By not setting either XDP_COPY or XDP_ZEROCOPY, we instruct
   290  		// the kernel to use zerocopy if available and then fallback to
   291  		// copy mode.
   292  		Flags:   unix.XDP_USE_NEED_WAKEUP,
   293  		Ifindex: ifaceIdx,
   294  		// AF_XDP sockets are per device RX queue, although multiple
   295  		// sockets on multiple queues (or devices) can share a single
   296  		// UMEM.
   297  		QueueID: queueID,
   298  		// We're not using shared mode, so the value here is irrelevant.
   299  		SharedUmemFD: 0,
   300  	}
   301  	if err := unix.Bind(sockfd, &addr); err != nil {
   302  		return nil, fmt.Errorf("failed to bind with addr %+v: %v", addr, err)
   303  	}
   304  
   305  	cleanup.Release()
   306  	return &cb, nil
   307  }