github.com/dylandreimerink/gobpfld@v0.6.1-0.20220205171531-e79c330ad608/xsk.go (about)

     1  package gobpfld
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"io"
     7  	"os"
     8  	"strings"
     9  	"sync"
    10  	"syscall"
    11  	"time"
    12  	"unsafe"
    13  
    14  	bpfSyscall "github.com/dylandreimerink/gobpfld/internal/syscall"
    15  	"github.com/dylandreimerink/gobpfld/kernelsupport"
    16  	"golang.org/x/sys/unix"
    17  )
    18  
    19  // A FrameReader can read whole or partial ethernet frames. Every time ReadFrame is called, p will be filled with up
    20  // to len(p) bytes from a single frame. These bytes include both the header and body of the ethernet frame.
    21  // If p to small to fit the whole frame, the remaining bytes of the frame are discarded. The next call to ReadFrame
    22  // will start at the next frame.
    23  //
    24  // n will be set to the number of bytes read from the the frame. err is non nil if any error has occurred during the
    25  // process. If both n is 0 and err is nil nothing was read for an expected reason like a timout or external interrupt.
    26  type FrameReader interface {
    27  	ReadFrame(p []byte) (n int, err error)
    28  }
    29  
    30  type FrameWriter interface {
    31  	WriteFrame(p []byte) (n int, err error)
    32  }
    33  
    34  type FrameLeaser interface {
    35  	ReadLease() (*XSKLease, error)
    36  	WriteLease() (*XSKLease, error)
    37  }
    38  
    39  var (
    40  	_ FrameReader = (*XSKMultiSocket)(nil)
    41  	_ FrameWriter = (*XSKMultiSocket)(nil)
    42  	_ FrameLeaser = (*XSKMultiSocket)(nil)
    43  	_ io.Closer   = (*XSKMultiSocket)(nil)
    44  )
    45  
    46  // XSKMultiSocket is a collection of XSKSockets. The multi socket balances reads and writes between all XSKSockets.
    47  // This is useful for multi queue netdevices since a XSKSocket can only read or write from one rx/tx queue pair
    48  // at a time. A multi queue allows you to bundle all of these sockets so you get a socket for the whole netdevice.
    49  //
    50  // An alternative use for the multi socket is to add sockets from multiple netdevices.
    51  //
    52  // TODO look into using epoll for multi sockets. Using poll for single sockets still makes sense since there is always
    53  //  1 fd, but for multi sockets we can have much more. For high-end NICs with ~40 rx/tx queues(mallanox for example)
    54  //  it makes sense to start using epoll since it is supposed to scale better. Should make it configurable when adding
    55  //  support in case freeBSD or other unix-like os adds XSK support since epoll is non-POSIX
    56  //
    57  // TODO dynamic socket adding/removing. Should not be to hard, the main edge case to solve is dealing with
    58  //  pending/blocking syscalls for read/write. But presumably epoll can allow us to dynamically add/remove
    59  //  fds without interrupting the reads/writes. Otherwise adding/removing sockets will have to request both the
    60  //  rmu and wmu.
    61  type XSKMultiSocket struct {
    62  	sockets []*XSKSocket
    63  
    64  	rmu sync.Mutex
    65  	wmu sync.Mutex
    66  
    67  	readIter  int
    68  	writeIter int
    69  
    70  	readTimeout  int
    71  	writeTimeout int
    72  }
    73  
    74  func NewXSKMultiSocket(xskSockets ...*XSKSocket) (*XSKMultiSocket, error) {
    75  	if len(xskSockets) == 0 {
    76  		return nil, fmt.Errorf("need at least one socket")
    77  	}
    78  
    79  	for _, sock := range xskSockets {
    80  		if sock == nil {
    81  			return nil, fmt.Errorf("socket value can't be nil")
    82  		}
    83  	}
    84  
    85  	return &XSKMultiSocket{
    86  		sockets: xskSockets,
    87  	}, nil
    88  }
    89  
    90  // SetWriteTimeout sets the timeout for Write and XSKLease.WriteBack calls.
    91  // If ms == 0 (default), we will never block/wait and error if we can't write at once.
    92  // If ms == -1, we will block forever until we can write.
    93  // If ms > 0, we will wait for x miliseconds for an oppurunity to write or error afterwards.
    94  func (xms *XSKMultiSocket) SetWriteTimeout(ms int) error {
    95  	if ms < -1 {
    96  		return fmt.Errorf("timeout must be -1, 0, or positive amount of miliseconds")
    97  	}
    98  
    99  	xms.writeTimeout = ms
   100  
   101  	return nil
   102  }
   103  
   104  // SetReadTimeout sets the timeout for Read and ReadLease calls.
   105  // If ms == 0 (default), we will never block/wait and return no data if there isn't any ready.
   106  // If ms == -1, we will block forever until we can read.
   107  // If ms > 0, we will wait for x miliseconds for an oppurunity to read or return no data.
   108  func (xms *XSKMultiSocket) SetReadTimeout(ms int) error {
   109  	if ms < -1 {
   110  		return fmt.Errorf("timeout must be -1, 0, or positive amount of miliseconds")
   111  	}
   112  
   113  	xms.readTimeout = ms
   114  
   115  	return nil
   116  }
   117  
   118  func (xms *XSKMultiSocket) ReadFrame(p []byte) (n int, err error) {
   119  	xms.rmu.Lock()
   120  	defer xms.rmu.Unlock()
   121  
   122  	var (
   123  		desc *descriptor
   124  		sock *XSKSocket
   125  	)
   126  	pollFds := make([]unix.PollFd, len(xms.sockets))
   127  
   128  	// Save the current iter value, we need it during poll resolving
   129  	curItr := xms.readIter
   130  
   131  	// Check every socket in case there is a frame ready
   132  	for i := 0; i < len(xms.sockets); i++ {
   133  		// Use readIter which keeps it value between calls to Read this ensures that we attempt to read from
   134  		// all sockets equally
   135  		sock = xms.sockets[xms.readIter]
   136  		desc = sock.rx.Dequeue()
   137  
   138  		xms.readIter++
   139  		if xms.readIter >= len(xms.sockets) {
   140  			xms.readIter = 0
   141  		}
   142  
   143  		if desc != nil {
   144  			break
   145  		}
   146  
   147  		pollFds[i] = unix.PollFd{
   148  			Fd:     int32(xms.sockets[xms.readIter].fd),
   149  			Events: unix.POLLIN,
   150  		}
   151  	}
   152  
   153  	// If none of the sockets have frames ready at the moment, we poll to wait
   154  	if desc == nil {
   155  		n, err := unix.Poll(pollFds, xms.readTimeout)
   156  		if err != nil {
   157  			// Sometimes a poll is interrupted by a signal, no a real error
   158  			// lets treat it like a timeout
   159  			if err == syscall.EINTR {
   160  				return 0, nil
   161  			}
   162  
   163  			return 0, fmt.Errorf("poll: %w", err)
   164  		}
   165  
   166  		// If n == 0, the timeout was reached
   167  		if n == 0 {
   168  			return 0, nil
   169  		}
   170  
   171  		// now that there is at least one socket with a frame, we need to find it
   172  		for i := 0; i < len(xms.sockets); i++ {
   173  			if pollFds[i].Revents&unix.POLLIN > 0 {
   174  				sock = xms.sockets[curItr]
   175  				desc = sock.rx.Dequeue()
   176  				if desc != nil {
   177  					break
   178  				}
   179  			}
   180  
   181  			curItr++
   182  			if curItr >= len(xms.sockets) {
   183  				curItr = 0
   184  			}
   185  		}
   186  
   187  		// If poll returned n>0 but dequeueing still failed
   188  		if desc == nil {
   189  			return 0, nil
   190  		}
   191  	}
   192  
   193  	len := copy(p, sock.umem[desc.addr:desc.addr+uint64(desc.len)])
   194  	err = sock.fill.Enqueue(addrToFrameStart(desc.addr, sock.settings.FrameSize))
   195  	if err != nil {
   196  		return len, fmt.Errorf("fill enqueue: %w", err)
   197  	}
   198  
   199  	err = sock.wakeupFill()
   200  	if err != nil {
   201  		return len, err
   202  	}
   203  
   204  	return len, nil
   205  }
   206  
   207  func (xms *XSKMultiSocket) WriteFrame(p []byte) (n int, err error) {
   208  	xms.wmu.Lock()
   209  	defer xms.wmu.Unlock()
   210  
   211  	pollFds := make([]unix.PollFd, len(xms.sockets))
   212  
   213  	// Check every socket in case there is a frame ready
   214  	for i := 0; i < len(xms.sockets); i++ {
   215  		pollFds[i] = unix.PollFd{
   216  			Fd:     int32(xms.sockets[xms.writeIter].fd),
   217  			Events: unix.POLLOUT,
   218  		}
   219  
   220  		xms.writeIter++
   221  		if xms.writeIter >= len(xms.sockets) {
   222  			xms.writeIter = 0
   223  		}
   224  	}
   225  
   226  	n, err = unix.Poll(pollFds, xms.writeTimeout)
   227  	if err != nil {
   228  		return 0, fmt.Errorf("poll: %w", err)
   229  	}
   230  
   231  	if n == 0 {
   232  		return 0, fmt.Errorf("timeout")
   233  	}
   234  
   235  	var (
   236  		addr uint64
   237  		sock *XSKSocket
   238  	)
   239  	for i := 0; i < len(xms.sockets); i++ {
   240  		sock = xms.sockets[xms.writeIter]
   241  
   242  		xms.writeIter++
   243  		if xms.writeIter >= len(xms.sockets) {
   244  			xms.writeIter = 0
   245  		}
   246  
   247  		if pollFds[i].Revents&unix.POLLOUT > 0 {
   248  			addr = <-sock.txAddrs
   249  			break
   250  		}
   251  	}
   252  
   253  	len := copy(sock.umem[addr:addr+uint64(len(p))], p)
   254  
   255  	err = sock.enqueueTx(descriptor{
   256  		addr: addr,
   257  		len:  uint32(len),
   258  	})
   259  	if err != nil {
   260  		sock.txAddrs <- addr
   261  		return 0, err
   262  	}
   263  
   264  	err = sock.wakeupTx()
   265  	if err != nil {
   266  		return 0, err
   267  	}
   268  
   269  	return len, nil
   270  }
   271  
   272  func (xms *XSKMultiSocket) Close() error {
   273  	for _, sock := range xms.sockets {
   274  		err := sock.Close()
   275  		if err != nil {
   276  			return err
   277  		}
   278  	}
   279  
   280  	return nil
   281  }
   282  
   283  // WriteLease creates a XSKLease which points to a piece of preallocated memory. This memory can be used to
   284  // build packets for writing. Unlike XSKLeases gotten from ReadLease, write leases have no Headroom.
   285  // The Data slice of the lease is the full length of the usable frame, this length should not be exceeded.
   286  // Any memory held by the lease can't be reused until released or written.
   287  //
   288  // This function blocks until a frame for transmission is available and is not subject to the write timeout.
   289  func (xms *XSKMultiSocket) WriteLease() (lease *XSKLease, err error) {
   290  	sock := xms.sockets[xms.writeIter]
   291  	xms.writeIter++
   292  	if xms.writeIter >= len(xms.sockets) {
   293  		xms.writeIter = 0
   294  	}
   295  
   296  	addr := <-sock.txAddrs
   297  	return &XSKLease{
   298  		Headroom: 0,
   299  		Data:     sock.umem[addr : addr+uint64(sock.settings.FrameSize)],
   300  		dataAddr: addr,
   301  		sock:     sock,
   302  		fromTx:   true,
   303  	}, nil
   304  }
   305  
   306  // ReadLease reads a frame from the socket and returns its memory in a XSKLease. After reading the contents of the
   307  // frame it can be released or written, both will allow the memory to be reused. Calling Write on the lease will
   308  // cause the contents of Data to be written back to the network interface. The contents of Data can be modified
   309  // before calling Write thus allowing a program to implement zero-copy/zero-allocation encaptulation or
   310  // request/response protocols.
   311  func (xms *XSKMultiSocket) ReadLease() (lease *XSKLease, err error) {
   312  	xms.rmu.Lock()
   313  	defer xms.rmu.Unlock()
   314  
   315  	var (
   316  		desc *descriptor
   317  		sock *XSKSocket
   318  	)
   319  	pollFds := make([]unix.PollFd, len(xms.sockets))
   320  
   321  	// Save the current iter value, we need it during poll resolving
   322  	curItr := xms.readIter
   323  
   324  	// Check every socket in case there is a frame ready
   325  	for i := 0; i < len(xms.sockets); i++ {
   326  		// Use readIter which keeps it value between calls to Read this ensures that we attempt to read from
   327  		// all sockets equally
   328  		sock = xms.sockets[xms.readIter]
   329  		desc = sock.rx.Dequeue()
   330  
   331  		xms.readIter++
   332  		if xms.readIter >= len(xms.sockets) {
   333  			xms.readIter = 0
   334  		}
   335  
   336  		if desc != nil {
   337  			break
   338  		}
   339  
   340  		pollFds[i] = unix.PollFd{
   341  			Fd:     int32(xms.sockets[xms.readIter].fd),
   342  			Events: unix.POLLIN,
   343  		}
   344  	}
   345  
   346  	// If none of the sockets have frames ready at the moment, we poll to wait
   347  	if desc == nil {
   348  		n, err := unix.Poll(pollFds, xms.readTimeout)
   349  		if err != nil {
   350  			// Sometimes a poll is interrupted by a signal, no a real error
   351  			// lets treat it like a timeout
   352  			if err == syscall.EINTR {
   353  				return nil, nil
   354  			}
   355  
   356  			return nil, fmt.Errorf("poll: %w", err)
   357  		}
   358  
   359  		// If n == 0, the timeout was reached
   360  		if n == 0 {
   361  			return nil, nil
   362  		}
   363  
   364  		// now that there is at least one socket with a frame, we need to find it
   365  		for i := 0; i < len(xms.sockets); i++ {
   366  			if pollFds[i].Revents&unix.POLLIN > 0 {
   367  				sock = xms.sockets[curItr]
   368  				desc = sock.rx.Dequeue()
   369  				if desc != nil {
   370  					break
   371  				}
   372  			}
   373  
   374  			curItr++
   375  			if curItr >= len(xms.sockets) {
   376  				curItr = 0
   377  			}
   378  		}
   379  
   380  		// If poll returned n>0 but dequeueing still failed
   381  		if desc == nil {
   382  			return nil, nil
   383  		}
   384  	}
   385  
   386  	return &XSKLease{
   387  		Headroom: sock.settings.Headroom,
   388  		Data:     sock.umem[desc.addr-uint64(sock.settings.Headroom) : desc.addr+uint64(desc.len)],
   389  		dataAddr: desc.addr,
   390  		sock:     sock,
   391  	}, nil
   392  }
   393  
   394  // XSKLease is used to "lease" a piece of buffer memory from the socket and return it after the user
   395  // is done using it. This allows us to implement true zero copy packet access.
   396  // After a XSKLease is released or written the underlaying array of Data will be repurposed, to avoid strage bugs
   397  // users must use Data or sub-slices of Data after the lease has been released.
   398  type XSKLease struct {
   399  	Data []byte
   400  	// The amount of bytes which are prefixed at the start which don't contain frame data.
   401  	// This headroom can be used to add an extra header(encapsulation) without having to
   402  	// copy or move the existing packet data.
   403  	Headroom int
   404  	// dataAddr is the memory address at the start of the headroom.
   405  	dataAddr uint64
   406  	sock     *XSKSocket
   407  	// If true the frame address originates from the txAddrs chan
   408  	fromTx bool
   409  }
   410  
   411  // Release releases the leased memory so the kernel can fill it with new data.
   412  func (xl *XSKLease) Release() error {
   413  	// Remove reference to Data since it is invalid from now
   414  	xl.Data = nil
   415  
   416  	frameAddr := addrToFrameStart(xl.dataAddr, xl.sock.settings.FrameSize)
   417  
   418  	// If the this is a tx lease, we can just return the unused address to the txAddrs buffer
   419  	if xl.fromTx {
   420  		xl.sock.txAddrs <- frameAddr
   421  	} else {
   422  		// else, this lease was a rx lease in which case it must be returned to the fill ring
   423  
   424  		xl.sock.fmu.Lock()
   425  		defer xl.sock.fmu.Unlock()
   426  
   427  		// Enqueue the address of the frame on the fill queue so it can be reused
   428  		err := xl.sock.fill.Enqueue(frameAddr)
   429  		if err != nil {
   430  			return fmt.Errorf("enqueue fill: %w", err)
   431  		}
   432  
   433  		err = xl.sock.wakeupFill()
   434  		if err != nil {
   435  			return err
   436  		}
   437  	}
   438  
   439  	return nil
   440  }
   441  
   442  // Write writes a lease to the network interface. The len property of the 'Data' slice - 'Headroom' is the length of
   443  // the packet. Make sure to resize the Data to the size of the data to be transmitted.
   444  // The headroom should always be included(never resize the start of the slice). The 'Headroom' should be used
   445  // to indicate from which byte the headroom starts.
   446  // After Write has been called the lease will be released and the Data slice or its subslices should not
   447  // be used anymore.
   448  func (xl *XSKLease) Write() error {
   449  	xl.sock.wmu.Lock()
   450  	defer xl.sock.wmu.Unlock()
   451  
   452  	if len(xl.Data) > xl.sock.settings.FrameSize {
   453  		return fmt.Errorf("lease has been expanded beyond framesize, can't transmit")
   454  	}
   455  
   456  	err := xl.sock.enqueueTx(descriptor{
   457  		// When enqueueing, we don't want to send the headroom bytes
   458  		addr: xl.dataAddr + uint64(xl.Headroom),
   459  		// Data should contain headroom + packet, since we will not be sending headroom
   460  		// we need to subtract the amout of headroom from the length of Data to get the correct packet length
   461  		len: uint32(len(xl.Data) - xl.Headroom),
   462  	})
   463  	if err != nil {
   464  		return fmt.Errorf("tx enqueue: %w", err)
   465  	}
   466  
   467  	err = xl.sock.wakeupTx()
   468  	if err != nil {
   469  		return err
   470  	}
   471  
   472  	// If the lease was from the fill->rx lifecycle
   473  	if !xl.fromTx {
   474  		// Since a frame from the fill->rx lifecycle was used to transmit, we will now get a frame from
   475  		// the tx->completion lifecycle and insert it into the fill ring so we end up with the same
   476  		// amount of frames available for both cycles. If we don't do this the fill->rx cycle will run
   477  		// out of frames.
   478  		// The completion queue is full at rest at max capacity, so first dequeue one frame to make
   479  		// room for the frame we are about to enqueue in tx, just in case the kernel can transmit
   480  		// faster than we can dequeue.
   481  		addr := <-xl.sock.txAddrs
   482  
   483  		err := xl.sock.fill.Enqueue(addr)
   484  		if err != nil {
   485  			return fmt.Errorf("fill enqueue: %w", err)
   486  		}
   487  
   488  		err = xl.sock.wakeupFill()
   489  		if err != nil {
   490  			return err
   491  		}
   492  	}
   493  
   494  	// Set data to nil to indicate that it is no longer valid to use
   495  	xl.Data = nil
   496  
   497  	return nil
   498  }
   499  
   500  // The addresses we get back from the rx ring have offsets due to headspacing, both user configured
   501  // and default headspacing created by the network driver. This function round the address
   502  // to the nearest start of a frame in umem when re-enqueueing the frame address
   503  // https://www.spinics.net/lists/xdp-newbies/msg01479.html
   504  func addrToFrameStart(addr uint64, frameSize int) uint64 {
   505  	return (addr / uint64(frameSize)) * uint64(frameSize)
   506  }
   507  
   508  // xskAddrRing is a ring buffer containing decriptors used for the rx and tx rings
   509  type xskDescRing struct {
   510  	xskRing
   511  }
   512  
   513  func (dr *xskDescRing) Dequeue() *descriptor {
   514  	producer := (*uint32)(dr.producer)
   515  	consumer := (*uint32)(dr.consumer)
   516  
   517  	if (*producer - *consumer) == 0 {
   518  		return nil
   519  	}
   520  
   521  	// The linux kernel uses the wraparound of an integer to reset the consumer and
   522  	// producer. And since ring buffers are always a factor of 2 we can just throw away
   523  	// all bits which fall outsize of this size to get a always increasing offset
   524  	// between 0 and dr.elemCount
   525  	off := *consumer & (dr.elemCount - 1)
   526  	desc := (*descriptor)(unsafe.Pointer(uintptr(dr.ring) + uintptr(off)*descSize))
   527  
   528  	*consumer++
   529  
   530  	return desc
   531  }
   532  
   533  func (dr *xskDescRing) Enqueue(desc descriptor) error {
   534  	producer := (*uint32)(dr.producer)
   535  	consumer := (*uint32)(dr.consumer)
   536  
   537  	// If the diff between producer and consumer is larger than the elem count the buffer is full
   538  	if (*producer - *consumer) == dr.elemCount-1 {
   539  		return errBufferFull
   540  	}
   541  
   542  	// The linux kernel uses the wraparound of an integer to reset the consumer and
   543  	// producer. And since ring buffers are always a factor of 2 we can just throw away
   544  	// all bits which fall outsize of this size to get a always increasing offset
   545  	// between 0 and dr.elemCount
   546  	off := *producer & (dr.elemCount - 1)
   547  
   548  	// Write the address to the current producer pos
   549  	*(*descriptor)(unsafe.Pointer(uintptr(dr.ring) + uintptr(off)*descSize)) = desc
   550  
   551  	*producer++
   552  
   553  	return nil
   554  }
   555  
   556  // xskAddrRing is a ring buffer containing addresses (uint64) used for the fill and completion rings
   557  type xskAddrRing struct {
   558  	xskRing
   559  }
   560  
   561  const addrSize = unsafe.Sizeof(uint64(0))
   562  
   563  func (ar *xskAddrRing) Dequeue() *uint64 {
   564  	producer := (*uint32)(ar.producer)
   565  	consumer := (*uint32)(ar.consumer)
   566  
   567  	if (*producer - *consumer) == 0 {
   568  		return nil
   569  	}
   570  
   571  	// The linux kernel uses the wraparound of an integer to reset the consumer and
   572  	// producer. And since ring buffers are always a factor of 2 we can just throw away
   573  	// all bits which fall outsize of this size to get a always increasing offset
   574  	// between 0 and ar.elemCount
   575  	off := *consumer & (ar.elemCount - 1)
   576  	addr := (*uint64)(unsafe.Pointer(uintptr(ar.ring) + uintptr(off)*addrSize))
   577  
   578  	*consumer++
   579  
   580  	return addr
   581  }
   582  
   583  var errBufferFull = errors.New("ring buffer is full")
   584  
   585  func (ar *xskAddrRing) Enqueue(addr uint64) error {
   586  	producer := (*uint32)(ar.producer)
   587  	consumer := (*uint32)(ar.consumer)
   588  
   589  	// If the diff between producer and consumer is larger than the elem count the buffer is full
   590  	if (*producer - *consumer) == ar.elemCount-1 {
   591  		return errBufferFull
   592  	}
   593  
   594  	// The linux kernel uses the wraparound of an integer to reset the consumer and
   595  	// producer. And since ring buffers are always a factor of 2 we can just throw away
   596  	// all bits which fall outsize of this size to get a always increasing offset
   597  	// between 0 and dr.elemCount
   598  	off := *producer & (ar.elemCount - 1)
   599  
   600  	// Write the address to the current producer pos
   601  	*(*uint64)(unsafe.Pointer(uintptr(ar.ring) + uintptr(off)*addrSize)) = addr
   602  
   603  	*producer++
   604  
   605  	return nil
   606  }
   607  
   608  type xskRing struct {
   609  	// Hold a reference to the mmap so we can unmmap it later
   610  	mmap      []byte
   611  	elemCount uint32
   612  	// This double pointer is owned by the producer, it points to the last element in the ring buffer that was added
   613  	producer unsafe.Pointer
   614  	// This double pointer is owned by the consumer, it points to the last element in the ring buffer that was consumed
   615  	consumer unsafe.Pointer
   616  	// A pointer to the start of the ring buffer
   617  	ring  unsafe.Pointer
   618  	flags unsafe.Pointer
   619  }
   620  
   621  func (xr *xskRing) Close() error {
   622  	if xr.mmap != nil {
   623  		return syscall.Munmap(xr.mmap)
   624  	}
   625  	xr.mmap = nil
   626  
   627  	return nil
   628  }
   629  
   630  func newXskRing(mmap []byte, off ringOffset, elemCount uint32) xskRing {
   631  	return xskRing{
   632  		mmap:      mmap,
   633  		consumer:  unsafe.Pointer(&mmap[off.consumer]),
   634  		producer:  unsafe.Pointer(&mmap[off.producer]),
   635  		ring:      unsafe.Pointer(&mmap[off.desc]),
   636  		flags:     unsafe.Pointer(&mmap[off.flags]),
   637  		elemCount: elemCount,
   638  	}
   639  }
   640  
   641  // https://elixir.bootlin.com/linux/latest/source/include/uapi/linux/if_xdp.h
   642  
   643  // struct xdp_umem_reg {
   644  // 	__u64 addr; /* Start of packet data area */
   645  // 	__u64 len; /* Length of packet data area */
   646  // 	__u32 chunk_size;
   647  // 	__u32 headroom;
   648  // 	__u32 flags;
   649  // };
   650  type umemReg struct {
   651  	addr      uint64
   652  	len       uint64
   653  	chunkSize uint32
   654  	headroom  uint32
   655  	flags     uint32 //nolint:structcheck // unused reserved for future use
   656  }
   657  
   658  // struct xdp_ring_offset {
   659  // 	__u64 producer;
   660  // 	__u64 consumer;
   661  // 	__u64 desc;
   662  // 	__u64 flags;
   663  // };
   664  type ringOffset struct {
   665  	producer uint64
   666  	consumer uint64
   667  	desc     uint64
   668  	flags    uint64
   669  }
   670  
   671  type ringOffsetNoFlags struct {
   672  	producer uint64
   673  	consumer uint64
   674  	desc     uint64
   675  }
   676  
   677  // struct xdp_mmap_offsets {
   678  // 	struct xdp_ring_offset rx;
   679  // 	struct xdp_ring_offset tx;
   680  // 	struct xdp_ring_offset fr; /* Fill */
   681  // 	struct xdp_ring_offset cr; /* Completion */
   682  // };
   683  type mmapOffsets struct {
   684  	rx ringOffset
   685  	tx ringOffset
   686  	fr ringOffset
   687  	cr ringOffset
   688  }
   689  
   690  // struct xdp_desc {
   691  // 	__u64 addr;
   692  // 	__u32 len;
   693  // 	__u32 options;
   694  // };
   695  type descriptor struct {
   696  	addr uint64
   697  	len  uint32
   698  	// options is reserved and not used, setting it to anything other than 0 is invalid in 5.12.2
   699  	// https://elixir.bootlin.com/linux/v5.12.2/source/net/xdp/xsk_queue.h#L141
   700  	options uint32 //nolint:structcheck // not used but reserved for future use (also for descSize)
   701  }
   702  
   703  var descSize = unsafe.Sizeof(descriptor{})
   704  
   705  // struct sockaddr_xdp {
   706  // 	__u16 sxdp_family;
   707  // 	__u16 sxdp_flags;
   708  // 	__u32 sxdp_ifindex;
   709  // 	__u32 sxdp_queue_id;
   710  // 	__u32 sxdp_shared_umem_fd;
   711  // };
   712  type xdpSockAddr struct {
   713  	sxdpFamily       uint16
   714  	sxdpFlags        uint16
   715  	sxdpIfIndex      uint32
   716  	sxdpQueueID      uint32
   717  	sxdpSharedUmemFD uint32
   718  }
   719  
   720  type XSKSettings struct {
   721  	// Size of the umem frames/packet buffers (2048 or 4096)
   722  	FrameSize int
   723  	// Amount of frames/packets which can be used, must be a power of 2
   724  	FrameCount int
   725  	// The index of the network device on which XSK will be used
   726  	NetDevIfIndex int
   727  	// The id of the Queue on which this XSK will be used
   728  	QueueID int
   729  	// How much unused space should be left at the start of each buffer.
   730  	// This can be used to for example encapsulate a packet whichout having to move or copy memory
   731  	Headroom int
   732  	// Is Tx disabled for this socket?
   733  	DisableTx bool
   734  	// Is Rx disabled for this socket?
   735  	DisableRx bool
   736  	// If true, XDP_USE_NEED_WAKEUP is not used. Should be on by default
   737  	// unless there is a reason it doesn't work (like on older kernels)
   738  	DisableNeedWakeup bool
   739  	// If true, zero copy mode is forced. By default zero copy mode is attempted and if not available
   740  	// in the driver will automatically fallback to copy mode.
   741  	ForceZeroCopy bool
   742  	// If true, copy mode is always used and zero copy mode never attempted.
   743  	ForceCopy bool
   744  	// The minimum time between two checks of the completion queue. A lower value allows for more transmitted
   745  	// packets per seconds at the cost of higher CPU usage, even when not transmitting.
   746  	// By default this value is 10ms which seems a sane value, it means that there is a theorethical max TX rate of
   747  	// (1000/10) * (tx ring size) which is 100 * 2048 = 204,800 packets per second when DisableRx = false
   748  	// or 100 * 4096 = 409,600 when DisableRx = true at the default FrameCount of 4096.
   749  	// Setting this setting to 0 will cause one goroutine to busy poll(use 100% CPU) per socket.
   750  	CQConsumeInterval *time.Duration
   751  }
   752  
   753  // Same defaults as libbpf https://elixir.bootlin.com/linux/latest/source/tools/lib/bpf/xsk.h#L192
   754  const (
   755  	defaultFrameCount = 4096
   756  	defaultFrameSize  = 4096
   757  )
   758  
   759  var (
   760  	_ FrameReader = (*XSKSocket)(nil)
   761  	_ FrameWriter = (*XSKSocket)(nil)
   762  	_ FrameLeaser = (*XSKSocket)(nil)
   763  	_ io.Closer   = (*XSKSocket)(nil)
   764  )
   765  
   766  // A XSKSocket can bind to one queue on one netdev
   767  type XSKSocket struct {
   768  	fd int
   769  
   770  	// memory region where frames are exchanged with kernel
   771  	umem     []byte
   772  	settings XSKSettings
   773  
   774  	// Buffered channel containing addresses frames which can be used
   775  	// for transmission
   776  	txAddrs          chan uint64
   777  	completionTicker *time.Ticker
   778  
   779  	rmu sync.Mutex
   780  	wmu sync.Mutex
   781  	fmu sync.Mutex
   782  
   783  	rx         xskDescRing
   784  	tx         xskDescRing
   785  	fill       xskAddrRing
   786  	completion xskAddrRing
   787  
   788  	readTimeout  int
   789  	writeTimeout int
   790  }
   791  
   792  func NewXSKSocket(settings XSKSettings) (_ *XSKSocket, err error) {
   793  	if !kernelsupport.CurrentFeatures.Map.Has(kernelsupport.KFeatMapAFXDP) {
   794  		return nil, fmt.Errorf("XSK/AF_XDP is not supported by the current kernel version")
   795  	}
   796  
   797  	if settings.FrameCount == 0 {
   798  		settings.FrameCount = defaultFrameCount
   799  	}
   800  
   801  	if settings.FrameSize == 0 {
   802  		settings.FrameSize = defaultFrameSize
   803  	}
   804  
   805  	if !isPowerOfTwo(settings.FrameCount) {
   806  		return nil, fmt.Errorf("frame count must be a power of 2")
   807  	}
   808  
   809  	if settings.FrameSize != 2048 && settings.FrameSize != 4096 {
   810  		// TODO allow frame sizes which are not aligned to 2k but enable
   811  		// XDP_UMEM_UNALIGNED_CHUNK_FLAG when this happens
   812  		return nil, fmt.Errorf("frame size must be 2048 or 4096")
   813  	}
   814  
   815  	if settings.DisableTx && settings.DisableRx {
   816  		return nil, fmt.Errorf("tx and rx can't both be disabled")
   817  	}
   818  
   819  	if settings.ForceCopy && settings.ForceZeroCopy {
   820  		return nil, fmt.Errorf("can't force both zero-copy and copy mode")
   821  	}
   822  
   823  	umemSize := settings.FrameSize * settings.FrameCount
   824  	xskSock := &XSKSocket{
   825  		umem:     make([]byte, umemSize),
   826  		settings: settings,
   827  	}
   828  
   829  	xskSock.fd, err = syscall.Socket(unix.AF_XDP, syscall.SOCK_RAW, 0)
   830  	if err != nil {
   831  		return nil, fmt.Errorf("syscall socket: %w", err)
   832  	}
   833  	// If we return with an error, close the socket so we don't leak resources
   834  	defer func() {
   835  		if err != nil {
   836  			xskSock.Close()
   837  		}
   838  	}()
   839  
   840  	reg := umemReg{
   841  		addr:      uint64(uintptr(unsafe.Pointer(&xskSock.umem[0]))),
   842  		len:       uint64(len(xskSock.umem)),
   843  		chunkSize: uint32(settings.FrameSize),
   844  		headroom:  uint32(settings.Headroom),
   845  		// TODO flags
   846  	}
   847  	// Register the umem
   848  	err = bpfSyscall.Setsockopt(
   849  		xskSock.fd,
   850  		unix.SOL_XDP,
   851  		unix.XDP_UMEM_REG,
   852  		unsafe.Pointer(&reg),
   853  		unsafe.Sizeof(reg),
   854  	)
   855  	if err != nil {
   856  		return nil, fmt.Errorf("set sockopt UMEM_REG: %w", err)
   857  	}
   858  
   859  	// Assume both are enabled
   860  	rxCount := settings.FrameCount / 2
   861  	txCount := rxCount
   862  
   863  	// If tx is disabled
   864  	if settings.DisableTx {
   865  		txCount = 0
   866  		rxCount = settings.FrameCount
   867  	} else if settings.DisableRx {
   868  		txCount = settings.FrameCount
   869  		rxCount = 0
   870  	}
   871  
   872  	// Tell the kernel how large the fill ring should be
   873  	err = bpfSyscall.Setsockopt(
   874  		xskSock.fd,
   875  		unix.SOL_XDP,
   876  		unix.XDP_UMEM_FILL_RING,
   877  		unsafe.Pointer(&rxCount),
   878  		unsafe.Sizeof(rxCount),
   879  	)
   880  	if err != nil {
   881  		return nil, fmt.Errorf("set sockopt XDP_UMEM_FILL_RING: %w", err)
   882  	}
   883  
   884  	// Tell the kernel how large the completion ring should be
   885  	err = bpfSyscall.Setsockopt(
   886  		xskSock.fd,
   887  		unix.SOL_XDP,
   888  		unix.XDP_UMEM_COMPLETION_RING,
   889  		unsafe.Pointer(&txCount),
   890  		unsafe.Sizeof(txCount),
   891  	)
   892  	if err != nil {
   893  		return nil, fmt.Errorf("set sockopt XDP_UMEM_COMPLETION_RING: %w", err)
   894  	}
   895  
   896  	offsets, err := getMMapOffsets(xskSock.fd)
   897  	if err != nil {
   898  		return nil, fmt.Errorf("get mmap offsets: %w", err)
   899  	}
   900  
   901  	mmap, err := syscall.Mmap(
   902  		xskSock.fd,
   903  		unix.XDP_UMEM_PGOFF_FILL_RING,
   904  		int(offsets.fr.desc)+rxCount*int(unsafe.Sizeof(uint64(0))),
   905  		unix.PROT_READ|unix.PROT_WRITE,
   906  		unix.MAP_SHARED|unix.MAP_POPULATE,
   907  	)
   908  	if err != nil {
   909  		return nil, fmt.Errorf("mmap fill ring: %w", err)
   910  	}
   911  	xskSock.fill = xskAddrRing{
   912  		xskRing: newXskRing(mmap, offsets.fr, uint32(rxCount)),
   913  	}
   914  
   915  	mmap, err = syscall.Mmap(
   916  		xskSock.fd,
   917  		unix.XDP_UMEM_PGOFF_COMPLETION_RING,
   918  		int(offsets.cr.desc)+txCount*int(unsafe.Sizeof(uint64(0))),
   919  		unix.PROT_READ|unix.PROT_WRITE,
   920  		unix.MAP_SHARED|unix.MAP_POPULATE,
   921  	)
   922  	if err != nil {
   923  		return nil, fmt.Errorf("mmap completion ring: %w", err)
   924  	}
   925  
   926  	xskSock.completion = xskAddrRing{
   927  		xskRing: newXskRing(mmap, offsets.cr, uint32(txCount)),
   928  	}
   929  
   930  	xskSock.txAddrs = make(chan uint64, txCount+1)
   931  	txOffset := rxCount * settings.FrameSize
   932  	// Fill the txAddrs channel with available addresses to use during transmisstion
   933  	for i := 0; i < txCount; i++ {
   934  		xskSock.txAddrs <- uint64(txOffset + i*settings.FrameSize)
   935  	}
   936  
   937  	// TODO allow for completion worker pooling (having one worker check multiple sockets)
   938  	//  this would allow a user to dedicate 1 or 2 CPU cores to busy polling all sockets of a
   939  	//  particular netdev or even the whole host.
   940  
   941  	interval := 10 * time.Millisecond
   942  	if settings.CQConsumeInterval != nil {
   943  		interval = *settings.CQConsumeInterval
   944  	}
   945  	xskSock.completionTicker = time.NewTicker(interval)
   946  	go xskSock.completionWorker()
   947  
   948  	// Tell the kernel how large the rx ring should be
   949  	err = bpfSyscall.Setsockopt(
   950  		xskSock.fd,
   951  		unix.SOL_XDP,
   952  		unix.XDP_RX_RING,
   953  		unsafe.Pointer(&rxCount),
   954  		unsafe.Sizeof(rxCount),
   955  	)
   956  	if err != nil {
   957  		return nil, fmt.Errorf("set sockopt XDP_RX_RING: %w", err)
   958  	}
   959  
   960  	// Tell the kernel how large the tx ring should be
   961  	err = bpfSyscall.Setsockopt(
   962  		xskSock.fd,
   963  		unix.SOL_XDP,
   964  		unix.XDP_TX_RING,
   965  		unsafe.Pointer(&txCount),
   966  		unsafe.Sizeof(txCount),
   967  	)
   968  	if err != nil {
   969  		return nil, fmt.Errorf("set sockopt XDP_TX_RING: %w", err)
   970  	}
   971  
   972  	mmap, err = syscall.Mmap(
   973  		xskSock.fd,
   974  		unix.XDP_PGOFF_RX_RING,
   975  		int(offsets.rx.desc)+rxCount*int(unsafe.Sizeof(descriptor{})),
   976  		unix.PROT_READ|unix.PROT_WRITE,
   977  		unix.MAP_SHARED|unix.MAP_POPULATE,
   978  	)
   979  	if err != nil {
   980  		return nil, fmt.Errorf("mmap rx ring: %w", err)
   981  	}
   982  	xskSock.rx = xskDescRing{
   983  		xskRing: newXskRing(mmap, offsets.rx, uint32(rxCount)),
   984  	}
   985  
   986  	mmap, err = syscall.Mmap(
   987  		xskSock.fd,
   988  		unix.XDP_PGOFF_TX_RING,
   989  		int(offsets.tx.desc)+txCount*int(unsafe.Sizeof(descriptor{})),
   990  		unix.PROT_READ|unix.PROT_WRITE,
   991  		unix.MAP_SHARED|unix.MAP_POPULATE,
   992  	)
   993  	if err != nil {
   994  		return nil, fmt.Errorf("mmap tx ring: %w", err)
   995  	}
   996  	xskSock.tx = xskDescRing{
   997  		xskRing: newXskRing(mmap, offsets.tx, uint32(txCount)),
   998  	}
   999  
  1000  	var flags uint16
  1001  	if !settings.DisableNeedWakeup {
  1002  		flags |= unix.XDP_USE_NEED_WAKEUP
  1003  	}
  1004  
  1005  	if settings.ForceCopy {
  1006  		flags |= unix.XDP_COPY
  1007  	}
  1008  
  1009  	if settings.ForceZeroCopy {
  1010  		flags |= unix.XDP_ZEROCOPY
  1011  	}
  1012  
  1013  	sockAddr := xdpSockAddr{
  1014  		sxdpFamily:       unix.AF_XDP,
  1015  		sxdpIfIndex:      uint32(settings.NetDevIfIndex),
  1016  		sxdpQueueID:      uint32(settings.QueueID),
  1017  		sxdpSharedUmemFD: uint32(xskSock.fd),
  1018  		sxdpFlags:        flags,
  1019  	}
  1020  	err = bpfSyscall.Bind(xskSock.fd, unsafe.Pointer(&sockAddr), bpfSyscall.Socklen(unsafe.Sizeof(sockAddr)))
  1021  	if err != nil {
  1022  		return nil, fmt.Errorf("bind: %w", err)
  1023  	}
  1024  
  1025  	// Give all Rx frames to the kernel
  1026  	for i := 0; i < rxCount-1; i++ {
  1027  		err = xskSock.fill.Enqueue(uint64(i * settings.FrameSize))
  1028  		if err != nil {
  1029  			return nil, fmt.Errorf("fill enqueue: %w", err)
  1030  		}
  1031  	}
  1032  	err = xskSock.wakeupFill()
  1033  	if err != nil {
  1034  		return nil, fmt.Errorf("wakeupFill: %w", err)
  1035  	}
  1036  	// NOTE Tx frames are enqueued after they have been filled as a signal to transmit them
  1037  
  1038  	return xskSock, nil
  1039  }
  1040  
  1041  // Fd returns the file descriptor of the socket.
  1042  func (xs *XSKSocket) Fd() int {
  1043  	return xs.fd
  1044  }
  1045  
  1046  // SetWriteTimeout sets the timeout for Write and XSKLease.WriteBack calls.
  1047  // If ms == 0 (default), we will never block/wait and error if we can't write at once.
  1048  // If ms == -1, we will block forever until we can write.
  1049  // If ms > 0, we will wait for x miliseconds for an oppurunity to write or error afterwards.
  1050  func (xs *XSKSocket) SetWriteTimeout(ms int) error {
  1051  	if ms < -1 {
  1052  		return fmt.Errorf("timeout must be -1, 0, or positive amount of miliseconds")
  1053  	}
  1054  
  1055  	xs.writeTimeout = ms
  1056  
  1057  	return nil
  1058  }
  1059  
  1060  // SetReadTimeout sets the timeout for Read and ReadLease calls.
  1061  // If ms == 0 (default), we will never block/wait and return no data if there isn't any ready.
  1062  // If ms == -1, we will block forever until we can read.
  1063  // If ms > 0, we will wait for x miliseconds for an oppurunity to read or return no data.
  1064  func (xs *XSKSocket) SetReadTimeout(ms int) error {
  1065  	if ms < -1 {
  1066  		return fmt.Errorf("timeout must be -1, 0, or positive amount of miliseconds")
  1067  	}
  1068  
  1069  	xs.readTimeout = ms
  1070  
  1071  	return nil
  1072  }
  1073  
  1074  // If the need wakeup flag is set on the ring the kernel requests that we
  1075  // wakeup the fill ring with a poll syscall
  1076  // https://patchwork.ozlabs.org/project/netdev/patch/1560411450-29121-3-git-send-email-magnus.karlsson@intel.com/
  1077  func (xs *XSKSocket) wakeupFill() error {
  1078  	if *(*uint32)(xs.fill.flags)&unix.XDP_RING_NEED_WAKEUP == 1 {
  1079  		_, err := unix.Poll([]unix.PollFd{{Fd: int32(xs.fd), Events: unix.POLLOUT}}, 0)
  1080  		if err != nil {
  1081  			return fmt.Errorf("poll fill: %w", err)
  1082  		}
  1083  	}
  1084  
  1085  	return nil
  1086  }
  1087  
  1088  // If the need wakeup flag is set on the ring the kernel requests that we
  1089  // wakeup the fill ring with a poll syscall
  1090  // https://patchwork.ozlabs.org/project/netdev/patch/1560411450-29121-3-git-send-email-magnus.karlsson@intel.com/
  1091  func (xs *XSKSocket) wakeupTx() error {
  1092  	if *(*uint32)(xs.tx.flags)&unix.XDP_RING_NEED_WAKEUP == 1 {
  1093  		err := bpfSyscall.Sendto(
  1094  			xs.fd,
  1095  			nil,
  1096  			syscall.MSG_DONTWAIT,
  1097  			unsafe.Pointer(&bpfSyscall.Zero),
  1098  			bpfSyscall.Socklen(0),
  1099  		)
  1100  		if err != nil {
  1101  			if sysErr, ok := err.(*bpfSyscall.Error); ok {
  1102  				switch sysErr.Errno {
  1103  				// These errors occur regulairly when load is high, ignore these errors, the next time
  1104  				// wakeupTx is called it will trigger the kernel to read the full ring anyway.
  1105  				// https://github.com/torvalds/linux/blob/b741596468b010af2846b75f5e75a842ce344a6e/samples/bpf/xdpsock_user.c#L1095
  1106  				//nolint:lll
  1107  				case syscall.EBUSY,
  1108  					syscall.EAGAIN,
  1109  					syscall.ENOBUFS,
  1110  					syscall.ENETDOWN:
  1111  					return nil
  1112  				}
  1113  			}
  1114  
  1115  			return fmt.Errorf("syscall sendto: %w", err)
  1116  		}
  1117  	}
  1118  
  1119  	return nil
  1120  }
  1121  
  1122  func (xs *XSKSocket) dequeueRx() (*descriptor, error) {
  1123  	desc := xs.rx.Dequeue()
  1124  	// there is nothing to dequeue
  1125  	if desc == nil {
  1126  		// Return at once if blocking is disabled
  1127  		if xs.readTimeout == 0 {
  1128  			return nil, nil
  1129  		}
  1130  
  1131  		n, err := unix.Poll([]unix.PollFd{{Fd: int32(xs.fd), Events: unix.POLLIN}}, xs.readTimeout)
  1132  		if err != nil {
  1133  			// Sometimes a poll is interrupted by a signal, no a real error
  1134  			// lets treat it like a timeout
  1135  			if err == syscall.EINTR {
  1136  				return nil, nil
  1137  			}
  1138  
  1139  			return nil, fmt.Errorf("poll: %w", err)
  1140  		}
  1141  
  1142  		// If n == 0, the timeout was reached
  1143  		if n == 0 {
  1144  			return nil, nil
  1145  		}
  1146  
  1147  		desc = xs.rx.Dequeue()
  1148  		if desc == nil {
  1149  			return desc, fmt.Errorf("no desc after poll")
  1150  		}
  1151  	}
  1152  
  1153  	return desc, nil
  1154  }
  1155  
  1156  // ReadFrame implements FrameReader, however we have to implement this with a memory copy which is not ideal
  1157  // for efficiency. For zero copy packet access ReadLease should be used.
  1158  func (xs *XSKSocket) ReadFrame(p []byte) (n int, err error) {
  1159  	xs.rmu.Lock()
  1160  	defer xs.rmu.Unlock()
  1161  
  1162  	desc, err := xs.dequeueRx()
  1163  	if err != nil {
  1164  		return 0, fmt.Errorf("dequeue rx: %w", err)
  1165  	}
  1166  	if desc == nil {
  1167  		return 0, nil
  1168  	}
  1169  
  1170  	// unlike the ReadLease function, we ignore headspace since any benefit is lost
  1171  	// during the copy.
  1172  	len := copy(p, xs.umem[desc.addr:desc.addr+uint64(desc.len)])
  1173  
  1174  	err = xs.fill.Enqueue(addrToFrameStart(desc.addr, xs.settings.FrameSize))
  1175  	if err != nil {
  1176  		return len, fmt.Errorf("fill enqueue: %w", err)
  1177  	}
  1178  
  1179  	err = xs.wakeupFill()
  1180  	if err != nil {
  1181  		return len, err
  1182  	}
  1183  
  1184  	return len, nil
  1185  }
  1186  
  1187  // WriteLease creates a XSKLease which points to a piece of preallocated memory. This memory can be used to
  1188  // build packets for writing. Unlike XSKLeases gotten from ReadLease, write leases have no Headroom.
  1189  // The Data slice of the lease is the full length of the usable frame, this length should not be exceeded.
  1190  // Any memory held by the lease can't be reused until released or written.
  1191  //
  1192  // This function blocks until a frame for transmission is available and is not subject to the write timeout.
  1193  func (xs *XSKSocket) WriteLease() (lease *XSKLease, err error) {
  1194  	addr := <-xs.txAddrs
  1195  	return &XSKLease{
  1196  		Headroom: 0,
  1197  		Data:     xs.umem[addr : addr+uint64(xs.settings.FrameSize)],
  1198  		dataAddr: addr,
  1199  		sock:     xs,
  1200  		fromTx:   true,
  1201  	}, nil
  1202  }
  1203  
  1204  // ReadLease reads a frame from the socket and returns its memory in a XSKLease. After reading the contents of the
  1205  // frame it can be released or written, both will allow the memory to be reused. Calling Write on the lease will
  1206  // cause the contents of Data to be written back to the network interface. The contents of Data can be modified
  1207  // before calling Write thus allowing a program to implement zero-copy/zero-allocation encaptulation or
  1208  // request/response protocols.
  1209  func (xs *XSKSocket) ReadLease() (lease *XSKLease, err error) {
  1210  	xs.rmu.Lock()
  1211  	defer xs.rmu.Unlock()
  1212  
  1213  	desc, err := xs.dequeueRx()
  1214  	if err != nil {
  1215  		return nil, fmt.Errorf("dequeue rx: %w", err)
  1216  	}
  1217  	if desc == nil {
  1218  		return nil, nil
  1219  	}
  1220  
  1221  	return &XSKLease{
  1222  		Headroom: xs.settings.Headroom,
  1223  		Data:     xs.umem[desc.addr-uint64(xs.settings.Headroom) : desc.addr+uint64(desc.len)],
  1224  		dataAddr: desc.addr,
  1225  		sock:     xs,
  1226  	}, nil
  1227  }
  1228  
  1229  func (xs *XSKSocket) enqueueTx(desc descriptor) error {
  1230  	err := xs.tx.Enqueue(desc)
  1231  	if err != nil {
  1232  		if err != errBufferFull {
  1233  			// Put the frame address back in the chan so we don't lose it
  1234  			xs.txAddrs <- desc.addr
  1235  
  1236  			return fmt.Errorf("tx enqueue: %w", err)
  1237  		}
  1238  
  1239  		_, err := unix.Poll([]unix.PollFd{{Fd: int32(xs.fd), Events: unix.POLLOUT}}, xs.writeTimeout)
  1240  		if err != nil {
  1241  			return fmt.Errorf("poll: %w", err)
  1242  		}
  1243  
  1244  		err = xs.tx.Enqueue(desc)
  1245  		if err != nil {
  1246  			// Put the frame address back in the chan so we don't lose it
  1247  			xs.txAddrs <- desc.addr
  1248  
  1249  			return fmt.Errorf("tx enqueue: %w", err)
  1250  		}
  1251  	}
  1252  
  1253  	return nil
  1254  }
  1255  
  1256  // WriteFrame implements FrameWriter. The interface requires us to copy p into umem which is not
  1257  // optimal for speed. For maximum performance use WriteLease instead.
  1258  func (xs *XSKSocket) WriteFrame(p []byte) (n int, err error) {
  1259  	xs.wmu.Lock()
  1260  	defer xs.wmu.Unlock()
  1261  
  1262  	if len(p) > xs.settings.FrameSize {
  1263  		return 0, fmt.Errorf("data is larget than frame size of %d", xs.settings.FrameSize)
  1264  	}
  1265  
  1266  	// We assume we will never be blocking here for long
  1267  	addr := <-xs.txAddrs
  1268  
  1269  	len := copy(xs.umem[addr:addr+uint64(len(p))], p)
  1270  
  1271  	err = xs.enqueueTx(descriptor{
  1272  		addr: addr,
  1273  		len:  uint32(len),
  1274  	})
  1275  	if err != nil {
  1276  		xs.txAddrs <- addr
  1277  		return 0, err
  1278  	}
  1279  
  1280  	err = xs.wakeupTx()
  1281  	if err != nil {
  1282  		return 0, err
  1283  	}
  1284  
  1285  	return len, nil
  1286  }
  1287  
  1288  func (xs *XSKSocket) Close() error {
  1289  	err := xs.rx.Close()
  1290  	if err != nil {
  1291  		return fmt.Errorf("rx close: %w", err)
  1292  	}
  1293  
  1294  	err = xs.tx.Close()
  1295  	if err != nil {
  1296  		return fmt.Errorf("tx close: %w", err)
  1297  	}
  1298  
  1299  	err = xs.fill.Close()
  1300  	if err != nil {
  1301  		return fmt.Errorf("fill close: %w", err)
  1302  	}
  1303  
  1304  	if xs.completionTicker != nil {
  1305  		xs.completionTicker.Stop()
  1306  	}
  1307  
  1308  	err = xs.completion.Close()
  1309  	if err != nil {
  1310  		return fmt.Errorf("completion close: %w", err)
  1311  	}
  1312  
  1313  	if xs.fd != 0 {
  1314  		err = syscall.Close(xs.fd)
  1315  		if err != nil {
  1316  			return fmt.Errorf("socket close: %w", err)
  1317  		}
  1318  
  1319  		xs.fd = 0
  1320  	}
  1321  
  1322  	return nil
  1323  }
  1324  
  1325  // completionWorker is started when a socket is created and is responsible for dequeueing the completion ring
  1326  // and transferring the free address to the txAddrs chan so they can be re-used
  1327  func (xs *XSKSocket) completionWorker() {
  1328  	// As long as the completion ring still is mapped
  1329  	for xs.completion.mmap != nil {
  1330  		// Every tick of the completion ticket, dequeue the whole completion queue
  1331  		// and put the frame addrsses on the txAddrs list
  1332  		for xs.completion.mmap != nil {
  1333  			addr := xs.completion.Dequeue()
  1334  			if addr == nil {
  1335  				break
  1336  			}
  1337  
  1338  			xs.txAddrs <- addrToFrameStart(*addr, xs.settings.FrameSize)
  1339  		}
  1340  
  1341  		// TODO auto ajust completion ticker (slow down using idle time and speed up during high tx rate)
  1342  
  1343  		<-xs.completionTicker.C
  1344  	}
  1345  }
  1346  
  1347  func getMMapOffsets(fd int) (offsets mmapOffsets, err error) {
  1348  	if kernelsupport.CurrentFeatures.Misc.Has(kernelsupport.KFeatMiscXSKRingFlags) {
  1349  		len := bpfSyscall.Socklen(unsafe.Sizeof(offsets))
  1350  		err = bpfSyscall.Getsockopt(
  1351  			fd,
  1352  			unix.SOL_XDP,
  1353  			unix.XDP_MMAP_OFFSETS,
  1354  			unsafe.Pointer(&offsets),
  1355  			&len,
  1356  		)
  1357  		if err != nil {
  1358  			return offsets, fmt.Errorf("get sockopt XDP_MMAP_OFFSETS: %w", err)
  1359  		}
  1360  	} else {
  1361  		nfOff, err := getMMapOffsetsNoFlags(fd)
  1362  		if err != nil {
  1363  			return offsets, fmt.Errorf("no flag offsets: %w", err)
  1364  		}
  1365  		offsets.rx = ringOffset{
  1366  			consumer: nfOff[0].consumer,
  1367  			producer: nfOff[0].producer,
  1368  			desc:     nfOff[0].desc,
  1369  		}
  1370  		offsets.tx = ringOffset{
  1371  			consumer: nfOff[1].consumer,
  1372  			producer: nfOff[1].producer,
  1373  			desc:     nfOff[1].desc,
  1374  		}
  1375  		offsets.cr = ringOffset{
  1376  			consumer: nfOff[2].consumer,
  1377  			producer: nfOff[2].producer,
  1378  			desc:     nfOff[2].desc,
  1379  		}
  1380  		offsets.fr = ringOffset{
  1381  			consumer: nfOff[3].consumer,
  1382  			producer: nfOff[3].producer,
  1383  			desc:     nfOff[3].desc,
  1384  		}
  1385  	}
  1386  
  1387  	return offsets, nil
  1388  }
  1389  
  1390  func getMMapOffsetsNoFlags(fd int) (offsets [4]ringOffsetNoFlags, err error) {
  1391  	len := bpfSyscall.Socklen(unsafe.Sizeof(offsets))
  1392  	err = bpfSyscall.Getsockopt(
  1393  		fd,
  1394  		unix.SOL_XDP,
  1395  		unix.XDP_MMAP_OFFSETS,
  1396  		unsafe.Pointer(&offsets),
  1397  		&len,
  1398  	)
  1399  	if err != nil {
  1400  		return offsets, fmt.Errorf("get sockopt XDP_MMAP_OFFSETS: %w", err)
  1401  	}
  1402  
  1403  	return offsets, nil
  1404  }
  1405  
  1406  func isPowerOfTwo(x int) bool {
  1407  	return (x != 0) && ((x & (x - 1)) == 0)
  1408  }
  1409  
  1410  // GetNetDevQueueCount uses the /sys/class/net/<dev>/queues/ directory to figure out how many queues a network
  1411  // device has. Knowing the number of queues is critical when binding XSK sockets to a network device.
  1412  func GetNetDevQueueCount(netdev string) (int, error) {
  1413  	if strings.ContainsAny(netdev, "/") {
  1414  		return 0, fmt.Errorf("network device name should not contain slashes")
  1415  	}
  1416  
  1417  	entries, err := os.ReadDir(fmt.Sprintf("/sys/class/net/%s/queues", netdev))
  1418  	if err != nil {
  1419  		return 0, fmt.Errorf("os.Lstat: %w", err)
  1420  	}
  1421  
  1422  	// Just count the RX queues, we can assume there are as much TX queues as RX queues
  1423  	count := 0
  1424  	for _, entry := range entries {
  1425  		if strings.HasPrefix(entry.Name(), "rx-") {
  1426  			count++
  1427  		}
  1428  	}
  1429  
  1430  	return count, nil
  1431  }