github.com/dylandreimerink/gobpfld@v0.6.1-0.20220205171531-e79c330ad608/xsk.go (about) 1 package gobpfld 2 3 import ( 4 "errors" 5 "fmt" 6 "io" 7 "os" 8 "strings" 9 "sync" 10 "syscall" 11 "time" 12 "unsafe" 13 14 bpfSyscall "github.com/dylandreimerink/gobpfld/internal/syscall" 15 "github.com/dylandreimerink/gobpfld/kernelsupport" 16 "golang.org/x/sys/unix" 17 ) 18 19 // A FrameReader can read whole or partial ethernet frames. Every time ReadFrame is called, p will be filled with up 20 // to len(p) bytes from a single frame. These bytes include both the header and body of the ethernet frame. 21 // If p to small to fit the whole frame, the remaining bytes of the frame are discarded. The next call to ReadFrame 22 // will start at the next frame. 23 // 24 // n will be set to the number of bytes read from the the frame. err is non nil if any error has occurred during the 25 // process. If both n is 0 and err is nil nothing was read for an expected reason like a timout or external interrupt. 26 type FrameReader interface { 27 ReadFrame(p []byte) (n int, err error) 28 } 29 30 type FrameWriter interface { 31 WriteFrame(p []byte) (n int, err error) 32 } 33 34 type FrameLeaser interface { 35 ReadLease() (*XSKLease, error) 36 WriteLease() (*XSKLease, error) 37 } 38 39 var ( 40 _ FrameReader = (*XSKMultiSocket)(nil) 41 _ FrameWriter = (*XSKMultiSocket)(nil) 42 _ FrameLeaser = (*XSKMultiSocket)(nil) 43 _ io.Closer = (*XSKMultiSocket)(nil) 44 ) 45 46 // XSKMultiSocket is a collection of XSKSockets. The multi socket balances reads and writes between all XSKSockets. 47 // This is useful for multi queue netdevices since a XSKSocket can only read or write from one rx/tx queue pair 48 // at a time. A multi queue allows you to bundle all of these sockets so you get a socket for the whole netdevice. 49 // 50 // An alternative use for the multi socket is to add sockets from multiple netdevices. 51 // 52 // TODO look into using epoll for multi sockets. Using poll for single sockets still makes sense since there is always 53 // 1 fd, but for multi sockets we can have much more. For high-end NICs with ~40 rx/tx queues(mallanox for example) 54 // it makes sense to start using epoll since it is supposed to scale better. Should make it configurable when adding 55 // support in case freeBSD or other unix-like os adds XSK support since epoll is non-POSIX 56 // 57 // TODO dynamic socket adding/removing. Should not be to hard, the main edge case to solve is dealing with 58 // pending/blocking syscalls for read/write. But presumably epoll can allow us to dynamically add/remove 59 // fds without interrupting the reads/writes. Otherwise adding/removing sockets will have to request both the 60 // rmu and wmu. 61 type XSKMultiSocket struct { 62 sockets []*XSKSocket 63 64 rmu sync.Mutex 65 wmu sync.Mutex 66 67 readIter int 68 writeIter int 69 70 readTimeout int 71 writeTimeout int 72 } 73 74 func NewXSKMultiSocket(xskSockets ...*XSKSocket) (*XSKMultiSocket, error) { 75 if len(xskSockets) == 0 { 76 return nil, fmt.Errorf("need at least one socket") 77 } 78 79 for _, sock := range xskSockets { 80 if sock == nil { 81 return nil, fmt.Errorf("socket value can't be nil") 82 } 83 } 84 85 return &XSKMultiSocket{ 86 sockets: xskSockets, 87 }, nil 88 } 89 90 // SetWriteTimeout sets the timeout for Write and XSKLease.WriteBack calls. 91 // If ms == 0 (default), we will never block/wait and error if we can't write at once. 92 // If ms == -1, we will block forever until we can write. 93 // If ms > 0, we will wait for x miliseconds for an oppurunity to write or error afterwards. 94 func (xms *XSKMultiSocket) SetWriteTimeout(ms int) error { 95 if ms < -1 { 96 return fmt.Errorf("timeout must be -1, 0, or positive amount of miliseconds") 97 } 98 99 xms.writeTimeout = ms 100 101 return nil 102 } 103 104 // SetReadTimeout sets the timeout for Read and ReadLease calls. 105 // If ms == 0 (default), we will never block/wait and return no data if there isn't any ready. 106 // If ms == -1, we will block forever until we can read. 107 // If ms > 0, we will wait for x miliseconds for an oppurunity to read or return no data. 108 func (xms *XSKMultiSocket) SetReadTimeout(ms int) error { 109 if ms < -1 { 110 return fmt.Errorf("timeout must be -1, 0, or positive amount of miliseconds") 111 } 112 113 xms.readTimeout = ms 114 115 return nil 116 } 117 118 func (xms *XSKMultiSocket) ReadFrame(p []byte) (n int, err error) { 119 xms.rmu.Lock() 120 defer xms.rmu.Unlock() 121 122 var ( 123 desc *descriptor 124 sock *XSKSocket 125 ) 126 pollFds := make([]unix.PollFd, len(xms.sockets)) 127 128 // Save the current iter value, we need it during poll resolving 129 curItr := xms.readIter 130 131 // Check every socket in case there is a frame ready 132 for i := 0; i < len(xms.sockets); i++ { 133 // Use readIter which keeps it value between calls to Read this ensures that we attempt to read from 134 // all sockets equally 135 sock = xms.sockets[xms.readIter] 136 desc = sock.rx.Dequeue() 137 138 xms.readIter++ 139 if xms.readIter >= len(xms.sockets) { 140 xms.readIter = 0 141 } 142 143 if desc != nil { 144 break 145 } 146 147 pollFds[i] = unix.PollFd{ 148 Fd: int32(xms.sockets[xms.readIter].fd), 149 Events: unix.POLLIN, 150 } 151 } 152 153 // If none of the sockets have frames ready at the moment, we poll to wait 154 if desc == nil { 155 n, err := unix.Poll(pollFds, xms.readTimeout) 156 if err != nil { 157 // Sometimes a poll is interrupted by a signal, no a real error 158 // lets treat it like a timeout 159 if err == syscall.EINTR { 160 return 0, nil 161 } 162 163 return 0, fmt.Errorf("poll: %w", err) 164 } 165 166 // If n == 0, the timeout was reached 167 if n == 0 { 168 return 0, nil 169 } 170 171 // now that there is at least one socket with a frame, we need to find it 172 for i := 0; i < len(xms.sockets); i++ { 173 if pollFds[i].Revents&unix.POLLIN > 0 { 174 sock = xms.sockets[curItr] 175 desc = sock.rx.Dequeue() 176 if desc != nil { 177 break 178 } 179 } 180 181 curItr++ 182 if curItr >= len(xms.sockets) { 183 curItr = 0 184 } 185 } 186 187 // If poll returned n>0 but dequeueing still failed 188 if desc == nil { 189 return 0, nil 190 } 191 } 192 193 len := copy(p, sock.umem[desc.addr:desc.addr+uint64(desc.len)]) 194 err = sock.fill.Enqueue(addrToFrameStart(desc.addr, sock.settings.FrameSize)) 195 if err != nil { 196 return len, fmt.Errorf("fill enqueue: %w", err) 197 } 198 199 err = sock.wakeupFill() 200 if err != nil { 201 return len, err 202 } 203 204 return len, nil 205 } 206 207 func (xms *XSKMultiSocket) WriteFrame(p []byte) (n int, err error) { 208 xms.wmu.Lock() 209 defer xms.wmu.Unlock() 210 211 pollFds := make([]unix.PollFd, len(xms.sockets)) 212 213 // Check every socket in case there is a frame ready 214 for i := 0; i < len(xms.sockets); i++ { 215 pollFds[i] = unix.PollFd{ 216 Fd: int32(xms.sockets[xms.writeIter].fd), 217 Events: unix.POLLOUT, 218 } 219 220 xms.writeIter++ 221 if xms.writeIter >= len(xms.sockets) { 222 xms.writeIter = 0 223 } 224 } 225 226 n, err = unix.Poll(pollFds, xms.writeTimeout) 227 if err != nil { 228 return 0, fmt.Errorf("poll: %w", err) 229 } 230 231 if n == 0 { 232 return 0, fmt.Errorf("timeout") 233 } 234 235 var ( 236 addr uint64 237 sock *XSKSocket 238 ) 239 for i := 0; i < len(xms.sockets); i++ { 240 sock = xms.sockets[xms.writeIter] 241 242 xms.writeIter++ 243 if xms.writeIter >= len(xms.sockets) { 244 xms.writeIter = 0 245 } 246 247 if pollFds[i].Revents&unix.POLLOUT > 0 { 248 addr = <-sock.txAddrs 249 break 250 } 251 } 252 253 len := copy(sock.umem[addr:addr+uint64(len(p))], p) 254 255 err = sock.enqueueTx(descriptor{ 256 addr: addr, 257 len: uint32(len), 258 }) 259 if err != nil { 260 sock.txAddrs <- addr 261 return 0, err 262 } 263 264 err = sock.wakeupTx() 265 if err != nil { 266 return 0, err 267 } 268 269 return len, nil 270 } 271 272 func (xms *XSKMultiSocket) Close() error { 273 for _, sock := range xms.sockets { 274 err := sock.Close() 275 if err != nil { 276 return err 277 } 278 } 279 280 return nil 281 } 282 283 // WriteLease creates a XSKLease which points to a piece of preallocated memory. This memory can be used to 284 // build packets for writing. Unlike XSKLeases gotten from ReadLease, write leases have no Headroom. 285 // The Data slice of the lease is the full length of the usable frame, this length should not be exceeded. 286 // Any memory held by the lease can't be reused until released or written. 287 // 288 // This function blocks until a frame for transmission is available and is not subject to the write timeout. 289 func (xms *XSKMultiSocket) WriteLease() (lease *XSKLease, err error) { 290 sock := xms.sockets[xms.writeIter] 291 xms.writeIter++ 292 if xms.writeIter >= len(xms.sockets) { 293 xms.writeIter = 0 294 } 295 296 addr := <-sock.txAddrs 297 return &XSKLease{ 298 Headroom: 0, 299 Data: sock.umem[addr : addr+uint64(sock.settings.FrameSize)], 300 dataAddr: addr, 301 sock: sock, 302 fromTx: true, 303 }, nil 304 } 305 306 // ReadLease reads a frame from the socket and returns its memory in a XSKLease. After reading the contents of the 307 // frame it can be released or written, both will allow the memory to be reused. Calling Write on the lease will 308 // cause the contents of Data to be written back to the network interface. The contents of Data can be modified 309 // before calling Write thus allowing a program to implement zero-copy/zero-allocation encaptulation or 310 // request/response protocols. 311 func (xms *XSKMultiSocket) ReadLease() (lease *XSKLease, err error) { 312 xms.rmu.Lock() 313 defer xms.rmu.Unlock() 314 315 var ( 316 desc *descriptor 317 sock *XSKSocket 318 ) 319 pollFds := make([]unix.PollFd, len(xms.sockets)) 320 321 // Save the current iter value, we need it during poll resolving 322 curItr := xms.readIter 323 324 // Check every socket in case there is a frame ready 325 for i := 0; i < len(xms.sockets); i++ { 326 // Use readIter which keeps it value between calls to Read this ensures that we attempt to read from 327 // all sockets equally 328 sock = xms.sockets[xms.readIter] 329 desc = sock.rx.Dequeue() 330 331 xms.readIter++ 332 if xms.readIter >= len(xms.sockets) { 333 xms.readIter = 0 334 } 335 336 if desc != nil { 337 break 338 } 339 340 pollFds[i] = unix.PollFd{ 341 Fd: int32(xms.sockets[xms.readIter].fd), 342 Events: unix.POLLIN, 343 } 344 } 345 346 // If none of the sockets have frames ready at the moment, we poll to wait 347 if desc == nil { 348 n, err := unix.Poll(pollFds, xms.readTimeout) 349 if err != nil { 350 // Sometimes a poll is interrupted by a signal, no a real error 351 // lets treat it like a timeout 352 if err == syscall.EINTR { 353 return nil, nil 354 } 355 356 return nil, fmt.Errorf("poll: %w", err) 357 } 358 359 // If n == 0, the timeout was reached 360 if n == 0 { 361 return nil, nil 362 } 363 364 // now that there is at least one socket with a frame, we need to find it 365 for i := 0; i < len(xms.sockets); i++ { 366 if pollFds[i].Revents&unix.POLLIN > 0 { 367 sock = xms.sockets[curItr] 368 desc = sock.rx.Dequeue() 369 if desc != nil { 370 break 371 } 372 } 373 374 curItr++ 375 if curItr >= len(xms.sockets) { 376 curItr = 0 377 } 378 } 379 380 // If poll returned n>0 but dequeueing still failed 381 if desc == nil { 382 return nil, nil 383 } 384 } 385 386 return &XSKLease{ 387 Headroom: sock.settings.Headroom, 388 Data: sock.umem[desc.addr-uint64(sock.settings.Headroom) : desc.addr+uint64(desc.len)], 389 dataAddr: desc.addr, 390 sock: sock, 391 }, nil 392 } 393 394 // XSKLease is used to "lease" a piece of buffer memory from the socket and return it after the user 395 // is done using it. This allows us to implement true zero copy packet access. 396 // After a XSKLease is released or written the underlaying array of Data will be repurposed, to avoid strage bugs 397 // users must use Data or sub-slices of Data after the lease has been released. 398 type XSKLease struct { 399 Data []byte 400 // The amount of bytes which are prefixed at the start which don't contain frame data. 401 // This headroom can be used to add an extra header(encapsulation) without having to 402 // copy or move the existing packet data. 403 Headroom int 404 // dataAddr is the memory address at the start of the headroom. 405 dataAddr uint64 406 sock *XSKSocket 407 // If true the frame address originates from the txAddrs chan 408 fromTx bool 409 } 410 411 // Release releases the leased memory so the kernel can fill it with new data. 412 func (xl *XSKLease) Release() error { 413 // Remove reference to Data since it is invalid from now 414 xl.Data = nil 415 416 frameAddr := addrToFrameStart(xl.dataAddr, xl.sock.settings.FrameSize) 417 418 // If the this is a tx lease, we can just return the unused address to the txAddrs buffer 419 if xl.fromTx { 420 xl.sock.txAddrs <- frameAddr 421 } else { 422 // else, this lease was a rx lease in which case it must be returned to the fill ring 423 424 xl.sock.fmu.Lock() 425 defer xl.sock.fmu.Unlock() 426 427 // Enqueue the address of the frame on the fill queue so it can be reused 428 err := xl.sock.fill.Enqueue(frameAddr) 429 if err != nil { 430 return fmt.Errorf("enqueue fill: %w", err) 431 } 432 433 err = xl.sock.wakeupFill() 434 if err != nil { 435 return err 436 } 437 } 438 439 return nil 440 } 441 442 // Write writes a lease to the network interface. The len property of the 'Data' slice - 'Headroom' is the length of 443 // the packet. Make sure to resize the Data to the size of the data to be transmitted. 444 // The headroom should always be included(never resize the start of the slice). The 'Headroom' should be used 445 // to indicate from which byte the headroom starts. 446 // After Write has been called the lease will be released and the Data slice or its subslices should not 447 // be used anymore. 448 func (xl *XSKLease) Write() error { 449 xl.sock.wmu.Lock() 450 defer xl.sock.wmu.Unlock() 451 452 if len(xl.Data) > xl.sock.settings.FrameSize { 453 return fmt.Errorf("lease has been expanded beyond framesize, can't transmit") 454 } 455 456 err := xl.sock.enqueueTx(descriptor{ 457 // When enqueueing, we don't want to send the headroom bytes 458 addr: xl.dataAddr + uint64(xl.Headroom), 459 // Data should contain headroom + packet, since we will not be sending headroom 460 // we need to subtract the amout of headroom from the length of Data to get the correct packet length 461 len: uint32(len(xl.Data) - xl.Headroom), 462 }) 463 if err != nil { 464 return fmt.Errorf("tx enqueue: %w", err) 465 } 466 467 err = xl.sock.wakeupTx() 468 if err != nil { 469 return err 470 } 471 472 // If the lease was from the fill->rx lifecycle 473 if !xl.fromTx { 474 // Since a frame from the fill->rx lifecycle was used to transmit, we will now get a frame from 475 // the tx->completion lifecycle and insert it into the fill ring so we end up with the same 476 // amount of frames available for both cycles. If we don't do this the fill->rx cycle will run 477 // out of frames. 478 // The completion queue is full at rest at max capacity, so first dequeue one frame to make 479 // room for the frame we are about to enqueue in tx, just in case the kernel can transmit 480 // faster than we can dequeue. 481 addr := <-xl.sock.txAddrs 482 483 err := xl.sock.fill.Enqueue(addr) 484 if err != nil { 485 return fmt.Errorf("fill enqueue: %w", err) 486 } 487 488 err = xl.sock.wakeupFill() 489 if err != nil { 490 return err 491 } 492 } 493 494 // Set data to nil to indicate that it is no longer valid to use 495 xl.Data = nil 496 497 return nil 498 } 499 500 // The addresses we get back from the rx ring have offsets due to headspacing, both user configured 501 // and default headspacing created by the network driver. This function round the address 502 // to the nearest start of a frame in umem when re-enqueueing the frame address 503 // https://www.spinics.net/lists/xdp-newbies/msg01479.html 504 func addrToFrameStart(addr uint64, frameSize int) uint64 { 505 return (addr / uint64(frameSize)) * uint64(frameSize) 506 } 507 508 // xskAddrRing is a ring buffer containing decriptors used for the rx and tx rings 509 type xskDescRing struct { 510 xskRing 511 } 512 513 func (dr *xskDescRing) Dequeue() *descriptor { 514 producer := (*uint32)(dr.producer) 515 consumer := (*uint32)(dr.consumer) 516 517 if (*producer - *consumer) == 0 { 518 return nil 519 } 520 521 // The linux kernel uses the wraparound of an integer to reset the consumer and 522 // producer. And since ring buffers are always a factor of 2 we can just throw away 523 // all bits which fall outsize of this size to get a always increasing offset 524 // between 0 and dr.elemCount 525 off := *consumer & (dr.elemCount - 1) 526 desc := (*descriptor)(unsafe.Pointer(uintptr(dr.ring) + uintptr(off)*descSize)) 527 528 *consumer++ 529 530 return desc 531 } 532 533 func (dr *xskDescRing) Enqueue(desc descriptor) error { 534 producer := (*uint32)(dr.producer) 535 consumer := (*uint32)(dr.consumer) 536 537 // If the diff between producer and consumer is larger than the elem count the buffer is full 538 if (*producer - *consumer) == dr.elemCount-1 { 539 return errBufferFull 540 } 541 542 // The linux kernel uses the wraparound of an integer to reset the consumer and 543 // producer. And since ring buffers are always a factor of 2 we can just throw away 544 // all bits which fall outsize of this size to get a always increasing offset 545 // between 0 and dr.elemCount 546 off := *producer & (dr.elemCount - 1) 547 548 // Write the address to the current producer pos 549 *(*descriptor)(unsafe.Pointer(uintptr(dr.ring) + uintptr(off)*descSize)) = desc 550 551 *producer++ 552 553 return nil 554 } 555 556 // xskAddrRing is a ring buffer containing addresses (uint64) used for the fill and completion rings 557 type xskAddrRing struct { 558 xskRing 559 } 560 561 const addrSize = unsafe.Sizeof(uint64(0)) 562 563 func (ar *xskAddrRing) Dequeue() *uint64 { 564 producer := (*uint32)(ar.producer) 565 consumer := (*uint32)(ar.consumer) 566 567 if (*producer - *consumer) == 0 { 568 return nil 569 } 570 571 // The linux kernel uses the wraparound of an integer to reset the consumer and 572 // producer. And since ring buffers are always a factor of 2 we can just throw away 573 // all bits which fall outsize of this size to get a always increasing offset 574 // between 0 and ar.elemCount 575 off := *consumer & (ar.elemCount - 1) 576 addr := (*uint64)(unsafe.Pointer(uintptr(ar.ring) + uintptr(off)*addrSize)) 577 578 *consumer++ 579 580 return addr 581 } 582 583 var errBufferFull = errors.New("ring buffer is full") 584 585 func (ar *xskAddrRing) Enqueue(addr uint64) error { 586 producer := (*uint32)(ar.producer) 587 consumer := (*uint32)(ar.consumer) 588 589 // If the diff between producer and consumer is larger than the elem count the buffer is full 590 if (*producer - *consumer) == ar.elemCount-1 { 591 return errBufferFull 592 } 593 594 // The linux kernel uses the wraparound of an integer to reset the consumer and 595 // producer. And since ring buffers are always a factor of 2 we can just throw away 596 // all bits which fall outsize of this size to get a always increasing offset 597 // between 0 and dr.elemCount 598 off := *producer & (ar.elemCount - 1) 599 600 // Write the address to the current producer pos 601 *(*uint64)(unsafe.Pointer(uintptr(ar.ring) + uintptr(off)*addrSize)) = addr 602 603 *producer++ 604 605 return nil 606 } 607 608 type xskRing struct { 609 // Hold a reference to the mmap so we can unmmap it later 610 mmap []byte 611 elemCount uint32 612 // This double pointer is owned by the producer, it points to the last element in the ring buffer that was added 613 producer unsafe.Pointer 614 // This double pointer is owned by the consumer, it points to the last element in the ring buffer that was consumed 615 consumer unsafe.Pointer 616 // A pointer to the start of the ring buffer 617 ring unsafe.Pointer 618 flags unsafe.Pointer 619 } 620 621 func (xr *xskRing) Close() error { 622 if xr.mmap != nil { 623 return syscall.Munmap(xr.mmap) 624 } 625 xr.mmap = nil 626 627 return nil 628 } 629 630 func newXskRing(mmap []byte, off ringOffset, elemCount uint32) xskRing { 631 return xskRing{ 632 mmap: mmap, 633 consumer: unsafe.Pointer(&mmap[off.consumer]), 634 producer: unsafe.Pointer(&mmap[off.producer]), 635 ring: unsafe.Pointer(&mmap[off.desc]), 636 flags: unsafe.Pointer(&mmap[off.flags]), 637 elemCount: elemCount, 638 } 639 } 640 641 // https://elixir.bootlin.com/linux/latest/source/include/uapi/linux/if_xdp.h 642 643 // struct xdp_umem_reg { 644 // __u64 addr; /* Start of packet data area */ 645 // __u64 len; /* Length of packet data area */ 646 // __u32 chunk_size; 647 // __u32 headroom; 648 // __u32 flags; 649 // }; 650 type umemReg struct { 651 addr uint64 652 len uint64 653 chunkSize uint32 654 headroom uint32 655 flags uint32 //nolint:structcheck // unused reserved for future use 656 } 657 658 // struct xdp_ring_offset { 659 // __u64 producer; 660 // __u64 consumer; 661 // __u64 desc; 662 // __u64 flags; 663 // }; 664 type ringOffset struct { 665 producer uint64 666 consumer uint64 667 desc uint64 668 flags uint64 669 } 670 671 type ringOffsetNoFlags struct { 672 producer uint64 673 consumer uint64 674 desc uint64 675 } 676 677 // struct xdp_mmap_offsets { 678 // struct xdp_ring_offset rx; 679 // struct xdp_ring_offset tx; 680 // struct xdp_ring_offset fr; /* Fill */ 681 // struct xdp_ring_offset cr; /* Completion */ 682 // }; 683 type mmapOffsets struct { 684 rx ringOffset 685 tx ringOffset 686 fr ringOffset 687 cr ringOffset 688 } 689 690 // struct xdp_desc { 691 // __u64 addr; 692 // __u32 len; 693 // __u32 options; 694 // }; 695 type descriptor struct { 696 addr uint64 697 len uint32 698 // options is reserved and not used, setting it to anything other than 0 is invalid in 5.12.2 699 // https://elixir.bootlin.com/linux/v5.12.2/source/net/xdp/xsk_queue.h#L141 700 options uint32 //nolint:structcheck // not used but reserved for future use (also for descSize) 701 } 702 703 var descSize = unsafe.Sizeof(descriptor{}) 704 705 // struct sockaddr_xdp { 706 // __u16 sxdp_family; 707 // __u16 sxdp_flags; 708 // __u32 sxdp_ifindex; 709 // __u32 sxdp_queue_id; 710 // __u32 sxdp_shared_umem_fd; 711 // }; 712 type xdpSockAddr struct { 713 sxdpFamily uint16 714 sxdpFlags uint16 715 sxdpIfIndex uint32 716 sxdpQueueID uint32 717 sxdpSharedUmemFD uint32 718 } 719 720 type XSKSettings struct { 721 // Size of the umem frames/packet buffers (2048 or 4096) 722 FrameSize int 723 // Amount of frames/packets which can be used, must be a power of 2 724 FrameCount int 725 // The index of the network device on which XSK will be used 726 NetDevIfIndex int 727 // The id of the Queue on which this XSK will be used 728 QueueID int 729 // How much unused space should be left at the start of each buffer. 730 // This can be used to for example encapsulate a packet whichout having to move or copy memory 731 Headroom int 732 // Is Tx disabled for this socket? 733 DisableTx bool 734 // Is Rx disabled for this socket? 735 DisableRx bool 736 // If true, XDP_USE_NEED_WAKEUP is not used. Should be on by default 737 // unless there is a reason it doesn't work (like on older kernels) 738 DisableNeedWakeup bool 739 // If true, zero copy mode is forced. By default zero copy mode is attempted and if not available 740 // in the driver will automatically fallback to copy mode. 741 ForceZeroCopy bool 742 // If true, copy mode is always used and zero copy mode never attempted. 743 ForceCopy bool 744 // The minimum time between two checks of the completion queue. A lower value allows for more transmitted 745 // packets per seconds at the cost of higher CPU usage, even when not transmitting. 746 // By default this value is 10ms which seems a sane value, it means that there is a theorethical max TX rate of 747 // (1000/10) * (tx ring size) which is 100 * 2048 = 204,800 packets per second when DisableRx = false 748 // or 100 * 4096 = 409,600 when DisableRx = true at the default FrameCount of 4096. 749 // Setting this setting to 0 will cause one goroutine to busy poll(use 100% CPU) per socket. 750 CQConsumeInterval *time.Duration 751 } 752 753 // Same defaults as libbpf https://elixir.bootlin.com/linux/latest/source/tools/lib/bpf/xsk.h#L192 754 const ( 755 defaultFrameCount = 4096 756 defaultFrameSize = 4096 757 ) 758 759 var ( 760 _ FrameReader = (*XSKSocket)(nil) 761 _ FrameWriter = (*XSKSocket)(nil) 762 _ FrameLeaser = (*XSKSocket)(nil) 763 _ io.Closer = (*XSKSocket)(nil) 764 ) 765 766 // A XSKSocket can bind to one queue on one netdev 767 type XSKSocket struct { 768 fd int 769 770 // memory region where frames are exchanged with kernel 771 umem []byte 772 settings XSKSettings 773 774 // Buffered channel containing addresses frames which can be used 775 // for transmission 776 txAddrs chan uint64 777 completionTicker *time.Ticker 778 779 rmu sync.Mutex 780 wmu sync.Mutex 781 fmu sync.Mutex 782 783 rx xskDescRing 784 tx xskDescRing 785 fill xskAddrRing 786 completion xskAddrRing 787 788 readTimeout int 789 writeTimeout int 790 } 791 792 func NewXSKSocket(settings XSKSettings) (_ *XSKSocket, err error) { 793 if !kernelsupport.CurrentFeatures.Map.Has(kernelsupport.KFeatMapAFXDP) { 794 return nil, fmt.Errorf("XSK/AF_XDP is not supported by the current kernel version") 795 } 796 797 if settings.FrameCount == 0 { 798 settings.FrameCount = defaultFrameCount 799 } 800 801 if settings.FrameSize == 0 { 802 settings.FrameSize = defaultFrameSize 803 } 804 805 if !isPowerOfTwo(settings.FrameCount) { 806 return nil, fmt.Errorf("frame count must be a power of 2") 807 } 808 809 if settings.FrameSize != 2048 && settings.FrameSize != 4096 { 810 // TODO allow frame sizes which are not aligned to 2k but enable 811 // XDP_UMEM_UNALIGNED_CHUNK_FLAG when this happens 812 return nil, fmt.Errorf("frame size must be 2048 or 4096") 813 } 814 815 if settings.DisableTx && settings.DisableRx { 816 return nil, fmt.Errorf("tx and rx can't both be disabled") 817 } 818 819 if settings.ForceCopy && settings.ForceZeroCopy { 820 return nil, fmt.Errorf("can't force both zero-copy and copy mode") 821 } 822 823 umemSize := settings.FrameSize * settings.FrameCount 824 xskSock := &XSKSocket{ 825 umem: make([]byte, umemSize), 826 settings: settings, 827 } 828 829 xskSock.fd, err = syscall.Socket(unix.AF_XDP, syscall.SOCK_RAW, 0) 830 if err != nil { 831 return nil, fmt.Errorf("syscall socket: %w", err) 832 } 833 // If we return with an error, close the socket so we don't leak resources 834 defer func() { 835 if err != nil { 836 xskSock.Close() 837 } 838 }() 839 840 reg := umemReg{ 841 addr: uint64(uintptr(unsafe.Pointer(&xskSock.umem[0]))), 842 len: uint64(len(xskSock.umem)), 843 chunkSize: uint32(settings.FrameSize), 844 headroom: uint32(settings.Headroom), 845 // TODO flags 846 } 847 // Register the umem 848 err = bpfSyscall.Setsockopt( 849 xskSock.fd, 850 unix.SOL_XDP, 851 unix.XDP_UMEM_REG, 852 unsafe.Pointer(®), 853 unsafe.Sizeof(reg), 854 ) 855 if err != nil { 856 return nil, fmt.Errorf("set sockopt UMEM_REG: %w", err) 857 } 858 859 // Assume both are enabled 860 rxCount := settings.FrameCount / 2 861 txCount := rxCount 862 863 // If tx is disabled 864 if settings.DisableTx { 865 txCount = 0 866 rxCount = settings.FrameCount 867 } else if settings.DisableRx { 868 txCount = settings.FrameCount 869 rxCount = 0 870 } 871 872 // Tell the kernel how large the fill ring should be 873 err = bpfSyscall.Setsockopt( 874 xskSock.fd, 875 unix.SOL_XDP, 876 unix.XDP_UMEM_FILL_RING, 877 unsafe.Pointer(&rxCount), 878 unsafe.Sizeof(rxCount), 879 ) 880 if err != nil { 881 return nil, fmt.Errorf("set sockopt XDP_UMEM_FILL_RING: %w", err) 882 } 883 884 // Tell the kernel how large the completion ring should be 885 err = bpfSyscall.Setsockopt( 886 xskSock.fd, 887 unix.SOL_XDP, 888 unix.XDP_UMEM_COMPLETION_RING, 889 unsafe.Pointer(&txCount), 890 unsafe.Sizeof(txCount), 891 ) 892 if err != nil { 893 return nil, fmt.Errorf("set sockopt XDP_UMEM_COMPLETION_RING: %w", err) 894 } 895 896 offsets, err := getMMapOffsets(xskSock.fd) 897 if err != nil { 898 return nil, fmt.Errorf("get mmap offsets: %w", err) 899 } 900 901 mmap, err := syscall.Mmap( 902 xskSock.fd, 903 unix.XDP_UMEM_PGOFF_FILL_RING, 904 int(offsets.fr.desc)+rxCount*int(unsafe.Sizeof(uint64(0))), 905 unix.PROT_READ|unix.PROT_WRITE, 906 unix.MAP_SHARED|unix.MAP_POPULATE, 907 ) 908 if err != nil { 909 return nil, fmt.Errorf("mmap fill ring: %w", err) 910 } 911 xskSock.fill = xskAddrRing{ 912 xskRing: newXskRing(mmap, offsets.fr, uint32(rxCount)), 913 } 914 915 mmap, err = syscall.Mmap( 916 xskSock.fd, 917 unix.XDP_UMEM_PGOFF_COMPLETION_RING, 918 int(offsets.cr.desc)+txCount*int(unsafe.Sizeof(uint64(0))), 919 unix.PROT_READ|unix.PROT_WRITE, 920 unix.MAP_SHARED|unix.MAP_POPULATE, 921 ) 922 if err != nil { 923 return nil, fmt.Errorf("mmap completion ring: %w", err) 924 } 925 926 xskSock.completion = xskAddrRing{ 927 xskRing: newXskRing(mmap, offsets.cr, uint32(txCount)), 928 } 929 930 xskSock.txAddrs = make(chan uint64, txCount+1) 931 txOffset := rxCount * settings.FrameSize 932 // Fill the txAddrs channel with available addresses to use during transmisstion 933 for i := 0; i < txCount; i++ { 934 xskSock.txAddrs <- uint64(txOffset + i*settings.FrameSize) 935 } 936 937 // TODO allow for completion worker pooling (having one worker check multiple sockets) 938 // this would allow a user to dedicate 1 or 2 CPU cores to busy polling all sockets of a 939 // particular netdev or even the whole host. 940 941 interval := 10 * time.Millisecond 942 if settings.CQConsumeInterval != nil { 943 interval = *settings.CQConsumeInterval 944 } 945 xskSock.completionTicker = time.NewTicker(interval) 946 go xskSock.completionWorker() 947 948 // Tell the kernel how large the rx ring should be 949 err = bpfSyscall.Setsockopt( 950 xskSock.fd, 951 unix.SOL_XDP, 952 unix.XDP_RX_RING, 953 unsafe.Pointer(&rxCount), 954 unsafe.Sizeof(rxCount), 955 ) 956 if err != nil { 957 return nil, fmt.Errorf("set sockopt XDP_RX_RING: %w", err) 958 } 959 960 // Tell the kernel how large the tx ring should be 961 err = bpfSyscall.Setsockopt( 962 xskSock.fd, 963 unix.SOL_XDP, 964 unix.XDP_TX_RING, 965 unsafe.Pointer(&txCount), 966 unsafe.Sizeof(txCount), 967 ) 968 if err != nil { 969 return nil, fmt.Errorf("set sockopt XDP_TX_RING: %w", err) 970 } 971 972 mmap, err = syscall.Mmap( 973 xskSock.fd, 974 unix.XDP_PGOFF_RX_RING, 975 int(offsets.rx.desc)+rxCount*int(unsafe.Sizeof(descriptor{})), 976 unix.PROT_READ|unix.PROT_WRITE, 977 unix.MAP_SHARED|unix.MAP_POPULATE, 978 ) 979 if err != nil { 980 return nil, fmt.Errorf("mmap rx ring: %w", err) 981 } 982 xskSock.rx = xskDescRing{ 983 xskRing: newXskRing(mmap, offsets.rx, uint32(rxCount)), 984 } 985 986 mmap, err = syscall.Mmap( 987 xskSock.fd, 988 unix.XDP_PGOFF_TX_RING, 989 int(offsets.tx.desc)+txCount*int(unsafe.Sizeof(descriptor{})), 990 unix.PROT_READ|unix.PROT_WRITE, 991 unix.MAP_SHARED|unix.MAP_POPULATE, 992 ) 993 if err != nil { 994 return nil, fmt.Errorf("mmap tx ring: %w", err) 995 } 996 xskSock.tx = xskDescRing{ 997 xskRing: newXskRing(mmap, offsets.tx, uint32(txCount)), 998 } 999 1000 var flags uint16 1001 if !settings.DisableNeedWakeup { 1002 flags |= unix.XDP_USE_NEED_WAKEUP 1003 } 1004 1005 if settings.ForceCopy { 1006 flags |= unix.XDP_COPY 1007 } 1008 1009 if settings.ForceZeroCopy { 1010 flags |= unix.XDP_ZEROCOPY 1011 } 1012 1013 sockAddr := xdpSockAddr{ 1014 sxdpFamily: unix.AF_XDP, 1015 sxdpIfIndex: uint32(settings.NetDevIfIndex), 1016 sxdpQueueID: uint32(settings.QueueID), 1017 sxdpSharedUmemFD: uint32(xskSock.fd), 1018 sxdpFlags: flags, 1019 } 1020 err = bpfSyscall.Bind(xskSock.fd, unsafe.Pointer(&sockAddr), bpfSyscall.Socklen(unsafe.Sizeof(sockAddr))) 1021 if err != nil { 1022 return nil, fmt.Errorf("bind: %w", err) 1023 } 1024 1025 // Give all Rx frames to the kernel 1026 for i := 0; i < rxCount-1; i++ { 1027 err = xskSock.fill.Enqueue(uint64(i * settings.FrameSize)) 1028 if err != nil { 1029 return nil, fmt.Errorf("fill enqueue: %w", err) 1030 } 1031 } 1032 err = xskSock.wakeupFill() 1033 if err != nil { 1034 return nil, fmt.Errorf("wakeupFill: %w", err) 1035 } 1036 // NOTE Tx frames are enqueued after they have been filled as a signal to transmit them 1037 1038 return xskSock, nil 1039 } 1040 1041 // Fd returns the file descriptor of the socket. 1042 func (xs *XSKSocket) Fd() int { 1043 return xs.fd 1044 } 1045 1046 // SetWriteTimeout sets the timeout for Write and XSKLease.WriteBack calls. 1047 // If ms == 0 (default), we will never block/wait and error if we can't write at once. 1048 // If ms == -1, we will block forever until we can write. 1049 // If ms > 0, we will wait for x miliseconds for an oppurunity to write or error afterwards. 1050 func (xs *XSKSocket) SetWriteTimeout(ms int) error { 1051 if ms < -1 { 1052 return fmt.Errorf("timeout must be -1, 0, or positive amount of miliseconds") 1053 } 1054 1055 xs.writeTimeout = ms 1056 1057 return nil 1058 } 1059 1060 // SetReadTimeout sets the timeout for Read and ReadLease calls. 1061 // If ms == 0 (default), we will never block/wait and return no data if there isn't any ready. 1062 // If ms == -1, we will block forever until we can read. 1063 // If ms > 0, we will wait for x miliseconds for an oppurunity to read or return no data. 1064 func (xs *XSKSocket) SetReadTimeout(ms int) error { 1065 if ms < -1 { 1066 return fmt.Errorf("timeout must be -1, 0, or positive amount of miliseconds") 1067 } 1068 1069 xs.readTimeout = ms 1070 1071 return nil 1072 } 1073 1074 // If the need wakeup flag is set on the ring the kernel requests that we 1075 // wakeup the fill ring with a poll syscall 1076 // https://patchwork.ozlabs.org/project/netdev/patch/1560411450-29121-3-git-send-email-magnus.karlsson@intel.com/ 1077 func (xs *XSKSocket) wakeupFill() error { 1078 if *(*uint32)(xs.fill.flags)&unix.XDP_RING_NEED_WAKEUP == 1 { 1079 _, err := unix.Poll([]unix.PollFd{{Fd: int32(xs.fd), Events: unix.POLLOUT}}, 0) 1080 if err != nil { 1081 return fmt.Errorf("poll fill: %w", err) 1082 } 1083 } 1084 1085 return nil 1086 } 1087 1088 // If the need wakeup flag is set on the ring the kernel requests that we 1089 // wakeup the fill ring with a poll syscall 1090 // https://patchwork.ozlabs.org/project/netdev/patch/1560411450-29121-3-git-send-email-magnus.karlsson@intel.com/ 1091 func (xs *XSKSocket) wakeupTx() error { 1092 if *(*uint32)(xs.tx.flags)&unix.XDP_RING_NEED_WAKEUP == 1 { 1093 err := bpfSyscall.Sendto( 1094 xs.fd, 1095 nil, 1096 syscall.MSG_DONTWAIT, 1097 unsafe.Pointer(&bpfSyscall.Zero), 1098 bpfSyscall.Socklen(0), 1099 ) 1100 if err != nil { 1101 if sysErr, ok := err.(*bpfSyscall.Error); ok { 1102 switch sysErr.Errno { 1103 // These errors occur regulairly when load is high, ignore these errors, the next time 1104 // wakeupTx is called it will trigger the kernel to read the full ring anyway. 1105 // https://github.com/torvalds/linux/blob/b741596468b010af2846b75f5e75a842ce344a6e/samples/bpf/xdpsock_user.c#L1095 1106 //nolint:lll 1107 case syscall.EBUSY, 1108 syscall.EAGAIN, 1109 syscall.ENOBUFS, 1110 syscall.ENETDOWN: 1111 return nil 1112 } 1113 } 1114 1115 return fmt.Errorf("syscall sendto: %w", err) 1116 } 1117 } 1118 1119 return nil 1120 } 1121 1122 func (xs *XSKSocket) dequeueRx() (*descriptor, error) { 1123 desc := xs.rx.Dequeue() 1124 // there is nothing to dequeue 1125 if desc == nil { 1126 // Return at once if blocking is disabled 1127 if xs.readTimeout == 0 { 1128 return nil, nil 1129 } 1130 1131 n, err := unix.Poll([]unix.PollFd{{Fd: int32(xs.fd), Events: unix.POLLIN}}, xs.readTimeout) 1132 if err != nil { 1133 // Sometimes a poll is interrupted by a signal, no a real error 1134 // lets treat it like a timeout 1135 if err == syscall.EINTR { 1136 return nil, nil 1137 } 1138 1139 return nil, fmt.Errorf("poll: %w", err) 1140 } 1141 1142 // If n == 0, the timeout was reached 1143 if n == 0 { 1144 return nil, nil 1145 } 1146 1147 desc = xs.rx.Dequeue() 1148 if desc == nil { 1149 return desc, fmt.Errorf("no desc after poll") 1150 } 1151 } 1152 1153 return desc, nil 1154 } 1155 1156 // ReadFrame implements FrameReader, however we have to implement this with a memory copy which is not ideal 1157 // for efficiency. For zero copy packet access ReadLease should be used. 1158 func (xs *XSKSocket) ReadFrame(p []byte) (n int, err error) { 1159 xs.rmu.Lock() 1160 defer xs.rmu.Unlock() 1161 1162 desc, err := xs.dequeueRx() 1163 if err != nil { 1164 return 0, fmt.Errorf("dequeue rx: %w", err) 1165 } 1166 if desc == nil { 1167 return 0, nil 1168 } 1169 1170 // unlike the ReadLease function, we ignore headspace since any benefit is lost 1171 // during the copy. 1172 len := copy(p, xs.umem[desc.addr:desc.addr+uint64(desc.len)]) 1173 1174 err = xs.fill.Enqueue(addrToFrameStart(desc.addr, xs.settings.FrameSize)) 1175 if err != nil { 1176 return len, fmt.Errorf("fill enqueue: %w", err) 1177 } 1178 1179 err = xs.wakeupFill() 1180 if err != nil { 1181 return len, err 1182 } 1183 1184 return len, nil 1185 } 1186 1187 // WriteLease creates a XSKLease which points to a piece of preallocated memory. This memory can be used to 1188 // build packets for writing. Unlike XSKLeases gotten from ReadLease, write leases have no Headroom. 1189 // The Data slice of the lease is the full length of the usable frame, this length should not be exceeded. 1190 // Any memory held by the lease can't be reused until released or written. 1191 // 1192 // This function blocks until a frame for transmission is available and is not subject to the write timeout. 1193 func (xs *XSKSocket) WriteLease() (lease *XSKLease, err error) { 1194 addr := <-xs.txAddrs 1195 return &XSKLease{ 1196 Headroom: 0, 1197 Data: xs.umem[addr : addr+uint64(xs.settings.FrameSize)], 1198 dataAddr: addr, 1199 sock: xs, 1200 fromTx: true, 1201 }, nil 1202 } 1203 1204 // ReadLease reads a frame from the socket and returns its memory in a XSKLease. After reading the contents of the 1205 // frame it can be released or written, both will allow the memory to be reused. Calling Write on the lease will 1206 // cause the contents of Data to be written back to the network interface. The contents of Data can be modified 1207 // before calling Write thus allowing a program to implement zero-copy/zero-allocation encaptulation or 1208 // request/response protocols. 1209 func (xs *XSKSocket) ReadLease() (lease *XSKLease, err error) { 1210 xs.rmu.Lock() 1211 defer xs.rmu.Unlock() 1212 1213 desc, err := xs.dequeueRx() 1214 if err != nil { 1215 return nil, fmt.Errorf("dequeue rx: %w", err) 1216 } 1217 if desc == nil { 1218 return nil, nil 1219 } 1220 1221 return &XSKLease{ 1222 Headroom: xs.settings.Headroom, 1223 Data: xs.umem[desc.addr-uint64(xs.settings.Headroom) : desc.addr+uint64(desc.len)], 1224 dataAddr: desc.addr, 1225 sock: xs, 1226 }, nil 1227 } 1228 1229 func (xs *XSKSocket) enqueueTx(desc descriptor) error { 1230 err := xs.tx.Enqueue(desc) 1231 if err != nil { 1232 if err != errBufferFull { 1233 // Put the frame address back in the chan so we don't lose it 1234 xs.txAddrs <- desc.addr 1235 1236 return fmt.Errorf("tx enqueue: %w", err) 1237 } 1238 1239 _, err := unix.Poll([]unix.PollFd{{Fd: int32(xs.fd), Events: unix.POLLOUT}}, xs.writeTimeout) 1240 if err != nil { 1241 return fmt.Errorf("poll: %w", err) 1242 } 1243 1244 err = xs.tx.Enqueue(desc) 1245 if err != nil { 1246 // Put the frame address back in the chan so we don't lose it 1247 xs.txAddrs <- desc.addr 1248 1249 return fmt.Errorf("tx enqueue: %w", err) 1250 } 1251 } 1252 1253 return nil 1254 } 1255 1256 // WriteFrame implements FrameWriter. The interface requires us to copy p into umem which is not 1257 // optimal for speed. For maximum performance use WriteLease instead. 1258 func (xs *XSKSocket) WriteFrame(p []byte) (n int, err error) { 1259 xs.wmu.Lock() 1260 defer xs.wmu.Unlock() 1261 1262 if len(p) > xs.settings.FrameSize { 1263 return 0, fmt.Errorf("data is larget than frame size of %d", xs.settings.FrameSize) 1264 } 1265 1266 // We assume we will never be blocking here for long 1267 addr := <-xs.txAddrs 1268 1269 len := copy(xs.umem[addr:addr+uint64(len(p))], p) 1270 1271 err = xs.enqueueTx(descriptor{ 1272 addr: addr, 1273 len: uint32(len), 1274 }) 1275 if err != nil { 1276 xs.txAddrs <- addr 1277 return 0, err 1278 } 1279 1280 err = xs.wakeupTx() 1281 if err != nil { 1282 return 0, err 1283 } 1284 1285 return len, nil 1286 } 1287 1288 func (xs *XSKSocket) Close() error { 1289 err := xs.rx.Close() 1290 if err != nil { 1291 return fmt.Errorf("rx close: %w", err) 1292 } 1293 1294 err = xs.tx.Close() 1295 if err != nil { 1296 return fmt.Errorf("tx close: %w", err) 1297 } 1298 1299 err = xs.fill.Close() 1300 if err != nil { 1301 return fmt.Errorf("fill close: %w", err) 1302 } 1303 1304 if xs.completionTicker != nil { 1305 xs.completionTicker.Stop() 1306 } 1307 1308 err = xs.completion.Close() 1309 if err != nil { 1310 return fmt.Errorf("completion close: %w", err) 1311 } 1312 1313 if xs.fd != 0 { 1314 err = syscall.Close(xs.fd) 1315 if err != nil { 1316 return fmt.Errorf("socket close: %w", err) 1317 } 1318 1319 xs.fd = 0 1320 } 1321 1322 return nil 1323 } 1324 1325 // completionWorker is started when a socket is created and is responsible for dequeueing the completion ring 1326 // and transferring the free address to the txAddrs chan so they can be re-used 1327 func (xs *XSKSocket) completionWorker() { 1328 // As long as the completion ring still is mapped 1329 for xs.completion.mmap != nil { 1330 // Every tick of the completion ticket, dequeue the whole completion queue 1331 // and put the frame addrsses on the txAddrs list 1332 for xs.completion.mmap != nil { 1333 addr := xs.completion.Dequeue() 1334 if addr == nil { 1335 break 1336 } 1337 1338 xs.txAddrs <- addrToFrameStart(*addr, xs.settings.FrameSize) 1339 } 1340 1341 // TODO auto ajust completion ticker (slow down using idle time and speed up during high tx rate) 1342 1343 <-xs.completionTicker.C 1344 } 1345 } 1346 1347 func getMMapOffsets(fd int) (offsets mmapOffsets, err error) { 1348 if kernelsupport.CurrentFeatures.Misc.Has(kernelsupport.KFeatMiscXSKRingFlags) { 1349 len := bpfSyscall.Socklen(unsafe.Sizeof(offsets)) 1350 err = bpfSyscall.Getsockopt( 1351 fd, 1352 unix.SOL_XDP, 1353 unix.XDP_MMAP_OFFSETS, 1354 unsafe.Pointer(&offsets), 1355 &len, 1356 ) 1357 if err != nil { 1358 return offsets, fmt.Errorf("get sockopt XDP_MMAP_OFFSETS: %w", err) 1359 } 1360 } else { 1361 nfOff, err := getMMapOffsetsNoFlags(fd) 1362 if err != nil { 1363 return offsets, fmt.Errorf("no flag offsets: %w", err) 1364 } 1365 offsets.rx = ringOffset{ 1366 consumer: nfOff[0].consumer, 1367 producer: nfOff[0].producer, 1368 desc: nfOff[0].desc, 1369 } 1370 offsets.tx = ringOffset{ 1371 consumer: nfOff[1].consumer, 1372 producer: nfOff[1].producer, 1373 desc: nfOff[1].desc, 1374 } 1375 offsets.cr = ringOffset{ 1376 consumer: nfOff[2].consumer, 1377 producer: nfOff[2].producer, 1378 desc: nfOff[2].desc, 1379 } 1380 offsets.fr = ringOffset{ 1381 consumer: nfOff[3].consumer, 1382 producer: nfOff[3].producer, 1383 desc: nfOff[3].desc, 1384 } 1385 } 1386 1387 return offsets, nil 1388 } 1389 1390 func getMMapOffsetsNoFlags(fd int) (offsets [4]ringOffsetNoFlags, err error) { 1391 len := bpfSyscall.Socklen(unsafe.Sizeof(offsets)) 1392 err = bpfSyscall.Getsockopt( 1393 fd, 1394 unix.SOL_XDP, 1395 unix.XDP_MMAP_OFFSETS, 1396 unsafe.Pointer(&offsets), 1397 &len, 1398 ) 1399 if err != nil { 1400 return offsets, fmt.Errorf("get sockopt XDP_MMAP_OFFSETS: %w", err) 1401 } 1402 1403 return offsets, nil 1404 } 1405 1406 func isPowerOfTwo(x int) bool { 1407 return (x != 0) && ((x & (x - 1)) == 0) 1408 } 1409 1410 // GetNetDevQueueCount uses the /sys/class/net/<dev>/queues/ directory to figure out how many queues a network 1411 // device has. Knowing the number of queues is critical when binding XSK sockets to a network device. 1412 func GetNetDevQueueCount(netdev string) (int, error) { 1413 if strings.ContainsAny(netdev, "/") { 1414 return 0, fmt.Errorf("network device name should not contain slashes") 1415 } 1416 1417 entries, err := os.ReadDir(fmt.Sprintf("/sys/class/net/%s/queues", netdev)) 1418 if err != nil { 1419 return 0, fmt.Errorf("os.Lstat: %w", err) 1420 } 1421 1422 // Just count the RX queues, we can assume there are as much TX queues as RX queues 1423 count := 0 1424 for _, entry := range entries { 1425 if strings.HasPrefix(entry.Name(), "rx-") { 1426 count++ 1427 } 1428 } 1429 1430 return count, nil 1431 }