github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/pkg/xdp/xdp.go (about) 1 // Copyright 2022 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build amd64 || arm64 16 // +build amd64 arm64 17 18 // Package xdp provides tools for working with AF_XDP sockets. 19 // 20 // AF_XDP shares a memory area (UMEM) with the kernel to pass packets 21 // back and forth. Communication is done via a number of queues. 22 // Briefly, the queues work as follows: 23 // 24 // - Receive: Userspace adds a descriptor to the fill queue. The 25 // descriptor points to an area of the UMEM that the kernel should fill 26 // with an incoming packet. The packet is filled by the kernel, which 27 // places a descriptor to the same UMEM area in the RX queue, signifying 28 // that userspace may read the packet. 29 // - Trasmit: Userspace adds a descriptor to TX queue. The kernel 30 // sends the packet (stored in UMEM) pointed to by the descriptor. 31 // Upon completion, the kernel places a desciptor in the completion 32 // queue to notify userspace that the packet is sent and the UMEM 33 // area can be reused. 34 // 35 // So in short: RX packets move from the fill to RX queue, and TX 36 // packets move from the TX to completion queue. 37 // 38 // Note that the shared UMEM for RX and TX means that packet forwarding 39 // can be done without copying; only the queues need to be updated to point to 40 // the packet in UMEM. 41 package xdp 42 43 import ( 44 "fmt" 45 "math/bits" 46 47 "github.com/ttpreport/gvisor-ligolo/pkg/cleanup" 48 "github.com/ttpreport/gvisor-ligolo/pkg/log" 49 "github.com/ttpreport/gvisor-ligolo/pkg/memutil" 50 "golang.org/x/sys/unix" 51 ) 52 53 // A ControlBlock contains all the control structures necessary to use an 54 // AF_XDP socket. 55 // 56 // The ControlBlock and the structures it contains are meant to be used with a 57 // single RX goroutine and a single TX goroutine. 58 type ControlBlock struct { 59 UMEM UMEM 60 Fill FillQueue 61 RX RXQueue 62 TX TXQueue 63 Completion CompletionQueue 64 } 65 66 // ReadOnlySocketOpts configure a read-only AF_XDP socket. 67 type ReadOnlySocketOpts struct { 68 NFrames uint32 69 FrameSize uint32 70 NDescriptors uint32 71 } 72 73 // DefaultReadOnlyOpts provides recommended default options for initializing a 74 // readonly AF_XDP socket. AF_XDP setup is extremely finnicky and can fail if 75 // incorrect values are used. 76 func DefaultReadOnlyOpts() ReadOnlySocketOpts { 77 return ReadOnlySocketOpts{ 78 NFrames: 4096, 79 // Frames must be 2048 or 4096 bytes, although not all drivers support 80 // both. 81 FrameSize: 4096, 82 NDescriptors: 2048, 83 } 84 } 85 86 // ReadOnlySocket returns an initialized read-only AF_XDP socket bound to a 87 // particular interface and queue. 88 func ReadOnlySocket(ifaceIdx, queueID uint32, opts ReadOnlySocketOpts) (*ControlBlock, error) { 89 sockfd, err := unix.Socket(unix.AF_XDP, unix.SOCK_RAW, 0) 90 if err != nil { 91 return nil, fmt.Errorf("failed to create AF_XDP socket: %v", err) 92 } 93 return ReadOnlyFromSocket(sockfd, ifaceIdx, queueID, opts) 94 } 95 96 // ReadOnlyFromSocket takes an AF_XDP socket, initializes it, and binds it to a 97 // particular interface and queue. 98 func ReadOnlyFromSocket(sockfd int, ifaceIdx, queueID uint32, opts ReadOnlySocketOpts) (*ControlBlock, error) { 99 if opts.FrameSize != 2048 && opts.FrameSize != 4096 { 100 return nil, fmt.Errorf("invalid frame size %d: must be either 2048 or 4096", opts.FrameSize) 101 } 102 if bits.OnesCount32(opts.NDescriptors) != 1 { 103 return nil, fmt.Errorf("invalid number of descriptors %d: must be a power of 2", opts.NDescriptors) 104 } 105 106 var cb ControlBlock 107 108 // Create the UMEM area. Use mmap instead of make([[]byte) to ensure 109 // that the UMEM is page-aligned. Aligning the UMEM keeps individual 110 // packets from spilling over between pages. 111 var zerofd uintptr 112 umemMemory, err := memutil.MapSlice( 113 0, 114 uintptr(opts.NFrames*opts.FrameSize), 115 unix.PROT_READ|unix.PROT_WRITE, 116 unix.MAP_PRIVATE|unix.MAP_ANONYMOUS, 117 zerofd-1, 118 0, 119 ) 120 if err != nil { 121 return nil, fmt.Errorf("failed to mmap umem: %v", err) 122 } 123 cleanup := cleanup.Make(func() { 124 memutil.UnmapSlice(umemMemory) 125 }) 126 127 if sliceBackingPointer(umemMemory)%uintptr(unix.Getpagesize()) != 0 { 128 return nil, fmt.Errorf("UMEM is not page aligned (address 0x%x)", sliceBackingPointer(umemMemory)) 129 } 130 131 cb.UMEM = UMEM{ 132 mem: umemMemory, 133 sockfd: uint32(sockfd), 134 frameAddresses: make([]uint64, opts.NFrames), 135 nFreeFrames: opts.NFrames, 136 frameMask: ^(uint64(opts.FrameSize) - 1), 137 } 138 139 // Fill in each frame address. 140 for i := range cb.UMEM.frameAddresses { 141 cb.UMEM.frameAddresses[i] = uint64(i) * uint64(opts.FrameSize) 142 } 143 144 // Check whether we're likely to fail due to RLIMIT_MEMLOCK. 145 var rlimit unix.Rlimit 146 if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlimit); err != nil { 147 return nil, fmt.Errorf("failed to get rlimit for memlock: %v", err) 148 } 149 if rlimit.Cur < uint64(len(cb.UMEM.mem)) { 150 log.Infof("UMEM size (%d) may exceed RLIMIT_MEMLOCK (%+v) and cause registration to fail", len(cb.UMEM.mem), rlimit) 151 } 152 153 reg := unix.XDPUmemReg{ 154 Addr: uint64(sliceBackingPointer(umemMemory)), 155 Len: uint64(len(umemMemory)), 156 Size: opts.FrameSize, 157 // Not useful in the RX path. 158 Headroom: 0, 159 // TODO(b/240191988): Investigate use of SHARED flag. 160 Flags: 0, 161 } 162 if err := registerUMEM(sockfd, reg); err != nil { 163 return nil, fmt.Errorf("failed to register UMEM: %v", err) 164 } 165 166 // Set the number of descriptors in the fill queue. 167 if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_UMEM_FILL_RING, int(opts.NDescriptors)); err != nil { 168 return nil, fmt.Errorf("failed to register fill ring: %v", err) 169 } 170 // Set the number of descriptors in the completion queue. 171 if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_UMEM_COMPLETION_RING, int(opts.NDescriptors)); err != nil { 172 return nil, fmt.Errorf("failed to register completion ring: %v", err) 173 } 174 // Set the number of descriptors in the RX queue. 175 if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_RX_RING, int(opts.NDescriptors)); err != nil { 176 return nil, fmt.Errorf("failed to register RX queue: %v", err) 177 } 178 // Set the number of descriptors in the TX queue. 179 if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_TX_RING, int(opts.NDescriptors)); err != nil { 180 return nil, fmt.Errorf("failed to register TX queue: %v", err) 181 } 182 183 // Get offset information for the queues. Offsets indicate where, once 184 // we mmap space for each queue, values in the queue are. They give 185 // offsets for the shared pointers, a shared flags value, and the 186 // beginning of the ring of descriptors. 187 off, err := getOffsets(sockfd) 188 if err != nil { 189 return nil, fmt.Errorf("failed to get offsets: %v", err) 190 } 191 192 // Allocate space for the fill queue. 193 fillQueueMem, err := memutil.MapSlice( 194 0, 195 uintptr(off.Fr.Desc+uint64(opts.NDescriptors)*sizeOfFillQueueDesc()), 196 unix.PROT_READ|unix.PROT_WRITE, 197 unix.MAP_SHARED|unix.MAP_POPULATE, 198 uintptr(sockfd), 199 unix.XDP_UMEM_PGOFF_FILL_RING, 200 ) 201 if err != nil { 202 return nil, fmt.Errorf("failed to mmap fill queue: %v", err) 203 } 204 cleanup.Add(func() { 205 memutil.UnmapSlice(fillQueueMem) 206 }) 207 // Setup the fillQueue with offsets into allocated memory. 208 cb.Fill = FillQueue{ 209 mem: fillQueueMem, 210 mask: opts.NDescriptors - 1, 211 cachedConsumer: opts.NDescriptors, 212 } 213 cb.Fill.init(off, opts) 214 215 // Allocate space for the completion queue. 216 completionQueueMem, err := memutil.MapSlice( 217 0, 218 uintptr(off.Cr.Desc+uint64(opts.NDescriptors)*sizeOfCompletionQueueDesc()), 219 unix.PROT_READ|unix.PROT_WRITE, 220 unix.MAP_SHARED|unix.MAP_POPULATE, 221 uintptr(sockfd), 222 unix.XDP_UMEM_PGOFF_COMPLETION_RING, 223 ) 224 if err != nil { 225 return nil, fmt.Errorf("failed to mmap completion queue: %v", err) 226 } 227 cleanup.Add(func() { 228 memutil.UnmapSlice(completionQueueMem) 229 }) 230 // Setup the completionQueue with offsets into allocated memory. 231 cb.Completion = CompletionQueue{ 232 mem: completionQueueMem, 233 mask: opts.NDescriptors - 1, 234 } 235 cb.Completion.init(off, opts) 236 237 // Allocate space for the RX queue. 238 rxQueueMem, err := memutil.MapSlice( 239 0, 240 uintptr(off.Rx.Desc+uint64(opts.NDescriptors)*sizeOfRXQueueDesc()), 241 unix.PROT_READ|unix.PROT_WRITE, 242 unix.MAP_SHARED|unix.MAP_POPULATE, 243 uintptr(sockfd), 244 unix.XDP_PGOFF_RX_RING, 245 ) 246 if err != nil { 247 return nil, fmt.Errorf("failed to mmap RX queue: %v", err) 248 } 249 cleanup.Add(func() { 250 memutil.UnmapSlice(rxQueueMem) 251 }) 252 // Setup the rxQueue with offsets into allocated memory. 253 cb.RX = RXQueue{ 254 mem: rxQueueMem, 255 mask: opts.NDescriptors - 1, 256 } 257 cb.RX.init(off, opts) 258 259 // Allocate space for the TX queue. 260 txQueueMem, err := memutil.MapSlice( 261 0, 262 uintptr(off.Tx.Desc+uint64(opts.NDescriptors)*sizeOfTXQueueDesc()), 263 unix.PROT_READ|unix.PROT_WRITE, 264 unix.MAP_SHARED|unix.MAP_POPULATE, 265 uintptr(sockfd), 266 unix.XDP_PGOFF_TX_RING, 267 ) 268 if err != nil { 269 return nil, fmt.Errorf("failed to mmap tx queue: %v", err) 270 } 271 cleanup.Add(func() { 272 memutil.UnmapSlice(txQueueMem) 273 }) 274 // Setup the txQueue with offsets into allocated memory. 275 cb.TX = TXQueue{ 276 sockfd: uint32(sockfd), 277 mem: txQueueMem, 278 mask: opts.NDescriptors - 1, 279 cachedConsumer: opts.NDescriptors, 280 } 281 cb.TX.init(off, opts) 282 283 addr := unix.SockaddrXDP{ 284 // XDP_USE_NEED_WAKEUP lets the driver sleep if there is no 285 // work to do. It will need to be woken by poll. It is expected 286 // that this improves performance by preventing the driver from 287 // burning cycles. 288 // 289 // By not setting either XDP_COPY or XDP_ZEROCOPY, we instruct 290 // the kernel to use zerocopy if available and then fallback to 291 // copy mode. 292 Flags: unix.XDP_USE_NEED_WAKEUP, 293 Ifindex: ifaceIdx, 294 // AF_XDP sockets are per device RX queue, although multiple 295 // sockets on multiple queues (or devices) can share a single 296 // UMEM. 297 QueueID: queueID, 298 // We're not using shared mode, so the value here is irrelevant. 299 SharedUmemFD: 0, 300 } 301 if err := unix.Bind(sockfd, &addr); err != nil { 302 return nil, fmt.Errorf("failed to bind with addr %+v: %v", addr, err) 303 } 304 305 cleanup.Release() 306 return &cb, nil 307 }