github.com/sagernet/gvisor@v0.0.0-20240428053021-e691de28565f/pkg/xdp/xdp.go (about) 1 // Copyright 2022 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build amd64 || arm64 16 // +build amd64 arm64 17 18 // Package xdp provides tools for working with AF_XDP sockets. 19 // 20 // AF_XDP shares a memory area (UMEM) with the kernel to pass packets 21 // back and forth. Communication is done via a number of queues. 22 // Briefly, the queues work as follows: 23 // 24 // - Receive: Userspace adds a descriptor to the fill queue. The 25 // descriptor points to an area of the UMEM that the kernel should fill 26 // with an incoming packet. The packet is filled by the kernel, which 27 // places a descriptor to the same UMEM area in the RX queue, signifying 28 // that userspace may read the packet. 29 // - Trasmit: Userspace adds a descriptor to TX queue. The kernel 30 // sends the packet (stored in UMEM) pointed to by the descriptor. 31 // Upon completion, the kernel places a descriptor in the completion 32 // queue to notify userspace that the packet is sent and the UMEM 33 // area can be reused. 34 // 35 // So in short: RX packets move from the fill to RX queue, and TX 36 // packets move from the TX to completion queue. 37 // 38 // Note that the shared UMEM for RX and TX means that packet forwarding 39 // can be done without copying; only the queues need to be updated to point to 40 // the packet in UMEM. 41 package xdp 42 43 import ( 44 "fmt" 45 "math/bits" 46 47 "golang.org/x/sys/unix" 48 "github.com/sagernet/gvisor/pkg/cleanup" 49 "github.com/sagernet/gvisor/pkg/log" 50 "github.com/sagernet/gvisor/pkg/memutil" 51 ) 52 53 // A ControlBlock contains all the control structures necessary to use an 54 // AF_XDP socket. 55 // 56 // The ControlBlock and the structures it contains are meant to be used with a 57 // single RX goroutine and a single TX goroutine. 58 type ControlBlock struct { 59 UMEM UMEM 60 Fill FillQueue 61 RX RXQueue 62 TX TXQueue 63 Completion CompletionQueue 64 } 65 66 // Opts configure an AF_XDP socket. 67 type Opts struct { 68 NFrames uint32 69 FrameSize uint32 70 NDescriptors uint32 71 Bind bool 72 UseNeedWakeup bool 73 } 74 75 // DefaultOpts provides recommended default options for initializing an AF_XDP 76 // socket. AF_XDP setup is extremely finnicky and can fail if incorrect values 77 // are used. 78 func DefaultOpts() Opts { 79 return Opts{ 80 NFrames: 4096, 81 // Frames must be 2048 or 4096 bytes, although not all drivers support 82 // both. 83 FrameSize: 4096, 84 NDescriptors: 2048, 85 } 86 } 87 88 // New returns an initialized AF_XDP socket bound to a particular interface and 89 // queue. 90 func New(ifaceIdx, queueID uint32, opts Opts) (*ControlBlock, error) { 91 sockfd, err := unix.Socket(unix.AF_XDP, unix.SOCK_RAW, 0) 92 if err != nil { 93 return nil, fmt.Errorf("failed to create AF_XDP socket: %v", err) 94 } 95 return NewFromSocket(sockfd, ifaceIdx, queueID, opts) 96 } 97 98 // NewFromSocket takes an AF_XDP socket, initializes it, and binds it to a 99 // particular interface and queue. 100 func NewFromSocket(sockfd int, ifaceIdx, queueID uint32, opts Opts) (*ControlBlock, error) { 101 if opts.FrameSize != 2048 && opts.FrameSize != 4096 { 102 return nil, fmt.Errorf("invalid frame size %d: must be either 2048 or 4096", opts.FrameSize) 103 } 104 if bits.OnesCount32(opts.NDescriptors) != 1 { 105 return nil, fmt.Errorf("invalid number of descriptors %d: must be a power of 2", opts.NDescriptors) 106 } 107 108 var cb ControlBlock 109 110 // Create the UMEM area. Use mmap instead of make([[]byte) to ensure 111 // that the UMEM is page-aligned. Aligning the UMEM keeps individual 112 // packets from spilling over between pages. 113 var zerofd uintptr 114 umemMemory, err := memutil.MapSlice( 115 0, 116 uintptr(opts.NFrames*opts.FrameSize), 117 unix.PROT_READ|unix.PROT_WRITE, 118 unix.MAP_PRIVATE|unix.MAP_ANONYMOUS, 119 zerofd-1, 120 0, 121 ) 122 if err != nil { 123 return nil, fmt.Errorf("failed to mmap umem: %v", err) 124 } 125 cleanup := cleanup.Make(func() { 126 memutil.UnmapSlice(umemMemory) 127 }) 128 129 if sliceBackingPointer(umemMemory)%uintptr(unix.Getpagesize()) != 0 { 130 return nil, fmt.Errorf("UMEM is not page aligned (address 0x%x)", sliceBackingPointer(umemMemory)) 131 } 132 133 cb.UMEM = UMEM{ 134 mem: umemMemory, 135 sockfd: uint32(sockfd), 136 frameAddresses: make([]uint64, opts.NFrames), 137 nFreeFrames: opts.NFrames, 138 frameMask: ^(uint64(opts.FrameSize) - 1), 139 } 140 141 // Fill in each frame address. 142 for i := range cb.UMEM.frameAddresses { 143 cb.UMEM.frameAddresses[i] = uint64(i) * uint64(opts.FrameSize) 144 } 145 146 // Check whether we're likely to fail due to RLIMIT_MEMLOCK. 147 var rlimit unix.Rlimit 148 if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlimit); err != nil { 149 return nil, fmt.Errorf("failed to get rlimit for memlock: %v", err) 150 } 151 if rlimit.Cur < uint64(len(cb.UMEM.mem)) { 152 log.Infof("UMEM size (%d) may exceed RLIMIT_MEMLOCK (%+v) and cause registration to fail", len(cb.UMEM.mem), rlimit) 153 } 154 155 reg := unix.XDPUmemReg{ 156 Addr: uint64(sliceBackingPointer(umemMemory)), 157 Len: uint64(len(umemMemory)), 158 Size: opts.FrameSize, 159 // Not useful in the RX path. 160 Headroom: 0, 161 // TODO(b/240191988): Investigate use of SHARED flag. 162 Flags: 0, 163 } 164 if err := registerUMEM(sockfd, reg); err != nil { 165 return nil, fmt.Errorf("failed to register UMEM: %v", err) 166 } 167 168 // Set the number of descriptors in the fill queue. 169 if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_UMEM_FILL_RING, int(opts.NDescriptors)); err != nil { 170 return nil, fmt.Errorf("failed to register fill ring: %v", err) 171 } 172 // Set the number of descriptors in the completion queue. 173 if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_UMEM_COMPLETION_RING, int(opts.NDescriptors)); err != nil { 174 return nil, fmt.Errorf("failed to register completion ring: %v", err) 175 } 176 // Set the number of descriptors in the RX queue. 177 if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_RX_RING, int(opts.NDescriptors)); err != nil { 178 return nil, fmt.Errorf("failed to register RX queue: %v", err) 179 } 180 // Set the number of descriptors in the TX queue. 181 if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_TX_RING, int(opts.NDescriptors)); err != nil { 182 return nil, fmt.Errorf("failed to register TX queue: %v", err) 183 } 184 185 // Get offset information for the queues. Offsets indicate where, once 186 // we mmap space for each queue, values in the queue are. They give 187 // offsets for the shared pointers, a shared flags value, and the 188 // beginning of the ring of descriptors. 189 off, err := getOffsets(sockfd) 190 if err != nil { 191 return nil, fmt.Errorf("failed to get offsets: %v", err) 192 } 193 194 // Allocate space for the fill queue. 195 fillQueueMem, err := memutil.MapSlice( 196 0, 197 uintptr(off.Fr.Desc+uint64(opts.NDescriptors)*sizeOfFillQueueDesc()), 198 unix.PROT_READ|unix.PROT_WRITE, 199 unix.MAP_SHARED|unix.MAP_POPULATE, 200 uintptr(sockfd), 201 unix.XDP_UMEM_PGOFF_FILL_RING, 202 ) 203 if err != nil { 204 return nil, fmt.Errorf("failed to mmap fill queue: %v", err) 205 } 206 cleanup.Add(func() { 207 memutil.UnmapSlice(fillQueueMem) 208 }) 209 // Setup the fillQueue with offsets into allocated memory. 210 cb.Fill = FillQueue{ 211 mem: fillQueueMem, 212 mask: opts.NDescriptors - 1, 213 cachedConsumer: opts.NDescriptors, 214 } 215 cb.Fill.init(off, opts) 216 217 // Allocate space for the completion queue. 218 completionQueueMem, err := memutil.MapSlice( 219 0, 220 uintptr(off.Cr.Desc+uint64(opts.NDescriptors)*sizeOfCompletionQueueDesc()), 221 unix.PROT_READ|unix.PROT_WRITE, 222 unix.MAP_SHARED|unix.MAP_POPULATE, 223 uintptr(sockfd), 224 unix.XDP_UMEM_PGOFF_COMPLETION_RING, 225 ) 226 if err != nil { 227 return nil, fmt.Errorf("failed to mmap completion queue: %v", err) 228 } 229 cleanup.Add(func() { 230 memutil.UnmapSlice(completionQueueMem) 231 }) 232 // Setup the completionQueue with offsets into allocated memory. 233 cb.Completion = CompletionQueue{ 234 mem: completionQueueMem, 235 mask: opts.NDescriptors - 1, 236 } 237 cb.Completion.init(off, opts) 238 239 // Allocate space for the RX queue. 240 rxQueueMem, err := memutil.MapSlice( 241 0, 242 uintptr(off.Rx.Desc+uint64(opts.NDescriptors)*sizeOfRXQueueDesc()), 243 unix.PROT_READ|unix.PROT_WRITE, 244 unix.MAP_SHARED|unix.MAP_POPULATE, 245 uintptr(sockfd), 246 unix.XDP_PGOFF_RX_RING, 247 ) 248 if err != nil { 249 return nil, fmt.Errorf("failed to mmap RX queue: %v", err) 250 } 251 cleanup.Add(func() { 252 memutil.UnmapSlice(rxQueueMem) 253 }) 254 // Setup the rxQueue with offsets into allocated memory. 255 cb.RX = RXQueue{ 256 mem: rxQueueMem, 257 mask: opts.NDescriptors - 1, 258 } 259 cb.RX.init(off, opts) 260 261 // Allocate space for the TX queue. 262 txQueueMem, err := memutil.MapSlice( 263 0, 264 uintptr(off.Tx.Desc+uint64(opts.NDescriptors)*sizeOfTXQueueDesc()), 265 unix.PROT_READ|unix.PROT_WRITE, 266 unix.MAP_SHARED|unix.MAP_POPULATE, 267 uintptr(sockfd), 268 unix.XDP_PGOFF_TX_RING, 269 ) 270 if err != nil { 271 return nil, fmt.Errorf("failed to mmap tx queue: %v", err) 272 } 273 cleanup.Add(func() { 274 memutil.UnmapSlice(txQueueMem) 275 }) 276 // Setup the txQueue with offsets into allocated memory. 277 cb.TX = TXQueue{ 278 sockfd: uint32(sockfd), 279 mem: txQueueMem, 280 mask: opts.NDescriptors - 1, 281 cachedConsumer: opts.NDescriptors, 282 } 283 cb.TX.init(off, opts) 284 285 // In some cases we don't call bind, as we're not in the netns with the 286 // device. In those cases, another process with the same socket will 287 // bind for us. 288 if opts.Bind { 289 if err := Bind(sockfd, ifaceIdx, queueID, opts.UseNeedWakeup); err != nil { 290 return nil, fmt.Errorf("failed to bind to interface %d: %v", ifaceIdx, err) 291 } 292 } 293 294 cleanup.Release() 295 return &cb, nil 296 } 297 298 // Bind binds a socket to a particular network interface and queue. 299 func Bind(sockfd int, ifindex, queueID uint32, useNeedWakeup bool) error { 300 var flags uint16 301 if useNeedWakeup { 302 flags |= unix.XDP_USE_NEED_WAKEUP 303 } 304 addr := unix.SockaddrXDP{ 305 // XDP_USE_NEED_WAKEUP lets the driver sleep if there is no 306 // work to do. It will need to be woken by poll. It is expected 307 // that this improves performance by preventing the driver from 308 // burning cycles. 309 // 310 // By not setting either XDP_COPY or XDP_ZEROCOPY, we instruct 311 // the kernel to use zerocopy if available and then fallback to 312 // copy mode. 313 Flags: flags, 314 Ifindex: ifindex, 315 // AF_XDP sockets are per device RX queue, although multiple 316 // sockets on multiple queues (or devices) can share a single 317 // UMEM. 318 QueueID: queueID, 319 // We're not using shared mode, so the value here is irrelevant. 320 SharedUmemFD: 0, 321 } 322 return unix.Bind(sockfd, &addr) 323 }