gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/tcpip/link/xdp/endpoint.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build linux 16 // +build linux 17 18 // Package xdp provides link layer endpoints backed by AF_XDP sockets. 19 package xdp 20 21 import ( 22 "fmt" 23 24 "golang.org/x/sys/unix" 25 "gvisor.dev/gvisor/pkg/buffer" 26 "gvisor.dev/gvisor/pkg/sync" 27 "gvisor.dev/gvisor/pkg/tcpip" 28 "gvisor.dev/gvisor/pkg/tcpip/header" 29 "gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo" 30 "gvisor.dev/gvisor/pkg/tcpip/link/rawfile" 31 "gvisor.dev/gvisor/pkg/tcpip/link/stopfd" 32 "gvisor.dev/gvisor/pkg/tcpip/stack" 33 "gvisor.dev/gvisor/pkg/xdp" 34 ) 35 36 // TODO(b/240191988): Turn off GSO, GRO, and LRO. Limit veth MTU to 1500. 37 38 // MTU is sized to ensure packets fit inside a 2048 byte XDP frame. 39 const MTU = 1500 40 41 var _ stack.LinkEndpoint = (*endpoint)(nil) 42 43 type endpoint struct { 44 // fd is the underlying AF_XDP socket. 45 fd int 46 47 // addr is the address of the endpoint. 48 addr tcpip.LinkAddress 49 50 // caps holds the endpoint capabilities. 51 caps stack.LinkEndpointCapabilities 52 53 // closed is a function to be called when the FD's peer (if any) closes 54 // its end of the communication pipe. 55 closed func(tcpip.Error) 56 57 mu sync.RWMutex 58 // +checkloks:mu 59 networkDispatcher stack.NetworkDispatcher 60 61 // wg keeps track of running goroutines. 62 wg sync.WaitGroup 63 64 // control is used to control the AF_XDP socket. 65 control *xdp.ControlBlock 66 67 // stopFD is used to stop the dispatch loop. 68 stopFD stopfd.StopFD 69 } 70 71 // Options specify the details about the fd-based endpoint to be created. 72 type Options struct { 73 // FD is used to read/write packets. 74 FD int 75 76 // ClosedFunc is a function to be called when an endpoint's peer (if 77 // any) closes its end of the communication pipe. 78 ClosedFunc func(tcpip.Error) 79 80 // Address is the link address for this endpoint. 81 Address tcpip.LinkAddress 82 83 // SaveRestore if true, indicates that this NIC capability set should 84 // include CapabilitySaveRestore 85 SaveRestore bool 86 87 // DisconnectOk if true, indicates that this NIC capability set should 88 // include CapabilityDisconnectOk. 89 DisconnectOk bool 90 91 // TXChecksumOffload if true, indicates that this endpoints capability 92 // set should include CapabilityTXChecksumOffload. 93 TXChecksumOffload bool 94 95 // RXChecksumOffload if true, indicates that this endpoints capability 96 // set should include CapabilityRXChecksumOffload. 97 RXChecksumOffload bool 98 99 // InterfaceIndex is the interface index of the underlying device. 100 InterfaceIndex int 101 102 // Bind is true when we're responsible for binding the AF_XDP socket to 103 // a device. When false, another process is expected to bind for us. 104 Bind bool 105 106 // GRO enables generic receive offload. 107 GRO bool 108 } 109 110 // New creates a new endpoint from an AF_XDP socket. 111 func New(opts *Options) (stack.LinkEndpoint, error) { 112 caps := stack.CapabilityResolutionRequired 113 if opts.RXChecksumOffload { 114 caps |= stack.CapabilityRXChecksumOffload 115 } 116 117 if opts.TXChecksumOffload { 118 caps |= stack.CapabilityTXChecksumOffload 119 } 120 121 if opts.SaveRestore { 122 caps |= stack.CapabilitySaveRestore 123 } 124 125 if opts.DisconnectOk { 126 caps |= stack.CapabilityDisconnectOk 127 } 128 129 if err := unix.SetNonblock(opts.FD, true); err != nil { 130 return nil, fmt.Errorf("unix.SetNonblock(%v) failed: %v", opts.FD, err) 131 } 132 133 ep := &endpoint{ 134 fd: opts.FD, 135 caps: caps, 136 closed: opts.ClosedFunc, 137 addr: opts.Address, 138 } 139 140 stopFD, err := stopfd.New() 141 if err != nil { 142 return nil, err 143 } 144 ep.stopFD = stopFD 145 146 // Use a 2MB UMEM to match the PACKET_MMAP dispatcher. There will be 147 // 1024 UMEM frames, and each queue will have 512 descriptors. Having 148 // fewer descriptors than frames prevents RX and TX from starving each 149 // other. 150 // TODO(b/240191988): Consider different numbers of descriptors for 151 // different queues. 152 const ( 153 frameSize = 2048 154 umemSize = 1 << 21 155 nFrames = umemSize / frameSize 156 ) 157 xdpOpts := xdp.Opts{ 158 NFrames: nFrames, 159 FrameSize: frameSize, 160 NDescriptors: nFrames / 2, 161 Bind: opts.Bind, 162 } 163 ep.control, err = xdp.NewFromSocket(opts.FD, uint32(opts.InterfaceIndex), 0 /* queueID */, xdpOpts) 164 if err != nil { 165 return nil, fmt.Errorf("failed to create AF_XDP dispatcher: %v", err) 166 } 167 168 ep.control.UMEM.Lock() 169 defer ep.control.UMEM.Unlock() 170 171 ep.control.Fill.FillAll(&ep.control.UMEM) 172 173 return ep, nil 174 } 175 176 // Attach launches the goroutine that reads packets from the file descriptor and 177 // dispatches them via the provided dispatcher. If one is already attached, 178 // then nothing happens. 179 // 180 // Attach implements stack.LinkEndpoint.Attach. 181 func (ep *endpoint) Attach(networkDispatcher stack.NetworkDispatcher) { 182 ep.mu.Lock() 183 defer ep.mu.Unlock() 184 // nil means the NIC is being removed. 185 if networkDispatcher == nil && ep.IsAttached() { 186 ep.stopFD.Stop() 187 ep.Wait() 188 ep.networkDispatcher = nil 189 return 190 } 191 if networkDispatcher != nil && ep.networkDispatcher == nil { 192 ep.networkDispatcher = networkDispatcher 193 // Link endpoints are not savable. When transportation endpoints are 194 // saved, they stop sending outgoing packets and all incoming packets 195 // are rejected. 196 ep.wg.Add(1) 197 go func() { // S/R-SAFE: See above. 198 defer ep.wg.Done() 199 for { 200 cont, err := ep.dispatch() 201 if err != nil || !cont { 202 if ep.closed != nil { 203 ep.closed(err) 204 } 205 return 206 } 207 } 208 }() 209 } 210 } 211 212 // IsAttached implements stack.LinkEndpoint.IsAttached. 213 func (ep *endpoint) IsAttached() bool { 214 ep.mu.RLock() 215 defer ep.mu.RUnlock() 216 return ep.networkDispatcher != nil 217 } 218 219 // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized 220 // during construction. 221 func (ep *endpoint) MTU() uint32 { 222 return MTU 223 } 224 225 // Capabilities implements stack.LinkEndpoint.Capabilities. 226 func (ep *endpoint) Capabilities() stack.LinkEndpointCapabilities { 227 return ep.caps 228 } 229 230 // MaxHeaderLength returns the maximum size of the link-layer header. 231 func (ep *endpoint) MaxHeaderLength() uint16 { 232 return uint16(header.EthernetMinimumSize) 233 } 234 235 // LinkAddress returns the link address of this endpoint. 236 func (ep *endpoint) LinkAddress() tcpip.LinkAddress { 237 return ep.addr 238 } 239 240 // Wait implements stack.LinkEndpoint.Wait. It waits for the endpoint to stop 241 // reading from its FD. 242 func (ep *endpoint) Wait() { 243 ep.wg.Wait() 244 } 245 246 // AddHeader implements stack.LinkEndpoint.AddHeader. 247 func (ep *endpoint) AddHeader(pkt *stack.PacketBuffer) { 248 // Add ethernet header if needed. 249 eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize)) 250 eth.Encode(&header.EthernetFields{ 251 SrcAddr: pkt.EgressRoute.LocalLinkAddress, 252 DstAddr: pkt.EgressRoute.RemoteLinkAddress, 253 Type: pkt.NetworkProtocolNumber, 254 }) 255 } 256 257 // ParseHeader implements stack.LinkEndpoint.ParseHeader. 258 func (ep *endpoint) ParseHeader(pkt *stack.PacketBuffer) bool { 259 _, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize) 260 return ok 261 } 262 263 // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. 264 func (ep *endpoint) ARPHardwareType() header.ARPHardwareType { 265 return header.ARPHardwareEther 266 } 267 268 // WritePackets writes outbound packets to the underlying file descriptors. If 269 // one is not currently writable, the packet is dropped. 270 // 271 // Each packet in pkts should have the following fields populated: 272 // - pkt.EgressRoute 273 // - pkt.NetworkProtocolNumber 274 // 275 // The following should not be populated, as GSO is not supported with XDP. 276 // - pkt.GSOOptions 277 func (ep *endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { 278 // We expect to be called via fifo, which imposes a limit of 279 // fifo.BatchSize. 280 var preallocatedBatch [fifo.BatchSize]unix.XDPDesc 281 batch := preallocatedBatch[:0] 282 283 ep.control.UMEM.Lock() 284 285 ep.control.Completion.FreeAll(&ep.control.UMEM) 286 287 // Reserve TX queue descriptors and umem buffers 288 nReserved, index := ep.control.TX.Reserve(&ep.control.UMEM, uint32(pkts.Len())) 289 if nReserved == 0 { 290 ep.control.UMEM.Unlock() 291 return 0, &tcpip.ErrNoBufferSpace{} 292 } 293 294 // Allocate UMEM space. In order to release the UMEM lock as soon as 295 // possible we allocate up-front. 296 for _, pkt := range pkts.AsSlice() { 297 batch = append(batch, unix.XDPDesc{ 298 Addr: ep.control.UMEM.AllocFrame(), 299 Len: uint32(pkt.Size()), 300 }) 301 } 302 303 for i, pkt := range pkts.AsSlice() { 304 // Copy packets into UMEM frame. 305 frame := ep.control.UMEM.Get(batch[i]) 306 offset := 0 307 var view *buffer.View 308 views, pktOffset := pkt.AsViewList() 309 for view = views.Front(); view != nil && pktOffset >= view.Size(); view = view.Next() { 310 pktOffset -= view.Size() 311 } 312 offset += copy(frame[offset:], view.AsSlice()[pktOffset:]) 313 for view = view.Next(); view != nil; view = view.Next() { 314 offset += copy(frame[offset:], view.AsSlice()) 315 } 316 ep.control.TX.Set(index+uint32(i), batch[i]) 317 } 318 319 // Notify the kernel that there're packets to write. 320 ep.control.TX.Notify() 321 322 // TODO(b/240191988): Explore more fine-grained locking. We shouldn't 323 // need to hold the UMEM lock for the whole duration of packet copying. 324 ep.control.UMEM.Unlock() 325 326 return pkts.Len(), nil 327 } 328 329 func (ep *endpoint) dispatch() (bool, tcpip.Error) { 330 var views []*buffer.View 331 332 for { 333 stopped, errno := rawfile.BlockingPollUntilStopped(ep.stopFD.EFD, ep.fd, unix.POLLIN|unix.POLLERR) 334 if errno != 0 { 335 if errno == unix.EINTR { 336 continue 337 } 338 return !stopped, rawfile.TranslateErrno(errno) 339 } 340 if stopped { 341 return true, nil 342 } 343 344 // Avoid the cost of the poll syscall if possible by peeking 345 // until there are no packets left. 346 for { 347 // We can receive multiple packets at once. 348 nReceived, rxIndex := ep.control.RX.Peek() 349 350 if nReceived == 0 { 351 break 352 } 353 354 // Reuse views to avoid allocating. 355 views = views[:0] 356 357 // Populate views quickly so that we can release frames 358 // back to the kernel. 359 ep.control.UMEM.Lock() 360 for i := uint32(0); i < nReceived; i++ { 361 // Copy packet bytes into a view and free up the 362 // buffer. 363 descriptor := ep.control.RX.Get(rxIndex + i) 364 data := ep.control.UMEM.Get(descriptor) 365 view := buffer.NewView(len(data)) 366 view.Write(data) 367 views = append(views, view) 368 ep.control.UMEM.FreeFrame(descriptor.Addr) 369 } 370 ep.control.Fill.FillAll(&ep.control.UMEM) 371 ep.control.UMEM.Unlock() 372 373 // Process each packet. 374 ep.mu.RLock() 375 d := ep.networkDispatcher 376 ep.mu.RUnlock() 377 for i := uint32(0); i < nReceived; i++ { 378 view := views[i] 379 data := view.AsSlice() 380 381 netProto := header.Ethernet(data).Type() 382 383 // Wrap the packet in a PacketBuffer and send it up the stack. 384 pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ 385 Payload: buffer.MakeWithView(view), 386 }) 387 // AF_XDP packets always have a link header. 388 if !ep.ParseHeader(pkt) { 389 panic("ParseHeader(_) must succeed") 390 } 391 d.DeliverNetworkPacket(netProto, pkt) 392 pkt.DecRef() 393 } 394 // Tell the kernel that we're done with these 395 // descriptors in the RX queue. 396 ep.control.RX.Release(nReceived) 397 } 398 } 399 }