gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/tcpip/link/fdbased/endpoint.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build linux 16 // +build linux 17 18 // Package fdbased provides the implementation of data-link layer endpoints 19 // backed by boundary-preserving file descriptors (e.g., TUN devices, 20 // seqpacket/datagram sockets). 21 // 22 // FD based endpoints can be used in the networking stack by calling New() to 23 // create a new endpoint, and then passing it as an argument to 24 // Stack.CreateNIC(). 25 // 26 // FD based endpoints can use more than one file descriptor to read incoming 27 // packets. If there are more than one FDs specified and the underlying FD is an 28 // AF_PACKET then the endpoint will enable FANOUT mode on the socket so that the 29 // host kernel will consistently hash the packets to the sockets. This ensures 30 // that packets for the same TCP streams are not reordered. 31 // 32 // Similarly if more than one FD's are specified where the underlying FD is not 33 // AF_PACKET then it's the caller's responsibility to ensure that all inbound 34 // packets on the descriptors are consistently 5 tuple hashed to one of the 35 // descriptors to prevent TCP reordering. 36 // 37 // Since netstack today does not compute 5 tuple hashes for outgoing packets we 38 // only use the first FD to write outbound packets. Once 5 tuple hashes for 39 // all outbound packets are available we will make use of all underlying FD's to 40 // write outbound packets. 41 package fdbased 42 43 import ( 44 "fmt" 45 "runtime" 46 47 "golang.org/x/sys/unix" 48 "gvisor.dev/gvisor/pkg/atomicbitops" 49 "gvisor.dev/gvisor/pkg/buffer" 50 "gvisor.dev/gvisor/pkg/sync" 51 "gvisor.dev/gvisor/pkg/tcpip" 52 "gvisor.dev/gvisor/pkg/tcpip/header" 53 "gvisor.dev/gvisor/pkg/tcpip/link/rawfile" 54 "gvisor.dev/gvisor/pkg/tcpip/stack" 55 ) 56 57 // linkDispatcher reads packets from the link FD and dispatches them to the 58 // NetworkDispatcher. 59 type linkDispatcher interface { 60 Stop() 61 dispatch() (bool, tcpip.Error) 62 release() 63 } 64 65 // PacketDispatchMode are the various supported methods of receiving and 66 // dispatching packets from the underlying FD. 67 type PacketDispatchMode int 68 69 // BatchSize is the number of packets to write in each syscall. It is 47 70 // because when GVisorGSO is in use then a single 65KB TCP segment can get 71 // split into 46 segments of 1420 bytes and a single 216 byte segment. 72 const BatchSize = 47 73 74 const ( 75 // Readv is the default dispatch mode and is the least performant of the 76 // dispatch options but the one that is supported by all underlying FD 77 // types. 78 Readv PacketDispatchMode = iota 79 // RecvMMsg enables use of recvmmsg() syscall instead of readv() to 80 // read inbound packets. This reduces # of syscalls needed to process 81 // packets. 82 // 83 // NOTE: recvmmsg() is only supported for sockets, so if the underlying 84 // FD is not a socket then the code will still fall back to the readv() 85 // path. 86 RecvMMsg 87 // PacketMMap enables use of PACKET_RX_RING to receive packets from the 88 // NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The 89 // primary use-case for this is runsc which uses an AF_PACKET FD to 90 // receive packets from the veth device. 91 PacketMMap 92 ) 93 94 func (p PacketDispatchMode) String() string { 95 switch p { 96 case Readv: 97 return "Readv" 98 case RecvMMsg: 99 return "RecvMMsg" 100 case PacketMMap: 101 return "PacketMMap" 102 default: 103 return fmt.Sprintf("unknown packet dispatch mode '%d'", p) 104 } 105 } 106 107 var _ stack.LinkEndpoint = (*endpoint)(nil) 108 var _ stack.GSOEndpoint = (*endpoint)(nil) 109 110 type fdInfo struct { 111 fd int 112 isSocket bool 113 } 114 115 type endpoint struct { 116 // fds is the set of file descriptors each identifying one inbound/outbound 117 // channel. The endpoint will dispatch from all inbound channels as well as 118 // hash outbound packets to specific channels based on the packet hash. 119 fds []fdInfo 120 121 // mtu (maximum transmission unit) is the maximum size of a packet. 122 mtu uint32 123 124 // hdrSize specifies the link-layer header size. If set to 0, no header 125 // is added/removed; otherwise an ethernet header is used. 126 hdrSize int 127 128 // addr is the address of the endpoint. 129 addr tcpip.LinkAddress 130 131 // caps holds the endpoint capabilities. 132 caps stack.LinkEndpointCapabilities 133 134 // closed is a function to be called when the FD's peer (if any) closes 135 // its end of the communication pipe. 136 closed func(tcpip.Error) 137 138 inboundDispatchers []linkDispatcher 139 140 mu sync.RWMutex 141 // +checklocks:mu 142 dispatcher stack.NetworkDispatcher 143 144 // packetDispatchMode controls the packet dispatcher used by this 145 // endpoint. 146 packetDispatchMode PacketDispatchMode 147 148 // gsoMaxSize is the maximum GSO packet size. It is zero if GSO is 149 // disabled. 150 gsoMaxSize uint32 151 152 // wg keeps track of running goroutines. 153 wg sync.WaitGroup 154 155 // gsoKind is the supported kind of GSO. 156 gsoKind stack.SupportedGSO 157 158 // maxSyscallHeaderBytes has the same meaning as 159 // Options.MaxSyscallHeaderBytes. 160 maxSyscallHeaderBytes uintptr 161 162 // writevMaxIovs is the maximum number of iovecs that may be passed to 163 // rawfile.NonBlockingWriteIovec, as possibly limited by 164 // maxSyscallHeaderBytes. (No analogous limit is defined for 165 // rawfile.NonBlockingSendMMsg, since in that case the maximum number of 166 // iovecs also depends on the number of mmsghdrs. Instead, if sendBatch 167 // encounters a packet whose iovec count is limited by 168 // maxSyscallHeaderBytes, it falls back to writing the packet using writev 169 // via WritePacket.) 170 writevMaxIovs int 171 } 172 173 // Options specify the details about the fd-based endpoint to be created. 174 type Options struct { 175 // FDs is a set of FDs used to read/write packets. 176 FDs []int 177 178 // MTU is the mtu to use for this endpoint. 179 MTU uint32 180 181 // EthernetHeader if true, indicates that the endpoint should read/write 182 // ethernet frames instead of IP packets. 183 EthernetHeader bool 184 185 // ClosedFunc is a function to be called when an endpoint's peer (if 186 // any) closes its end of the communication pipe. 187 ClosedFunc func(tcpip.Error) 188 189 // Address is the link address for this endpoint. Only used if 190 // EthernetHeader is true. 191 Address tcpip.LinkAddress 192 193 // SaveRestore if true, indicates that this NIC capability set should 194 // include CapabilitySaveRestore 195 SaveRestore bool 196 197 // DisconnectOk if true, indicates that this NIC capability set should 198 // include CapabilityDisconnectOk. 199 DisconnectOk bool 200 201 // GSOMaxSize is the maximum GSO packet size. It is zero if GSO is 202 // disabled. 203 GSOMaxSize uint32 204 205 // GVisorGSOEnabled indicates whether Gvisor GSO is enabled or not. 206 GVisorGSOEnabled bool 207 208 // PacketDispatchMode specifies the type of inbound dispatcher to be 209 // used for this endpoint. 210 PacketDispatchMode PacketDispatchMode 211 212 // TXChecksumOffload if true, indicates that this endpoints capability 213 // set should include CapabilityTXChecksumOffload. 214 TXChecksumOffload bool 215 216 // RXChecksumOffload if true, indicates that this endpoints capability 217 // set should include CapabilityRXChecksumOffload. 218 RXChecksumOffload bool 219 220 // If MaxSyscallHeaderBytes is non-zero, it is the maximum number of bytes 221 // of struct iovec, msghdr, and mmsghdr that may be passed by each host 222 // system call. 223 MaxSyscallHeaderBytes int 224 225 // InterfaceIndex is the interface index of the underlying device. 226 InterfaceIndex int 227 228 // GRO enables generic receive offload. 229 GRO bool 230 231 // ProcessorsPerChannel is the number of goroutines used to handle packets 232 // from each FD. 233 ProcessorsPerChannel int 234 } 235 236 // fanoutID is used for AF_PACKET based endpoints to enable PACKET_FANOUT 237 // support in the host kernel. This allows us to use multiple FD's to receive 238 // from the same underlying NIC. The fanoutID needs to be the same for a given 239 // set of FD's that point to the same NIC. Trying to set the PACKET_FANOUT 240 // option for an FD with a fanoutID already in use by another FD for a different 241 // NIC will return an EINVAL. 242 // 243 // Since fanoutID must be unique within the network namespace, we start with 244 // the PID to avoid collisions. The only way to be sure of avoiding collisions 245 // is to run in a new network namespace. 246 var fanoutID atomicbitops.Int32 = atomicbitops.FromInt32(int32(unix.Getpid())) 247 248 // New creates a new fd-based endpoint. 249 // 250 // Makes fd non-blocking, but does not take ownership of fd, which must remain 251 // open for the lifetime of the returned endpoint (until after the endpoint has 252 // stopped being using and Wait returns). 253 func New(opts *Options) (stack.LinkEndpoint, error) { 254 caps := stack.LinkEndpointCapabilities(0) 255 if opts.RXChecksumOffload { 256 caps |= stack.CapabilityRXChecksumOffload 257 } 258 259 if opts.TXChecksumOffload { 260 caps |= stack.CapabilityTXChecksumOffload 261 } 262 263 hdrSize := 0 264 if opts.EthernetHeader { 265 hdrSize = header.EthernetMinimumSize 266 caps |= stack.CapabilityResolutionRequired 267 } 268 269 if opts.SaveRestore { 270 caps |= stack.CapabilitySaveRestore 271 } 272 273 if opts.DisconnectOk { 274 caps |= stack.CapabilityDisconnectOk 275 } 276 277 if len(opts.FDs) == 0 { 278 return nil, fmt.Errorf("opts.FD is empty, at least one FD must be specified") 279 } 280 281 if opts.MaxSyscallHeaderBytes < 0 { 282 return nil, fmt.Errorf("opts.MaxSyscallHeaderBytes is negative") 283 } 284 285 e := &endpoint{ 286 mtu: opts.MTU, 287 caps: caps, 288 closed: opts.ClosedFunc, 289 addr: opts.Address, 290 hdrSize: hdrSize, 291 packetDispatchMode: opts.PacketDispatchMode, 292 maxSyscallHeaderBytes: uintptr(opts.MaxSyscallHeaderBytes), 293 writevMaxIovs: rawfile.MaxIovs, 294 } 295 if e.maxSyscallHeaderBytes != 0 { 296 if max := int(e.maxSyscallHeaderBytes / rawfile.SizeofIovec); max < e.writevMaxIovs { 297 e.writevMaxIovs = max 298 } 299 } 300 301 // Increment fanoutID to ensure that we don't re-use the same fanoutID 302 // for the next endpoint. 303 fid := fanoutID.Add(1) 304 305 // Create per channel dispatchers. 306 for _, fd := range opts.FDs { 307 if err := unix.SetNonblock(fd, true); err != nil { 308 return nil, fmt.Errorf("unix.SetNonblock(%v) failed: %v", fd, err) 309 } 310 311 isSocket, err := isSocketFD(fd) 312 if err != nil { 313 return nil, err 314 } 315 e.fds = append(e.fds, fdInfo{fd: fd, isSocket: isSocket}) 316 if isSocket { 317 if opts.GSOMaxSize != 0 { 318 if opts.GVisorGSOEnabled { 319 e.gsoKind = stack.GVisorGSOSupported 320 } else { 321 e.gsoKind = stack.HostGSOSupported 322 } 323 e.gsoMaxSize = opts.GSOMaxSize 324 } 325 } 326 if opts.ProcessorsPerChannel == 0 { 327 opts.ProcessorsPerChannel = max(1, runtime.GOMAXPROCS(0)/len(opts.FDs)) 328 } 329 330 inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket, fid, opts) 331 if err != nil { 332 return nil, fmt.Errorf("createInboundDispatcher(...) = %v", err) 333 } 334 e.inboundDispatchers = append(e.inboundDispatchers, inboundDispatcher) 335 } 336 337 return e, nil 338 } 339 340 func createInboundDispatcher(e *endpoint, fd int, isSocket bool, fID int32, opts *Options) (linkDispatcher, error) { 341 // By default use the readv() dispatcher as it works with all kinds of 342 // FDs (tap/tun/unix domain sockets and af_packet). 343 inboundDispatcher, err := newReadVDispatcher(fd, e, opts) 344 if err != nil { 345 return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", fd, e, err) 346 } 347 348 if isSocket { 349 sa, err := unix.Getsockname(fd) 350 if err != nil { 351 return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err) 352 } 353 switch sa.(type) { 354 case *unix.SockaddrLinklayer: 355 // Enable PACKET_FANOUT mode if the underlying socket is of type 356 // AF_PACKET. We do not enable PACKET_FANOUT_FLAG_DEFRAG as that will 357 // prevent gvisor from receiving fragmented packets and the host does the 358 // reassembly on our behalf before delivering the fragments. This makes it 359 // hard to test fragmentation reassembly code in Netstack. 360 // 361 // See: include/uapi/linux/if_packet.h (struct fanout_args). 362 // 363 // NOTE: We are using SetSockOptInt here even though the underlying 364 // option is actually a struct. The code follows the example in the 365 // kernel documentation as described at the link below: 366 // 367 // See: https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt 368 // 369 // This works out because the actual implementation for the option zero 370 // initializes the structure and will initialize the max_members field 371 // to a proper value if zero. 372 // 373 // See: https://github.com/torvalds/linux/blob/7acac4b3196caee5e21fb5ea53f8bc124e6a16fc/net/packet/af_packet.c#L3881 374 const fanoutType = unix.PACKET_FANOUT_HASH 375 fanoutArg := (int(fID) & 0xffff) | fanoutType<<16 376 if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil { 377 return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err) 378 } 379 } 380 381 switch e.packetDispatchMode { 382 case PacketMMap: 383 inboundDispatcher, err = newPacketMMapDispatcher(fd, e, opts) 384 if err != nil { 385 return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", fd, e, err) 386 } 387 case RecvMMsg: 388 // If the provided FD is a socket then we optimize 389 // packet reads by using recvmmsg() instead of read() to 390 // read packets in a batch. 391 inboundDispatcher, err = newRecvMMsgDispatcher(fd, e, opts) 392 if err != nil { 393 return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", fd, e, err) 394 } 395 case Readv: 396 default: 397 return nil, fmt.Errorf("unknown dispatch mode %d", e.packetDispatchMode) 398 } 399 } 400 return inboundDispatcher, nil 401 } 402 403 func isSocketFD(fd int) (bool, error) { 404 var stat unix.Stat_t 405 if err := unix.Fstat(fd, &stat); err != nil { 406 return false, fmt.Errorf("unix.Fstat(%v,...) failed: %v", fd, err) 407 } 408 return (stat.Mode & unix.S_IFSOCK) == unix.S_IFSOCK, nil 409 } 410 411 // Attach launches the goroutine that reads packets from the file descriptor and 412 // dispatches them via the provided dispatcher. If one is already attached, 413 // then nothing happens. 414 // 415 // Attach implements stack.LinkEndpoint.Attach. 416 func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { 417 e.mu.Lock() 418 defer e.mu.Unlock() 419 420 // nil means the NIC is being removed. 421 if dispatcher == nil && e.dispatcher != nil { 422 for _, dispatcher := range e.inboundDispatchers { 423 dispatcher.Stop() 424 } 425 e.Wait() 426 e.dispatcher = nil 427 return 428 } 429 if dispatcher != nil && e.dispatcher == nil { 430 e.dispatcher = dispatcher 431 // Link endpoints are not savable. When transportation endpoints are 432 // saved, they stop sending outgoing packets and all incoming packets 433 // are rejected. 434 for i := range e.inboundDispatchers { 435 e.wg.Add(1) 436 go func(i int) { // S/R-SAFE: See above. 437 e.dispatchLoop(e.inboundDispatchers[i]) 438 e.wg.Done() 439 }(i) 440 } 441 } 442 } 443 444 // IsAttached implements stack.LinkEndpoint.IsAttached. 445 func (e *endpoint) IsAttached() bool { 446 e.mu.RLock() 447 defer e.mu.RUnlock() 448 return e.dispatcher != nil 449 } 450 451 // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized 452 // during construction. 453 func (e *endpoint) MTU() uint32 { 454 return e.mtu 455 } 456 457 // Capabilities implements stack.LinkEndpoint.Capabilities. 458 func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { 459 return e.caps 460 } 461 462 // MaxHeaderLength returns the maximum size of the link-layer header. 463 func (e *endpoint) MaxHeaderLength() uint16 { 464 return uint16(e.hdrSize) 465 } 466 467 // LinkAddress returns the link address of this endpoint. 468 func (e *endpoint) LinkAddress() tcpip.LinkAddress { 469 return e.addr 470 } 471 472 // Wait implements stack.LinkEndpoint.Wait. It waits for the endpoint to stop 473 // reading from its FD. 474 func (e *endpoint) Wait() { 475 e.wg.Wait() 476 } 477 478 // virtioNetHdr is declared in linux/virtio_net.h. 479 type virtioNetHdr struct { 480 flags uint8 481 gsoType uint8 482 hdrLen uint16 483 gsoSize uint16 484 csumStart uint16 485 csumOffset uint16 486 } 487 488 // marshal serializes h to a newly-allocated byte slice, in little-endian byte 489 // order. 490 // 491 // Note: Virtio v1.0 onwards specifies little-endian as the byte ordering used 492 // for general serialization. This makes it difficult to use go-marshal for 493 // virtio types, as go-marshal implicitly uses the native byte ordering. 494 func (h *virtioNetHdr) marshal() []byte { 495 buf := [virtioNetHdrSize]byte{ 496 0: byte(h.flags), 497 1: byte(h.gsoType), 498 499 // Manually lay out the fields in little-endian byte order. Little endian => 500 // least significant bit goes to the lower address. 501 502 2: byte(h.hdrLen), 503 3: byte(h.hdrLen >> 8), 504 505 4: byte(h.gsoSize), 506 5: byte(h.gsoSize >> 8), 507 508 6: byte(h.csumStart), 509 7: byte(h.csumStart >> 8), 510 511 8: byte(h.csumOffset), 512 9: byte(h.csumOffset >> 8), 513 } 514 return buf[:] 515 } 516 517 // These constants are declared in linux/virtio_net.h. 518 const ( 519 _VIRTIO_NET_HDR_F_NEEDS_CSUM = 1 520 521 _VIRTIO_NET_HDR_GSO_TCPV4 = 1 522 _VIRTIO_NET_HDR_GSO_TCPV6 = 4 523 ) 524 525 // AddHeader implements stack.LinkEndpoint.AddHeader. 526 func (e *endpoint) AddHeader(pkt *stack.PacketBuffer) { 527 if e.hdrSize > 0 { 528 // Add ethernet header if needed. 529 eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize)) 530 eth.Encode(&header.EthernetFields{ 531 SrcAddr: pkt.EgressRoute.LocalLinkAddress, 532 DstAddr: pkt.EgressRoute.RemoteLinkAddress, 533 Type: pkt.NetworkProtocolNumber, 534 }) 535 } 536 } 537 538 func (e *endpoint) parseHeader(pkt *stack.PacketBuffer) bool { 539 _, ok := pkt.LinkHeader().Consume(e.hdrSize) 540 return ok 541 542 } 543 544 // ParseHeader implements stack.LinkEndpoint.ParseHeader. 545 func (e *endpoint) ParseHeader(pkt *stack.PacketBuffer) bool { 546 if e.hdrSize > 0 { 547 return e.parseHeader(pkt) 548 } 549 return true 550 } 551 552 // writePacket writes outbound packets to the file descriptor. If it is not 553 // currently writable, the packet is dropped. 554 func (e *endpoint) writePacket(pkt *stack.PacketBuffer) tcpip.Error { 555 fdInfo := e.fds[pkt.Hash%uint32(len(e.fds))] 556 fd := fdInfo.fd 557 var vnetHdrBuf []byte 558 if e.gsoKind == stack.HostGSOSupported { 559 vnetHdr := virtioNetHdr{} 560 if pkt.GSOOptions.Type != stack.GSONone { 561 vnetHdr.hdrLen = uint16(pkt.HeaderSize()) 562 if pkt.GSOOptions.NeedsCsum { 563 vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM 564 vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen 565 vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset 566 } 567 if uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS { 568 switch pkt.GSOOptions.Type { 569 case stack.GSOTCPv4: 570 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 571 case stack.GSOTCPv6: 572 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 573 default: 574 panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type)) 575 } 576 vnetHdr.gsoSize = pkt.GSOOptions.MSS 577 } 578 } 579 vnetHdrBuf = vnetHdr.marshal() 580 } 581 582 views := pkt.AsSlices() 583 numIovecs := len(views) 584 if len(vnetHdrBuf) != 0 { 585 numIovecs++ 586 } 587 if numIovecs > e.writevMaxIovs { 588 numIovecs = e.writevMaxIovs 589 } 590 591 // Allocate small iovec arrays on the stack. 592 var iovecsArr [8]unix.Iovec 593 iovecs := iovecsArr[:0] 594 if numIovecs > len(iovecsArr) { 595 iovecs = make([]unix.Iovec, 0, numIovecs) 596 } 597 iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs) 598 for _, v := range views { 599 iovecs = rawfile.AppendIovecFromBytes(iovecs, v, numIovecs) 600 } 601 return rawfile.NonBlockingWriteIovec(fd, iovecs) 602 } 603 604 func (e *endpoint) sendBatch(batchFDInfo fdInfo, pkts []*stack.PacketBuffer) (int, tcpip.Error) { 605 // Degrade to writePacket if underlying fd is not a socket. 606 if !batchFDInfo.isSocket { 607 var written int 608 var err tcpip.Error 609 for written < len(pkts) { 610 if err = e.writePacket(pkts[written]); err != nil { 611 break 612 } 613 written++ 614 } 615 return written, err 616 } 617 618 // Send a batch of packets through batchFD. 619 batchFD := batchFDInfo.fd 620 mmsgHdrsStorage := make([]rawfile.MMsgHdr, 0, len(pkts)) 621 packets := 0 622 for packets < len(pkts) { 623 mmsgHdrs := mmsgHdrsStorage 624 batch := pkts[packets:] 625 syscallHeaderBytes := uintptr(0) 626 for _, pkt := range batch { 627 var vnetHdrBuf []byte 628 if e.gsoKind == stack.HostGSOSupported { 629 vnetHdr := virtioNetHdr{} 630 if pkt.GSOOptions.Type != stack.GSONone { 631 vnetHdr.hdrLen = uint16(pkt.HeaderSize()) 632 if pkt.GSOOptions.NeedsCsum { 633 vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM 634 vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen 635 vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset 636 } 637 if pkt.GSOOptions.Type != stack.GSONone && uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS { 638 switch pkt.GSOOptions.Type { 639 case stack.GSOTCPv4: 640 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 641 case stack.GSOTCPv6: 642 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 643 default: 644 panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type)) 645 } 646 vnetHdr.gsoSize = pkt.GSOOptions.MSS 647 } 648 } 649 vnetHdrBuf = vnetHdr.marshal() 650 } 651 652 views, offset := pkt.AsViewList() 653 var skipped int 654 var view *buffer.View 655 for view = views.Front(); view != nil && offset >= view.Size(); view = view.Next() { 656 offset -= view.Size() 657 skipped++ 658 } 659 660 // We've made it to the usable views. 661 numIovecs := views.Len() - skipped 662 if len(vnetHdrBuf) != 0 { 663 numIovecs++ 664 } 665 if numIovecs > rawfile.MaxIovs { 666 numIovecs = rawfile.MaxIovs 667 } 668 if e.maxSyscallHeaderBytes != 0 { 669 syscallHeaderBytes += rawfile.SizeofMMsgHdr + uintptr(numIovecs)*rawfile.SizeofIovec 670 if syscallHeaderBytes > e.maxSyscallHeaderBytes { 671 // We can't fit this packet into this call to sendmmsg(). 672 // We could potentially do so if we reduced numIovecs 673 // further, but this might incur considerable extra 674 // copying. Leave it to the next batch instead. 675 break 676 } 677 } 678 679 // We can't easily allocate iovec arrays on the stack here since 680 // they will escape this loop iteration via mmsgHdrs. 681 iovecs := make([]unix.Iovec, 0, numIovecs) 682 iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs) 683 // At most one slice has a non-zero offset. 684 iovecs = rawfile.AppendIovecFromBytes(iovecs, view.AsSlice()[offset:], numIovecs) 685 for view = view.Next(); view != nil; view = view.Next() { 686 iovecs = rawfile.AppendIovecFromBytes(iovecs, view.AsSlice(), numIovecs) 687 } 688 689 var mmsgHdr rawfile.MMsgHdr 690 mmsgHdr.Msg.Iov = &iovecs[0] 691 mmsgHdr.Msg.SetIovlen(len(iovecs)) 692 mmsgHdrs = append(mmsgHdrs, mmsgHdr) 693 } 694 695 if len(mmsgHdrs) == 0 { 696 // We can't fit batch[0] into a mmsghdr while staying under 697 // e.maxSyscallHeaderBytes. Use WritePacket, which will avoid the 698 // mmsghdr (by using writev) and re-buffer iovecs more aggressively 699 // if necessary (by using e.writevMaxIovs instead of 700 // rawfile.MaxIovs). 701 pkt := batch[0] 702 if err := e.writePacket(pkt); err != nil { 703 return packets, err 704 } 705 packets++ 706 } else { 707 for len(mmsgHdrs) > 0 { 708 sent, err := rawfile.NonBlockingSendMMsg(batchFD, mmsgHdrs) 709 if err != nil { 710 return packets, err 711 } 712 packets += sent 713 mmsgHdrs = mmsgHdrs[sent:] 714 } 715 } 716 } 717 718 return packets, nil 719 } 720 721 // WritePackets writes outbound packets to the underlying file descriptors. If 722 // one is not currently writable, the packet is dropped. 723 // 724 // Being a batch API, each packet in pkts should have the following 725 // fields populated: 726 // - pkt.EgressRoute 727 // - pkt.GSOOptions 728 // - pkt.NetworkProtocolNumber 729 func (e *endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { 730 // Preallocate to avoid repeated reallocation as we append to batch. 731 batch := make([]*stack.PacketBuffer, 0, BatchSize) 732 batchFDInfo := fdInfo{fd: -1, isSocket: false} 733 sentPackets := 0 734 for _, pkt := range pkts.AsSlice() { 735 if len(batch) == 0 { 736 batchFDInfo = e.fds[pkt.Hash%uint32(len(e.fds))] 737 } 738 pktFDInfo := e.fds[pkt.Hash%uint32(len(e.fds))] 739 if sendNow := pktFDInfo != batchFDInfo; !sendNow { 740 batch = append(batch, pkt) 741 continue 742 } 743 n, err := e.sendBatch(batchFDInfo, batch) 744 sentPackets += n 745 if err != nil { 746 return sentPackets, err 747 } 748 batch = batch[:0] 749 batch = append(batch, pkt) 750 batchFDInfo = pktFDInfo 751 } 752 753 if len(batch) != 0 { 754 n, err := e.sendBatch(batchFDInfo, batch) 755 sentPackets += n 756 if err != nil { 757 return sentPackets, err 758 } 759 } 760 return sentPackets, nil 761 } 762 763 // InjectOutbound implements stack.InjectableEndpoint.InjectOutbound. 764 func (e *endpoint) InjectOutbound(dest tcpip.Address, packet *buffer.View) tcpip.Error { 765 return rawfile.NonBlockingWrite(e.fds[0].fd, packet.AsSlice()) 766 } 767 768 // dispatchLoop reads packets from the file descriptor in a loop and dispatches 769 // them to the network stack. 770 func (e *endpoint) dispatchLoop(inboundDispatcher linkDispatcher) tcpip.Error { 771 for { 772 cont, err := inboundDispatcher.dispatch() 773 if err != nil || !cont { 774 if e.closed != nil { 775 e.closed(err) 776 } 777 inboundDispatcher.release() 778 return err 779 } 780 } 781 } 782 783 // GSOMaxSize implements stack.GSOEndpoint. 784 func (e *endpoint) GSOMaxSize() uint32 { 785 return e.gsoMaxSize 786 } 787 788 // SupportedGSO implements stack.GSOEndpoint. 789 func (e *endpoint) SupportedGSO() stack.SupportedGSO { 790 return e.gsoKind 791 } 792 793 // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. 794 func (e *endpoint) ARPHardwareType() header.ARPHardwareType { 795 if e.hdrSize > 0 { 796 return header.ARPHardwareEther 797 } 798 return header.ARPHardwareNone 799 } 800 801 // InjectableEndpoint is an injectable fd-based endpoint. The endpoint writes 802 // to the FD, but does not read from it. All reads come from injected packets. 803 type InjectableEndpoint struct { 804 endpoint 805 806 mu sync.RWMutex 807 // +checklocks:mu 808 dispatcher stack.NetworkDispatcher 809 } 810 811 // Attach saves the stack network-layer dispatcher for use later when packets 812 // are injected. 813 func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) { 814 e.mu.Lock() 815 defer e.mu.Unlock() 816 e.dispatcher = dispatcher 817 } 818 819 // InjectInbound injects an inbound packet. If the endpoint is not attached, the 820 // packet is not delivered. 821 func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { 822 e.mu.RLock() 823 d := e.dispatcher 824 e.mu.RUnlock() 825 if d != nil { 826 d.DeliverNetworkPacket(protocol, pkt) 827 } 828 } 829 830 // NewInjectable creates a new fd-based InjectableEndpoint. 831 func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabilities) (*InjectableEndpoint, error) { 832 unix.SetNonblock(fd, true) 833 isSocket, err := isSocketFD(fd) 834 if err != nil { 835 return nil, err 836 } 837 838 return &InjectableEndpoint{endpoint: endpoint{ 839 fds: []fdInfo{{fd: fd, isSocket: isSocket}}, 840 mtu: mtu, 841 caps: capabilities, 842 writevMaxIovs: rawfile.MaxIovs, 843 }}, nil 844 }