github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/tcpip/link/fdbased/endpoint.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build linux 16 // +build linux 17 18 // Package fdbased provides the implemention of data-link layer endpoints 19 // backed by boundary-preserving file descriptors (e.g., TUN devices, 20 // seqpacket/datagram sockets). 21 // 22 // FD based endpoints can be used in the networking stack by calling New() to 23 // create a new endpoint, and then passing it as an argument to 24 // Stack.CreateNIC(). 25 // 26 // FD based endpoints can use more than one file descriptor to read incoming 27 // packets. If there are more than one FDs specified and the underlying FD is an 28 // AF_PACKET then the endpoint will enable FANOUT mode on the socket so that the 29 // host kernel will consistently hash the packets to the sockets. This ensures 30 // that packets for the same TCP streams are not reordered. 31 // 32 // Similarly if more than one FD's are specified where the underlying FD is not 33 // AF_PACKET then it's the caller's responsibility to ensure that all inbound 34 // packets on the descriptors are consistently 5 tuple hashed to one of the 35 // descriptors to prevent TCP reordering. 36 // 37 // Since netstack today does not compute 5 tuple hashes for outgoing packets we 38 // only use the first FD to write outbound packets. Once 5 tuple hashes for 39 // all outbound packets are available we will make use of all underlying FD's to 40 // write outbound packets. 41 package fdbased 42 43 import ( 44 "fmt" 45 46 "golang.org/x/sys/unix" 47 "github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops" 48 "github.com/nicocha30/gvisor-ligolo/pkg/buffer" 49 "github.com/nicocha30/gvisor-ligolo/pkg/sync" 50 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip" 51 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/header" 52 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/link/rawfile" 53 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/stack" 54 ) 55 56 // linkDispatcher reads packets from the link FD and dispatches them to the 57 // NetworkDispatcher. 58 type linkDispatcher interface { 59 Stop() 60 dispatch() (bool, tcpip.Error) 61 release() 62 } 63 64 // PacketDispatchMode are the various supported methods of receiving and 65 // dispatching packets from the underlying FD. 66 type PacketDispatchMode int 67 68 // BatchSize is the number of packets to write in each syscall. It is 47 69 // because when GvisorGSO is in use then a single 65KB TCP segment can get 70 // split into 46 segments of 1420 bytes and a single 216 byte segment. 71 const BatchSize = 47 72 73 const ( 74 // Readv is the default dispatch mode and is the least performant of the 75 // dispatch options but the one that is supported by all underlying FD 76 // types. 77 Readv PacketDispatchMode = iota 78 // RecvMMsg enables use of recvmmsg() syscall instead of readv() to 79 // read inbound packets. This reduces # of syscalls needed to process 80 // packets. 81 // 82 // NOTE: recvmmsg() is only supported for sockets, so if the underlying 83 // FD is not a socket then the code will still fall back to the readv() 84 // path. 85 RecvMMsg 86 // PacketMMap enables use of PACKET_RX_RING to receive packets from the 87 // NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The 88 // primary use-case for this is runsc which uses an AF_PACKET FD to 89 // receive packets from the veth device. 90 PacketMMap 91 ) 92 93 func (p PacketDispatchMode) String() string { 94 switch p { 95 case Readv: 96 return "Readv" 97 case RecvMMsg: 98 return "RecvMMsg" 99 case PacketMMap: 100 return "PacketMMap" 101 default: 102 return fmt.Sprintf("unknown packet dispatch mode '%d'", p) 103 } 104 } 105 106 var _ stack.LinkEndpoint = (*endpoint)(nil) 107 var _ stack.GSOEndpoint = (*endpoint)(nil) 108 109 type fdInfo struct { 110 fd int 111 isSocket bool 112 } 113 114 type endpoint struct { 115 // fds is the set of file descriptors each identifying one inbound/outbound 116 // channel. The endpoint will dispatch from all inbound channels as well as 117 // hash outbound packets to specific channels based on the packet hash. 118 fds []fdInfo 119 120 // mtu (maximum transmission unit) is the maximum size of a packet. 121 mtu uint32 122 123 // hdrSize specifies the link-layer header size. If set to 0, no header 124 // is added/removed; otherwise an ethernet header is used. 125 hdrSize int 126 127 // addr is the address of the endpoint. 128 addr tcpip.LinkAddress 129 130 // caps holds the endpoint capabilities. 131 caps stack.LinkEndpointCapabilities 132 133 // closed is a function to be called when the FD's peer (if any) closes 134 // its end of the communication pipe. 135 closed func(tcpip.Error) 136 137 inboundDispatchers []linkDispatcher 138 139 mu sync.RWMutex 140 // +checklocks:mu 141 dispatcher stack.NetworkDispatcher 142 143 // packetDispatchMode controls the packet dispatcher used by this 144 // endpoint. 145 packetDispatchMode PacketDispatchMode 146 147 // gsoMaxSize is the maximum GSO packet size. It is zero if GSO is 148 // disabled. 149 gsoMaxSize uint32 150 151 // wg keeps track of running goroutines. 152 wg sync.WaitGroup 153 154 // gsoKind is the supported kind of GSO. 155 gsoKind stack.SupportedGSO 156 157 // maxSyscallHeaderBytes has the same meaning as 158 // Options.MaxSyscallHeaderBytes. 159 maxSyscallHeaderBytes uintptr 160 161 // writevMaxIovs is the maximum number of iovecs that may be passed to 162 // rawfile.NonBlockingWriteIovec, as possibly limited by 163 // maxSyscallHeaderBytes. (No analogous limit is defined for 164 // rawfile.NonBlockingSendMMsg, since in that case the maximum number of 165 // iovecs also depends on the number of mmsghdrs. Instead, if sendBatch 166 // encounters a packet whose iovec count is limited by 167 // maxSyscallHeaderBytes, it falls back to writing the packet using writev 168 // via WritePacket.) 169 writevMaxIovs int 170 } 171 172 // Options specify the details about the fd-based endpoint to be created. 173 type Options struct { 174 // FDs is a set of FDs used to read/write packets. 175 FDs []int 176 177 // MTU is the mtu to use for this endpoint. 178 MTU uint32 179 180 // EthernetHeader if true, indicates that the endpoint should read/write 181 // ethernet frames instead of IP packets. 182 EthernetHeader bool 183 184 // ClosedFunc is a function to be called when an endpoint's peer (if 185 // any) closes its end of the communication pipe. 186 ClosedFunc func(tcpip.Error) 187 188 // Address is the link address for this endpoint. Only used if 189 // EthernetHeader is true. 190 Address tcpip.LinkAddress 191 192 // SaveRestore if true, indicates that this NIC capability set should 193 // include CapabilitySaveRestore 194 SaveRestore bool 195 196 // DisconnectOk if true, indicates that this NIC capability set should 197 // include CapabilityDisconnectOk. 198 DisconnectOk bool 199 200 // GSOMaxSize is the maximum GSO packet size. It is zero if GSO is 201 // disabled. 202 GSOMaxSize uint32 203 204 // GvisorGSOEnabled indicates whether Gvisor GSO is enabled or not. 205 GvisorGSOEnabled bool 206 207 // PacketDispatchMode specifies the type of inbound dispatcher to be 208 // used for this endpoint. 209 PacketDispatchMode PacketDispatchMode 210 211 // TXChecksumOffload if true, indicates that this endpoints capability 212 // set should include CapabilityTXChecksumOffload. 213 TXChecksumOffload bool 214 215 // RXChecksumOffload if true, indicates that this endpoints capability 216 // set should include CapabilityRXChecksumOffload. 217 RXChecksumOffload bool 218 219 // If MaxSyscallHeaderBytes is non-zero, it is the maximum number of bytes 220 // of struct iovec, msghdr, and mmsghdr that may be passed by each host 221 // system call. 222 MaxSyscallHeaderBytes int 223 224 // AFXDPFD is used with the experimental AF_XDP mode. 225 // TODO(b/240191988): Use multiple sockets. 226 // TODO(b/240191988): How do we handle the MTU issue? 227 AFXDPFD *int 228 229 // InterfaceIndex is the interface index of the underlying device. 230 InterfaceIndex int 231 } 232 233 // fanoutID is used for AF_PACKET based endpoints to enable PACKET_FANOUT 234 // support in the host kernel. This allows us to use multiple FD's to receive 235 // from the same underlying NIC. The fanoutID needs to be the same for a given 236 // set of FD's that point to the same NIC. Trying to set the PACKET_FANOUT 237 // option for an FD with a fanoutID already in use by another FD for a different 238 // NIC will return an EINVAL. 239 // 240 // Since fanoutID must be unique within the network namespace, we start with 241 // the PID to avoid collisions. The only way to be sure of avoiding collisions 242 // is to run in a new network namespace. 243 var fanoutID atomicbitops.Int32 = atomicbitops.FromInt32(int32(unix.Getpid())) 244 245 // New creates a new fd-based endpoint. 246 // 247 // Makes fd non-blocking, but does not take ownership of fd, which must remain 248 // open for the lifetime of the returned endpoint (until after the endpoint has 249 // stopped being using and Wait returns). 250 func New(opts *Options) (stack.LinkEndpoint, error) { 251 caps := stack.LinkEndpointCapabilities(0) 252 if opts.RXChecksumOffload { 253 caps |= stack.CapabilityRXChecksumOffload 254 } 255 256 if opts.TXChecksumOffload { 257 caps |= stack.CapabilityTXChecksumOffload 258 } 259 260 hdrSize := 0 261 if opts.EthernetHeader { 262 hdrSize = header.EthernetMinimumSize 263 caps |= stack.CapabilityResolutionRequired 264 } 265 266 if opts.SaveRestore { 267 caps |= stack.CapabilitySaveRestore 268 } 269 270 if opts.DisconnectOk { 271 caps |= stack.CapabilityDisconnectOk 272 } 273 274 if len(opts.FDs) == 0 { 275 return nil, fmt.Errorf("opts.FD is empty, at least one FD must be specified") 276 } 277 278 if opts.MaxSyscallHeaderBytes < 0 { 279 return nil, fmt.Errorf("opts.MaxSyscallHeaderBytes is negative") 280 } 281 282 e := &endpoint{ 283 mtu: opts.MTU, 284 caps: caps, 285 closed: opts.ClosedFunc, 286 addr: opts.Address, 287 hdrSize: hdrSize, 288 packetDispatchMode: opts.PacketDispatchMode, 289 maxSyscallHeaderBytes: uintptr(opts.MaxSyscallHeaderBytes), 290 writevMaxIovs: rawfile.MaxIovs, 291 } 292 if e.maxSyscallHeaderBytes != 0 { 293 if max := int(e.maxSyscallHeaderBytes / rawfile.SizeofIovec); max < e.writevMaxIovs { 294 e.writevMaxIovs = max 295 } 296 } 297 298 // Increment fanoutID to ensure that we don't re-use the same fanoutID 299 // for the next endpoint. 300 fid := fanoutID.Add(1) 301 302 // Create per channel dispatchers. 303 for _, fd := range opts.FDs { 304 if err := unix.SetNonblock(fd, true); err != nil { 305 return nil, fmt.Errorf("unix.SetNonblock(%v) failed: %v", fd, err) 306 } 307 308 isSocket, err := isSocketFD(fd) 309 if err != nil { 310 return nil, err 311 } 312 e.fds = append(e.fds, fdInfo{fd: fd, isSocket: isSocket}) 313 if isSocket { 314 if opts.GSOMaxSize != 0 { 315 if opts.GvisorGSOEnabled { 316 e.gsoKind = stack.GvisorGSOSupported 317 } else { 318 e.gsoKind = stack.HostGSOSupported 319 } 320 e.gsoMaxSize = opts.GSOMaxSize 321 } 322 } 323 324 inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket, fid) 325 if err != nil { 326 return nil, fmt.Errorf("createInboundDispatcher(...) = %v", err) 327 } 328 e.inboundDispatchers = append(e.inboundDispatchers, inboundDispatcher) 329 } 330 331 return e, nil 332 } 333 334 func createInboundDispatcher(e *endpoint, fd int, isSocket bool, fID int32) (linkDispatcher, error) { 335 // By default use the readv() dispatcher as it works with all kinds of 336 // FDs (tap/tun/unix domain sockets and af_packet). 337 inboundDispatcher, err := newReadVDispatcher(fd, e) 338 if err != nil { 339 return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", fd, e, err) 340 } 341 342 if isSocket { 343 sa, err := unix.Getsockname(fd) 344 if err != nil { 345 return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err) 346 } 347 switch sa.(type) { 348 case *unix.SockaddrLinklayer: 349 // Enable PACKET_FANOUT mode if the underlying socket is of type 350 // AF_PACKET. We do not enable PACKET_FANOUT_FLAG_DEFRAG as that will 351 // prevent gvisor from receiving fragmented packets and the host does the 352 // reassembly on our behalf before delivering the fragments. This makes it 353 // hard to test fragmentation reassembly code in Netstack. 354 // 355 // See: include/uapi/linux/if_packet.h (struct fanout_args). 356 // 357 // NOTE: We are using SetSockOptInt here even though the underlying 358 // option is actually a struct. The code follows the example in the 359 // kernel documentation as described at the link below: 360 // 361 // See: https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt 362 // 363 // This works out because the actual implementation for the option zero 364 // initializes the structure and will initialize the max_members field 365 // to a proper value if zero. 366 // 367 // See: https://github.com/torvalds/linux/blob/7acac4b3196caee5e21fb5ea53f8bc124e6a16fc/net/packet/af_packet.c#L3881 368 const fanoutType = unix.PACKET_FANOUT_HASH 369 fanoutArg := (int(fID) & 0xffff) | fanoutType<<16 370 if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil { 371 return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err) 372 } 373 } 374 375 switch e.packetDispatchMode { 376 case PacketMMap: 377 inboundDispatcher, err = newPacketMMapDispatcher(fd, e) 378 if err != nil { 379 return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", fd, e, err) 380 } 381 case RecvMMsg: 382 // If the provided FD is a socket then we optimize 383 // packet reads by using recvmmsg() instead of read() to 384 // read packets in a batch. 385 inboundDispatcher, err = newRecvMMsgDispatcher(fd, e) 386 if err != nil { 387 return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", fd, e, err) 388 } 389 case Readv: 390 default: 391 return nil, fmt.Errorf("unknown dispatch mode %d", e.packetDispatchMode) 392 } 393 } 394 return inboundDispatcher, nil 395 } 396 397 func isSocketFD(fd int) (bool, error) { 398 var stat unix.Stat_t 399 if err := unix.Fstat(fd, &stat); err != nil { 400 return false, fmt.Errorf("unix.Fstat(%v,...) failed: %v", fd, err) 401 } 402 return (stat.Mode & unix.S_IFSOCK) == unix.S_IFSOCK, nil 403 } 404 405 // Attach launches the goroutine that reads packets from the file descriptor and 406 // dispatches them via the provided dispatcher. If one is already attached, 407 // then nothing happens. 408 // 409 // Attach implements stack.LinkEndpoint.Attach. 410 func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { 411 e.mu.Lock() 412 defer e.mu.Unlock() 413 // nil means the NIC is being removed. 414 if dispatcher == nil && e.dispatcher != nil { 415 for _, dispatcher := range e.inboundDispatchers { 416 dispatcher.Stop() 417 } 418 e.Wait() 419 e.dispatcher = nil 420 return 421 } 422 if dispatcher != nil && e.dispatcher == nil { 423 e.dispatcher = dispatcher 424 // Link endpoints are not savable. When transportation endpoints are 425 // saved, they stop sending outgoing packets and all incoming packets 426 // are rejected. 427 for i := range e.inboundDispatchers { 428 e.wg.Add(1) 429 go func(i int) { // S/R-SAFE: See above. 430 e.dispatchLoop(e.inboundDispatchers[i]) 431 e.wg.Done() 432 }(i) 433 } 434 } 435 } 436 437 // IsAttached implements stack.LinkEndpoint.IsAttached. 438 func (e *endpoint) IsAttached() bool { 439 e.mu.RLock() 440 defer e.mu.RUnlock() 441 return e.dispatcher != nil 442 } 443 444 // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized 445 // during construction. 446 func (e *endpoint) MTU() uint32 { 447 return e.mtu 448 } 449 450 // Capabilities implements stack.LinkEndpoint.Capabilities. 451 func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { 452 return e.caps 453 } 454 455 // MaxHeaderLength returns the maximum size of the link-layer header. 456 func (e *endpoint) MaxHeaderLength() uint16 { 457 return uint16(e.hdrSize) 458 } 459 460 // LinkAddress returns the link address of this endpoint. 461 func (e *endpoint) LinkAddress() tcpip.LinkAddress { 462 return e.addr 463 } 464 465 // Wait implements stack.LinkEndpoint.Wait. It waits for the endpoint to stop 466 // reading from its FD. 467 func (e *endpoint) Wait() { 468 e.wg.Wait() 469 } 470 471 // virtioNetHdr is declared in linux/virtio_net.h. 472 type virtioNetHdr struct { 473 flags uint8 474 gsoType uint8 475 hdrLen uint16 476 gsoSize uint16 477 csumStart uint16 478 csumOffset uint16 479 } 480 481 // marshal serializes h to a newly-allocated byte slice, in little-endian byte 482 // order. 483 // 484 // Note: Virtio v1.0 onwards specifies little-endian as the byte ordering used 485 // for general serialization. This makes it difficult to use go-marshal for 486 // virtio types, as go-marshal implicitly uses the native byte ordering. 487 func (h *virtioNetHdr) marshal() []byte { 488 buf := [virtioNetHdrSize]byte{ 489 0: byte(h.flags), 490 1: byte(h.gsoType), 491 492 // Manually lay out the fields in little-endian byte order. Little endian => 493 // least significant bit goes to the lower address. 494 495 2: byte(h.hdrLen), 496 3: byte(h.hdrLen >> 8), 497 498 4: byte(h.gsoSize), 499 5: byte(h.gsoSize >> 8), 500 501 6: byte(h.csumStart), 502 7: byte(h.csumStart >> 8), 503 504 8: byte(h.csumOffset), 505 9: byte(h.csumOffset >> 8), 506 } 507 return buf[:] 508 } 509 510 // These constants are declared in linux/virtio_net.h. 511 const ( 512 _VIRTIO_NET_HDR_F_NEEDS_CSUM = 1 513 514 _VIRTIO_NET_HDR_GSO_TCPV4 = 1 515 _VIRTIO_NET_HDR_GSO_TCPV6 = 4 516 ) 517 518 // AddHeader implements stack.LinkEndpoint.AddHeader. 519 func (e *endpoint) AddHeader(pkt stack.PacketBufferPtr) { 520 if e.hdrSize > 0 { 521 // Add ethernet header if needed. 522 eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize)) 523 eth.Encode(&header.EthernetFields{ 524 SrcAddr: pkt.EgressRoute.LocalLinkAddress, 525 DstAddr: pkt.EgressRoute.RemoteLinkAddress, 526 Type: pkt.NetworkProtocolNumber, 527 }) 528 } 529 } 530 531 func (e *endpoint) parseHeader(pkt stack.PacketBufferPtr) bool { 532 _, ok := pkt.LinkHeader().Consume(e.hdrSize) 533 return ok 534 535 } 536 537 // ParseHeader implements stack.LinkEndpoint.ParseHeader. 538 func (e *endpoint) ParseHeader(pkt stack.PacketBufferPtr) bool { 539 if e.hdrSize > 0 { 540 return e.parseHeader(pkt) 541 } 542 return true 543 } 544 545 // writePacket writes outbound packets to the file descriptor. If it is not 546 // currently writable, the packet is dropped. 547 func (e *endpoint) writePacket(pkt stack.PacketBufferPtr) tcpip.Error { 548 fdInfo := e.fds[pkt.Hash%uint32(len(e.fds))] 549 fd := fdInfo.fd 550 var vnetHdrBuf []byte 551 if e.gsoKind == stack.HostGSOSupported { 552 vnetHdr := virtioNetHdr{} 553 if pkt.GSOOptions.Type != stack.GSONone { 554 vnetHdr.hdrLen = uint16(pkt.HeaderSize()) 555 if pkt.GSOOptions.NeedsCsum { 556 vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM 557 vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen 558 vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset 559 } 560 if uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS { 561 switch pkt.GSOOptions.Type { 562 case stack.GSOTCPv4: 563 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 564 case stack.GSOTCPv6: 565 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 566 default: 567 panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type)) 568 } 569 vnetHdr.gsoSize = pkt.GSOOptions.MSS 570 } 571 } 572 vnetHdrBuf = vnetHdr.marshal() 573 } 574 575 views := pkt.AsSlices() 576 numIovecs := len(views) 577 if len(vnetHdrBuf) != 0 { 578 numIovecs++ 579 } 580 if numIovecs > e.writevMaxIovs { 581 numIovecs = e.writevMaxIovs 582 } 583 584 // Allocate small iovec arrays on the stack. 585 var iovecsArr [8]unix.Iovec 586 iovecs := iovecsArr[:0] 587 if numIovecs > len(iovecsArr) { 588 iovecs = make([]unix.Iovec, 0, numIovecs) 589 } 590 iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs) 591 for _, v := range views { 592 iovecs = rawfile.AppendIovecFromBytes(iovecs, v, numIovecs) 593 } 594 return rawfile.NonBlockingWriteIovec(fd, iovecs) 595 } 596 597 func (e *endpoint) sendBatch(batchFDInfo fdInfo, pkts []stack.PacketBufferPtr) (int, tcpip.Error) { 598 // Degrade to writePacket if underlying fd is not a socket. 599 if !batchFDInfo.isSocket { 600 var written int 601 var err tcpip.Error 602 for written < len(pkts) { 603 if err = e.writePacket(pkts[written]); err != nil { 604 break 605 } 606 written++ 607 } 608 return written, err 609 } 610 611 // Send a batch of packets through batchFD. 612 batchFD := batchFDInfo.fd 613 mmsgHdrsStorage := make([]rawfile.MMsgHdr, 0, len(pkts)) 614 packets := 0 615 for packets < len(pkts) { 616 mmsgHdrs := mmsgHdrsStorage 617 batch := pkts[packets:] 618 syscallHeaderBytes := uintptr(0) 619 for _, pkt := range batch { 620 var vnetHdrBuf []byte 621 if e.gsoKind == stack.HostGSOSupported { 622 vnetHdr := virtioNetHdr{} 623 if pkt.GSOOptions.Type != stack.GSONone { 624 vnetHdr.hdrLen = uint16(pkt.HeaderSize()) 625 if pkt.GSOOptions.NeedsCsum { 626 vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM 627 vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen 628 vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset 629 } 630 if pkt.GSOOptions.Type != stack.GSONone && uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS { 631 switch pkt.GSOOptions.Type { 632 case stack.GSOTCPv4: 633 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 634 case stack.GSOTCPv6: 635 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 636 default: 637 panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type)) 638 } 639 vnetHdr.gsoSize = pkt.GSOOptions.MSS 640 } 641 } 642 vnetHdrBuf = vnetHdr.marshal() 643 } 644 645 views := pkt.AsSlices() 646 numIovecs := len(views) 647 if len(vnetHdrBuf) != 0 { 648 numIovecs++ 649 } 650 if numIovecs > rawfile.MaxIovs { 651 numIovecs = rawfile.MaxIovs 652 } 653 if e.maxSyscallHeaderBytes != 0 { 654 syscallHeaderBytes += rawfile.SizeofMMsgHdr + uintptr(numIovecs)*rawfile.SizeofIovec 655 if syscallHeaderBytes > e.maxSyscallHeaderBytes { 656 // We can't fit this packet into this call to sendmmsg(). 657 // We could potentially do so if we reduced numIovecs 658 // further, but this might incur considerable extra 659 // copying. Leave it to the next batch instead. 660 break 661 } 662 } 663 664 // We can't easily allocate iovec arrays on the stack here since 665 // they will escape this loop iteration via mmsgHdrs. 666 iovecs := make([]unix.Iovec, 0, numIovecs) 667 iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs) 668 for _, v := range views { 669 iovecs = rawfile.AppendIovecFromBytes(iovecs, v, numIovecs) 670 } 671 672 var mmsgHdr rawfile.MMsgHdr 673 mmsgHdr.Msg.Iov = &iovecs[0] 674 mmsgHdr.Msg.SetIovlen(len(iovecs)) 675 mmsgHdrs = append(mmsgHdrs, mmsgHdr) 676 } 677 678 if len(mmsgHdrs) == 0 { 679 // We can't fit batch[0] into a mmsghdr while staying under 680 // e.maxSyscallHeaderBytes. Use WritePacket, which will avoid the 681 // mmsghdr (by using writev) and re-buffer iovecs more aggressively 682 // if necessary (by using e.writevMaxIovs instead of 683 // rawfile.MaxIovs). 684 pkt := batch[0] 685 if err := e.writePacket(pkt); err != nil { 686 return packets, err 687 } 688 packets++ 689 } else { 690 for len(mmsgHdrs) > 0 { 691 sent, err := rawfile.NonBlockingSendMMsg(batchFD, mmsgHdrs) 692 if err != nil { 693 return packets, err 694 } 695 packets += sent 696 mmsgHdrs = mmsgHdrs[sent:] 697 } 698 } 699 } 700 701 return packets, nil 702 } 703 704 // WritePackets writes outbound packets to the underlying file descriptors. If 705 // one is not currently writable, the packet is dropped. 706 // 707 // Being a batch API, each packet in pkts should have the following 708 // fields populated: 709 // - pkt.EgressRoute 710 // - pkt.GSOOptions 711 // - pkt.NetworkProtocolNumber 712 func (e *endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { 713 // Preallocate to avoid repeated reallocation as we append to batch. 714 batch := make([]stack.PacketBufferPtr, 0, BatchSize) 715 batchFDInfo := fdInfo{fd: -1, isSocket: false} 716 sentPackets := 0 717 for _, pkt := range pkts.AsSlice() { 718 if len(batch) == 0 { 719 batchFDInfo = e.fds[pkt.Hash%uint32(len(e.fds))] 720 } 721 pktFDInfo := e.fds[pkt.Hash%uint32(len(e.fds))] 722 if sendNow := pktFDInfo != batchFDInfo; !sendNow { 723 batch = append(batch, pkt) 724 continue 725 } 726 n, err := e.sendBatch(batchFDInfo, batch) 727 sentPackets += n 728 if err != nil { 729 return sentPackets, err 730 } 731 batch = batch[:0] 732 batch = append(batch, pkt) 733 batchFDInfo = pktFDInfo 734 } 735 736 if len(batch) != 0 { 737 n, err := e.sendBatch(batchFDInfo, batch) 738 sentPackets += n 739 if err != nil { 740 return sentPackets, err 741 } 742 } 743 return sentPackets, nil 744 } 745 746 // InjectOutbound implements stack.InjectableEndpoint.InjectOutbound. 747 func (e *endpoint) InjectOutbound(dest tcpip.Address, packet *buffer.View) tcpip.Error { 748 return rawfile.NonBlockingWrite(e.fds[0].fd, packet.AsSlice()) 749 } 750 751 // dispatchLoop reads packets from the file descriptor in a loop and dispatches 752 // them to the network stack. 753 func (e *endpoint) dispatchLoop(inboundDispatcher linkDispatcher) tcpip.Error { 754 for { 755 cont, err := inboundDispatcher.dispatch() 756 if err != nil || !cont { 757 if e.closed != nil { 758 e.closed(err) 759 } 760 inboundDispatcher.release() 761 return err 762 } 763 } 764 } 765 766 // GSOMaxSize implements stack.GSOEndpoint. 767 func (e *endpoint) GSOMaxSize() uint32 { 768 return e.gsoMaxSize 769 } 770 771 // SupportedGSO implements stack.GSOEndpoint. 772 func (e *endpoint) SupportedGSO() stack.SupportedGSO { 773 return e.gsoKind 774 } 775 776 // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. 777 func (e *endpoint) ARPHardwareType() header.ARPHardwareType { 778 if e.hdrSize > 0 { 779 return header.ARPHardwareEther 780 } 781 return header.ARPHardwareNone 782 } 783 784 // InjectableEndpoint is an injectable fd-based endpoint. The endpoint writes 785 // to the FD, but does not read from it. All reads come from injected packets. 786 type InjectableEndpoint struct { 787 endpoint 788 789 mu sync.RWMutex 790 // +checklocks:mu 791 dispatcher stack.NetworkDispatcher 792 } 793 794 // Attach saves the stack network-layer dispatcher for use later when packets 795 // are injected. 796 func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) { 797 e.mu.Lock() 798 defer e.mu.Unlock() 799 e.dispatcher = dispatcher 800 } 801 802 // InjectInbound injects an inbound packet. If the endpoint is not attached, the 803 // packet is not delivered. 804 func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBufferPtr) { 805 e.mu.RLock() 806 d := e.dispatcher 807 e.mu.RUnlock() 808 if d != nil { 809 d.DeliverNetworkPacket(protocol, pkt) 810 } 811 } 812 813 // NewInjectable creates a new fd-based InjectableEndpoint. 814 func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabilities) (*InjectableEndpoint, error) { 815 unix.SetNonblock(fd, true) 816 isSocket, err := isSocketFD(fd) 817 if err != nil { 818 return nil, err 819 } 820 821 return &InjectableEndpoint{endpoint: endpoint{ 822 fds: []fdInfo{{fd: fd, isSocket: isSocket}}, 823 mtu: mtu, 824 caps: capabilities, 825 writevMaxIovs: rawfile.MaxIovs, 826 }}, nil 827 }