github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/tcpip/link/fdbased/endpoint.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // +build linux 16 17 // Package fdbased provides the implemention of data-link layer endpoints 18 // backed by boundary-preserving file descriptors (e.g., TUN devices, 19 // seqpacket/datagram sockets). 20 // 21 // FD based endpoints can be used in the networking stack by calling New() to 22 // create a new endpoint, and then passing it as an argument to 23 // Stack.CreateNIC(). 24 // 25 // FD based endpoints can use more than one file descriptor to read incoming 26 // packets. If there are more than one FDs specified and the underlying FD is an 27 // AF_PACKET then the endpoint will enable FANOUT mode on the socket so that the 28 // host kernel will consistently hash the packets to the sockets. This ensures 29 // that packets for the same TCP streams are not reordered. 30 // 31 // Similarly if more than one FD's are specified where the underlying FD is not 32 // AF_PACKET then it's the caller's responsibility to ensure that all inbound 33 // packets on the descriptors are consistently 5 tuple hashed to one of the 34 // descriptors to prevent TCP reordering. 35 // 36 // Since netstack today does not compute 5 tuple hashes for outgoing packets we 37 // only use the first FD to write outbound packets. Once 5 tuple hashes for 38 // all outbound packets are available we will make use of all underlying FD's to 39 // write outbound packets. 40 package fdbased 41 42 import ( 43 "fmt" 44 "runtime" 45 "sync/atomic" 46 47 "golang.org/x/sys/unix" 48 "github.com/SagerNet/gvisor/pkg/sync" 49 "github.com/SagerNet/gvisor/pkg/tcpip" 50 "github.com/SagerNet/gvisor/pkg/tcpip/buffer" 51 "github.com/SagerNet/gvisor/pkg/tcpip/header" 52 "github.com/SagerNet/gvisor/pkg/tcpip/link/rawfile" 53 "github.com/SagerNet/gvisor/pkg/tcpip/stack" 54 ) 55 56 // linkDispatcher reads packets from the link FD and dispatches them to the 57 // NetworkDispatcher. 58 type linkDispatcher interface { 59 dispatch() (bool, tcpip.Error) 60 } 61 62 // PacketDispatchMode are the various supported methods of receiving and 63 // dispatching packets from the underlying FD. 64 type PacketDispatchMode int 65 66 const ( 67 // Readv is the default dispatch mode and is the least performant of the 68 // dispatch options but the one that is supported by all underlying FD 69 // types. 70 Readv PacketDispatchMode = iota 71 // RecvMMsg enables use of recvmmsg() syscall instead of readv() to 72 // read inbound packets. This reduces # of syscalls needed to process 73 // packets. 74 // 75 // NOTE: recvmmsg() is only supported for sockets, so if the underlying 76 // FD is not a socket then the code will still fall back to the readv() 77 // path. 78 RecvMMsg 79 // PacketMMap enables use of PACKET_RX_RING to receive packets from the 80 // NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The 81 // primary use-case for this is runsc which uses an AF_PACKET FD to 82 // receive packets from the veth device. 83 PacketMMap 84 ) 85 86 func (p PacketDispatchMode) String() string { 87 switch p { 88 case Readv: 89 return "Readv" 90 case RecvMMsg: 91 return "RecvMMsg" 92 case PacketMMap: 93 return "PacketMMap" 94 default: 95 return fmt.Sprintf("unknown packet dispatch mode '%d'", p) 96 } 97 } 98 99 var _ stack.LinkEndpoint = (*endpoint)(nil) 100 var _ stack.GSOEndpoint = (*endpoint)(nil) 101 102 type endpoint struct { 103 // fds is the set of file descriptors each identifying one inbound/outbound 104 // channel. The endpoint will dispatch from all inbound channels as well as 105 // hash outbound packets to specific channels based on the packet hash. 106 fds []int 107 108 // mtu (maximum transmission unit) is the maximum size of a packet. 109 mtu uint32 110 111 // hdrSize specifies the link-layer header size. If set to 0, no header 112 // is added/removed; otherwise an ethernet header is used. 113 hdrSize int 114 115 // addr is the address of the endpoint. 116 addr tcpip.LinkAddress 117 118 // caps holds the endpoint capabilities. 119 caps stack.LinkEndpointCapabilities 120 121 // closed is a function to be called when the FD's peer (if any) closes 122 // its end of the communication pipe. 123 closed func(tcpip.Error) 124 125 inboundDispatchers []linkDispatcher 126 dispatcher stack.NetworkDispatcher 127 128 // packetDispatchMode controls the packet dispatcher used by this 129 // endpoint. 130 packetDispatchMode PacketDispatchMode 131 132 // gsoMaxSize is the maximum GSO packet size. It is zero if GSO is 133 // disabled. 134 gsoMaxSize uint32 135 136 // wg keeps track of running goroutines. 137 wg sync.WaitGroup 138 139 // gsoKind is the supported kind of GSO. 140 gsoKind stack.SupportedGSO 141 142 // maxSyscallHeaderBytes has the same meaning as 143 // Options.MaxSyscallHeaderBytes. 144 maxSyscallHeaderBytes uintptr 145 146 // writevMaxIovs is the maximum number of iovecs that may be passed to 147 // rawfile.NonBlockingWriteIovec, as possibly limited by 148 // maxSyscallHeaderBytes. (No analogous limit is defined for 149 // rawfile.NonBlockingSendMMsg, since in that case the maximum number of 150 // iovecs also depends on the number of mmsghdrs. Instead, if sendBatch 151 // encounters a packet whose iovec count is limited by 152 // maxSyscallHeaderBytes, it falls back to writing the packet using writev 153 // via WritePacket.) 154 writevMaxIovs int 155 } 156 157 // Options specify the details about the fd-based endpoint to be created. 158 type Options struct { 159 // FDs is a set of FDs used to read/write packets. 160 FDs []int 161 162 // MTU is the mtu to use for this endpoint. 163 MTU uint32 164 165 // EthernetHeader if true, indicates that the endpoint should read/write 166 // ethernet frames instead of IP packets. 167 EthernetHeader bool 168 169 // ClosedFunc is a function to be called when an endpoint's peer (if 170 // any) closes its end of the communication pipe. 171 ClosedFunc func(tcpip.Error) 172 173 // Address is the link address for this endpoint. Only used if 174 // EthernetHeader is true. 175 Address tcpip.LinkAddress 176 177 // SaveRestore if true, indicates that this NIC capability set should 178 // include CapabilitySaveRestore 179 SaveRestore bool 180 181 // DisconnectOk if true, indicates that this NIC capability set should 182 // include CapabilityDisconnectOk. 183 DisconnectOk bool 184 185 // GSOMaxSize is the maximum GSO packet size. It is zero if GSO is 186 // disabled. 187 GSOMaxSize uint32 188 189 // SoftwareGSOEnabled indicates whether software GSO is enabled or not. 190 SoftwareGSOEnabled bool 191 192 // PacketDispatchMode specifies the type of inbound dispatcher to be 193 // used for this endpoint. 194 PacketDispatchMode PacketDispatchMode 195 196 // TXChecksumOffload if true, indicates that this endpoints capability 197 // set should include CapabilityTXChecksumOffload. 198 TXChecksumOffload bool 199 200 // RXChecksumOffload if true, indicates that this endpoints capability 201 // set should include CapabilityRXChecksumOffload. 202 RXChecksumOffload bool 203 204 // If MaxSyscallHeaderBytes is non-zero, it is the maximum number of bytes 205 // of struct iovec, msghdr, and mmsghdr that may be passed by each host 206 // system call. 207 MaxSyscallHeaderBytes int 208 } 209 210 // fanoutID is used for AF_PACKET based endpoints to enable PACKET_FANOUT 211 // support in the host kernel. This allows us to use multiple FD's to receive 212 // from the same underlying NIC. The fanoutID needs to be the same for a given 213 // set of FD's that point to the same NIC. Trying to set the PACKET_FANOUT 214 // option for an FD with a fanoutID already in use by another FD for a different 215 // NIC will return an EINVAL. 216 // 217 // Since fanoutID must be unique within the network namespace, we start with 218 // the PID to avoid collisions. The only way to be sure of avoiding collisions 219 // is to run in a new network namespace. 220 // 221 // Must be accessed using atomic operations. 222 var fanoutID int32 = int32(unix.Getpid()) 223 224 // New creates a new fd-based endpoint. 225 // 226 // Makes fd non-blocking, but does not take ownership of fd, which must remain 227 // open for the lifetime of the returned endpoint (until after the endpoint has 228 // stopped being using and Wait returns). 229 func New(opts *Options) (stack.LinkEndpoint, error) { 230 caps := stack.LinkEndpointCapabilities(0) 231 if opts.RXChecksumOffload { 232 caps |= stack.CapabilityRXChecksumOffload 233 } 234 235 if opts.TXChecksumOffload { 236 caps |= stack.CapabilityTXChecksumOffload 237 } 238 239 hdrSize := 0 240 if opts.EthernetHeader { 241 hdrSize = header.EthernetMinimumSize 242 caps |= stack.CapabilityResolutionRequired 243 } 244 245 if opts.SaveRestore { 246 caps |= stack.CapabilitySaveRestore 247 } 248 249 if opts.DisconnectOk { 250 caps |= stack.CapabilityDisconnectOk 251 } 252 253 if len(opts.FDs) == 0 { 254 return nil, fmt.Errorf("opts.FD is empty, at least one FD must be specified") 255 } 256 257 if opts.MaxSyscallHeaderBytes < 0 { 258 return nil, fmt.Errorf("opts.MaxSyscallHeaderBytes is negative") 259 } 260 261 e := &endpoint{ 262 fds: opts.FDs, 263 mtu: opts.MTU, 264 caps: caps, 265 closed: opts.ClosedFunc, 266 addr: opts.Address, 267 hdrSize: hdrSize, 268 packetDispatchMode: opts.PacketDispatchMode, 269 maxSyscallHeaderBytes: uintptr(opts.MaxSyscallHeaderBytes), 270 writevMaxIovs: rawfile.MaxIovs, 271 } 272 if e.maxSyscallHeaderBytes != 0 { 273 if max := int(e.maxSyscallHeaderBytes / rawfile.SizeofIovec); max < e.writevMaxIovs { 274 e.writevMaxIovs = max 275 } 276 } 277 278 // Increment fanoutID to ensure that we don't re-use the same fanoutID for 279 // the next endpoint. 280 fid := atomic.AddInt32(&fanoutID, 1) 281 282 // Create per channel dispatchers. 283 for i := 0; i < len(e.fds); i++ { 284 fd := e.fds[i] 285 286 if runtime.GOOS != "android" { 287 if err := unix.SetNonblock(fd, true); err != nil { 288 return nil, fmt.Errorf("unix.SetNonblock(%v) failed: %v", fd, err) 289 } 290 } 291 292 isSocket, err := isSocketFD(fd) 293 if err != nil { 294 return nil, err 295 } 296 if isSocket { 297 if opts.GSOMaxSize != 0 { 298 if opts.SoftwareGSOEnabled { 299 e.gsoKind = stack.SWGSOSupported 300 } else { 301 e.gsoKind = stack.HWGSOSupported 302 } 303 e.gsoMaxSize = opts.GSOMaxSize 304 } 305 } 306 inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket, fid) 307 if err != nil { 308 return nil, fmt.Errorf("createInboundDispatcher(...) = %v", err) 309 } 310 e.inboundDispatchers = append(e.inboundDispatchers, inboundDispatcher) 311 } 312 313 return e, nil 314 } 315 316 func createInboundDispatcher(e *endpoint, fd int, isSocket bool, fID int32) (linkDispatcher, error) { 317 // By default use the readv() dispatcher as it works with all kinds of 318 // FDs (tap/tun/unix domain sockets and af_packet). 319 inboundDispatcher, err := newReadVDispatcher(fd, e) 320 if err != nil { 321 return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", fd, e, err) 322 } 323 324 if isSocket { 325 sa, err := unix.Getsockname(fd) 326 if err != nil { 327 return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err) 328 } 329 switch sa.(type) { 330 case *unix.SockaddrLinklayer: 331 // Enable PACKET_FANOUT mode if the underlying socket is of type 332 // AF_PACKET. We do not enable PACKET_FANOUT_FLAG_DEFRAG as that will 333 // prevent gvisor from receiving fragmented packets and the host does the 334 // reassembly on our behalf before delivering the fragments. This makes it 335 // hard to test fragmentation reassembly code in Netstack. 336 // 337 // See: include/uapi/linux/if_packet.h (struct fanout_args). 338 // 339 // NOTE: We are using SetSockOptInt here even though the underlying 340 // option is actually a struct. The code follows the example in the 341 // kernel documentation as described at the link below: 342 // 343 // See: https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt 344 // 345 // This works out because the actual implementation for the option zero 346 // initializes the structure and will initialize the max_members field 347 // to a proper value if zero. 348 // 349 // See: https://github.com/torvalds/linux/blob/7acac4b3196caee5e21fb5ea53f8bc124e6a16fc/net/packet/af_packet.c#L3881 350 const fanoutType = unix.PACKET_FANOUT_HASH 351 fanoutArg := (int(fID) & 0xffff) | fanoutType<<16 352 if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil { 353 return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err) 354 } 355 } 356 357 switch e.packetDispatchMode { 358 case PacketMMap: 359 inboundDispatcher, err = newPacketMMapDispatcher(fd, e) 360 if err != nil { 361 return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", fd, e, err) 362 } 363 case RecvMMsg: 364 // If the provided FD is a socket then we optimize 365 // packet reads by using recvmmsg() instead of read() to 366 // read packets in a batch. 367 inboundDispatcher, err = newRecvMMsgDispatcher(fd, e) 368 if err != nil { 369 return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", fd, e, err) 370 } 371 } 372 } 373 return inboundDispatcher, nil 374 } 375 376 func isSocketFD(fd int) (bool, error) { 377 var stat unix.Stat_t 378 if err := unix.Fstat(fd, &stat); err != nil { 379 return false, fmt.Errorf("unix.Fstat(%v,...) failed: %v", fd, err) 380 } 381 return (stat.Mode & unix.S_IFSOCK) == unix.S_IFSOCK, nil 382 } 383 384 // Attach launches the goroutine that reads packets from the file descriptor and 385 // dispatches them via the provided dispatcher. 386 func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { 387 e.dispatcher = dispatcher 388 // Link endpoints are not savable. When transportation endpoints are 389 // saved, they stop sending outgoing packets and all incoming packets 390 // are rejected. 391 for i := range e.inboundDispatchers { 392 e.wg.Add(1) 393 go func(i int) { // S/R-SAFE: See above. 394 e.dispatchLoop(e.inboundDispatchers[i]) 395 e.wg.Done() 396 }(i) 397 } 398 } 399 400 // IsAttached implements stack.LinkEndpoint.IsAttached. 401 func (e *endpoint) IsAttached() bool { 402 return e.dispatcher != nil 403 } 404 405 // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized 406 // during construction. 407 func (e *endpoint) MTU() uint32 { 408 return e.mtu 409 } 410 411 // Capabilities implements stack.LinkEndpoint.Capabilities. 412 func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { 413 return e.caps 414 } 415 416 // MaxHeaderLength returns the maximum size of the link-layer header. 417 func (e *endpoint) MaxHeaderLength() uint16 { 418 return uint16(e.hdrSize) 419 } 420 421 // LinkAddress returns the link address of this endpoint. 422 func (e *endpoint) LinkAddress() tcpip.LinkAddress { 423 return e.addr 424 } 425 426 // Wait implements stack.LinkEndpoint.Wait. It waits for the endpoint to stop 427 // reading from its FD. 428 func (e *endpoint) Wait() { 429 e.wg.Wait() 430 } 431 432 // virtioNetHdr is declared in linux/virtio_net.h. 433 type virtioNetHdr struct { 434 flags uint8 435 gsoType uint8 436 hdrLen uint16 437 gsoSize uint16 438 csumStart uint16 439 csumOffset uint16 440 } 441 442 // marshal serializes h to a newly-allocated byte slice, in little-endian byte 443 // order. 444 // 445 // Note: Virtio v1.0 onwards specifies little-endian as the byte ordering used 446 // for general serialization. This makes it difficult to use go-marshal for 447 // virtio types, as go-marshal implicitly uses the native byte ordering. 448 func (h *virtioNetHdr) marshal() []byte { 449 buf := [virtioNetHdrSize]byte{ 450 0: byte(h.flags), 451 1: byte(h.gsoType), 452 453 // Manually lay out the fields in little-endian byte order. Little endian => 454 // least significant bit goes to the lower address. 455 456 2: byte(h.hdrLen), 457 3: byte(h.hdrLen >> 8), 458 459 4: byte(h.gsoSize), 460 5: byte(h.gsoSize >> 8), 461 462 6: byte(h.csumStart), 463 7: byte(h.csumStart >> 8), 464 465 8: byte(h.csumOffset), 466 9: byte(h.csumOffset >> 8), 467 } 468 return buf[:] 469 } 470 471 // These constants are declared in linux/virtio_net.h. 472 const ( 473 _VIRTIO_NET_HDR_F_NEEDS_CSUM = 1 474 475 _VIRTIO_NET_HDR_GSO_TCPV4 = 1 476 _VIRTIO_NET_HDR_GSO_TCPV6 = 4 477 ) 478 479 // AddHeader implements stack.LinkEndpoint.AddHeader. 480 func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { 481 if e.hdrSize > 0 { 482 // Add ethernet header if needed. 483 eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize)) 484 ethHdr := &header.EthernetFields{ 485 DstAddr: remote, 486 Type: protocol, 487 } 488 489 // Preserve the src address if it's set in the route. 490 if local != "" { 491 ethHdr.SrcAddr = local 492 } else { 493 ethHdr.SrcAddr = e.addr 494 } 495 eth.Encode(ethHdr) 496 } 497 } 498 499 // WritePacket writes outbound packets to the file descriptor. If it is not 500 // currently writable, the packet is dropped. 501 func (e *endpoint) WritePacket(r stack.RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error { 502 if e.hdrSize > 0 { 503 e.AddHeader(r.LocalLinkAddress, r.RemoteLinkAddress, protocol, pkt) 504 } 505 506 fd := e.fds[pkt.Hash%uint32(len(e.fds))] 507 var vnetHdrBuf []byte 508 if e.gsoKind == stack.HWGSOSupported { 509 vnetHdr := virtioNetHdr{} 510 if pkt.GSOOptions.Type != stack.GSONone { 511 vnetHdr.hdrLen = uint16(pkt.HeaderSize()) 512 if pkt.GSOOptions.NeedsCsum { 513 vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM 514 vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen 515 vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset 516 } 517 if pkt.GSOOptions.Type != stack.GSONone && uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS { 518 switch pkt.GSOOptions.Type { 519 case stack.GSOTCPv4: 520 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 521 case stack.GSOTCPv6: 522 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 523 default: 524 panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type)) 525 } 526 vnetHdr.gsoSize = pkt.GSOOptions.MSS 527 } 528 } 529 vnetHdrBuf = vnetHdr.marshal() 530 } 531 532 views := pkt.Views() 533 numIovecs := len(views) 534 if len(vnetHdrBuf) != 0 { 535 numIovecs++ 536 } 537 if numIovecs > e.writevMaxIovs { 538 numIovecs = e.writevMaxIovs 539 } 540 541 // Allocate small iovec arrays on the stack. 542 var iovecsArr [8]unix.Iovec 543 iovecs := iovecsArr[:0] 544 if numIovecs > len(iovecsArr) { 545 iovecs = make([]unix.Iovec, 0, numIovecs) 546 } 547 iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs) 548 for _, v := range views { 549 iovecs = rawfile.AppendIovecFromBytes(iovecs, v, numIovecs) 550 } 551 return rawfile.NonBlockingWriteIovec(fd, iovecs) 552 } 553 554 func (e *endpoint) sendBatch(batchFD int, pkts []*stack.PacketBuffer) (int, tcpip.Error) { 555 // Send a batch of packets through batchFD. 556 mmsgHdrsStorage := make([]rawfile.MMsgHdr, 0, len(pkts)) 557 packets := 0 558 for packets < len(pkts) { 559 mmsgHdrs := mmsgHdrsStorage 560 batch := pkts[packets:] 561 syscallHeaderBytes := uintptr(0) 562 for _, pkt := range batch { 563 if e.hdrSize > 0 { 564 e.AddHeader(pkt.EgressRoute.LocalLinkAddress, pkt.EgressRoute.RemoteLinkAddress, pkt.NetworkProtocolNumber, pkt) 565 } 566 567 var vnetHdrBuf []byte 568 if e.gsoKind == stack.HWGSOSupported { 569 vnetHdr := virtioNetHdr{} 570 if pkt.GSOOptions.Type != stack.GSONone { 571 vnetHdr.hdrLen = uint16(pkt.HeaderSize()) 572 if pkt.GSOOptions.NeedsCsum { 573 vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM 574 vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen 575 vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset 576 } 577 if pkt.GSOOptions.Type != stack.GSONone && uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS { 578 switch pkt.GSOOptions.Type { 579 case stack.GSOTCPv4: 580 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 581 case stack.GSOTCPv6: 582 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 583 default: 584 panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type)) 585 } 586 vnetHdr.gsoSize = pkt.GSOOptions.MSS 587 } 588 } 589 vnetHdrBuf = vnetHdr.marshal() 590 } 591 592 views := pkt.Views() 593 numIovecs := len(views) 594 if len(vnetHdrBuf) != 0 { 595 numIovecs++ 596 } 597 if numIovecs > rawfile.MaxIovs { 598 numIovecs = rawfile.MaxIovs 599 } 600 if e.maxSyscallHeaderBytes != 0 { 601 syscallHeaderBytes += rawfile.SizeofMMsgHdr + uintptr(numIovecs)*rawfile.SizeofIovec 602 if syscallHeaderBytes > e.maxSyscallHeaderBytes { 603 // We can't fit this packet into this call to sendmmsg(). 604 // We could potentially do so if we reduced numIovecs 605 // further, but this might incur considerable extra 606 // copying. Leave it to the next batch instead. 607 break 608 } 609 } 610 611 // We can't easily allocate iovec arrays on the stack here since 612 // they will escape this loop iteration via mmsgHdrs. 613 iovecs := make([]unix.Iovec, 0, numIovecs) 614 iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs) 615 for _, v := range views { 616 iovecs = rawfile.AppendIovecFromBytes(iovecs, v, numIovecs) 617 } 618 619 var mmsgHdr rawfile.MMsgHdr 620 mmsgHdr.Msg.Iov = &iovecs[0] 621 mmsgHdr.Msg.SetIovlen(len(iovecs)) 622 mmsgHdrs = append(mmsgHdrs, mmsgHdr) 623 } 624 625 if len(mmsgHdrs) == 0 { 626 // We can't fit batch[0] into a mmsghdr while staying under 627 // e.maxSyscallHeaderBytes. Use WritePacket, which will avoid the 628 // mmsghdr (by using writev) and re-buffer iovecs more aggressively 629 // if necessary (by using e.writevMaxIovs instead of 630 // rawfile.MaxIovs). 631 pkt := batch[0] 632 if err := e.WritePacket(pkt.EgressRoute, pkt.NetworkProtocolNumber, pkt); err != nil { 633 return packets, err 634 } 635 packets++ 636 } else { 637 for len(mmsgHdrs) > 0 { 638 sent, err := rawfile.NonBlockingSendMMsg(batchFD, mmsgHdrs) 639 if err != nil { 640 return packets, err 641 } 642 packets += sent 643 mmsgHdrs = mmsgHdrs[sent:] 644 } 645 } 646 } 647 648 return packets, nil 649 } 650 651 // WritePackets writes outbound packets to the underlying file descriptors. If 652 // one is not currently writable, the packet is dropped. 653 // 654 // Being a batch API, each packet in pkts should have the following 655 // fields populated: 656 // - pkt.EgressRoute 657 // - pkt.GSOOptions 658 // - pkt.NetworkProtocolNumber 659 func (e *endpoint) WritePackets(_ stack.RouteInfo, pkts stack.PacketBufferList, _ tcpip.NetworkProtocolNumber) (int, tcpip.Error) { 660 // Preallocate to avoid repeated reallocation as we append to batch. 661 // batchSz is 47 because when SWGSO is in use then a single 65KB TCP 662 // segment can get split into 46 segments of 1420 bytes and a single 216 663 // byte segment. 664 const batchSz = 47 665 batch := make([]*stack.PacketBuffer, 0, batchSz) 666 batchFD := -1 667 sentPackets := 0 668 for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { 669 if len(batch) == 0 { 670 batchFD = e.fds[pkt.Hash%uint32(len(e.fds))] 671 } 672 pktFD := e.fds[pkt.Hash%uint32(len(e.fds))] 673 if sendNow := pktFD != batchFD; !sendNow { 674 batch = append(batch, pkt) 675 continue 676 } 677 n, err := e.sendBatch(batchFD, batch) 678 sentPackets += n 679 if err != nil { 680 return sentPackets, err 681 } 682 batch = batch[:0] 683 batch = append(batch, pkt) 684 batchFD = pktFD 685 } 686 687 if len(batch) != 0 { 688 n, err := e.sendBatch(batchFD, batch) 689 sentPackets += n 690 if err != nil { 691 return sentPackets, err 692 } 693 } 694 return sentPackets, nil 695 } 696 697 // viewsEqual tests whether v1 and v2 refer to the same backing bytes. 698 func viewsEqual(vs1, vs2 []buffer.View) bool { 699 return len(vs1) == len(vs2) && (len(vs1) == 0 || &vs1[0] == &vs2[0]) 700 } 701 702 // InjectOutobund implements stack.InjectableEndpoint.InjectOutbound. 703 func (e *endpoint) InjectOutbound(dest tcpip.Address, packet []byte) tcpip.Error { 704 return rawfile.NonBlockingWrite(e.fds[0], packet) 705 } 706 707 // dispatchLoop reads packets from the file descriptor in a loop and dispatches 708 // them to the network stack. 709 func (e *endpoint) dispatchLoop(inboundDispatcher linkDispatcher) tcpip.Error { 710 for { 711 cont, err := inboundDispatcher.dispatch() 712 if err != nil || !cont { 713 if e.closed != nil { 714 e.closed(err) 715 } 716 return err 717 } 718 } 719 } 720 721 // GSOMaxSize implements stack.GSOEndpoint. 722 func (e *endpoint) GSOMaxSize() uint32 { 723 return e.gsoMaxSize 724 } 725 726 // SupportsHWGSO implements stack.GSOEndpoint. 727 func (e *endpoint) SupportedGSO() stack.SupportedGSO { 728 return e.gsoKind 729 } 730 731 // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. 732 func (e *endpoint) ARPHardwareType() header.ARPHardwareType { 733 if e.hdrSize > 0 { 734 return header.ARPHardwareEther 735 } 736 return header.ARPHardwareNone 737 } 738 739 // InjectableEndpoint is an injectable fd-based endpoint. The endpoint writes 740 // to the FD, but does not read from it. All reads come from injected packets. 741 type InjectableEndpoint struct { 742 endpoint 743 744 dispatcher stack.NetworkDispatcher 745 } 746 747 // Attach saves the stack network-layer dispatcher for use later when packets 748 // are injected. 749 func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) { 750 e.dispatcher = dispatcher 751 } 752 753 // InjectInbound injects an inbound packet. 754 func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { 755 e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, pkt) 756 } 757 758 // NewInjectable creates a new fd-based InjectableEndpoint. 759 func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabilities) *InjectableEndpoint { 760 unix.SetNonblock(fd, true) 761 762 return &InjectableEndpoint{endpoint: endpoint{ 763 fds: []int{fd}, 764 mtu: mtu, 765 caps: capabilities, 766 writevMaxIovs: rawfile.MaxIovs, 767 }} 768 }