github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/tcpip/link/fdbased/endpoint.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build linux 16 // +build linux 17 18 // Package fdbased provides the implementation of data-link layer endpoints 19 // backed by boundary-preserving file descriptors (e.g., TUN devices, 20 // seqpacket/datagram sockets). 21 // 22 // FD based endpoints can be used in the networking stack by calling New() to 23 // create a new endpoint, and then passing it as an argument to 24 // Stack.CreateNIC(). 25 // 26 // FD based endpoints can use more than one file descriptor to read incoming 27 // packets. If there are more than one FDs specified and the underlying FD is an 28 // AF_PACKET then the endpoint will enable FANOUT mode on the socket so that the 29 // host kernel will consistently hash the packets to the sockets. This ensures 30 // that packets for the same TCP streams are not reordered. 31 // 32 // Similarly if more than one FD's are specified where the underlying FD is not 33 // AF_PACKET then it's the caller's responsibility to ensure that all inbound 34 // packets on the descriptors are consistently 5 tuple hashed to one of the 35 // descriptors to prevent TCP reordering. 36 // 37 // Since netstack today does not compute 5 tuple hashes for outgoing packets we 38 // only use the first FD to write outbound packets. Once 5 tuple hashes for 39 // all outbound packets are available we will make use of all underlying FD's to 40 // write outbound packets. 41 package fdbased 42 43 import ( 44 "fmt" 45 46 "golang.org/x/sys/unix" 47 "github.com/metacubex/gvisor/pkg/atomicbitops" 48 "github.com/metacubex/gvisor/pkg/buffer" 49 "github.com/metacubex/gvisor/pkg/sync" 50 "github.com/metacubex/gvisor/pkg/tcpip" 51 "github.com/metacubex/gvisor/pkg/tcpip/header" 52 "github.com/metacubex/gvisor/pkg/tcpip/link/rawfile" 53 "github.com/metacubex/gvisor/pkg/tcpip/stack" 54 ) 55 56 // linkDispatcher reads packets from the link FD and dispatches them to the 57 // NetworkDispatcher. 58 type linkDispatcher interface { 59 Stop() 60 dispatch() (bool, tcpip.Error) 61 release() 62 } 63 64 // PacketDispatchMode are the various supported methods of receiving and 65 // dispatching packets from the underlying FD. 66 type PacketDispatchMode int 67 68 // BatchSize is the number of packets to write in each syscall. It is 47 69 // because when GvisorGSO is in use then a single 65KB TCP segment can get 70 // split into 46 segments of 1420 bytes and a single 216 byte segment. 71 const BatchSize = 47 72 73 const ( 74 // Readv is the default dispatch mode and is the least performant of the 75 // dispatch options but the one that is supported by all underlying FD 76 // types. 77 Readv PacketDispatchMode = iota 78 // RecvMMsg enables use of recvmmsg() syscall instead of readv() to 79 // read inbound packets. This reduces # of syscalls needed to process 80 // packets. 81 // 82 // NOTE: recvmmsg() is only supported for sockets, so if the underlying 83 // FD is not a socket then the code will still fall back to the readv() 84 // path. 85 RecvMMsg 86 // PacketMMap enables use of PACKET_RX_RING to receive packets from the 87 // NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The 88 // primary use-case for this is runsc which uses an AF_PACKET FD to 89 // receive packets from the veth device. 90 PacketMMap 91 ) 92 93 func (p PacketDispatchMode) String() string { 94 switch p { 95 case Readv: 96 return "Readv" 97 case RecvMMsg: 98 return "RecvMMsg" 99 case PacketMMap: 100 return "PacketMMap" 101 default: 102 return fmt.Sprintf("unknown packet dispatch mode '%d'", p) 103 } 104 } 105 106 var _ stack.LinkEndpoint = (*endpoint)(nil) 107 var _ stack.GSOEndpoint = (*endpoint)(nil) 108 109 type fdInfo struct { 110 fd int 111 isSocket bool 112 } 113 114 type endpoint struct { 115 // fds is the set of file descriptors each identifying one inbound/outbound 116 // channel. The endpoint will dispatch from all inbound channels as well as 117 // hash outbound packets to specific channels based on the packet hash. 118 fds []fdInfo 119 120 // mtu (maximum transmission unit) is the maximum size of a packet. 121 mtu uint32 122 123 // hdrSize specifies the link-layer header size. If set to 0, no header 124 // is added/removed; otherwise an ethernet header is used. 125 hdrSize int 126 127 // addr is the address of the endpoint. 128 addr tcpip.LinkAddress 129 130 // caps holds the endpoint capabilities. 131 caps stack.LinkEndpointCapabilities 132 133 // closed is a function to be called when the FD's peer (if any) closes 134 // its end of the communication pipe. 135 closed func(tcpip.Error) 136 137 inboundDispatchers []linkDispatcher 138 139 mu sync.RWMutex 140 // +checklocks:mu 141 dispatcher stack.NetworkDispatcher 142 143 // packetDispatchMode controls the packet dispatcher used by this 144 // endpoint. 145 packetDispatchMode PacketDispatchMode 146 147 // gsoMaxSize is the maximum GSO packet size. It is zero if GSO is 148 // disabled. 149 gsoMaxSize uint32 150 151 // wg keeps track of running goroutines. 152 wg sync.WaitGroup 153 154 // gsoKind is the supported kind of GSO. 155 gsoKind stack.SupportedGSO 156 157 // maxSyscallHeaderBytes has the same meaning as 158 // Options.MaxSyscallHeaderBytes. 159 maxSyscallHeaderBytes uintptr 160 161 // writevMaxIovs is the maximum number of iovecs that may be passed to 162 // rawfile.NonBlockingWriteIovec, as possibly limited by 163 // maxSyscallHeaderBytes. (No analogous limit is defined for 164 // rawfile.NonBlockingSendMMsg, since in that case the maximum number of 165 // iovecs also depends on the number of mmsghdrs. Instead, if sendBatch 166 // encounters a packet whose iovec count is limited by 167 // maxSyscallHeaderBytes, it falls back to writing the packet using writev 168 // via WritePacket.) 169 writevMaxIovs int 170 } 171 172 // Options specify the details about the fd-based endpoint to be created. 173 type Options struct { 174 // FDs is a set of FDs used to read/write packets. 175 FDs []int 176 177 // MTU is the mtu to use for this endpoint. 178 MTU uint32 179 180 // EthernetHeader if true, indicates that the endpoint should read/write 181 // ethernet frames instead of IP packets. 182 EthernetHeader bool 183 184 // ClosedFunc is a function to be called when an endpoint's peer (if 185 // any) closes its end of the communication pipe. 186 ClosedFunc func(tcpip.Error) 187 188 // Address is the link address for this endpoint. Only used if 189 // EthernetHeader is true. 190 Address tcpip.LinkAddress 191 192 // SaveRestore if true, indicates that this NIC capability set should 193 // include CapabilitySaveRestore 194 SaveRestore bool 195 196 // DisconnectOk if true, indicates that this NIC capability set should 197 // include CapabilityDisconnectOk. 198 DisconnectOk bool 199 200 // GSOMaxSize is the maximum GSO packet size. It is zero if GSO is 201 // disabled. 202 GSOMaxSize uint32 203 204 // GvisorGSOEnabled indicates whether Gvisor GSO is enabled or not. 205 GvisorGSOEnabled bool 206 207 // PacketDispatchMode specifies the type of inbound dispatcher to be 208 // used for this endpoint. 209 PacketDispatchMode PacketDispatchMode 210 211 // TXChecksumOffload if true, indicates that this endpoints capability 212 // set should include CapabilityTXChecksumOffload. 213 TXChecksumOffload bool 214 215 // RXChecksumOffload if true, indicates that this endpoints capability 216 // set should include CapabilityRXChecksumOffload. 217 RXChecksumOffload bool 218 219 // If MaxSyscallHeaderBytes is non-zero, it is the maximum number of bytes 220 // of struct iovec, msghdr, and mmsghdr that may be passed by each host 221 // system call. 222 MaxSyscallHeaderBytes int 223 224 // AFXDPFD is used with the experimental AF_XDP mode. 225 // TODO(b/240191988): Use multiple sockets. 226 // TODO(b/240191988): How do we handle the MTU issue? 227 AFXDPFD *int 228 229 // InterfaceIndex is the interface index of the underlying device. 230 InterfaceIndex int 231 } 232 233 // fanoutID is used for AF_PACKET based endpoints to enable PACKET_FANOUT 234 // support in the host kernel. This allows us to use multiple FD's to receive 235 // from the same underlying NIC. The fanoutID needs to be the same for a given 236 // set of FD's that point to the same NIC. Trying to set the PACKET_FANOUT 237 // option for an FD with a fanoutID already in use by another FD for a different 238 // NIC will return an EINVAL. 239 // 240 // Since fanoutID must be unique within the network namespace, we start with 241 // the PID to avoid collisions. The only way to be sure of avoiding collisions 242 // is to run in a new network namespace. 243 var fanoutID atomicbitops.Int32 = atomicbitops.FromInt32(int32(unix.Getpid())) 244 245 // New creates a new fd-based endpoint. 246 // 247 // Makes fd non-blocking, but does not take ownership of fd, which must remain 248 // open for the lifetime of the returned endpoint (until after the endpoint has 249 // stopped being using and Wait returns). 250 func New(opts *Options) (stack.LinkEndpoint, error) { 251 caps := stack.LinkEndpointCapabilities(0) 252 if opts.RXChecksumOffload { 253 caps |= stack.CapabilityRXChecksumOffload 254 } 255 256 if opts.TXChecksumOffload { 257 caps |= stack.CapabilityTXChecksumOffload 258 } 259 260 hdrSize := 0 261 if opts.EthernetHeader { 262 hdrSize = header.EthernetMinimumSize 263 caps |= stack.CapabilityResolutionRequired 264 } 265 266 if opts.SaveRestore { 267 caps |= stack.CapabilitySaveRestore 268 } 269 270 if opts.DisconnectOk { 271 caps |= stack.CapabilityDisconnectOk 272 } 273 274 if len(opts.FDs) == 0 { 275 return nil, fmt.Errorf("opts.FD is empty, at least one FD must be specified") 276 } 277 278 if opts.MaxSyscallHeaderBytes < 0 { 279 return nil, fmt.Errorf("opts.MaxSyscallHeaderBytes is negative") 280 } 281 282 e := &endpoint{ 283 mtu: opts.MTU, 284 caps: caps, 285 closed: opts.ClosedFunc, 286 addr: opts.Address, 287 hdrSize: hdrSize, 288 packetDispatchMode: opts.PacketDispatchMode, 289 maxSyscallHeaderBytes: uintptr(opts.MaxSyscallHeaderBytes), 290 writevMaxIovs: rawfile.MaxIovs, 291 } 292 if e.maxSyscallHeaderBytes != 0 { 293 if max := int(e.maxSyscallHeaderBytes / rawfile.SizeofIovec); max < e.writevMaxIovs { 294 e.writevMaxIovs = max 295 } 296 } 297 298 // Increment fanoutID to ensure that we don't re-use the same fanoutID 299 // for the next endpoint. 300 fid := fanoutID.Add(1) 301 302 // Create per channel dispatchers. 303 for _, fd := range opts.FDs { 304 if err := unix.SetNonblock(fd, true); err != nil { 305 return nil, fmt.Errorf("unix.SetNonblock(%v) failed: %v", fd, err) 306 } 307 308 isSocket, err := isSocketFD(fd) 309 if err != nil { 310 return nil, err 311 } 312 e.fds = append(e.fds, fdInfo{fd: fd, isSocket: isSocket}) 313 if opts.GSOMaxSize != 0 { 314 if opts.GvisorGSOEnabled { 315 e.gsoKind = stack.GvisorGSOSupported 316 } else { 317 e.gsoKind = stack.HostGSOSupported 318 } 319 e.gsoMaxSize = opts.GSOMaxSize 320 } 321 322 inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket, fid) 323 if err != nil { 324 return nil, fmt.Errorf("createInboundDispatcher(...) = %v", err) 325 } 326 e.inboundDispatchers = append(e.inboundDispatchers, inboundDispatcher) 327 } 328 329 return e, nil 330 } 331 332 func createInboundDispatcher(e *endpoint, fd int, isSocket bool, fID int32) (linkDispatcher, error) { 333 // By default use the readv() dispatcher as it works with all kinds of 334 // FDs (tap/tun/unix domain sockets and af_packet). 335 inboundDispatcher, err := newReadVDispatcher(fd, e) 336 if err != nil { 337 return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", fd, e, err) 338 } 339 340 if isSocket { 341 sa, err := unix.Getsockname(fd) 342 if err != nil { 343 return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err) 344 } 345 switch sa.(type) { 346 case *unix.SockaddrLinklayer: 347 // Enable PACKET_FANOUT mode if the underlying socket is of type 348 // AF_PACKET. We do not enable PACKET_FANOUT_FLAG_DEFRAG as that will 349 // prevent gvisor from receiving fragmented packets and the host does the 350 // reassembly on our behalf before delivering the fragments. This makes it 351 // hard to test fragmentation reassembly code in Netstack. 352 // 353 // See: include/uapi/linux/if_packet.h (struct fanout_args). 354 // 355 // NOTE: We are using SetSockOptInt here even though the underlying 356 // option is actually a struct. The code follows the example in the 357 // kernel documentation as described at the link below: 358 // 359 // See: https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt 360 // 361 // This works out because the actual implementation for the option zero 362 // initializes the structure and will initialize the max_members field 363 // to a proper value if zero. 364 // 365 // See: https://github.com/torvalds/linux/blob/7acac4b3196caee5e21fb5ea53f8bc124e6a16fc/net/packet/af_packet.c#L3881 366 const fanoutType = unix.PACKET_FANOUT_HASH 367 fanoutArg := (int(fID) & 0xffff) | fanoutType<<16 368 if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil { 369 return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err) 370 } 371 } 372 373 switch e.packetDispatchMode { 374 case PacketMMap: 375 inboundDispatcher, err = newPacketMMapDispatcher(fd, e) 376 if err != nil { 377 return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", fd, e, err) 378 } 379 case RecvMMsg: 380 // If the provided FD is a socket then we optimize 381 // packet reads by using recvmmsg() instead of read() to 382 // read packets in a batch. 383 inboundDispatcher, err = newRecvMMsgDispatcher(fd, e) 384 if err != nil { 385 return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", fd, e, err) 386 } 387 case Readv: 388 default: 389 return nil, fmt.Errorf("unknown dispatch mode %d", e.packetDispatchMode) 390 } 391 } 392 return inboundDispatcher, nil 393 } 394 395 func isSocketFD(fd int) (bool, error) { 396 var stat unix.Stat_t 397 if err := unix.Fstat(fd, &stat); err != nil { 398 return false, fmt.Errorf("unix.Fstat(%v,...) failed: %v", fd, err) 399 } 400 return (stat.Mode & unix.S_IFSOCK) == unix.S_IFSOCK, nil 401 } 402 403 // Attach launches the goroutine that reads packets from the file descriptor and 404 // dispatches them via the provided dispatcher. If one is already attached, 405 // then nothing happens. 406 // 407 // Attach implements stack.LinkEndpoint.Attach. 408 func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { 409 e.mu.Lock() 410 defer e.mu.Unlock() 411 // nil means the NIC is being removed. 412 if dispatcher == nil && e.dispatcher != nil { 413 for _, dispatcher := range e.inboundDispatchers { 414 dispatcher.Stop() 415 } 416 e.Wait() 417 e.dispatcher = nil 418 return 419 } 420 if dispatcher != nil && e.dispatcher == nil { 421 e.dispatcher = dispatcher 422 // Link endpoints are not savable. When transportation endpoints are 423 // saved, they stop sending outgoing packets and all incoming packets 424 // are rejected. 425 for i := range e.inboundDispatchers { 426 e.wg.Add(1) 427 go func(i int) { // S/R-SAFE: See above. 428 e.dispatchLoop(e.inboundDispatchers[i]) 429 e.wg.Done() 430 }(i) 431 } 432 } 433 } 434 435 // IsAttached implements stack.LinkEndpoint.IsAttached. 436 func (e *endpoint) IsAttached() bool { 437 e.mu.RLock() 438 defer e.mu.RUnlock() 439 return e.dispatcher != nil 440 } 441 442 // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized 443 // during construction. 444 func (e *endpoint) MTU() uint32 { 445 return e.mtu 446 } 447 448 // Capabilities implements stack.LinkEndpoint.Capabilities. 449 func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { 450 return e.caps 451 } 452 453 // MaxHeaderLength returns the maximum size of the link-layer header. 454 func (e *endpoint) MaxHeaderLength() uint16 { 455 return uint16(e.hdrSize) 456 } 457 458 // LinkAddress returns the link address of this endpoint. 459 func (e *endpoint) LinkAddress() tcpip.LinkAddress { 460 return e.addr 461 } 462 463 // Wait implements stack.LinkEndpoint.Wait. It waits for the endpoint to stop 464 // reading from its FD. 465 func (e *endpoint) Wait() { 466 e.wg.Wait() 467 } 468 469 // virtioNetHdr is declared in linux/virtio_net.h. 470 type virtioNetHdr struct { 471 flags uint8 472 gsoType uint8 473 hdrLen uint16 474 gsoSize uint16 475 csumStart uint16 476 csumOffset uint16 477 } 478 479 // marshal serializes h to a newly-allocated byte slice, in little-endian byte 480 // order. 481 // 482 // Note: Virtio v1.0 onwards specifies little-endian as the byte ordering used 483 // for general serialization. This makes it difficult to use go-marshal for 484 // virtio types, as go-marshal implicitly uses the native byte ordering. 485 func (h *virtioNetHdr) marshal() []byte { 486 buf := [virtioNetHdrSize]byte{ 487 0: byte(h.flags), 488 1: byte(h.gsoType), 489 490 // Manually lay out the fields in little-endian byte order. Little endian => 491 // least significant bit goes to the lower address. 492 493 2: byte(h.hdrLen), 494 3: byte(h.hdrLen >> 8), 495 496 4: byte(h.gsoSize), 497 5: byte(h.gsoSize >> 8), 498 499 6: byte(h.csumStart), 500 7: byte(h.csumStart >> 8), 501 502 8: byte(h.csumOffset), 503 9: byte(h.csumOffset >> 8), 504 } 505 return buf[:] 506 } 507 508 // These constants are declared in linux/virtio_net.h. 509 const ( 510 _VIRTIO_NET_HDR_F_NEEDS_CSUM = 1 511 512 _VIRTIO_NET_HDR_GSO_TCPV4 = 1 513 _VIRTIO_NET_HDR_GSO_TCPV6 = 4 514 ) 515 516 // AddHeader implements stack.LinkEndpoint.AddHeader. 517 func (e *endpoint) AddHeader(pkt *stack.PacketBuffer) { 518 if e.hdrSize > 0 { 519 // Add ethernet header if needed. 520 eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize)) 521 eth.Encode(&header.EthernetFields{ 522 SrcAddr: pkt.EgressRoute.LocalLinkAddress, 523 DstAddr: pkt.EgressRoute.RemoteLinkAddress, 524 Type: pkt.NetworkProtocolNumber, 525 }) 526 } 527 } 528 529 func (e *endpoint) parseHeader(pkt *stack.PacketBuffer) bool { 530 _, ok := pkt.LinkHeader().Consume(e.hdrSize) 531 return ok 532 533 } 534 535 // ParseHeader implements stack.LinkEndpoint.ParseHeader. 536 func (e *endpoint) ParseHeader(pkt *stack.PacketBuffer) bool { 537 if e.hdrSize > 0 { 538 return e.parseHeader(pkt) 539 } 540 return true 541 } 542 543 // writePacket writes outbound packets to the file descriptor. If it is not 544 // currently writable, the packet is dropped. 545 func (e *endpoint) writePacket(pkt *stack.PacketBuffer) tcpip.Error { 546 fdInfo := e.fds[pkt.Hash%uint32(len(e.fds))] 547 fd := fdInfo.fd 548 var vnetHdrBuf []byte 549 if e.gsoKind == stack.HostGSOSupported { 550 vnetHdr := virtioNetHdr{} 551 if pkt.GSOOptions.Type != stack.GSONone { 552 vnetHdr.hdrLen = uint16(pkt.HeaderSize()) 553 if pkt.GSOOptions.NeedsCsum { 554 vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM 555 vnetHdr.csumStart = pkt.GSOOptions.L3HdrLen 556 vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset 557 } 558 if uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS { 559 switch pkt.GSOOptions.Type { 560 case stack.GSOTCPv4: 561 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 562 case stack.GSOTCPv6: 563 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 564 default: 565 panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type)) 566 } 567 vnetHdr.gsoSize = pkt.GSOOptions.MSS 568 } 569 } 570 vnetHdrBuf = vnetHdr.marshal() 571 } 572 573 views := pkt.AsSlices() 574 numIovecs := len(views) 575 if len(vnetHdrBuf) != 0 { 576 numIovecs++ 577 } 578 if numIovecs > e.writevMaxIovs { 579 numIovecs = e.writevMaxIovs 580 } 581 582 // Allocate small iovec arrays on the stack. 583 var iovecsArr [8]unix.Iovec 584 iovecs := iovecsArr[:0] 585 if numIovecs > len(iovecsArr) { 586 iovecs = make([]unix.Iovec, 0, numIovecs) 587 } 588 iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs) 589 for _, v := range views { 590 iovecs = rawfile.AppendIovecFromBytes(iovecs, v, numIovecs) 591 } 592 return rawfile.NonBlockingWriteIovec(fd, iovecs) 593 } 594 595 func (e *endpoint) sendBatch(batchFDInfo fdInfo, pkts []*stack.PacketBuffer) (int, tcpip.Error) { 596 // Degrade to writePacket if underlying fd is not a socket. 597 if !batchFDInfo.isSocket { 598 var written int 599 var err tcpip.Error 600 for written < len(pkts) { 601 if err = e.writePacket(pkts[written]); err != nil { 602 break 603 } 604 written++ 605 } 606 return written, err 607 } 608 609 // Send a batch of packets through batchFD. 610 batchFD := batchFDInfo.fd 611 mmsgHdrsStorage := make([]rawfile.MMsgHdr, 0, len(pkts)) 612 packets := 0 613 for packets < len(pkts) { 614 mmsgHdrs := mmsgHdrsStorage 615 batch := pkts[packets:] 616 syscallHeaderBytes := uintptr(0) 617 for _, pkt := range batch { 618 var vnetHdrBuf []byte 619 if e.gsoKind == stack.HostGSOSupported { 620 vnetHdr := virtioNetHdr{} 621 if pkt.GSOOptions.Type != stack.GSONone { 622 vnetHdr.hdrLen = uint16(pkt.HeaderSize()) 623 if pkt.GSOOptions.NeedsCsum { 624 vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM 625 vnetHdr.csumStart = pkt.GSOOptions.L3HdrLen 626 vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset 627 } 628 if pkt.GSOOptions.Type != stack.GSONone && uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS { 629 switch pkt.GSOOptions.Type { 630 case stack.GSOTCPv4: 631 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 632 case stack.GSOTCPv6: 633 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 634 default: 635 panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type)) 636 } 637 vnetHdr.gsoSize = pkt.GSOOptions.MSS 638 } 639 } 640 vnetHdrBuf = vnetHdr.marshal() 641 } 642 643 views, offset := pkt.AsViewList() 644 var skipped int 645 var view *buffer.View 646 for view = views.Front(); view != nil && offset >= view.Size(); view = view.Next() { 647 offset -= view.Size() 648 skipped++ 649 } 650 651 // We've made it to the usable views. 652 numIovecs := views.Len() - skipped 653 if len(vnetHdrBuf) != 0 { 654 numIovecs++ 655 } 656 if numIovecs > rawfile.MaxIovs { 657 numIovecs = rawfile.MaxIovs 658 } 659 if e.maxSyscallHeaderBytes != 0 { 660 syscallHeaderBytes += rawfile.SizeofMMsgHdr + uintptr(numIovecs)*rawfile.SizeofIovec 661 if syscallHeaderBytes > e.maxSyscallHeaderBytes { 662 // We can't fit this packet into this call to sendmmsg(). 663 // We could potentially do so if we reduced numIovecs 664 // further, but this might incur considerable extra 665 // copying. Leave it to the next batch instead. 666 break 667 } 668 } 669 670 // We can't easily allocate iovec arrays on the stack here since 671 // they will escape this loop iteration via mmsgHdrs. 672 iovecs := make([]unix.Iovec, 0, numIovecs) 673 iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs) 674 // At most one slice has a non-zero offset. 675 iovecs = rawfile.AppendIovecFromBytes(iovecs, view.AsSlice()[offset:], numIovecs) 676 for view = view.Next(); view != nil; view = view.Next() { 677 iovecs = rawfile.AppendIovecFromBytes(iovecs, view.AsSlice(), numIovecs) 678 } 679 680 var mmsgHdr rawfile.MMsgHdr 681 mmsgHdr.Msg.Iov = &iovecs[0] 682 mmsgHdr.Msg.SetIovlen(len(iovecs)) 683 mmsgHdrs = append(mmsgHdrs, mmsgHdr) 684 } 685 686 if len(mmsgHdrs) == 0 { 687 // We can't fit batch[0] into a mmsghdr while staying under 688 // e.maxSyscallHeaderBytes. Use WritePacket, which will avoid the 689 // mmsghdr (by using writev) and re-buffer iovecs more aggressively 690 // if necessary (by using e.writevMaxIovs instead of 691 // rawfile.MaxIovs). 692 pkt := batch[0] 693 if err := e.writePacket(pkt); err != nil { 694 return packets, err 695 } 696 packets++ 697 } else { 698 for len(mmsgHdrs) > 0 { 699 sent, err := rawfile.NonBlockingSendMMsg(batchFD, mmsgHdrs) 700 if err != nil { 701 return packets, err 702 } 703 packets += sent 704 mmsgHdrs = mmsgHdrs[sent:] 705 } 706 } 707 } 708 709 return packets, nil 710 } 711 712 // WritePackets writes outbound packets to the underlying file descriptors. If 713 // one is not currently writable, the packet is dropped. 714 // 715 // Being a batch API, each packet in pkts should have the following 716 // fields populated: 717 // - pkt.EgressRoute 718 // - pkt.GSOOptions 719 // - pkt.NetworkProtocolNumber 720 func (e *endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { 721 // Preallocate to avoid repeated reallocation as we append to batch. 722 batch := make([]*stack.PacketBuffer, 0, BatchSize) 723 batchFDInfo := fdInfo{fd: -1, isSocket: false} 724 sentPackets := 0 725 for _, pkt := range pkts.AsSlice() { 726 if len(batch) == 0 { 727 batchFDInfo = e.fds[pkt.Hash%uint32(len(e.fds))] 728 } 729 pktFDInfo := e.fds[pkt.Hash%uint32(len(e.fds))] 730 if sendNow := pktFDInfo != batchFDInfo; !sendNow { 731 batch = append(batch, pkt) 732 continue 733 } 734 n, err := e.sendBatch(batchFDInfo, batch) 735 sentPackets += n 736 if err != nil { 737 return sentPackets, err 738 } 739 batch = batch[:0] 740 batch = append(batch, pkt) 741 batchFDInfo = pktFDInfo 742 } 743 744 if len(batch) != 0 { 745 n, err := e.sendBatch(batchFDInfo, batch) 746 sentPackets += n 747 if err != nil { 748 return sentPackets, err 749 } 750 } 751 return sentPackets, nil 752 } 753 754 // InjectOutbound implements stack.InjectableEndpoint.InjectOutbound. 755 func (e *endpoint) InjectOutbound(dest tcpip.Address, packet *buffer.View) tcpip.Error { 756 return rawfile.NonBlockingWrite(e.fds[0].fd, packet.AsSlice()) 757 } 758 759 // dispatchLoop reads packets from the file descriptor in a loop and dispatches 760 // them to the network stack. 761 func (e *endpoint) dispatchLoop(inboundDispatcher linkDispatcher) tcpip.Error { 762 for { 763 cont, err := inboundDispatcher.dispatch() 764 if err != nil || !cont { 765 if e.closed != nil { 766 e.closed(err) 767 } 768 inboundDispatcher.release() 769 return err 770 } 771 } 772 } 773 774 // GSOMaxSize implements stack.GSOEndpoint. 775 func (e *endpoint) GSOMaxSize() uint32 { 776 return e.gsoMaxSize 777 } 778 779 // SupportedGSO implements stack.GSOEndpoint. 780 func (e *endpoint) SupportedGSO() stack.SupportedGSO { 781 return e.gsoKind 782 } 783 784 // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. 785 func (e *endpoint) ARPHardwareType() header.ARPHardwareType { 786 if e.hdrSize > 0 { 787 return header.ARPHardwareEther 788 } 789 return header.ARPHardwareNone 790 } 791 792 // InjectableEndpoint is an injectable fd-based endpoint. The endpoint writes 793 // to the FD, but does not read from it. All reads come from injected packets. 794 type InjectableEndpoint struct { 795 endpoint 796 797 mu sync.RWMutex 798 // +checklocks:mu 799 dispatcher stack.NetworkDispatcher 800 } 801 802 // Attach saves the stack network-layer dispatcher for use later when packets 803 // are injected. 804 func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) { 805 e.mu.Lock() 806 defer e.mu.Unlock() 807 e.dispatcher = dispatcher 808 } 809 810 // InjectInbound injects an inbound packet. If the endpoint is not attached, the 811 // packet is not delivered. 812 func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { 813 e.mu.RLock() 814 d := e.dispatcher 815 e.mu.RUnlock() 816 if d != nil { 817 d.DeliverNetworkPacket(protocol, pkt) 818 } 819 } 820 821 // NewInjectable creates a new fd-based InjectableEndpoint. 822 func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabilities) (*InjectableEndpoint, error) { 823 unix.SetNonblock(fd, true) 824 isSocket, err := isSocketFD(fd) 825 if err != nil { 826 return nil, err 827 } 828 829 return &InjectableEndpoint{endpoint: endpoint{ 830 fds: []fdInfo{{fd: fd, isSocket: isSocket}}, 831 mtu: mtu, 832 caps: capabilities, 833 writevMaxIovs: rawfile.MaxIovs, 834 }}, nil 835 }