github.com/sagernet/gvisor@v0.0.0-20240428053021-e691de28565f/pkg/tcpip/link/fdbased/endpoint.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build linux 16 // +build linux 17 18 // Package fdbased provides the implementation of data-link layer endpoints 19 // backed by boundary-preserving file descriptors (e.g., TUN devices, 20 // seqpacket/datagram sockets). 21 // 22 // FD based endpoints can be used in the networking stack by calling New() to 23 // create a new endpoint, and then passing it as an argument to 24 // Stack.CreateNIC(). 25 // 26 // FD based endpoints can use more than one file descriptor to read incoming 27 // packets. If there are more than one FDs specified and the underlying FD is an 28 // AF_PACKET then the endpoint will enable FANOUT mode on the socket so that the 29 // host kernel will consistently hash the packets to the sockets. This ensures 30 // that packets for the same TCP streams are not reordered. 31 // 32 // Similarly if more than one FD's are specified where the underlying FD is not 33 // AF_PACKET then it's the caller's responsibility to ensure that all inbound 34 // packets on the descriptors are consistently 5 tuple hashed to one of the 35 // descriptors to prevent TCP reordering. 36 // 37 // Since netstack today does not compute 5 tuple hashes for outgoing packets we 38 // only use the first FD to write outbound packets. Once 5 tuple hashes for 39 // all outbound packets are available we will make use of all underlying FD's to 40 // write outbound packets. 41 package fdbased 42 43 import ( 44 "fmt" 45 46 "golang.org/x/sys/unix" 47 "github.com/sagernet/gvisor/pkg/atomicbitops" 48 "github.com/sagernet/gvisor/pkg/buffer" 49 "github.com/sagernet/gvisor/pkg/sync" 50 "github.com/sagernet/gvisor/pkg/tcpip" 51 "github.com/sagernet/gvisor/pkg/tcpip/header" 52 "github.com/sagernet/gvisor/pkg/tcpip/link/rawfile" 53 "github.com/sagernet/gvisor/pkg/tcpip/stack" 54 ) 55 56 // linkDispatcher reads packets from the link FD and dispatches them to the 57 // NetworkDispatcher. 58 type linkDispatcher interface { 59 Stop() 60 dispatch() (bool, tcpip.Error) 61 release() 62 } 63 64 // PacketDispatchMode are the various supported methods of receiving and 65 // dispatching packets from the underlying FD. 66 type PacketDispatchMode int 67 68 // BatchSize is the number of packets to write in each syscall. It is 47 69 // because when GVisorGSO is in use then a single 65KB TCP segment can get 70 // split into 46 segments of 1420 bytes and a single 216 byte segment. 71 const BatchSize = 47 72 73 const ( 74 // Readv is the default dispatch mode and is the least performant of the 75 // dispatch options but the one that is supported by all underlying FD 76 // types. 77 Readv PacketDispatchMode = iota 78 // RecvMMsg enables use of recvmmsg() syscall instead of readv() to 79 // read inbound packets. This reduces # of syscalls needed to process 80 // packets. 81 // 82 // NOTE: recvmmsg() is only supported for sockets, so if the underlying 83 // FD is not a socket then the code will still fall back to the readv() 84 // path. 85 RecvMMsg 86 // PacketMMap enables use of PACKET_RX_RING to receive packets from the 87 // NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The 88 // primary use-case for this is runsc which uses an AF_PACKET FD to 89 // receive packets from the veth device. 90 PacketMMap 91 ) 92 93 func (p PacketDispatchMode) String() string { 94 switch p { 95 case Readv: 96 return "Readv" 97 case RecvMMsg: 98 return "RecvMMsg" 99 case PacketMMap: 100 return "PacketMMap" 101 default: 102 return fmt.Sprintf("unknown packet dispatch mode '%d'", p) 103 } 104 } 105 106 var _ stack.LinkEndpoint = (*endpoint)(nil) 107 var _ stack.GSOEndpoint = (*endpoint)(nil) 108 109 type fdInfo struct { 110 fd int 111 isSocket bool 112 } 113 114 type endpoint struct { 115 // fds is the set of file descriptors each identifying one inbound/outbound 116 // channel. The endpoint will dispatch from all inbound channels as well as 117 // hash outbound packets to specific channels based on the packet hash. 118 fds []fdInfo 119 120 // mtu (maximum transmission unit) is the maximum size of a packet. 121 mtu uint32 122 123 // hdrSize specifies the link-layer header size. If set to 0, no header 124 // is added/removed; otherwise an ethernet header is used. 125 hdrSize int 126 127 // addr is the address of the endpoint. 128 addr tcpip.LinkAddress 129 130 // caps holds the endpoint capabilities. 131 caps stack.LinkEndpointCapabilities 132 133 // closed is a function to be called when the FD's peer (if any) closes 134 // its end of the communication pipe. 135 closed func(tcpip.Error) 136 137 inboundDispatchers []linkDispatcher 138 139 mu sync.RWMutex 140 // +checklocks:mu 141 dispatcher stack.NetworkDispatcher 142 143 // packetDispatchMode controls the packet dispatcher used by this 144 // endpoint. 145 packetDispatchMode PacketDispatchMode 146 147 // gsoMaxSize is the maximum GSO packet size. It is zero if GSO is 148 // disabled. 149 gsoMaxSize uint32 150 151 // wg keeps track of running goroutines. 152 wg sync.WaitGroup 153 154 // gsoKind is the supported kind of GSO. 155 gsoKind stack.SupportedGSO 156 157 // maxSyscallHeaderBytes has the same meaning as 158 // Options.MaxSyscallHeaderBytes. 159 maxSyscallHeaderBytes uintptr 160 161 // writevMaxIovs is the maximum number of iovecs that may be passed to 162 // rawfile.NonBlockingWriteIovec, as possibly limited by 163 // maxSyscallHeaderBytes. (No analogous limit is defined for 164 // rawfile.NonBlockingSendMMsg, since in that case the maximum number of 165 // iovecs also depends on the number of mmsghdrs. Instead, if sendBatch 166 // encounters a packet whose iovec count is limited by 167 // maxSyscallHeaderBytes, it falls back to writing the packet using writev 168 // via WritePacket.) 169 writevMaxIovs int 170 } 171 172 // Options specify the details about the fd-based endpoint to be created. 173 type Options struct { 174 // FDs is a set of FDs used to read/write packets. 175 FDs []int 176 177 // MTU is the mtu to use for this endpoint. 178 MTU uint32 179 180 // EthernetHeader if true, indicates that the endpoint should read/write 181 // ethernet frames instead of IP packets. 182 EthernetHeader bool 183 184 // ClosedFunc is a function to be called when an endpoint's peer (if 185 // any) closes its end of the communication pipe. 186 ClosedFunc func(tcpip.Error) 187 188 // Address is the link address for this endpoint. Only used if 189 // EthernetHeader is true. 190 Address tcpip.LinkAddress 191 192 // SaveRestore if true, indicates that this NIC capability set should 193 // include CapabilitySaveRestore 194 SaveRestore bool 195 196 // DisconnectOk if true, indicates that this NIC capability set should 197 // include CapabilityDisconnectOk. 198 DisconnectOk bool 199 200 // GSOMaxSize is the maximum GSO packet size. It is zero if GSO is 201 // disabled. 202 GSOMaxSize uint32 203 204 // GVisorGSOEnabled indicates whether Gvisor GSO is enabled or not. 205 GVisorGSOEnabled bool 206 207 // PacketDispatchMode specifies the type of inbound dispatcher to be 208 // used for this endpoint. 209 PacketDispatchMode PacketDispatchMode 210 211 // TXChecksumOffload if true, indicates that this endpoints capability 212 // set should include CapabilityTXChecksumOffload. 213 TXChecksumOffload bool 214 215 // RXChecksumOffload if true, indicates that this endpoints capability 216 // set should include CapabilityRXChecksumOffload. 217 RXChecksumOffload bool 218 219 // If MaxSyscallHeaderBytes is non-zero, it is the maximum number of bytes 220 // of struct iovec, msghdr, and mmsghdr that may be passed by each host 221 // system call. 222 MaxSyscallHeaderBytes int 223 224 // InterfaceIndex is the interface index of the underlying device. 225 InterfaceIndex int 226 227 // GRO enables generic receive offload. 228 GRO bool 229 } 230 231 // fanoutID is used for AF_PACKET based endpoints to enable PACKET_FANOUT 232 // support in the host kernel. This allows us to use multiple FD's to receive 233 // from the same underlying NIC. The fanoutID needs to be the same for a given 234 // set of FD's that point to the same NIC. Trying to set the PACKET_FANOUT 235 // option for an FD with a fanoutID already in use by another FD for a different 236 // NIC will return an EINVAL. 237 // 238 // Since fanoutID must be unique within the network namespace, we start with 239 // the PID to avoid collisions. The only way to be sure of avoiding collisions 240 // is to run in a new network namespace. 241 var fanoutID atomicbitops.Int32 = atomicbitops.FromInt32(int32(unix.Getpid())) 242 243 // New creates a new fd-based endpoint. 244 // 245 // Makes fd non-blocking, but does not take ownership of fd, which must remain 246 // open for the lifetime of the returned endpoint (until after the endpoint has 247 // stopped being using and Wait returns). 248 func New(opts *Options) (stack.LinkEndpoint, error) { 249 caps := stack.LinkEndpointCapabilities(0) 250 if opts.RXChecksumOffload { 251 caps |= stack.CapabilityRXChecksumOffload 252 } 253 254 if opts.TXChecksumOffload { 255 caps |= stack.CapabilityTXChecksumOffload 256 } 257 258 hdrSize := 0 259 if opts.EthernetHeader { 260 hdrSize = header.EthernetMinimumSize 261 caps |= stack.CapabilityResolutionRequired 262 } 263 264 if opts.SaveRestore { 265 caps |= stack.CapabilitySaveRestore 266 } 267 268 if opts.DisconnectOk { 269 caps |= stack.CapabilityDisconnectOk 270 } 271 272 if len(opts.FDs) == 0 { 273 return nil, fmt.Errorf("opts.FD is empty, at least one FD must be specified") 274 } 275 276 if opts.MaxSyscallHeaderBytes < 0 { 277 return nil, fmt.Errorf("opts.MaxSyscallHeaderBytes is negative") 278 } 279 280 e := &endpoint{ 281 mtu: opts.MTU, 282 caps: caps, 283 closed: opts.ClosedFunc, 284 addr: opts.Address, 285 hdrSize: hdrSize, 286 packetDispatchMode: opts.PacketDispatchMode, 287 maxSyscallHeaderBytes: uintptr(opts.MaxSyscallHeaderBytes), 288 writevMaxIovs: rawfile.MaxIovs, 289 } 290 if e.maxSyscallHeaderBytes != 0 { 291 if max := int(e.maxSyscallHeaderBytes / rawfile.SizeofIovec); max < e.writevMaxIovs { 292 e.writevMaxIovs = max 293 } 294 } 295 296 // Increment fanoutID to ensure that we don't re-use the same fanoutID 297 // for the next endpoint. 298 fid := fanoutID.Add(1) 299 300 // Create per channel dispatchers. 301 for _, fd := range opts.FDs { 302 if err := unix.SetNonblock(fd, true); err != nil { 303 return nil, fmt.Errorf("unix.SetNonblock(%v) failed: %v", fd, err) 304 } 305 306 isSocket, err := isSocketFD(fd) 307 if err != nil { 308 return nil, err 309 } 310 e.fds = append(e.fds, fdInfo{fd: fd, isSocket: isSocket}) 311 if opts.GSOMaxSize != 0 { 312 if opts.GVisorGSOEnabled { 313 e.gsoKind = stack.GVisorGSOSupported 314 } else { 315 e.gsoKind = stack.HostGSOSupported 316 } 317 e.gsoMaxSize = opts.GSOMaxSize 318 } 319 320 inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket, fid, opts) 321 if err != nil { 322 return nil, fmt.Errorf("createInboundDispatcher(...) = %v", err) 323 } 324 e.inboundDispatchers = append(e.inboundDispatchers, inboundDispatcher) 325 } 326 327 return e, nil 328 } 329 330 func createInboundDispatcher(e *endpoint, fd int, isSocket bool, fID int32, opts *Options) (linkDispatcher, error) { 331 // By default use the readv() dispatcher as it works with all kinds of 332 // FDs (tap/tun/unix domain sockets and af_packet). 333 inboundDispatcher, err := newReadVDispatcher(fd, e) 334 if err != nil { 335 return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", fd, e, err) 336 } 337 338 if isSocket { 339 sa, err := unix.Getsockname(fd) 340 if err != nil { 341 return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err) 342 } 343 switch sa.(type) { 344 case *unix.SockaddrLinklayer: 345 // Enable PACKET_FANOUT mode if the underlying socket is of type 346 // AF_PACKET. We do not enable PACKET_FANOUT_FLAG_DEFRAG as that will 347 // prevent gvisor from receiving fragmented packets and the host does the 348 // reassembly on our behalf before delivering the fragments. This makes it 349 // hard to test fragmentation reassembly code in Netstack. 350 // 351 // See: include/uapi/linux/if_packet.h (struct fanout_args). 352 // 353 // NOTE: We are using SetSockOptInt here even though the underlying 354 // option is actually a struct. The code follows the example in the 355 // kernel documentation as described at the link below: 356 // 357 // See: https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt 358 // 359 // This works out because the actual implementation for the option zero 360 // initializes the structure and will initialize the max_members field 361 // to a proper value if zero. 362 // 363 // See: https://github.com/torvalds/linux/blob/7acac4b3196caee5e21fb5ea53f8bc124e6a16fc/net/packet/af_packet.c#L3881 364 const fanoutType = unix.PACKET_FANOUT_HASH 365 fanoutArg := (int(fID) & 0xffff) | fanoutType<<16 366 if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil { 367 return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err) 368 } 369 } 370 371 switch e.packetDispatchMode { 372 case PacketMMap: 373 inboundDispatcher, err = newPacketMMapDispatcher(fd, e) 374 if err != nil { 375 return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", fd, e, err) 376 } 377 case RecvMMsg: 378 // If the provided FD is a socket then we optimize 379 // packet reads by using recvmmsg() instead of read() to 380 // read packets in a batch. 381 inboundDispatcher, err = newRecvMMsgDispatcher(fd, e, opts) 382 if err != nil { 383 return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", fd, e, err) 384 } 385 case Readv: 386 default: 387 return nil, fmt.Errorf("unknown dispatch mode %d", e.packetDispatchMode) 388 } 389 } 390 return inboundDispatcher, nil 391 } 392 393 func isSocketFD(fd int) (bool, error) { 394 var stat unix.Stat_t 395 if err := unix.Fstat(fd, &stat); err != nil { 396 return false, fmt.Errorf("unix.Fstat(%v,...) failed: %v", fd, err) 397 } 398 return (stat.Mode & unix.S_IFSOCK) == unix.S_IFSOCK, nil 399 } 400 401 // Attach launches the goroutine that reads packets from the file descriptor and 402 // dispatches them via the provided dispatcher. If one is already attached, 403 // then nothing happens. 404 // 405 // Attach implements stack.LinkEndpoint.Attach. 406 func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { 407 e.mu.Lock() 408 defer e.mu.Unlock() 409 410 // nil means the NIC is being removed. 411 if dispatcher == nil && e.dispatcher != nil { 412 for _, dispatcher := range e.inboundDispatchers { 413 dispatcher.Stop() 414 } 415 e.Wait() 416 e.dispatcher = nil 417 return 418 } 419 if dispatcher != nil && e.dispatcher == nil { 420 e.dispatcher = dispatcher 421 // Link endpoints are not savable. When transportation endpoints are 422 // saved, they stop sending outgoing packets and all incoming packets 423 // are rejected. 424 for i := range e.inboundDispatchers { 425 e.wg.Add(1) 426 go func(i int) { // S/R-SAFE: See above. 427 e.dispatchLoop(e.inboundDispatchers[i]) 428 e.wg.Done() 429 }(i) 430 } 431 } 432 } 433 434 // IsAttached implements stack.LinkEndpoint.IsAttached. 435 func (e *endpoint) IsAttached() bool { 436 e.mu.RLock() 437 defer e.mu.RUnlock() 438 return e.dispatcher != nil 439 } 440 441 // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized 442 // during construction. 443 func (e *endpoint) MTU() uint32 { 444 return e.mtu 445 } 446 447 // Capabilities implements stack.LinkEndpoint.Capabilities. 448 func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { 449 return e.caps 450 } 451 452 // MaxHeaderLength returns the maximum size of the link-layer header. 453 func (e *endpoint) MaxHeaderLength() uint16 { 454 return uint16(e.hdrSize) 455 } 456 457 // LinkAddress returns the link address of this endpoint. 458 func (e *endpoint) LinkAddress() tcpip.LinkAddress { 459 return e.addr 460 } 461 462 // Wait implements stack.LinkEndpoint.Wait. It waits for the endpoint to stop 463 // reading from its FD. 464 func (e *endpoint) Wait() { 465 e.wg.Wait() 466 } 467 468 // virtioNetHdr is declared in linux/virtio_net.h. 469 type virtioNetHdr struct { 470 flags uint8 471 gsoType uint8 472 hdrLen uint16 473 gsoSize uint16 474 csumStart uint16 475 csumOffset uint16 476 } 477 478 // marshal serializes h to a newly-allocated byte slice, in little-endian byte 479 // order. 480 // 481 // Note: Virtio v1.0 onwards specifies little-endian as the byte ordering used 482 // for general serialization. This makes it difficult to use go-marshal for 483 // virtio types, as go-marshal implicitly uses the native byte ordering. 484 func (h *virtioNetHdr) marshal() []byte { 485 buf := [virtioNetHdrSize]byte{ 486 0: byte(h.flags), 487 1: byte(h.gsoType), 488 489 // Manually lay out the fields in little-endian byte order. Little endian => 490 // least significant bit goes to the lower address. 491 492 2: byte(h.hdrLen), 493 3: byte(h.hdrLen >> 8), 494 495 4: byte(h.gsoSize), 496 5: byte(h.gsoSize >> 8), 497 498 6: byte(h.csumStart), 499 7: byte(h.csumStart >> 8), 500 501 8: byte(h.csumOffset), 502 9: byte(h.csumOffset >> 8), 503 } 504 return buf[:] 505 } 506 507 // These constants are declared in linux/virtio_net.h. 508 const ( 509 _VIRTIO_NET_HDR_F_NEEDS_CSUM = 1 510 511 _VIRTIO_NET_HDR_GSO_TCPV4 = 1 512 _VIRTIO_NET_HDR_GSO_TCPV6 = 4 513 ) 514 515 // AddHeader implements stack.LinkEndpoint.AddHeader. 516 func (e *endpoint) AddHeader(pkt *stack.PacketBuffer) { 517 if e.hdrSize > 0 { 518 // Add ethernet header if needed. 519 eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize)) 520 eth.Encode(&header.EthernetFields{ 521 SrcAddr: pkt.EgressRoute.LocalLinkAddress, 522 DstAddr: pkt.EgressRoute.RemoteLinkAddress, 523 Type: pkt.NetworkProtocolNumber, 524 }) 525 } 526 } 527 528 func (e *endpoint) parseHeader(pkt *stack.PacketBuffer) bool { 529 _, ok := pkt.LinkHeader().Consume(e.hdrSize) 530 return ok 531 532 } 533 534 // ParseHeader implements stack.LinkEndpoint.ParseHeader. 535 func (e *endpoint) ParseHeader(pkt *stack.PacketBuffer) bool { 536 if e.hdrSize > 0 { 537 return e.parseHeader(pkt) 538 } 539 return true 540 } 541 542 // writePacket writes outbound packets to the file descriptor. If it is not 543 // currently writable, the packet is dropped. 544 func (e *endpoint) writePacket(pkt *stack.PacketBuffer) tcpip.Error { 545 fdInfo := e.fds[pkt.Hash%uint32(len(e.fds))] 546 fd := fdInfo.fd 547 var vnetHdrBuf []byte 548 if e.gsoKind == stack.HostGSOSupported { 549 vnetHdr := virtioNetHdr{} 550 if pkt.GSOOptions.Type != stack.GSONone { 551 vnetHdr.hdrLen = uint16(pkt.HeaderSize()) 552 if pkt.GSOOptions.NeedsCsum { 553 vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM 554 vnetHdr.csumStart = pkt.GSOOptions.L3HdrLen 555 vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset 556 } 557 if uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS { 558 switch pkt.GSOOptions.Type { 559 case stack.GSOTCPv4: 560 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 561 case stack.GSOTCPv6: 562 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 563 default: 564 panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type)) 565 } 566 vnetHdr.gsoSize = pkt.GSOOptions.MSS 567 } 568 } 569 vnetHdrBuf = vnetHdr.marshal() 570 } 571 572 views := pkt.AsSlices() 573 numIovecs := len(views) 574 if len(vnetHdrBuf) != 0 { 575 numIovecs++ 576 } 577 if numIovecs > e.writevMaxIovs { 578 numIovecs = e.writevMaxIovs 579 } 580 581 // Allocate small iovec arrays on the stack. 582 var iovecsArr [8]unix.Iovec 583 iovecs := iovecsArr[:0] 584 if numIovecs > len(iovecsArr) { 585 iovecs = make([]unix.Iovec, 0, numIovecs) 586 } 587 iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs) 588 for _, v := range views { 589 iovecs = rawfile.AppendIovecFromBytes(iovecs, v, numIovecs) 590 } 591 return rawfile.NonBlockingWriteIovec(fd, iovecs) 592 } 593 594 func (e *endpoint) sendBatch(batchFDInfo fdInfo, pkts []*stack.PacketBuffer) (int, tcpip.Error) { 595 // Degrade to writePacket if underlying fd is not a socket. 596 if !batchFDInfo.isSocket { 597 var written int 598 var err tcpip.Error 599 for written < len(pkts) { 600 if err = e.writePacket(pkts[written]); err != nil { 601 break 602 } 603 written++ 604 } 605 return written, err 606 } 607 608 // Send a batch of packets through batchFD. 609 batchFD := batchFDInfo.fd 610 mmsgHdrsStorage := make([]rawfile.MMsgHdr, 0, len(pkts)) 611 packets := 0 612 for packets < len(pkts) { 613 mmsgHdrs := mmsgHdrsStorage 614 batch := pkts[packets:] 615 syscallHeaderBytes := uintptr(0) 616 for _, pkt := range batch { 617 var vnetHdrBuf []byte 618 if e.gsoKind == stack.HostGSOSupported { 619 vnetHdr := virtioNetHdr{} 620 if pkt.GSOOptions.Type != stack.GSONone { 621 vnetHdr.hdrLen = uint16(pkt.HeaderSize()) 622 if pkt.GSOOptions.NeedsCsum { 623 vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM 624 vnetHdr.csumStart = pkt.GSOOptions.L3HdrLen 625 vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset 626 } 627 if pkt.GSOOptions.Type != stack.GSONone && uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS { 628 switch pkt.GSOOptions.Type { 629 case stack.GSOTCPv4: 630 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 631 case stack.GSOTCPv6: 632 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 633 default: 634 panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type)) 635 } 636 vnetHdr.gsoSize = pkt.GSOOptions.MSS 637 } 638 } 639 vnetHdrBuf = vnetHdr.marshal() 640 } 641 642 views, offset := pkt.AsViewList() 643 var skipped int 644 var view *buffer.View 645 for view = views.Front(); view != nil && offset >= view.Size(); view = view.Next() { 646 offset -= view.Size() 647 skipped++ 648 } 649 650 // We've made it to the usable views. 651 numIovecs := views.Len() - skipped 652 if len(vnetHdrBuf) != 0 { 653 numIovecs++ 654 } 655 if numIovecs > rawfile.MaxIovs { 656 numIovecs = rawfile.MaxIovs 657 } 658 if e.maxSyscallHeaderBytes != 0 { 659 syscallHeaderBytes += rawfile.SizeofMMsgHdr + uintptr(numIovecs)*rawfile.SizeofIovec 660 if syscallHeaderBytes > e.maxSyscallHeaderBytes { 661 // We can't fit this packet into this call to sendmmsg(). 662 // We could potentially do so if we reduced numIovecs 663 // further, but this might incur considerable extra 664 // copying. Leave it to the next batch instead. 665 break 666 } 667 } 668 669 // We can't easily allocate iovec arrays on the stack here since 670 // they will escape this loop iteration via mmsgHdrs. 671 iovecs := make([]unix.Iovec, 0, numIovecs) 672 iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs) 673 // At most one slice has a non-zero offset. 674 iovecs = rawfile.AppendIovecFromBytes(iovecs, view.AsSlice()[offset:], numIovecs) 675 for view = view.Next(); view != nil; view = view.Next() { 676 iovecs = rawfile.AppendIovecFromBytes(iovecs, view.AsSlice(), numIovecs) 677 } 678 679 var mmsgHdr rawfile.MMsgHdr 680 mmsgHdr.Msg.Iov = &iovecs[0] 681 mmsgHdr.Msg.SetIovlen(len(iovecs)) 682 mmsgHdrs = append(mmsgHdrs, mmsgHdr) 683 } 684 685 if len(mmsgHdrs) == 0 { 686 // We can't fit batch[0] into a mmsghdr while staying under 687 // e.maxSyscallHeaderBytes. Use WritePacket, which will avoid the 688 // mmsghdr (by using writev) and re-buffer iovecs more aggressively 689 // if necessary (by using e.writevMaxIovs instead of 690 // rawfile.MaxIovs). 691 pkt := batch[0] 692 if err := e.writePacket(pkt); err != nil { 693 return packets, err 694 } 695 packets++ 696 } else { 697 for len(mmsgHdrs) > 0 { 698 sent, err := rawfile.NonBlockingSendMMsg(batchFD, mmsgHdrs) 699 if err != nil { 700 return packets, err 701 } 702 packets += sent 703 mmsgHdrs = mmsgHdrs[sent:] 704 } 705 } 706 } 707 708 return packets, nil 709 } 710 711 // WritePackets writes outbound packets to the underlying file descriptors. If 712 // one is not currently writable, the packet is dropped. 713 // 714 // Being a batch API, each packet in pkts should have the following 715 // fields populated: 716 // - pkt.EgressRoute 717 // - pkt.GSOOptions 718 // - pkt.NetworkProtocolNumber 719 func (e *endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { 720 // Preallocate to avoid repeated reallocation as we append to batch. 721 batch := make([]*stack.PacketBuffer, 0, BatchSize) 722 batchFDInfo := fdInfo{fd: -1, isSocket: false} 723 sentPackets := 0 724 for _, pkt := range pkts.AsSlice() { 725 if len(batch) == 0 { 726 batchFDInfo = e.fds[pkt.Hash%uint32(len(e.fds))] 727 } 728 pktFDInfo := e.fds[pkt.Hash%uint32(len(e.fds))] 729 if sendNow := pktFDInfo != batchFDInfo; !sendNow { 730 batch = append(batch, pkt) 731 continue 732 } 733 n, err := e.sendBatch(batchFDInfo, batch) 734 sentPackets += n 735 if err != nil { 736 return sentPackets, err 737 } 738 batch = batch[:0] 739 batch = append(batch, pkt) 740 batchFDInfo = pktFDInfo 741 } 742 743 if len(batch) != 0 { 744 n, err := e.sendBatch(batchFDInfo, batch) 745 sentPackets += n 746 if err != nil { 747 return sentPackets, err 748 } 749 } 750 return sentPackets, nil 751 } 752 753 // InjectOutbound implements stack.InjectableEndpoint.InjectOutbound. 754 func (e *endpoint) InjectOutbound(dest tcpip.Address, packet *buffer.View) tcpip.Error { 755 return rawfile.NonBlockingWrite(e.fds[0].fd, packet.AsSlice()) 756 } 757 758 // dispatchLoop reads packets from the file descriptor in a loop and dispatches 759 // them to the network stack. 760 func (e *endpoint) dispatchLoop(inboundDispatcher linkDispatcher) tcpip.Error { 761 for { 762 cont, err := inboundDispatcher.dispatch() 763 if err != nil || !cont { 764 if e.closed != nil { 765 e.closed(err) 766 } 767 inboundDispatcher.release() 768 return err 769 } 770 } 771 } 772 773 // GSOMaxSize implements stack.GSOEndpoint. 774 func (e *endpoint) GSOMaxSize() uint32 { 775 return e.gsoMaxSize 776 } 777 778 // SupportedGSO implements stack.GSOEndpoint. 779 func (e *endpoint) SupportedGSO() stack.SupportedGSO { 780 return e.gsoKind 781 } 782 783 // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. 784 func (e *endpoint) ARPHardwareType() header.ARPHardwareType { 785 if e.hdrSize > 0 { 786 return header.ARPHardwareEther 787 } 788 return header.ARPHardwareNone 789 } 790 791 // InjectableEndpoint is an injectable fd-based endpoint. The endpoint writes 792 // to the FD, but does not read from it. All reads come from injected packets. 793 type InjectableEndpoint struct { 794 endpoint 795 796 mu sync.RWMutex 797 // +checklocks:mu 798 dispatcher stack.NetworkDispatcher 799 } 800 801 // Attach saves the stack network-layer dispatcher for use later when packets 802 // are injected. 803 func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) { 804 e.mu.Lock() 805 defer e.mu.Unlock() 806 e.dispatcher = dispatcher 807 } 808 809 // InjectInbound injects an inbound packet. If the endpoint is not attached, the 810 // packet is not delivered. 811 func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { 812 e.mu.RLock() 813 d := e.dispatcher 814 e.mu.RUnlock() 815 if d != nil { 816 d.DeliverNetworkPacket(protocol, pkt) 817 } 818 } 819 820 // NewInjectable creates a new fd-based InjectableEndpoint. 821 func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabilities) (*InjectableEndpoint, error) { 822 unix.SetNonblock(fd, true) 823 isSocket, err := isSocketFD(fd) 824 if err != nil { 825 return nil, err 826 } 827 828 return &InjectableEndpoint{endpoint: endpoint{ 829 fds: []fdInfo{{fd: fd, isSocket: isSocket}}, 830 mtu: mtu, 831 caps: capabilities, 832 writevMaxIovs: rawfile.MaxIovs, 833 }}, nil 834 }