github.com/noisysockets/netstack@v0.6.0/pkg/tcpip/link/fdbased/endpoint.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build linux 16 // +build linux 17 18 // Package fdbased provides the implementation of data-link layer endpoints 19 // backed by boundary-preserving file descriptors (e.g., TUN devices, 20 // seqpacket/datagram sockets). 21 // 22 // FD based endpoints can be used in the networking stack by calling New() to 23 // create a new endpoint, and then passing it as an argument to 24 // Stack.CreateNIC(). 25 // 26 // FD based endpoints can use more than one file descriptor to read incoming 27 // packets. If there are more than one FDs specified and the underlying FD is an 28 // AF_PACKET then the endpoint will enable FANOUT mode on the socket so that the 29 // host kernel will consistently hash the packets to the sockets. This ensures 30 // that packets for the same TCP streams are not reordered. 31 // 32 // Similarly if more than one FD's are specified where the underlying FD is not 33 // AF_PACKET then it's the caller's responsibility to ensure that all inbound 34 // packets on the descriptors are consistently 5 tuple hashed to one of the 35 // descriptors to prevent TCP reordering. 36 // 37 // Since netstack today does not compute 5 tuple hashes for outgoing packets we 38 // only use the first FD to write outbound packets. Once 5 tuple hashes for 39 // all outbound packets are available we will make use of all underlying FD's to 40 // write outbound packets. 41 package fdbased 42 43 import ( 44 "fmt" 45 46 "golang.org/x/sys/unix" 47 "github.com/noisysockets/netstack/pkg/atomicbitops" 48 "github.com/noisysockets/netstack/pkg/buffer" 49 "github.com/noisysockets/netstack/pkg/sync" 50 "github.com/noisysockets/netstack/pkg/tcpip" 51 "github.com/noisysockets/netstack/pkg/tcpip/header" 52 "github.com/noisysockets/netstack/pkg/tcpip/link/rawfile" 53 "github.com/noisysockets/netstack/pkg/tcpip/stack" 54 ) 55 56 // linkDispatcher reads packets from the link FD and dispatches them to the 57 // NetworkDispatcher. 58 type linkDispatcher interface { 59 Stop() 60 dispatch() (bool, tcpip.Error) 61 release() 62 } 63 64 // PacketDispatchMode are the various supported methods of receiving and 65 // dispatching packets from the underlying FD. 66 type PacketDispatchMode int 67 68 // BatchSize is the number of packets to write in each syscall. It is 47 69 // because when GVisorGSO is in use then a single 65KB TCP segment can get 70 // split into 46 segments of 1420 bytes and a single 216 byte segment. 71 const BatchSize = 47 72 73 const ( 74 // Readv is the default dispatch mode and is the least performant of the 75 // dispatch options but the one that is supported by all underlying FD 76 // types. 77 Readv PacketDispatchMode = iota 78 // RecvMMsg enables use of recvmmsg() syscall instead of readv() to 79 // read inbound packets. This reduces # of syscalls needed to process 80 // packets. 81 // 82 // NOTE: recvmmsg() is only supported for sockets, so if the underlying 83 // FD is not a socket then the code will still fall back to the readv() 84 // path. 85 RecvMMsg 86 // PacketMMap enables use of PACKET_RX_RING to receive packets from the 87 // NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The 88 // primary use-case for this is runsc which uses an AF_PACKET FD to 89 // receive packets from the veth device. 90 PacketMMap 91 ) 92 93 func (p PacketDispatchMode) String() string { 94 switch p { 95 case Readv: 96 return "Readv" 97 case RecvMMsg: 98 return "RecvMMsg" 99 case PacketMMap: 100 return "PacketMMap" 101 default: 102 return fmt.Sprintf("unknown packet dispatch mode '%d'", p) 103 } 104 } 105 106 var _ stack.LinkEndpoint = (*endpoint)(nil) 107 var _ stack.GSOEndpoint = (*endpoint)(nil) 108 109 type fdInfo struct { 110 fd int 111 isSocket bool 112 } 113 114 type endpoint struct { 115 // fds is the set of file descriptors each identifying one inbound/outbound 116 // channel. The endpoint will dispatch from all inbound channels as well as 117 // hash outbound packets to specific channels based on the packet hash. 118 fds []fdInfo 119 120 // mtu (maximum transmission unit) is the maximum size of a packet. 121 mtu uint32 122 123 // hdrSize specifies the link-layer header size. If set to 0, no header 124 // is added/removed; otherwise an ethernet header is used. 125 hdrSize int 126 127 // addr is the address of the endpoint. 128 addr tcpip.LinkAddress 129 130 // caps holds the endpoint capabilities. 131 caps stack.LinkEndpointCapabilities 132 133 // closed is a function to be called when the FD's peer (if any) closes 134 // its end of the communication pipe. 135 closed func(tcpip.Error) 136 137 inboundDispatchers []linkDispatcher 138 139 mu sync.RWMutex 140 // +checklocks:mu 141 dispatcher stack.NetworkDispatcher 142 143 // packetDispatchMode controls the packet dispatcher used by this 144 // endpoint. 145 packetDispatchMode PacketDispatchMode 146 147 // gsoMaxSize is the maximum GSO packet size. It is zero if GSO is 148 // disabled. 149 gsoMaxSize uint32 150 151 // wg keeps track of running goroutines. 152 wg sync.WaitGroup 153 154 // gsoKind is the supported kind of GSO. 155 gsoKind stack.SupportedGSO 156 157 // maxSyscallHeaderBytes has the same meaning as 158 // Options.MaxSyscallHeaderBytes. 159 maxSyscallHeaderBytes uintptr 160 161 // writevMaxIovs is the maximum number of iovecs that may be passed to 162 // rawfile.NonBlockingWriteIovec, as possibly limited by 163 // maxSyscallHeaderBytes. (No analogous limit is defined for 164 // rawfile.NonBlockingSendMMsg, since in that case the maximum number of 165 // iovecs also depends on the number of mmsghdrs. Instead, if sendBatch 166 // encounters a packet whose iovec count is limited by 167 // maxSyscallHeaderBytes, it falls back to writing the packet using writev 168 // via WritePacket.) 169 writevMaxIovs int 170 } 171 172 // Options specify the details about the fd-based endpoint to be created. 173 type Options struct { 174 // FDs is a set of FDs used to read/write packets. 175 FDs []int 176 177 // MTU is the mtu to use for this endpoint. 178 MTU uint32 179 180 // EthernetHeader if true, indicates that the endpoint should read/write 181 // ethernet frames instead of IP packets. 182 EthernetHeader bool 183 184 // ClosedFunc is a function to be called when an endpoint's peer (if 185 // any) closes its end of the communication pipe. 186 ClosedFunc func(tcpip.Error) 187 188 // Address is the link address for this endpoint. Only used if 189 // EthernetHeader is true. 190 Address tcpip.LinkAddress 191 192 // SaveRestore if true, indicates that this NIC capability set should 193 // include CapabilitySaveRestore 194 SaveRestore bool 195 196 // DisconnectOk if true, indicates that this NIC capability set should 197 // include CapabilityDisconnectOk. 198 DisconnectOk bool 199 200 // GSOMaxSize is the maximum GSO packet size. It is zero if GSO is 201 // disabled. 202 GSOMaxSize uint32 203 204 // GVisorGSOEnabled indicates whether Gvisor GSO is enabled or not. 205 GVisorGSOEnabled bool 206 207 // PacketDispatchMode specifies the type of inbound dispatcher to be 208 // used for this endpoint. 209 PacketDispatchMode PacketDispatchMode 210 211 // TXChecksumOffload if true, indicates that this endpoints capability 212 // set should include CapabilityTXChecksumOffload. 213 TXChecksumOffload bool 214 215 // RXChecksumOffload if true, indicates that this endpoints capability 216 // set should include CapabilityRXChecksumOffload. 217 RXChecksumOffload bool 218 219 // If MaxSyscallHeaderBytes is non-zero, it is the maximum number of bytes 220 // of struct iovec, msghdr, and mmsghdr that may be passed by each host 221 // system call. 222 MaxSyscallHeaderBytes int 223 224 // InterfaceIndex is the interface index of the underlying device. 225 InterfaceIndex int 226 227 // GRO enables generic receive offload. 228 GRO bool 229 } 230 231 // fanoutID is used for AF_PACKET based endpoints to enable PACKET_FANOUT 232 // support in the host kernel. This allows us to use multiple FD's to receive 233 // from the same underlying NIC. The fanoutID needs to be the same for a given 234 // set of FD's that point to the same NIC. Trying to set the PACKET_FANOUT 235 // option for an FD with a fanoutID already in use by another FD for a different 236 // NIC will return an EINVAL. 237 // 238 // Since fanoutID must be unique within the network namespace, we start with 239 // the PID to avoid collisions. The only way to be sure of avoiding collisions 240 // is to run in a new network namespace. 241 var fanoutID atomicbitops.Int32 = atomicbitops.FromInt32(int32(unix.Getpid())) 242 243 // New creates a new fd-based endpoint. 244 // 245 // Makes fd non-blocking, but does not take ownership of fd, which must remain 246 // open for the lifetime of the returned endpoint (until after the endpoint has 247 // stopped being using and Wait returns). 248 func New(opts *Options) (stack.LinkEndpoint, error) { 249 caps := stack.LinkEndpointCapabilities(0) 250 if opts.RXChecksumOffload { 251 caps |= stack.CapabilityRXChecksumOffload 252 } 253 254 if opts.TXChecksumOffload { 255 caps |= stack.CapabilityTXChecksumOffload 256 } 257 258 hdrSize := 0 259 if opts.EthernetHeader { 260 hdrSize = header.EthernetMinimumSize 261 caps |= stack.CapabilityResolutionRequired 262 } 263 264 if opts.SaveRestore { 265 caps |= stack.CapabilitySaveRestore 266 } 267 268 if opts.DisconnectOk { 269 caps |= stack.CapabilityDisconnectOk 270 } 271 272 if len(opts.FDs) == 0 { 273 return nil, fmt.Errorf("opts.FD is empty, at least one FD must be specified") 274 } 275 276 if opts.MaxSyscallHeaderBytes < 0 { 277 return nil, fmt.Errorf("opts.MaxSyscallHeaderBytes is negative") 278 } 279 280 e := &endpoint{ 281 mtu: opts.MTU, 282 caps: caps, 283 closed: opts.ClosedFunc, 284 addr: opts.Address, 285 hdrSize: hdrSize, 286 packetDispatchMode: opts.PacketDispatchMode, 287 maxSyscallHeaderBytes: uintptr(opts.MaxSyscallHeaderBytes), 288 writevMaxIovs: rawfile.MaxIovs, 289 } 290 if e.maxSyscallHeaderBytes != 0 { 291 if max := int(e.maxSyscallHeaderBytes / rawfile.SizeofIovec); max < e.writevMaxIovs { 292 e.writevMaxIovs = max 293 } 294 } 295 296 // Increment fanoutID to ensure that we don't re-use the same fanoutID 297 // for the next endpoint. 298 fid := fanoutID.Add(1) 299 300 // Create per channel dispatchers. 301 for _, fd := range opts.FDs { 302 if err := unix.SetNonblock(fd, true); err != nil { 303 return nil, fmt.Errorf("unix.SetNonblock(%v) failed: %v", fd, err) 304 } 305 306 isSocket, err := isSocketFD(fd) 307 if err != nil { 308 return nil, err 309 } 310 e.fds = append(e.fds, fdInfo{fd: fd, isSocket: isSocket}) 311 if isSocket { 312 if opts.GSOMaxSize != 0 { 313 if opts.GVisorGSOEnabled { 314 e.gsoKind = stack.GVisorGSOSupported 315 } else { 316 e.gsoKind = stack.HostGSOSupported 317 } 318 e.gsoMaxSize = opts.GSOMaxSize 319 } 320 } 321 322 inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket, fid, opts) 323 if err != nil { 324 return nil, fmt.Errorf("createInboundDispatcher(...) = %v", err) 325 } 326 e.inboundDispatchers = append(e.inboundDispatchers, inboundDispatcher) 327 } 328 329 return e, nil 330 } 331 332 func createInboundDispatcher(e *endpoint, fd int, isSocket bool, fID int32, opts *Options) (linkDispatcher, error) { 333 // By default use the readv() dispatcher as it works with all kinds of 334 // FDs (tap/tun/unix domain sockets and af_packet). 335 inboundDispatcher, err := newReadVDispatcher(fd, e) 336 if err != nil { 337 return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", fd, e, err) 338 } 339 340 if isSocket { 341 sa, err := unix.Getsockname(fd) 342 if err != nil { 343 return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err) 344 } 345 switch sa.(type) { 346 case *unix.SockaddrLinklayer: 347 // Enable PACKET_FANOUT mode if the underlying socket is of type 348 // AF_PACKET. We do not enable PACKET_FANOUT_FLAG_DEFRAG as that will 349 // prevent gvisor from receiving fragmented packets and the host does the 350 // reassembly on our behalf before delivering the fragments. This makes it 351 // hard to test fragmentation reassembly code in Netstack. 352 // 353 // See: include/uapi/linux/if_packet.h (struct fanout_args). 354 // 355 // NOTE: We are using SetSockOptInt here even though the underlying 356 // option is actually a struct. The code follows the example in the 357 // kernel documentation as described at the link below: 358 // 359 // See: https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt 360 // 361 // This works out because the actual implementation for the option zero 362 // initializes the structure and will initialize the max_members field 363 // to a proper value if zero. 364 // 365 // See: https://github.com/torvalds/linux/blob/7acac4b3196caee5e21fb5ea53f8bc124e6a16fc/net/packet/af_packet.c#L3881 366 const fanoutType = unix.PACKET_FANOUT_HASH 367 fanoutArg := (int(fID) & 0xffff) | fanoutType<<16 368 if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil { 369 return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err) 370 } 371 } 372 373 switch e.packetDispatchMode { 374 case PacketMMap: 375 inboundDispatcher, err = newPacketMMapDispatcher(fd, e) 376 if err != nil { 377 return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", fd, e, err) 378 } 379 case RecvMMsg: 380 // If the provided FD is a socket then we optimize 381 // packet reads by using recvmmsg() instead of read() to 382 // read packets in a batch. 383 inboundDispatcher, err = newRecvMMsgDispatcher(fd, e, opts) 384 if err != nil { 385 return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", fd, e, err) 386 } 387 case Readv: 388 default: 389 return nil, fmt.Errorf("unknown dispatch mode %d", e.packetDispatchMode) 390 } 391 } 392 return inboundDispatcher, nil 393 } 394 395 func isSocketFD(fd int) (bool, error) { 396 var stat unix.Stat_t 397 if err := unix.Fstat(fd, &stat); err != nil { 398 return false, fmt.Errorf("unix.Fstat(%v,...) failed: %v", fd, err) 399 } 400 return (stat.Mode & unix.S_IFSOCK) == unix.S_IFSOCK, nil 401 } 402 403 // Attach launches the goroutine that reads packets from the file descriptor and 404 // dispatches them via the provided dispatcher. If one is already attached, 405 // then nothing happens. 406 // 407 // Attach implements stack.LinkEndpoint.Attach. 408 func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { 409 e.mu.Lock() 410 defer e.mu.Unlock() 411 412 // nil means the NIC is being removed. 413 if dispatcher == nil && e.dispatcher != nil { 414 for _, dispatcher := range e.inboundDispatchers { 415 dispatcher.Stop() 416 } 417 e.Wait() 418 e.dispatcher = nil 419 return 420 } 421 if dispatcher != nil && e.dispatcher == nil { 422 e.dispatcher = dispatcher 423 // Link endpoints are not savable. When transportation endpoints are 424 // saved, they stop sending outgoing packets and all incoming packets 425 // are rejected. 426 for i := range e.inboundDispatchers { 427 e.wg.Add(1) 428 go func(i int) { // S/R-SAFE: See above. 429 e.dispatchLoop(e.inboundDispatchers[i]) 430 e.wg.Done() 431 }(i) 432 } 433 } 434 } 435 436 // IsAttached implements stack.LinkEndpoint.IsAttached. 437 func (e *endpoint) IsAttached() bool { 438 e.mu.RLock() 439 defer e.mu.RUnlock() 440 return e.dispatcher != nil 441 } 442 443 // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized 444 // during construction. 445 func (e *endpoint) MTU() uint32 { 446 return e.mtu 447 } 448 449 // Capabilities implements stack.LinkEndpoint.Capabilities. 450 func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { 451 return e.caps 452 } 453 454 // MaxHeaderLength returns the maximum size of the link-layer header. 455 func (e *endpoint) MaxHeaderLength() uint16 { 456 return uint16(e.hdrSize) 457 } 458 459 // LinkAddress returns the link address of this endpoint. 460 func (e *endpoint) LinkAddress() tcpip.LinkAddress { 461 return e.addr 462 } 463 464 // Wait implements stack.LinkEndpoint.Wait. It waits for the endpoint to stop 465 // reading from its FD. 466 func (e *endpoint) Wait() { 467 e.wg.Wait() 468 } 469 470 // virtioNetHdr is declared in linux/virtio_net.h. 471 type virtioNetHdr struct { 472 flags uint8 473 gsoType uint8 474 hdrLen uint16 475 gsoSize uint16 476 csumStart uint16 477 csumOffset uint16 478 } 479 480 // marshal serializes h to a newly-allocated byte slice, in little-endian byte 481 // order. 482 // 483 // Note: Virtio v1.0 onwards specifies little-endian as the byte ordering used 484 // for general serialization. This makes it difficult to use go-marshal for 485 // virtio types, as go-marshal implicitly uses the native byte ordering. 486 func (h *virtioNetHdr) marshal() []byte { 487 buf := [virtioNetHdrSize]byte{ 488 0: byte(h.flags), 489 1: byte(h.gsoType), 490 491 // Manually lay out the fields in little-endian byte order. Little endian => 492 // least significant bit goes to the lower address. 493 494 2: byte(h.hdrLen), 495 3: byte(h.hdrLen >> 8), 496 497 4: byte(h.gsoSize), 498 5: byte(h.gsoSize >> 8), 499 500 6: byte(h.csumStart), 501 7: byte(h.csumStart >> 8), 502 503 8: byte(h.csumOffset), 504 9: byte(h.csumOffset >> 8), 505 } 506 return buf[:] 507 } 508 509 // These constants are declared in linux/virtio_net.h. 510 const ( 511 _VIRTIO_NET_HDR_F_NEEDS_CSUM = 1 512 513 _VIRTIO_NET_HDR_GSO_TCPV4 = 1 514 _VIRTIO_NET_HDR_GSO_TCPV6 = 4 515 ) 516 517 // AddHeader implements stack.LinkEndpoint.AddHeader. 518 func (e *endpoint) AddHeader(pkt *stack.PacketBuffer) { 519 if e.hdrSize > 0 { 520 // Add ethernet header if needed. 521 eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize)) 522 eth.Encode(&header.EthernetFields{ 523 SrcAddr: pkt.EgressRoute.LocalLinkAddress, 524 DstAddr: pkt.EgressRoute.RemoteLinkAddress, 525 Type: pkt.NetworkProtocolNumber, 526 }) 527 } 528 } 529 530 func (e *endpoint) parseHeader(pkt *stack.PacketBuffer) bool { 531 _, ok := pkt.LinkHeader().Consume(e.hdrSize) 532 return ok 533 534 } 535 536 // ParseHeader implements stack.LinkEndpoint.ParseHeader. 537 func (e *endpoint) ParseHeader(pkt *stack.PacketBuffer) bool { 538 if e.hdrSize > 0 { 539 return e.parseHeader(pkt) 540 } 541 return true 542 } 543 544 // writePacket writes outbound packets to the file descriptor. If it is not 545 // currently writable, the packet is dropped. 546 func (e *endpoint) writePacket(pkt *stack.PacketBuffer) tcpip.Error { 547 fdInfo := e.fds[pkt.Hash%uint32(len(e.fds))] 548 fd := fdInfo.fd 549 var vnetHdrBuf []byte 550 if e.gsoKind == stack.HostGSOSupported { 551 vnetHdr := virtioNetHdr{} 552 if pkt.GSOOptions.Type != stack.GSONone { 553 vnetHdr.hdrLen = uint16(pkt.HeaderSize()) 554 if pkt.GSOOptions.NeedsCsum { 555 vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM 556 vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen 557 vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset 558 } 559 if uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS { 560 switch pkt.GSOOptions.Type { 561 case stack.GSOTCPv4: 562 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 563 case stack.GSOTCPv6: 564 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 565 default: 566 panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type)) 567 } 568 vnetHdr.gsoSize = pkt.GSOOptions.MSS 569 } 570 } 571 vnetHdrBuf = vnetHdr.marshal() 572 } 573 574 views := pkt.AsSlices() 575 numIovecs := len(views) 576 if len(vnetHdrBuf) != 0 { 577 numIovecs++ 578 } 579 if numIovecs > e.writevMaxIovs { 580 numIovecs = e.writevMaxIovs 581 } 582 583 // Allocate small iovec arrays on the stack. 584 var iovecsArr [8]unix.Iovec 585 iovecs := iovecsArr[:0] 586 if numIovecs > len(iovecsArr) { 587 iovecs = make([]unix.Iovec, 0, numIovecs) 588 } 589 iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs) 590 for _, v := range views { 591 iovecs = rawfile.AppendIovecFromBytes(iovecs, v, numIovecs) 592 } 593 return rawfile.NonBlockingWriteIovec(fd, iovecs) 594 } 595 596 func (e *endpoint) sendBatch(batchFDInfo fdInfo, pkts []*stack.PacketBuffer) (int, tcpip.Error) { 597 // Degrade to writePacket if underlying fd is not a socket. 598 if !batchFDInfo.isSocket { 599 var written int 600 var err tcpip.Error 601 for written < len(pkts) { 602 if err = e.writePacket(pkts[written]); err != nil { 603 break 604 } 605 written++ 606 } 607 return written, err 608 } 609 610 // Send a batch of packets through batchFD. 611 batchFD := batchFDInfo.fd 612 mmsgHdrsStorage := make([]rawfile.MMsgHdr, 0, len(pkts)) 613 packets := 0 614 for packets < len(pkts) { 615 mmsgHdrs := mmsgHdrsStorage 616 batch := pkts[packets:] 617 syscallHeaderBytes := uintptr(0) 618 for _, pkt := range batch { 619 var vnetHdrBuf []byte 620 if e.gsoKind == stack.HostGSOSupported { 621 vnetHdr := virtioNetHdr{} 622 if pkt.GSOOptions.Type != stack.GSONone { 623 vnetHdr.hdrLen = uint16(pkt.HeaderSize()) 624 if pkt.GSOOptions.NeedsCsum { 625 vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM 626 vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen 627 vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset 628 } 629 if pkt.GSOOptions.Type != stack.GSONone && uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS { 630 switch pkt.GSOOptions.Type { 631 case stack.GSOTCPv4: 632 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 633 case stack.GSOTCPv6: 634 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 635 default: 636 panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type)) 637 } 638 vnetHdr.gsoSize = pkt.GSOOptions.MSS 639 } 640 } 641 vnetHdrBuf = vnetHdr.marshal() 642 } 643 644 views, offset := pkt.AsViewList() 645 var skipped int 646 var view *buffer.View 647 for view = views.Front(); view != nil && offset >= view.Size(); view = view.Next() { 648 offset -= view.Size() 649 skipped++ 650 } 651 652 // We've made it to the usable views. 653 numIovecs := views.Len() - skipped 654 if len(vnetHdrBuf) != 0 { 655 numIovecs++ 656 } 657 if numIovecs > rawfile.MaxIovs { 658 numIovecs = rawfile.MaxIovs 659 } 660 if e.maxSyscallHeaderBytes != 0 { 661 syscallHeaderBytes += rawfile.SizeofMMsgHdr + uintptr(numIovecs)*rawfile.SizeofIovec 662 if syscallHeaderBytes > e.maxSyscallHeaderBytes { 663 // We can't fit this packet into this call to sendmmsg(). 664 // We could potentially do so if we reduced numIovecs 665 // further, but this might incur considerable extra 666 // copying. Leave it to the next batch instead. 667 break 668 } 669 } 670 671 // We can't easily allocate iovec arrays on the stack here since 672 // they will escape this loop iteration via mmsgHdrs. 673 iovecs := make([]unix.Iovec, 0, numIovecs) 674 iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs) 675 // At most one slice has a non-zero offset. 676 iovecs = rawfile.AppendIovecFromBytes(iovecs, view.AsSlice()[offset:], numIovecs) 677 for view = view.Next(); view != nil; view = view.Next() { 678 iovecs = rawfile.AppendIovecFromBytes(iovecs, view.AsSlice(), numIovecs) 679 } 680 681 var mmsgHdr rawfile.MMsgHdr 682 mmsgHdr.Msg.Iov = &iovecs[0] 683 mmsgHdr.Msg.SetIovlen(len(iovecs)) 684 mmsgHdrs = append(mmsgHdrs, mmsgHdr) 685 } 686 687 if len(mmsgHdrs) == 0 { 688 // We can't fit batch[0] into a mmsghdr while staying under 689 // e.maxSyscallHeaderBytes. Use WritePacket, which will avoid the 690 // mmsghdr (by using writev) and re-buffer iovecs more aggressively 691 // if necessary (by using e.writevMaxIovs instead of 692 // rawfile.MaxIovs). 693 pkt := batch[0] 694 if err := e.writePacket(pkt); err != nil { 695 return packets, err 696 } 697 packets++ 698 } else { 699 for len(mmsgHdrs) > 0 { 700 sent, err := rawfile.NonBlockingSendMMsg(batchFD, mmsgHdrs) 701 if err != nil { 702 return packets, err 703 } 704 packets += sent 705 mmsgHdrs = mmsgHdrs[sent:] 706 } 707 } 708 } 709 710 return packets, nil 711 } 712 713 // WritePackets writes outbound packets to the underlying file descriptors. If 714 // one is not currently writable, the packet is dropped. 715 // 716 // Being a batch API, each packet in pkts should have the following 717 // fields populated: 718 // - pkt.EgressRoute 719 // - pkt.GSOOptions 720 // - pkt.NetworkProtocolNumber 721 func (e *endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { 722 // Preallocate to avoid repeated reallocation as we append to batch. 723 batch := make([]*stack.PacketBuffer, 0, BatchSize) 724 batchFDInfo := fdInfo{fd: -1, isSocket: false} 725 sentPackets := 0 726 for _, pkt := range pkts.AsSlice() { 727 if len(batch) == 0 { 728 batchFDInfo = e.fds[pkt.Hash%uint32(len(e.fds))] 729 } 730 pktFDInfo := e.fds[pkt.Hash%uint32(len(e.fds))] 731 if sendNow := pktFDInfo != batchFDInfo; !sendNow { 732 batch = append(batch, pkt) 733 continue 734 } 735 n, err := e.sendBatch(batchFDInfo, batch) 736 sentPackets += n 737 if err != nil { 738 return sentPackets, err 739 } 740 batch = batch[:0] 741 batch = append(batch, pkt) 742 batchFDInfo = pktFDInfo 743 } 744 745 if len(batch) != 0 { 746 n, err := e.sendBatch(batchFDInfo, batch) 747 sentPackets += n 748 if err != nil { 749 return sentPackets, err 750 } 751 } 752 return sentPackets, nil 753 } 754 755 // InjectOutbound implements stack.InjectableEndpoint.InjectOutbound. 756 func (e *endpoint) InjectOutbound(dest tcpip.Address, packet *buffer.View) tcpip.Error { 757 return rawfile.NonBlockingWrite(e.fds[0].fd, packet.AsSlice()) 758 } 759 760 // dispatchLoop reads packets from the file descriptor in a loop and dispatches 761 // them to the network stack. 762 func (e *endpoint) dispatchLoop(inboundDispatcher linkDispatcher) tcpip.Error { 763 for { 764 cont, err := inboundDispatcher.dispatch() 765 if err != nil || !cont { 766 if e.closed != nil { 767 e.closed(err) 768 } 769 inboundDispatcher.release() 770 return err 771 } 772 } 773 } 774 775 // GSOMaxSize implements stack.GSOEndpoint. 776 func (e *endpoint) GSOMaxSize() uint32 { 777 return e.gsoMaxSize 778 } 779 780 // SupportedGSO implements stack.GSOEndpoint. 781 func (e *endpoint) SupportedGSO() stack.SupportedGSO { 782 return e.gsoKind 783 } 784 785 // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. 786 func (e *endpoint) ARPHardwareType() header.ARPHardwareType { 787 if e.hdrSize > 0 { 788 return header.ARPHardwareEther 789 } 790 return header.ARPHardwareNone 791 } 792 793 // InjectableEndpoint is an injectable fd-based endpoint. The endpoint writes 794 // to the FD, but does not read from it. All reads come from injected packets. 795 type InjectableEndpoint struct { 796 endpoint 797 798 mu sync.RWMutex 799 // +checklocks:mu 800 dispatcher stack.NetworkDispatcher 801 } 802 803 // Attach saves the stack network-layer dispatcher for use later when packets 804 // are injected. 805 func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) { 806 e.mu.Lock() 807 defer e.mu.Unlock() 808 e.dispatcher = dispatcher 809 } 810 811 // InjectInbound injects an inbound packet. If the endpoint is not attached, the 812 // packet is not delivered. 813 func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { 814 e.mu.RLock() 815 d := e.dispatcher 816 e.mu.RUnlock() 817 if d != nil { 818 d.DeliverNetworkPacket(protocol, pkt) 819 } 820 } 821 822 // NewInjectable creates a new fd-based InjectableEndpoint. 823 func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabilities) (*InjectableEndpoint, error) { 824 unix.SetNonblock(fd, true) 825 isSocket, err := isSocketFD(fd) 826 if err != nil { 827 return nil, err 828 } 829 830 return &InjectableEndpoint{endpoint: endpoint{ 831 fds: []fdInfo{{fd: fd, isSocket: isSocket}}, 832 mtu: mtu, 833 caps: capabilities, 834 writevMaxIovs: rawfile.MaxIovs, 835 }}, nil 836 }