github.com/google/netstack@v0.0.0-20191123085552-55fcc16cd0eb/tcpip/link/fdbased/endpoint.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // +build linux 16 17 // Package fdbased provides the implemention of data-link layer endpoints 18 // backed by boundary-preserving file descriptors (e.g., TUN devices, 19 // seqpacket/datagram sockets). 20 // 21 // FD based endpoints can be used in the networking stack by calling New() to 22 // create a new endpoint, and then passing it as an argument to 23 // Stack.CreateNIC(). 24 // 25 // FD based endpoints can use more than one file descriptor to read incoming 26 // packets. If there are more than one FDs specified and the underlying FD is an 27 // AF_PACKET then the endpoint will enable FANOUT mode on the socket so that the 28 // host kernel will consistently hash the packets to the sockets. This ensures 29 // that packets for the same TCP streams are not reordered. 30 // 31 // Similarly if more than one FD's are specified where the underlying FD is not 32 // AF_PACKET then it's the caller's responsibility to ensure that all inbound 33 // packets on the descriptors are consistently 5 tuple hashed to one of the 34 // descriptors to prevent TCP reordering. 35 // 36 // Since netstack today does not compute 5 tuple hashes for outgoing packets we 37 // only use the first FD to write outbound packets. Once 5 tuple hashes for 38 // all outbound packets are available we will make use of all underlying FD's to 39 // write outbound packets. 40 package fdbased 41 42 import ( 43 "fmt" 44 "sync" 45 "syscall" 46 47 "github.com/google/netstack/tcpip" 48 "github.com/google/netstack/tcpip/buffer" 49 "github.com/google/netstack/tcpip/header" 50 "github.com/google/netstack/tcpip/link/rawfile" 51 "github.com/google/netstack/tcpip/stack" 52 "golang.org/x/sys/unix" 53 ) 54 55 // linkDispatcher reads packets from the link FD and dispatches them to the 56 // NetworkDispatcher. 57 type linkDispatcher interface { 58 dispatch() (bool, *tcpip.Error) 59 } 60 61 // PacketDispatchMode are the various supported methods of receiving and 62 // dispatching packets from the underlying FD. 63 type PacketDispatchMode int 64 65 const ( 66 // Readv is the default dispatch mode and is the least performant of the 67 // dispatch options but the one that is supported by all underlying FD 68 // types. 69 Readv PacketDispatchMode = iota 70 // RecvMMsg enables use of recvmmsg() syscall instead of readv() to 71 // read inbound packets. This reduces # of syscalls needed to process 72 // packets. 73 // 74 // NOTE: recvmmsg() is only supported for sockets, so if the underlying 75 // FD is not a socket then the code will still fall back to the readv() 76 // path. 77 RecvMMsg 78 // PacketMMap enables use of PACKET_RX_RING to receive packets from the 79 // NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The 80 // primary use-case for this is runsc which uses an AF_PACKET FD to 81 // receive packets from the veth device. 82 PacketMMap 83 ) 84 85 func (p PacketDispatchMode) String() string { 86 switch p { 87 case Readv: 88 return "Readv" 89 case RecvMMsg: 90 return "RecvMMsg" 91 case PacketMMap: 92 return "PacketMMap" 93 default: 94 return fmt.Sprintf("unknown packet dispatch mode %v", p) 95 } 96 } 97 98 type endpoint struct { 99 // fds is the set of file descriptors each identifying one inbound/outbound 100 // channel. The endpoint will dispatch from all inbound channels as well as 101 // hash outbound packets to specific channels based on the packet hash. 102 fds []int 103 104 // mtu (maximum transmission unit) is the maximum size of a packet. 105 mtu uint32 106 107 // hdrSize specifies the link-layer header size. If set to 0, no header 108 // is added/removed; otherwise an ethernet header is used. 109 hdrSize int 110 111 // addr is the address of the endpoint. 112 addr tcpip.LinkAddress 113 114 // caps holds the endpoint capabilities. 115 caps stack.LinkEndpointCapabilities 116 117 // closed is a function to be called when the FD's peer (if any) closes 118 // its end of the communication pipe. 119 closed func(*tcpip.Error) 120 121 inboundDispatchers []linkDispatcher 122 dispatcher stack.NetworkDispatcher 123 124 // packetDispatchMode controls the packet dispatcher used by this 125 // endpoint. 126 packetDispatchMode PacketDispatchMode 127 128 // gsoMaxSize is the maximum GSO packet size. It is zero if GSO is 129 // disabled. 130 gsoMaxSize uint32 131 132 // wg keeps track of running goroutines. 133 wg sync.WaitGroup 134 } 135 136 // Options specify the details about the fd-based endpoint to be created. 137 type Options struct { 138 // FDs is a set of FDs used to read/write packets. 139 FDs []int 140 141 // MTU is the mtu to use for this endpoint. 142 MTU uint32 143 144 // EthernetHeader if true, indicates that the endpoint should read/write 145 // ethernet frames instead of IP packets. 146 EthernetHeader bool 147 148 // ClosedFunc is a function to be called when an endpoint's peer (if 149 // any) closes its end of the communication pipe. 150 ClosedFunc func(*tcpip.Error) 151 152 // Address is the link address for this endpoint. Only used if 153 // EthernetHeader is true. 154 Address tcpip.LinkAddress 155 156 // SaveRestore if true, indicates that this NIC capability set should 157 // include CapabilitySaveRestore 158 SaveRestore bool 159 160 // DisconnectOk if true, indicates that this NIC capability set should 161 // include CapabilityDisconnectOk. 162 DisconnectOk bool 163 164 // GSOMaxSize is the maximum GSO packet size. It is zero if GSO is 165 // disabled. 166 GSOMaxSize uint32 167 168 // SoftwareGSOEnabled indicates whether software GSO is enabled or not. 169 SoftwareGSOEnabled bool 170 171 // PacketDispatchMode specifies the type of inbound dispatcher to be 172 // used for this endpoint. 173 PacketDispatchMode PacketDispatchMode 174 175 // TXChecksumOffload if true, indicates that this endpoints capability 176 // set should include CapabilityTXChecksumOffload. 177 TXChecksumOffload bool 178 179 // RXChecksumOffload if true, indicates that this endpoints capability 180 // set should include CapabilityRXChecksumOffload. 181 RXChecksumOffload bool 182 } 183 184 // fanoutID is used for AF_PACKET based endpoints to enable PACKET_FANOUT 185 // support in the host kernel. This allows us to use multiple FD's to receive 186 // from the same underlying NIC. The fanoutID needs to be the same for a given 187 // set of FD's that point to the same NIC. Trying to set the PACKET_FANOUT 188 // option for an FD with a fanoutID already in use by another FD for a different 189 // NIC will return an EINVAL. 190 var fanoutID = 1 191 192 // New creates a new fd-based endpoint. 193 // 194 // Makes fd non-blocking, but does not take ownership of fd, which must remain 195 // open for the lifetime of the returned endpoint (until after the endpoint has 196 // stopped being using and Wait returns). 197 func New(opts *Options) (stack.LinkEndpoint, error) { 198 caps := stack.LinkEndpointCapabilities(0) 199 if opts.RXChecksumOffload { 200 caps |= stack.CapabilityRXChecksumOffload 201 } 202 203 if opts.TXChecksumOffload { 204 caps |= stack.CapabilityTXChecksumOffload 205 } 206 207 hdrSize := 0 208 if opts.EthernetHeader { 209 hdrSize = header.EthernetMinimumSize 210 caps |= stack.CapabilityResolutionRequired 211 } 212 213 if opts.SaveRestore { 214 caps |= stack.CapabilitySaveRestore 215 } 216 217 if opts.DisconnectOk { 218 caps |= stack.CapabilityDisconnectOk 219 } 220 221 if len(opts.FDs) == 0 { 222 return nil, fmt.Errorf("opts.FD is empty, at least one FD must be specified") 223 } 224 225 e := &endpoint{ 226 fds: opts.FDs, 227 mtu: opts.MTU, 228 caps: caps, 229 closed: opts.ClosedFunc, 230 addr: opts.Address, 231 hdrSize: hdrSize, 232 packetDispatchMode: opts.PacketDispatchMode, 233 } 234 235 // Create per channel dispatchers. 236 for i := 0; i < len(e.fds); i++ { 237 fd := e.fds[i] 238 if err := syscall.SetNonblock(fd, true); err != nil { 239 return nil, fmt.Errorf("syscall.SetNonblock(%v) failed: %v", fd, err) 240 } 241 242 isSocket, err := isSocketFD(fd) 243 if err != nil { 244 return nil, err 245 } 246 if isSocket { 247 if opts.GSOMaxSize != 0 { 248 if opts.SoftwareGSOEnabled { 249 e.caps |= stack.CapabilitySoftwareGSO 250 } else { 251 e.caps |= stack.CapabilityHardwareGSO 252 } 253 e.gsoMaxSize = opts.GSOMaxSize 254 } 255 } 256 inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket) 257 if err != nil { 258 return nil, fmt.Errorf("createInboundDispatcher(...) = %v", err) 259 } 260 e.inboundDispatchers = append(e.inboundDispatchers, inboundDispatcher) 261 } 262 263 // Increment fanoutID to ensure that we don't re-use the same fanoutID for 264 // the next endpoint. 265 fanoutID++ 266 267 return e, nil 268 } 269 270 func createInboundDispatcher(e *endpoint, fd int, isSocket bool) (linkDispatcher, error) { 271 // By default use the readv() dispatcher as it works with all kinds of 272 // FDs (tap/tun/unix domain sockets and af_packet). 273 inboundDispatcher, err := newReadVDispatcher(fd, e) 274 if err != nil { 275 return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", fd, e, err) 276 } 277 278 if isSocket { 279 sa, err := unix.Getsockname(fd) 280 if err != nil { 281 return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err) 282 } 283 switch sa.(type) { 284 case *unix.SockaddrLinklayer: 285 // enable PACKET_FANOUT mode is the underlying socket is 286 // of type AF_PACKET. 287 const fanoutType = 0x8000 // PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_DEFRAG 288 fanoutArg := fanoutID | fanoutType<<16 289 if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil { 290 return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err) 291 } 292 } 293 294 switch e.packetDispatchMode { 295 case PacketMMap: 296 inboundDispatcher, err = newPacketMMapDispatcher(fd, e) 297 if err != nil { 298 return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", fd, e, err) 299 } 300 case RecvMMsg: 301 // If the provided FD is a socket then we optimize 302 // packet reads by using recvmmsg() instead of read() to 303 // read packets in a batch. 304 inboundDispatcher, err = newRecvMMsgDispatcher(fd, e) 305 if err != nil { 306 return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", fd, e, err) 307 } 308 } 309 } 310 return inboundDispatcher, nil 311 } 312 313 func isSocketFD(fd int) (bool, error) { 314 var stat syscall.Stat_t 315 if err := syscall.Fstat(fd, &stat); err != nil { 316 return false, fmt.Errorf("syscall.Fstat(%v,...) failed: %v", fd, err) 317 } 318 return (stat.Mode & syscall.S_IFSOCK) == syscall.S_IFSOCK, nil 319 } 320 321 // Attach launches the goroutine that reads packets from the file descriptor and 322 // dispatches them via the provided dispatcher. 323 func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { 324 e.dispatcher = dispatcher 325 // Link endpoints are not savable. When transportation endpoints are 326 // saved, they stop sending outgoing packets and all incoming packets 327 // are rejected. 328 for i := range e.inboundDispatchers { 329 e.wg.Add(1) 330 go func(i int) { 331 e.dispatchLoop(e.inboundDispatchers[i]) 332 e.wg.Done() 333 }(i) 334 } 335 } 336 337 // IsAttached implements stack.LinkEndpoint.IsAttached. 338 func (e *endpoint) IsAttached() bool { 339 return e.dispatcher != nil 340 } 341 342 // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized 343 // during construction. 344 func (e *endpoint) MTU() uint32 { 345 return e.mtu 346 } 347 348 // Capabilities implements stack.LinkEndpoint.Capabilities. 349 func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { 350 return e.caps 351 } 352 353 // MaxHeaderLength returns the maximum size of the link-layer header. 354 func (e *endpoint) MaxHeaderLength() uint16 { 355 return uint16(e.hdrSize) 356 } 357 358 // LinkAddress returns the link address of this endpoint. 359 func (e *endpoint) LinkAddress() tcpip.LinkAddress { 360 return e.addr 361 } 362 363 // Wait implements stack.LinkEndpoint.Wait. It waits for the endpoint to stop 364 // reading from its FD. 365 func (e *endpoint) Wait() { 366 e.wg.Wait() 367 } 368 369 // virtioNetHdr is declared in linux/virtio_net.h. 370 type virtioNetHdr struct { 371 flags uint8 372 gsoType uint8 373 hdrLen uint16 374 gsoSize uint16 375 csumStart uint16 376 csumOffset uint16 377 } 378 379 // These constants are declared in linux/virtio_net.h. 380 const ( 381 _VIRTIO_NET_HDR_F_NEEDS_CSUM = 1 382 383 _VIRTIO_NET_HDR_GSO_TCPV4 = 1 384 _VIRTIO_NET_HDR_GSO_TCPV6 = 4 385 ) 386 387 // WritePacket writes outbound packets to the file descriptor. If it is not 388 // currently writable, the packet is dropped. 389 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error { 390 if e.hdrSize > 0 { 391 // Add ethernet header if needed. 392 eth := header.Ethernet(pkt.Header.Prepend(header.EthernetMinimumSize)) 393 pkt.LinkHeader = buffer.View(eth) 394 ethHdr := &header.EthernetFields{ 395 DstAddr: r.RemoteLinkAddress, 396 Type: protocol, 397 } 398 399 // Preserve the src address if it's set in the route. 400 if r.LocalLinkAddress != "" { 401 ethHdr.SrcAddr = r.LocalLinkAddress 402 } else { 403 ethHdr.SrcAddr = e.addr 404 } 405 eth.Encode(ethHdr) 406 } 407 408 if e.Capabilities()&stack.CapabilityHardwareGSO != 0 { 409 vnetHdr := virtioNetHdr{} 410 vnetHdrBuf := vnetHdrToByteSlice(&vnetHdr) 411 if gso != nil { 412 vnetHdr.hdrLen = uint16(pkt.Header.UsedLength()) 413 if gso.NeedsCsum { 414 vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM 415 vnetHdr.csumStart = header.EthernetMinimumSize + gso.L3HdrLen 416 vnetHdr.csumOffset = gso.CsumOffset 417 } 418 if gso.Type != stack.GSONone && uint16(pkt.Data.Size()) > gso.MSS { 419 switch gso.Type { 420 case stack.GSOTCPv4: 421 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 422 case stack.GSOTCPv6: 423 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 424 default: 425 panic(fmt.Sprintf("Unknown gso type: %v", gso.Type)) 426 } 427 vnetHdr.gsoSize = gso.MSS 428 } 429 } 430 431 return rawfile.NonBlockingWrite3(e.fds[0], vnetHdrBuf, pkt.Header.View(), pkt.Data.ToView()) 432 } 433 434 if pkt.Data.Size() == 0 { 435 return rawfile.NonBlockingWrite(e.fds[0], pkt.Header.View()) 436 } 437 438 return rawfile.NonBlockingWrite3(e.fds[0], pkt.Header.View(), pkt.Data.ToView(), nil) 439 } 440 441 // WritePackets writes outbound packets to the file descriptor. If it is not 442 // currently writable, the packet is dropped. 443 func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { 444 var ethHdrBuf []byte 445 // hdr + data 446 iovLen := 2 447 if e.hdrSize > 0 { 448 // Add ethernet header if needed. 449 ethHdrBuf = make([]byte, header.EthernetMinimumSize) 450 eth := header.Ethernet(ethHdrBuf) 451 ethHdr := &header.EthernetFields{ 452 DstAddr: r.RemoteLinkAddress, 453 Type: protocol, 454 } 455 456 // Preserve the src address if it's set in the route. 457 if r.LocalLinkAddress != "" { 458 ethHdr.SrcAddr = r.LocalLinkAddress 459 } else { 460 ethHdr.SrcAddr = e.addr 461 } 462 eth.Encode(ethHdr) 463 iovLen++ 464 } 465 466 n := len(hdrs) 467 468 views := payload.Views() 469 /* 470 * Each bondary in views can add one more iovec. 471 * 472 * payload | | | | 473 * ----------------------------- 474 * packets | | | | | | | 475 * ----------------------------- 476 * iovecs | | | | | | | | | 477 */ 478 iovec := make([]syscall.Iovec, n*iovLen+len(views)-1) 479 mmsgHdrs := make([]rawfile.MMsgHdr, n) 480 481 iovecIdx := 0 482 viewIdx := 0 483 viewOff := 0 484 off := 0 485 nextOff := 0 486 for i := range hdrs { 487 prevIovecIdx := iovecIdx 488 mmsgHdr := &mmsgHdrs[i] 489 mmsgHdr.Msg.Iov = &iovec[iovecIdx] 490 packetSize := hdrs[i].Size 491 hdr := &hdrs[i].Hdr 492 493 off = hdrs[i].Off 494 if off != nextOff { 495 // We stop in a different point last time. 496 size := packetSize 497 viewIdx = 0 498 viewOff = 0 499 for size > 0 { 500 if size >= len(views[viewIdx]) { 501 viewIdx++ 502 viewOff = 0 503 size -= len(views[viewIdx]) 504 } else { 505 viewOff = size 506 size = 0 507 } 508 } 509 } 510 nextOff = off + packetSize 511 512 if ethHdrBuf != nil { 513 v := &iovec[iovecIdx] 514 v.Base = ðHdrBuf[0] 515 v.Len = uint64(len(ethHdrBuf)) 516 iovecIdx++ 517 } 518 519 v := &iovec[iovecIdx] 520 hdrView := hdr.View() 521 v.Base = &hdrView[0] 522 v.Len = uint64(len(hdrView)) 523 iovecIdx++ 524 525 for packetSize > 0 { 526 vec := &iovec[iovecIdx] 527 iovecIdx++ 528 529 v := views[viewIdx] 530 vec.Base = &v[viewOff] 531 s := len(v) - viewOff 532 if s <= packetSize { 533 viewIdx++ 534 viewOff = 0 535 } else { 536 s = packetSize 537 viewOff += s 538 } 539 vec.Len = uint64(s) 540 packetSize -= s 541 } 542 543 mmsgHdr.Msg.Iovlen = uint64(iovecIdx - prevIovecIdx) 544 } 545 546 packets := 0 547 for packets < n { 548 sent, err := rawfile.NonBlockingSendMMsg(e.fds[0], mmsgHdrs) 549 if err != nil { 550 return packets, err 551 } 552 packets += sent 553 mmsgHdrs = mmsgHdrs[sent:] 554 } 555 return packets, nil 556 } 557 558 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. 559 func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error { 560 return rawfile.NonBlockingWrite(e.fds[0], vv.ToView()) 561 } 562 563 // InjectOutobund implements stack.InjectableEndpoint.InjectOutbound. 564 func (e *endpoint) InjectOutbound(dest tcpip.Address, packet []byte) *tcpip.Error { 565 return rawfile.NonBlockingWrite(e.fds[0], packet) 566 } 567 568 // dispatchLoop reads packets from the file descriptor in a loop and dispatches 569 // them to the network stack. 570 func (e *endpoint) dispatchLoop(inboundDispatcher linkDispatcher) *tcpip.Error { 571 for { 572 cont, err := inboundDispatcher.dispatch() 573 if err != nil || !cont { 574 if e.closed != nil { 575 e.closed(err) 576 } 577 return err 578 } 579 } 580 } 581 582 // GSOMaxSize returns the maximum GSO packet size. 583 func (e *endpoint) GSOMaxSize() uint32 { 584 return e.gsoMaxSize 585 } 586 587 // InjectableEndpoint is an injectable fd-based endpoint. The endpoint writes 588 // to the FD, but does not read from it. All reads come from injected packets. 589 type InjectableEndpoint struct { 590 endpoint 591 592 dispatcher stack.NetworkDispatcher 593 } 594 595 // Attach saves the stack network-layer dispatcher for use later when packets 596 // are injected. 597 func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) { 598 e.dispatcher = dispatcher 599 } 600 601 // InjectInbound injects an inbound packet. 602 func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) { 603 e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, pkt) 604 } 605 606 // NewInjectable creates a new fd-based InjectableEndpoint. 607 func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabilities) *InjectableEndpoint { 608 syscall.SetNonblock(fd, true) 609 610 return &InjectableEndpoint{endpoint: endpoint{ 611 fds: []int{fd}, 612 mtu: mtu, 613 caps: capabilities, 614 }} 615 }