github.com/flowerwrong/netstack@v0.0.0-20191009141956-e5848263af28/tcpip/link/fdbased/endpoint.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package fdbased provides the implemention of data-link layer endpoints 16 // backed by boundary-preserving file descriptors (e.g., TUN devices, 17 // seqpacket/datagram sockets). 18 // 19 // FD based endpoints can be used in the networking stack by calling New() to 20 // create a new endpoint, and then passing it as an argument to 21 // Stack.CreateNIC(). 22 // 23 // FD based endpoints can use more than one file descriptor to read incoming 24 // packets. If there are more than one FDs specified and the underlying FD is an 25 // AF_PACKET then the endpoint will enable FANOUT mode on the socket so that the 26 // host kernel will consistently hash the packets to the sockets. This ensures 27 // that packets for the same TCP streams are not reordered. 28 // 29 // Similarly if more than one FD's are specified where the underlying FD is not 30 // AF_PACKET then it's the caller's responsibility to ensure that all inbound 31 // packets on the descriptors are consistently 5 tuple hashed to one of the 32 // descriptors to prevent TCP reordering. 33 // 34 // Since netstack today does not compute 5 tuple hashes for outgoing packets we 35 // only use the first FD to write outbound packets. Once 5 tuple hashes for 36 // all outbound packets are available we will make use of all underlying FD's to 37 // write outbound packets. 38 package fdbased 39 40 import ( 41 "fmt" 42 "sync" 43 "syscall" 44 45 "github.com/FlowerWrong/netstack/tcpip" 46 "github.com/FlowerWrong/netstack/tcpip/buffer" 47 "github.com/FlowerWrong/netstack/tcpip/header" 48 "github.com/FlowerWrong/netstack/tcpip/link/rawfile" 49 "github.com/FlowerWrong/netstack/tcpip/stack" 50 "golang.org/x/sys/unix" 51 ) 52 53 // linkDispatcher reads packets from the link FD and dispatches them to the 54 // NetworkDispatcher. 55 type linkDispatcher interface { 56 dispatch() (bool, *tcpip.Error) 57 } 58 59 // PacketDispatchMode are the various supported methods of receiving and 60 // dispatching packets from the underlying FD. 61 type PacketDispatchMode int 62 63 const ( 64 // Readv is the default dispatch mode and is the least performant of the 65 // dispatch options but the one that is supported by all underlying FD 66 // types. 67 Readv PacketDispatchMode = iota 68 // RecvMMsg enables use of recvmmsg() syscall instead of readv() to 69 // read inbound packets. This reduces # of syscalls needed to process 70 // packets. 71 // 72 // NOTE: recvmmsg() is only supported for sockets, so if the underlying 73 // FD is not a socket then the code will still fall back to the readv() 74 // path. 75 RecvMMsg 76 // PacketMMap enables use of PACKET_RX_RING to receive packets from the 77 // NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The 78 // primary use-case for this is runsc which uses an AF_PACKET FD to 79 // receive packets from the veth device. 80 PacketMMap 81 ) 82 83 func (p PacketDispatchMode) String() string { 84 switch p { 85 case Readv: 86 return "Readv" 87 case RecvMMsg: 88 return "RecvMMsg" 89 case PacketMMap: 90 return "PacketMMap" 91 default: 92 return fmt.Sprintf("unknown packet dispatch mode %v", p) 93 } 94 } 95 96 type endpoint struct { 97 // fds is the set of file descriptors each identifying one inbound/outbound 98 // channel. The endpoint will dispatch from all inbound channels as well as 99 // hash outbound packets to specific channels based on the packet hash. 100 fds []int 101 102 // mtu (maximum transmission unit) is the maximum size of a packet. 103 mtu uint32 104 105 // hdrSize specifies the link-layer header size. If set to 0, no header 106 // is added/removed; otherwise an ethernet header is used. 107 hdrSize int 108 109 // addr is the address of the endpoint. 110 addr tcpip.LinkAddress 111 112 // caps holds the endpoint capabilities. 113 caps stack.LinkEndpointCapabilities 114 115 // closed is a function to be called when the FD's peer (if any) closes 116 // its end of the communication pipe. 117 closed func(*tcpip.Error) 118 119 inboundDispatchers []linkDispatcher 120 dispatcher stack.NetworkDispatcher 121 122 // packetDispatchMode controls the packet dispatcher used by this 123 // endpoint. 124 packetDispatchMode PacketDispatchMode 125 126 // gsoMaxSize is the maximum GSO packet size. It is zero if GSO is 127 // disabled. 128 gsoMaxSize uint32 129 130 // wg keeps track of running goroutines. 131 wg sync.WaitGroup 132 } 133 134 // Options specify the details about the fd-based endpoint to be created. 135 type Options struct { 136 // FDs is a set of FDs used to read/write packets. 137 FDs []int 138 139 // MTU is the mtu to use for this endpoint. 140 MTU uint32 141 142 // EthernetHeader if true, indicates that the endpoint should read/write 143 // ethernet frames instead of IP packets. 144 EthernetHeader bool 145 146 // ClosedFunc is a function to be called when an endpoint's peer (if 147 // any) closes its end of the communication pipe. 148 ClosedFunc func(*tcpip.Error) 149 150 // Address is the link address for this endpoint. Only used if 151 // EthernetHeader is true. 152 Address tcpip.LinkAddress 153 154 // SaveRestore if true, indicates that this NIC capability set should 155 // include CapabilitySaveRestore 156 SaveRestore bool 157 158 // DisconnectOk if true, indicates that this NIC capability set should 159 // include CapabilityDisconnectOk. 160 DisconnectOk bool 161 162 // GSOMaxSize is the maximum GSO packet size. It is zero if GSO is 163 // disabled. 164 GSOMaxSize uint32 165 166 // PacketDispatchMode specifies the type of inbound dispatcher to be 167 // used for this endpoint. 168 PacketDispatchMode PacketDispatchMode 169 170 // TXChecksumOffload if true, indicates that this endpoints capability 171 // set should include CapabilityTXChecksumOffload. 172 TXChecksumOffload bool 173 174 // RXChecksumOffload if true, indicates that this endpoints capability 175 // set should include CapabilityRXChecksumOffload. 176 RXChecksumOffload bool 177 } 178 179 // New creates a new fd-based endpoint. 180 // 181 // Makes fd non-blocking, but does not take ownership of fd, which must remain 182 // open for the lifetime of the returned endpoint (until after the endpoint has 183 // stopped being using and Wait returns). 184 func New(opts *Options) (stack.LinkEndpoint, error) { 185 caps := stack.LinkEndpointCapabilities(0) 186 if opts.RXChecksumOffload { 187 caps |= stack.CapabilityRXChecksumOffload 188 } 189 190 if opts.TXChecksumOffload { 191 caps |= stack.CapabilityTXChecksumOffload 192 } 193 194 hdrSize := 0 195 if opts.EthernetHeader { 196 hdrSize = header.EthernetMinimumSize 197 caps |= stack.CapabilityResolutionRequired 198 } 199 200 if opts.SaveRestore { 201 caps |= stack.CapabilitySaveRestore 202 } 203 204 if opts.DisconnectOk { 205 caps |= stack.CapabilityDisconnectOk 206 } 207 208 if len(opts.FDs) == 0 { 209 return nil, fmt.Errorf("opts.FD is empty, at least one FD must be specified") 210 } 211 212 e := &endpoint{ 213 fds: opts.FDs, 214 mtu: opts.MTU, 215 caps: caps, 216 closed: opts.ClosedFunc, 217 addr: opts.Address, 218 hdrSize: hdrSize, 219 packetDispatchMode: opts.PacketDispatchMode, 220 } 221 222 // Create per channel dispatchers. 223 for i := 0; i < len(e.fds); i++ { 224 fd := e.fds[i] 225 if err := syscall.SetNonblock(fd, true); err != nil { 226 return nil, fmt.Errorf("syscall.SetNonblock(%v) failed: %v", fd, err) 227 } 228 229 isSocket, err := isSocketFD(fd) 230 if err != nil { 231 return nil, err 232 } 233 if isSocket { 234 if opts.GSOMaxSize != 0 { 235 e.caps |= stack.CapabilityGSO 236 e.gsoMaxSize = opts.GSOMaxSize 237 } 238 } 239 inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket) 240 if err != nil { 241 return nil, fmt.Errorf("createInboundDispatcher(...) = %v", err) 242 } 243 e.inboundDispatchers = append(e.inboundDispatchers, inboundDispatcher) 244 } 245 246 return e, nil 247 } 248 249 func createInboundDispatcher(e *endpoint, fd int, isSocket bool) (linkDispatcher, error) { 250 // By default use the readv() dispatcher as it works with all kinds of 251 // FDs (tap/tun/unix domain sockets and af_packet). 252 inboundDispatcher, err := newReadVDispatcher(fd, e) 253 if err != nil { 254 return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", fd, e, err) 255 } 256 257 if isSocket { 258 sa, err := unix.Getsockname(fd) 259 if err != nil { 260 return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err) 261 } 262 switch sa.(type) { 263 //case *unix.SockaddrLinklayer: 264 // // enable PACKET_FANOUT mode is the underlying socket is 265 // // of type AF_PACKET. 266 // const fanoutID = 1 267 // const fanoutType = 0x8000 // PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_DEFRAG 268 // fanoutArg := fanoutID | fanoutType<<16 269 // if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil { 270 // return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err) 271 // } 272 } 273 274 switch e.packetDispatchMode { 275 case PacketMMap: 276 inboundDispatcher, err = newPacketMMapDispatcher(fd, e) 277 if err != nil { 278 return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", fd, e, err) 279 } 280 case RecvMMsg: 281 // If the provided FD is a socket then we optimize 282 // packet reads by using recvmmsg() instead of read() to 283 // read packets in a batch. 284 inboundDispatcher, err = newRecvMMsgDispatcher(fd, e) 285 if err != nil { 286 return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", fd, e, err) 287 } 288 } 289 } 290 return inboundDispatcher, nil 291 } 292 293 func isSocketFD(fd int) (bool, error) { 294 var stat syscall.Stat_t 295 if err := syscall.Fstat(fd, &stat); err != nil { 296 return false, fmt.Errorf("syscall.Fstat(%v,...) failed: %v", fd, err) 297 } 298 return (stat.Mode & syscall.S_IFSOCK) == syscall.S_IFSOCK, nil 299 } 300 301 // Attach launches the goroutine that reads packets from the file descriptor and 302 // dispatches them via the provided dispatcher. 303 func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { 304 e.dispatcher = dispatcher 305 // Link endpoints are not savable. When transportation endpoints are 306 // saved, they stop sending outgoing packets and all incoming packets 307 // are rejected. 308 for i := range e.inboundDispatchers { 309 e.wg.Add(1) 310 go func(i int) { 311 e.dispatchLoop(e.inboundDispatchers[i]) 312 e.wg.Done() 313 }(i) 314 } 315 } 316 317 // IsAttached implements stack.LinkEndpoint.IsAttached. 318 func (e *endpoint) IsAttached() bool { 319 return e.dispatcher != nil 320 } 321 322 // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized 323 // during construction. 324 func (e *endpoint) MTU() uint32 { 325 return e.mtu 326 } 327 328 // Capabilities implements stack.LinkEndpoint.Capabilities. 329 func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { 330 return e.caps 331 } 332 333 // MaxHeaderLength returns the maximum size of the link-layer header. 334 func (e *endpoint) MaxHeaderLength() uint16 { 335 return uint16(e.hdrSize) 336 } 337 338 // LinkAddress returns the link address of this endpoint. 339 func (e *endpoint) LinkAddress() tcpip.LinkAddress { 340 return e.addr 341 } 342 343 // Wait implements stack.LinkEndpoint.Wait. It waits for the endpoint to stop 344 // reading from its FD. 345 func (e *endpoint) Wait() { 346 e.wg.Wait() 347 } 348 349 // virtioNetHdr is declared in linux/virtio_net.h. 350 type virtioNetHdr struct { 351 flags uint8 352 gsoType uint8 353 hdrLen uint16 354 gsoSize uint16 355 csumStart uint16 356 csumOffset uint16 357 } 358 359 // These constants are declared in linux/virtio_net.h. 360 const ( 361 _VIRTIO_NET_HDR_F_NEEDS_CSUM = 1 362 363 _VIRTIO_NET_HDR_GSO_TCPV4 = 1 364 _VIRTIO_NET_HDR_GSO_TCPV6 = 4 365 ) 366 367 // WritePacket writes outbound packets to the file descriptor. If it is not 368 // currently writable, the packet is dropped. 369 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error { 370 if e.hdrSize > 0 { 371 // Add ethernet header if needed. 372 eth := header.Ethernet(hdr.Prepend(header.EthernetMinimumSize)) 373 ethHdr := &header.EthernetFields{ 374 DstAddr: r.RemoteLinkAddress, 375 Type: protocol, 376 } 377 378 // Preserve the src address if it's set in the route. 379 if r.LocalLinkAddress != "" { 380 ethHdr.SrcAddr = r.LocalLinkAddress 381 } else { 382 ethHdr.SrcAddr = e.addr 383 } 384 eth.Encode(ethHdr) 385 } 386 387 if e.Capabilities()&stack.CapabilityGSO != 0 { 388 vnetHdr := virtioNetHdr{} 389 vnetHdrBuf := vnetHdrToByteSlice(&vnetHdr) 390 if gso != nil { 391 vnetHdr.hdrLen = uint16(hdr.UsedLength()) 392 if gso.NeedsCsum { 393 vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM 394 vnetHdr.csumStart = header.EthernetMinimumSize + gso.L3HdrLen 395 vnetHdr.csumOffset = gso.CsumOffset 396 } 397 if gso.Type != stack.GSONone && uint16(payload.Size()) > gso.MSS { 398 switch gso.Type { 399 case stack.GSOTCPv4: 400 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 401 case stack.GSOTCPv6: 402 vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 403 default: 404 panic(fmt.Sprintf("Unknown gso type: %v", gso.Type)) 405 } 406 vnetHdr.gsoSize = gso.MSS 407 } 408 } 409 410 return rawfile.NonBlockingWrite3(e.fds[0], vnetHdrBuf, hdr.View(), payload.ToView()) 411 } 412 413 if payload.Size() == 0 { 414 return rawfile.NonBlockingWrite(e.fds[0], hdr.View()) 415 } 416 417 return rawfile.NonBlockingWrite3(e.fds[0], hdr.View(), payload.ToView(), nil) 418 } 419 420 // WriteRawPacket writes a raw packet directly to the file descriptor. 421 func (e *endpoint) WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Error { 422 return rawfile.NonBlockingWrite(e.fds[0], packet) 423 } 424 425 // dispatchLoop reads packets from the file descriptor in a loop and dispatches 426 // them to the network stack. 427 func (e *endpoint) dispatchLoop(inboundDispatcher linkDispatcher) *tcpip.Error { 428 for { 429 cont, err := inboundDispatcher.dispatch() 430 if err != nil || !cont { 431 if e.closed != nil { 432 e.closed(err) 433 } 434 return err 435 } 436 } 437 } 438 439 // GSOMaxSize returns the maximum GSO packet size. 440 func (e *endpoint) GSOMaxSize() uint32 { 441 return e.gsoMaxSize 442 } 443 444 // InjectableEndpoint is an injectable fd-based endpoint. The endpoint writes 445 // to the FD, but does not read from it. All reads come from injected packets. 446 type InjectableEndpoint struct { 447 endpoint 448 449 dispatcher stack.NetworkDispatcher 450 } 451 452 // Attach saves the stack network-layer dispatcher for use later when packets 453 // are injected. 454 func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) { 455 e.dispatcher = dispatcher 456 } 457 458 // Inject injects an inbound packet. 459 func (e *InjectableEndpoint) Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) { 460 e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv) 461 } 462 463 // NewInjectable creates a new fd-based InjectableEndpoint. 464 func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabilities) *InjectableEndpoint { 465 syscall.SetNonblock(fd, true) 466 467 return &InjectableEndpoint{endpoint: endpoint{ 468 fds: []int{fd}, 469 mtu: mtu, 470 caps: capabilities, 471 }} 472 }