github.com/vpnishe/netstack@v1.10.6/tcpip/transport/raw/endpoint.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package raw provides the implementation of raw sockets (see raw(7)). Raw 16 // sockets allow applications to: 17 // 18 // * manually write and inspect transport layer headers and payloads 19 // * receive all traffic of a given transport protocol (e.g. ICMP or UDP) 20 // * optionally write and inspect network layer headers of packets 21 // 22 // Raw sockets don't have any notion of ports, and incoming packets are 23 // demultiplexed solely by protocol number. Thus, a raw UDP endpoint will 24 // receive every UDP packet received by netstack. bind(2) and connect(2) can be 25 // used to filter incoming packets by source and destination. 26 package raw 27 28 import ( 29 "sync" 30 31 "github.com/vpnishe/netstack/tcpip" 32 "github.com/vpnishe/netstack/tcpip/buffer" 33 "github.com/vpnishe/netstack/tcpip/header" 34 "github.com/vpnishe/netstack/tcpip/iptables" 35 "github.com/vpnishe/netstack/tcpip/stack" 36 "github.com/vpnishe/netstack/waiter" 37 ) 38 39 // +stateify savable 40 type rawPacket struct { 41 rawPacketEntry 42 // data holds the actual packet data, including any headers and 43 // payload. 44 data buffer.VectorisedView 45 // timestampNS is the unix time at which the packet was received. 46 timestampNS int64 47 // senderAddr is the network address of the sender. 48 senderAddr tcpip.FullAddress 49 } 50 51 // endpoint is the raw socket implementation of tcpip.Endpoint. It is legal to 52 // have goroutines make concurrent calls into the endpoint. 53 // 54 // Lock order: 55 // endpoint.mu 56 // endpoint.rcvMu 57 // 58 // +stateify savable 59 type endpoint struct { 60 stack.TransportEndpointInfo 61 // The following fields are initialized at creation time and are 62 // immutable. 63 stack *stack.Stack 64 waiterQueue *waiter.Queue 65 associated bool 66 67 // The following fields are used to manage the receive queue and are 68 // protected by rcvMu. 69 rcvMu sync.Mutex 70 rcvList rawPacketList 71 rcvBufSizeMax int 72 rcvBufSize int 73 rcvClosed bool 74 75 // The following fields are protected by mu. 76 mu sync.RWMutex 77 sndBufSize int 78 closed bool 79 connected bool 80 bound bool 81 // route is the route to a remote network endpoint. It is set via 82 // Connect(), and is valid only when conneted is true. 83 route stack.Route 84 stats tcpip.TransportEndpointStats 85 } 86 87 // NewEndpoint returns a raw endpoint for the given protocols. 88 func NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { 89 return newEndpoint(stack, netProto, transProto, waiterQueue, true /* associated */) 90 } 91 92 func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, *tcpip.Error) { 93 if netProto != header.IPv4ProtocolNumber { 94 return nil, tcpip.ErrUnknownProtocol 95 } 96 97 e := &endpoint{ 98 stack: s, 99 TransportEndpointInfo: stack.TransportEndpointInfo{ 100 NetProto: netProto, 101 TransProto: transProto, 102 }, 103 waiterQueue: waiterQueue, 104 rcvBufSizeMax: 32 * 1024, 105 sndBufSize: 32 * 1024, 106 associated: associated, 107 } 108 109 // Unassociated endpoints are write-only and users call Write() with IP 110 // headers included. Because they're write-only, We don't need to 111 // register with the stack. 112 if !associated { 113 e.rcvBufSizeMax = 0 114 e.waiterQueue = nil 115 return e, nil 116 } 117 118 if err := e.stack.RegisterRawTransportEndpoint(e.RegisterNICID, e.NetProto, e.TransProto, e); err != nil { 119 return nil, err 120 } 121 122 return e, nil 123 } 124 125 // Close implements tcpip.Endpoint.Close. 126 func (e *endpoint) Close() { 127 e.mu.Lock() 128 defer e.mu.Unlock() 129 130 if e.closed || !e.associated { 131 return 132 } 133 134 e.stack.UnregisterRawTransportEndpoint(e.RegisterNICID, e.NetProto, e.TransProto, e) 135 136 e.rcvMu.Lock() 137 defer e.rcvMu.Unlock() 138 139 // Clear the receive list. 140 e.rcvClosed = true 141 e.rcvBufSize = 0 142 for !e.rcvList.Empty() { 143 e.rcvList.Remove(e.rcvList.Front()) 144 } 145 146 if e.connected { 147 e.route.Release() 148 e.connected = false 149 } 150 151 e.closed = true 152 153 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut) 154 } 155 156 // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf. 157 func (e *endpoint) ModerateRecvBuf(copied int) {} 158 159 // IPTables implements tcpip.Endpoint.IPTables. 160 func (e *endpoint) IPTables() (iptables.IPTables, error) { 161 return e.stack.IPTables(), nil 162 } 163 164 // Read implements tcpip.Endpoint.Read. 165 func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) { 166 if !e.associated { 167 return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidOptionValue 168 } 169 170 e.rcvMu.Lock() 171 172 // If there's no data to read, return that read would block or that the 173 // endpoint is closed. 174 if e.rcvList.Empty() { 175 err := tcpip.ErrWouldBlock 176 if e.rcvClosed { 177 e.stats.ReadErrors.ReadClosed.Increment() 178 err = tcpip.ErrClosedForReceive 179 } 180 e.rcvMu.Unlock() 181 return buffer.View{}, tcpip.ControlMessages{}, err 182 } 183 184 pkt := e.rcvList.Front() 185 e.rcvList.Remove(pkt) 186 e.rcvBufSize -= pkt.data.Size() 187 188 e.rcvMu.Unlock() 189 190 if addr != nil { 191 *addr = pkt.senderAddr 192 } 193 194 return pkt.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: pkt.timestampNS}, nil 195 } 196 197 // Write implements tcpip.Endpoint.Write. 198 func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) { 199 n, ch, err := e.write(p, opts) 200 switch err { 201 case nil: 202 e.stats.PacketsSent.Increment() 203 case tcpip.ErrMessageTooLong, tcpip.ErrInvalidOptionValue: 204 e.stats.WriteErrors.InvalidArgs.Increment() 205 case tcpip.ErrClosedForSend: 206 e.stats.WriteErrors.WriteClosed.Increment() 207 case tcpip.ErrInvalidEndpointState: 208 e.stats.WriteErrors.InvalidEndpointState.Increment() 209 case tcpip.ErrNoLinkAddress: 210 e.stats.SendErrors.NoLinkAddr.Increment() 211 case tcpip.ErrNoRoute, tcpip.ErrBroadcastDisabled, tcpip.ErrNetworkUnreachable: 212 // Errors indicating any problem with IP routing of the packet. 213 e.stats.SendErrors.NoRoute.Increment() 214 default: 215 // For all other errors when writing to the network layer. 216 e.stats.SendErrors.SendToNetworkFailed.Increment() 217 } 218 return n, ch, err 219 } 220 221 func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) { 222 // MSG_MORE is unimplemented. This also means that MSG_EOR is a no-op. 223 if opts.More { 224 return 0, nil, tcpip.ErrInvalidOptionValue 225 } 226 227 e.mu.RLock() 228 229 if e.closed { 230 e.mu.RUnlock() 231 return 0, nil, tcpip.ErrInvalidEndpointState 232 } 233 234 payloadBytes, err := p.FullPayload() 235 if err != nil { 236 e.mu.RUnlock() 237 return 0, nil, err 238 } 239 240 // If this is an unassociated socket and callee provided a nonzero 241 // destination address, route using that address. 242 if !e.associated { 243 ip := header.IPv4(payloadBytes) 244 if !ip.IsValid(len(payloadBytes)) { 245 e.mu.RUnlock() 246 return 0, nil, tcpip.ErrInvalidOptionValue 247 } 248 dstAddr := ip.DestinationAddress() 249 // Update dstAddr with the address in the IP header, unless 250 // opts.To is set (e.g. if sendto specifies a specific 251 // address). 252 if dstAddr != tcpip.Address([]byte{0, 0, 0, 0}) && opts.To == nil { 253 opts.To = &tcpip.FullAddress{ 254 NIC: 0, // NIC is unset. 255 Addr: dstAddr, // The address from the payload. 256 Port: 0, // There are no ports here. 257 } 258 } 259 } 260 261 // Did the user caller provide a destination? If not, use the connected 262 // destination. 263 if opts.To == nil { 264 // If the user doesn't specify a destination, they should have 265 // connected to another address. 266 if !e.connected { 267 e.mu.RUnlock() 268 return 0, nil, tcpip.ErrDestinationRequired 269 } 270 271 if e.route.IsResolutionRequired() { 272 savedRoute := &e.route 273 // Promote lock to exclusive if using a shared route, 274 // given that it may need to change in finishWrite. 275 e.mu.RUnlock() 276 e.mu.Lock() 277 278 // Make sure that the route didn't change during the 279 // time we didn't hold the lock. 280 if !e.connected || savedRoute != &e.route { 281 e.mu.Unlock() 282 return 0, nil, tcpip.ErrInvalidEndpointState 283 } 284 285 n, ch, err := e.finishWrite(payloadBytes, savedRoute) 286 e.mu.Unlock() 287 return n, ch, err 288 } 289 290 n, ch, err := e.finishWrite(payloadBytes, &e.route) 291 e.mu.RUnlock() 292 return n, ch, err 293 } 294 295 // The caller provided a destination. Reject destination address if it 296 // goes through a different NIC than the endpoint was bound to. 297 nic := opts.To.NIC 298 if e.bound && nic != 0 && nic != e.BindNICID { 299 e.mu.RUnlock() 300 return 0, nil, tcpip.ErrNoRoute 301 } 302 303 // We don't support IPv6 yet, so this has to be an IPv4 address. 304 if len(opts.To.Addr) != header.IPv4AddressSize { 305 e.mu.RUnlock() 306 return 0, nil, tcpip.ErrInvalidEndpointState 307 } 308 309 // Find the route to the destination. If BindAddress is 0, 310 // FindRoute will choose an appropriate source address. 311 route, err := e.stack.FindRoute(nic, e.BindAddr, opts.To.Addr, e.NetProto, false) 312 if err != nil { 313 e.mu.RUnlock() 314 return 0, nil, err 315 } 316 317 n, ch, err := e.finishWrite(payloadBytes, &route) 318 route.Release() 319 e.mu.RUnlock() 320 return n, ch, err 321 } 322 323 // finishWrite writes the payload to a route. It resolves the route if 324 // necessary. It's really just a helper to make defer unnecessary in Write. 325 func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64, <-chan struct{}, *tcpip.Error) { 326 // We may need to resolve the route (match a link layer address to the 327 // network address). If that requires blocking (e.g. to use ARP), 328 // return a channel on which the caller can wait. 329 if route.IsResolutionRequired() { 330 if ch, err := route.Resolve(nil); err != nil { 331 if err == tcpip.ErrWouldBlock { 332 return 0, ch, tcpip.ErrNoLinkAddress 333 } 334 return 0, nil, err 335 } 336 } 337 338 switch e.NetProto { 339 case header.IPv4ProtocolNumber: 340 if !e.associated { 341 if err := route.WriteHeaderIncludedPacket(tcpip.PacketBuffer{ 342 Data: buffer.View(payloadBytes).ToVectorisedView(), 343 }); err != nil { 344 return 0, nil, err 345 } 346 break 347 } 348 hdr := buffer.NewPrependable(len(payloadBytes) + int(route.MaxHeaderLength())) 349 if err := route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: e.TransProto, TTL: route.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{ 350 Header: hdr, 351 Data: buffer.View(payloadBytes).ToVectorisedView(), 352 }); err != nil { 353 return 0, nil, err 354 } 355 356 default: 357 return 0, nil, tcpip.ErrUnknownProtocol 358 } 359 360 return int64(len(payloadBytes)), nil, nil 361 } 362 363 // Peek implements tcpip.Endpoint.Peek. 364 func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) { 365 return 0, tcpip.ControlMessages{}, nil 366 } 367 368 // Disconnect implements tcpip.Endpoint.Disconnect. 369 func (*endpoint) Disconnect() *tcpip.Error { 370 return tcpip.ErrNotSupported 371 } 372 373 // Connect implements tcpip.Endpoint.Connect. 374 func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { 375 e.mu.Lock() 376 defer e.mu.Unlock() 377 378 if e.closed { 379 return tcpip.ErrInvalidEndpointState 380 } 381 382 // We don't support IPv6 yet. 383 if len(addr.Addr) != header.IPv4AddressSize { 384 return tcpip.ErrInvalidEndpointState 385 } 386 387 nic := addr.NIC 388 if e.bound { 389 if e.BindNICID == 0 { 390 // If we're bound, but not to a specific NIC, the NIC 391 // in addr will be used. Nothing to do here. 392 } else if addr.NIC == 0 { 393 // If we're bound to a specific NIC, but addr doesn't 394 // specify a NIC, use the bound NIC. 395 nic = e.BindNICID 396 } else if addr.NIC != e.BindNICID { 397 // We're bound and addr specifies a NIC. They must be 398 // the same. 399 return tcpip.ErrInvalidEndpointState 400 } 401 } 402 403 // Find a route to the destination. 404 route, err := e.stack.FindRoute(nic, tcpip.Address(""), addr.Addr, e.NetProto, false) 405 if err != nil { 406 return err 407 } 408 defer route.Release() 409 410 if e.associated { 411 // Re-register the endpoint with the appropriate NIC. 412 if err := e.stack.RegisterRawTransportEndpoint(addr.NIC, e.NetProto, e.TransProto, e); err != nil { 413 return err 414 } 415 e.stack.UnregisterRawTransportEndpoint(e.RegisterNICID, e.NetProto, e.TransProto, e) 416 e.RegisterNICID = nic 417 } 418 419 // Save the route we've connected via. 420 e.route = route.Clone() 421 e.connected = true 422 423 return nil 424 } 425 426 // Shutdown implements tcpip.Endpoint.Shutdown. It's a noop for raw sockets. 427 func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error { 428 e.mu.Lock() 429 defer e.mu.Unlock() 430 431 if !e.connected { 432 return tcpip.ErrNotConnected 433 } 434 return nil 435 } 436 437 // Listen implements tcpip.Endpoint.Listen. 438 func (e *endpoint) Listen(backlog int) *tcpip.Error { 439 return tcpip.ErrNotSupported 440 } 441 442 // Accept implements tcpip.Endpoint.Accept. 443 func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) { 444 return nil, nil, tcpip.ErrNotSupported 445 } 446 447 // Bind implements tcpip.Endpoint.Bind. 448 func (e *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error { 449 e.mu.Lock() 450 defer e.mu.Unlock() 451 452 // Callers must provide an IPv4 address or no network address (for 453 // binding to a NIC, but not an address). 454 if len(addr.Addr) != 0 && len(addr.Addr) != 4 { 455 return tcpip.ErrInvalidEndpointState 456 } 457 458 // If a local address was specified, verify that it's valid. 459 if len(addr.Addr) == header.IPv4AddressSize && e.stack.CheckLocalAddress(addr.NIC, e.NetProto, addr.Addr) == 0 { 460 return tcpip.ErrBadLocalAddress 461 } 462 463 if e.associated { 464 // Re-register the endpoint with the appropriate NIC. 465 if err := e.stack.RegisterRawTransportEndpoint(addr.NIC, e.NetProto, e.TransProto, e); err != nil { 466 return err 467 } 468 e.stack.UnregisterRawTransportEndpoint(e.RegisterNICID, e.NetProto, e.TransProto, e) 469 e.RegisterNICID = addr.NIC 470 e.BindNICID = addr.NIC 471 } 472 473 e.BindAddr = addr.Addr 474 e.bound = true 475 476 return nil 477 } 478 479 // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress. 480 func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { 481 return tcpip.FullAddress{}, tcpip.ErrNotSupported 482 } 483 484 // GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress. 485 func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { 486 // Even a connected socket doesn't return a remote address. 487 return tcpip.FullAddress{}, tcpip.ErrNotConnected 488 } 489 490 // Readiness implements tcpip.Endpoint.Readiness. 491 func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { 492 // The endpoint is always writable. 493 result := waiter.EventOut & mask 494 495 // Determine whether the endpoint is readable. 496 if (mask & waiter.EventIn) != 0 { 497 e.rcvMu.Lock() 498 if !e.rcvList.Empty() || e.rcvClosed { 499 result |= waiter.EventIn 500 } 501 e.rcvMu.Unlock() 502 } 503 504 return result 505 } 506 507 // SetSockOpt implements tcpip.Endpoint.SetSockOpt. 508 func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { 509 return tcpip.ErrUnknownProtocolOption 510 } 511 512 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt. 513 func (ep *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error { 514 return tcpip.ErrUnknownProtocolOption 515 } 516 517 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. 518 func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) { 519 switch opt { 520 case tcpip.ReceiveQueueSizeOption: 521 v := 0 522 e.rcvMu.Lock() 523 if !e.rcvList.Empty() { 524 p := e.rcvList.Front() 525 v = p.data.Size() 526 } 527 e.rcvMu.Unlock() 528 return v, nil 529 530 case tcpip.SendBufferSizeOption: 531 e.mu.Lock() 532 v := e.sndBufSize 533 e.mu.Unlock() 534 return v, nil 535 536 case tcpip.ReceiveBufferSizeOption: 537 e.rcvMu.Lock() 538 v := e.rcvBufSizeMax 539 e.rcvMu.Unlock() 540 return v, nil 541 542 } 543 544 return -1, tcpip.ErrUnknownProtocolOption 545 } 546 547 // GetSockOpt implements tcpip.Endpoint.GetSockOpt. 548 func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { 549 switch o := opt.(type) { 550 case tcpip.ErrorOption: 551 return nil 552 553 case *tcpip.KeepaliveEnabledOption: 554 *o = 0 555 return nil 556 557 default: 558 return tcpip.ErrUnknownProtocolOption 559 } 560 } 561 562 // HandlePacket implements stack.RawTransportEndpoint.HandlePacket. 563 func (e *endpoint) HandlePacket(route *stack.Route, pkt tcpip.PacketBuffer) { 564 e.rcvMu.Lock() 565 566 // Drop the packet if our buffer is currently full. 567 if e.rcvClosed { 568 e.rcvMu.Unlock() 569 e.stack.Stats().DroppedPackets.Increment() 570 e.stats.ReceiveErrors.ClosedReceiver.Increment() 571 return 572 } 573 574 if e.rcvBufSize >= e.rcvBufSizeMax { 575 e.rcvMu.Unlock() 576 e.stack.Stats().DroppedPackets.Increment() 577 e.stats.ReceiveErrors.ReceiveBufferOverflow.Increment() 578 return 579 } 580 581 if e.bound { 582 // If bound to a NIC, only accept data for that NIC. 583 if e.BindNICID != 0 && e.BindNICID != route.NICID() { 584 e.rcvMu.Unlock() 585 return 586 } 587 // If bound to an address, only accept data for that address. 588 if e.BindAddr != "" && e.BindAddr != route.RemoteAddress { 589 e.rcvMu.Unlock() 590 return 591 } 592 } 593 594 // If connected, only accept packets from the remote address we 595 // connected to. 596 if e.connected && e.route.RemoteAddress != route.RemoteAddress { 597 e.rcvMu.Unlock() 598 return 599 } 600 601 wasEmpty := e.rcvBufSize == 0 602 603 // Push new packet into receive list and increment the buffer size. 604 packet := &rawPacket{ 605 senderAddr: tcpip.FullAddress{ 606 NIC: route.NICID(), 607 Addr: route.RemoteAddress, 608 }, 609 } 610 611 networkHeader := append(buffer.View(nil), pkt.NetworkHeader...) 612 combinedVV := networkHeader.ToVectorisedView() 613 combinedVV.Append(pkt.Data) 614 packet.data = combinedVV 615 packet.timestampNS = e.stack.NowNanoseconds() 616 617 e.rcvList.PushBack(packet) 618 e.rcvBufSize += packet.data.Size() 619 620 e.rcvMu.Unlock() 621 e.stats.PacketsReceived.Increment() 622 // Notify waiters that there's data to be read. 623 if wasEmpty { 624 e.waiterQueue.Notify(waiter.EventIn) 625 } 626 } 627 628 // State implements socket.Socket.State. 629 func (e *endpoint) State() uint32 { 630 return 0 631 } 632 633 // Info returns a copy of the endpoint info. 634 func (e *endpoint) Info() tcpip.EndpointInfo { 635 e.mu.RLock() 636 // Make a copy of the endpoint info. 637 ret := e.TransportEndpointInfo 638 e.mu.RUnlock() 639 return &ret 640 } 641 642 // Stats returns a pointer to the endpoint stats. 643 func (e *endpoint) Stats() tcpip.EndpointStats { 644 return &e.stats 645 } 646 647 // Wait implements stack.TransportEndpoint.Wait. 648 func (*endpoint) Wait() {}