inet.af/netstack@v0.0.0-20220214151720-7585b01ddccf/tcpip/transport/raw/endpoint.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package raw provides the implementation of raw sockets (see raw(7)). Raw 16 // sockets allow applications to: 17 // 18 // * manually write and inspect transport layer headers and payloads 19 // * receive all traffic of a given transport protocol (e.g. ICMP or UDP) 20 // * optionally write and inspect network layer headers of packets 21 // 22 // Raw sockets don't have any notion of ports, and incoming packets are 23 // demultiplexed solely by protocol number. Thus, a raw UDP endpoint will 24 // receive every UDP packet received by netstack. bind(2) and connect(2) can be 25 // used to filter incoming packets by source and destination. 26 package raw 27 28 import ( 29 "fmt" 30 "io" 31 "time" 32 33 "inet.af/netstack/sync" 34 "inet.af/netstack/tcpip" 35 "inet.af/netstack/tcpip/buffer" 36 "inet.af/netstack/tcpip/header" 37 "inet.af/netstack/tcpip/stack" 38 "inet.af/netstack/tcpip/transport" 39 "inet.af/netstack/tcpip/transport/internal/network" 40 "inet.af/netstack/waiter" 41 ) 42 43 // +stateify savable 44 type rawPacket struct { 45 rawPacketEntry 46 // data holds the actual packet data, including any headers and 47 // payload. 48 data buffer.VectorisedView `state:".(buffer.VectorisedView)"` 49 receivedAt time.Time `state:".(int64)"` 50 // senderAddr is the network address of the sender. 51 senderAddr tcpip.FullAddress 52 packetInfo tcpip.IPPacketInfo 53 } 54 55 // endpoint is the raw socket implementation of tcpip.Endpoint. It is legal to 56 // have goroutines make concurrent calls into the endpoint. 57 // 58 // Lock order: 59 // endpoint.mu 60 // endpoint.rcvMu 61 // 62 // +stateify savable 63 type endpoint struct { 64 tcpip.DefaultSocketOptionsHandler 65 66 // The following fields are initialized at creation time and are 67 // immutable. 68 stack *stack.Stack `state:"manual"` 69 transProto tcpip.TransportProtocolNumber 70 waiterQueue *waiter.Queue 71 associated bool 72 73 net network.Endpoint 74 stats tcpip.TransportEndpointStats 75 ops tcpip.SocketOptions 76 77 // The following fields are used to manage the receive queue and are 78 // protected by rcvMu. 79 rcvMu sync.Mutex `state:"nosave"` 80 rcvList rawPacketList 81 rcvBufSize int 82 rcvClosed bool 83 84 // The following fields are protected by mu. 85 mu sync.RWMutex `state:"nosave"` 86 // frozen indicates if the packets should be delivered to the endpoint 87 // during restore. 88 frozen bool 89 } 90 91 // NewEndpoint returns a raw endpoint for the given protocols. 92 func NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { 93 return newEndpoint(stack, netProto, transProto, waiterQueue, true /* associated */) 94 } 95 96 func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, tcpip.Error) { 97 e := &endpoint{ 98 stack: s, 99 transProto: transProto, 100 waiterQueue: waiterQueue, 101 associated: associated, 102 } 103 e.ops.InitHandler(e, e.stack, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits) 104 e.ops.SetHeaderIncluded(!associated) 105 e.ops.SetSendBufferSize(32*1024, false /* notify */) 106 e.ops.SetReceiveBufferSize(32*1024, false /* notify */) 107 e.net.Init(s, netProto, transProto, &e.ops) 108 109 // Override with stack defaults. 110 var ss tcpip.SendBufferSizeOption 111 if err := s.Option(&ss); err == nil { 112 e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) 113 } 114 115 var rs tcpip.ReceiveBufferSizeOption 116 if err := s.Option(&rs); err == nil { 117 e.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */) 118 } 119 120 // Unassociated endpoints are write-only and users call Write() with IP 121 // headers included. Because they're write-only, We don't need to 122 // register with the stack. 123 if !associated { 124 e.ops.SetReceiveBufferSize(0, false /* notify */) 125 e.waiterQueue = nil 126 return e, nil 127 } 128 129 if err := e.stack.RegisterRawTransportEndpoint(netProto, e.transProto, e); err != nil { 130 return nil, err 131 } 132 133 return e, nil 134 } 135 136 // Abort implements stack.TransportEndpoint.Abort. 137 func (e *endpoint) Abort() { 138 e.Close() 139 } 140 141 // Close implements tcpip.Endpoint.Close. 142 func (e *endpoint) Close() { 143 e.mu.Lock() 144 defer e.mu.Unlock() 145 146 if e.net.State() == transport.DatagramEndpointStateClosed { 147 return 148 } 149 150 e.net.Close() 151 152 if !e.associated { 153 return 154 } 155 156 e.stack.UnregisterRawTransportEndpoint(e.net.NetProto(), e.transProto, e) 157 158 e.rcvMu.Lock() 159 defer e.rcvMu.Unlock() 160 161 // Clear the receive list. 162 e.rcvClosed = true 163 e.rcvBufSize = 0 164 for !e.rcvList.Empty() { 165 e.rcvList.Remove(e.rcvList.Front()) 166 } 167 168 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 169 } 170 171 // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf. 172 func (*endpoint) ModerateRecvBuf(int) {} 173 174 func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { 175 e.net.SetOwner(owner) 176 } 177 178 // Read implements tcpip.Endpoint.Read. 179 func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) { 180 e.rcvMu.Lock() 181 182 // If there's no data to read, return that read would block or that the 183 // endpoint is closed. 184 if e.rcvList.Empty() { 185 var err tcpip.Error = &tcpip.ErrWouldBlock{} 186 if e.rcvClosed { 187 e.stats.ReadErrors.ReadClosed.Increment() 188 err = &tcpip.ErrClosedForReceive{} 189 } 190 e.rcvMu.Unlock() 191 return tcpip.ReadResult{}, err 192 } 193 194 pkt := e.rcvList.Front() 195 if !opts.Peek { 196 e.rcvList.Remove(pkt) 197 e.rcvBufSize -= pkt.data.Size() 198 } 199 200 e.rcvMu.Unlock() 201 202 res := tcpip.ReadResult{ 203 Total: pkt.data.Size(), 204 ControlMessages: tcpip.ControlMessages{ 205 HasTimestamp: true, 206 Timestamp: pkt.receivedAt, 207 }, 208 } 209 if opts.NeedRemoteAddr { 210 res.RemoteAddr = pkt.senderAddr 211 } 212 switch netProto := e.net.NetProto(); netProto { 213 case header.IPv4ProtocolNumber: 214 if e.ops.GetReceivePacketInfo() { 215 res.ControlMessages.HasIPPacketInfo = true 216 res.ControlMessages.PacketInfo = pkt.packetInfo 217 } 218 case header.IPv6ProtocolNumber: 219 if e.ops.GetIPv6ReceivePacketInfo() { 220 res.ControlMessages.HasIPv6PacketInfo = true 221 res.ControlMessages.IPv6PacketInfo = tcpip.IPv6PacketInfo{ 222 NIC: pkt.packetInfo.NIC, 223 Addr: pkt.packetInfo.DestinationAddr, 224 } 225 } 226 default: 227 panic(fmt.Sprintf("unrecognized network protocol = %d", netProto)) 228 } 229 230 n, err := pkt.data.ReadTo(dst, opts.Peek) 231 if n == 0 && err != nil { 232 return res, &tcpip.ErrBadBuffer{} 233 } 234 res.Count = n 235 return res, nil 236 } 237 238 // Write implements tcpip.Endpoint.Write. 239 func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { 240 netProto := e.net.NetProto() 241 // We can create, but not write to, unassociated IPv6 endpoints. 242 if !e.associated && netProto == header.IPv6ProtocolNumber { 243 return 0, &tcpip.ErrInvalidOptionValue{} 244 } 245 246 if opts.To != nil { 247 // Raw sockets do not support sending to a IPv4 address on a IPv6 endpoint. 248 if netProto == header.IPv6ProtocolNumber && len(opts.To.Addr) != header.IPv6AddressSize { 249 return 0, &tcpip.ErrInvalidOptionValue{} 250 } 251 } 252 253 n, err := e.write(p, opts) 254 switch err.(type) { 255 case nil: 256 e.stats.PacketsSent.Increment() 257 case *tcpip.ErrMessageTooLong, *tcpip.ErrInvalidOptionValue: 258 e.stats.WriteErrors.InvalidArgs.Increment() 259 case *tcpip.ErrClosedForSend: 260 e.stats.WriteErrors.WriteClosed.Increment() 261 case *tcpip.ErrInvalidEndpointState: 262 e.stats.WriteErrors.InvalidEndpointState.Increment() 263 case *tcpip.ErrNoRoute, *tcpip.ErrBroadcastDisabled, *tcpip.ErrNetworkUnreachable: 264 // Errors indicating any problem with IP routing of the packet. 265 e.stats.SendErrors.NoRoute.Increment() 266 default: 267 // For all other errors when writing to the network layer. 268 e.stats.SendErrors.SendToNetworkFailed.Increment() 269 } 270 return n, err 271 } 272 273 func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { 274 ctx, err := e.net.AcquireContextForWrite(opts) 275 if err != nil { 276 return 0, err 277 } 278 279 // TODO(https://gvisor.dev/issue/6538): Avoid this allocation. 280 payloadBytes := make([]byte, p.Len()) 281 if _, err := io.ReadFull(p, payloadBytes); err != nil { 282 return 0, &tcpip.ErrBadBuffer{} 283 } 284 285 pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ 286 ReserveHeaderBytes: int(ctx.PacketInfo().MaxHeaderLength), 287 Data: buffer.View(payloadBytes).ToVectorisedView(), 288 }) 289 defer pkt.DecRef() 290 291 if err := ctx.WritePacket(pkt, e.ops.GetHeaderIncluded()); err != nil { 292 return 0, err 293 } 294 295 return int64(len(payloadBytes)), nil 296 } 297 298 // Disconnect implements tcpip.Endpoint.Disconnect. 299 func (*endpoint) Disconnect() tcpip.Error { 300 return &tcpip.ErrNotSupported{} 301 } 302 303 // Connect implements tcpip.Endpoint.Connect. 304 func (e *endpoint) Connect(addr tcpip.FullAddress) tcpip.Error { 305 netProto := e.net.NetProto() 306 307 // Raw sockets do not support connecting to a IPv4 address on a IPv6 endpoint. 308 if netProto == header.IPv6ProtocolNumber && len(addr.Addr) != header.IPv6AddressSize { 309 return &tcpip.ErrAddressFamilyNotSupported{} 310 } 311 312 return e.net.ConnectAndThen(addr, func(_ tcpip.NetworkProtocolNumber, _, _ stack.TransportEndpointID) tcpip.Error { 313 if e.associated { 314 // Re-register the endpoint with the appropriate NIC. 315 if err := e.stack.RegisterRawTransportEndpoint(netProto, e.transProto, e); err != nil { 316 return err 317 } 318 e.stack.UnregisterRawTransportEndpoint(netProto, e.transProto, e) 319 } 320 321 return nil 322 }) 323 } 324 325 // Shutdown implements tcpip.Endpoint.Shutdown. It's a noop for raw sockets. 326 func (e *endpoint) Shutdown(tcpip.ShutdownFlags) tcpip.Error { 327 if e.net.State() != transport.DatagramEndpointStateConnected { 328 return &tcpip.ErrNotConnected{} 329 } 330 return nil 331 } 332 333 // Listen implements tcpip.Endpoint.Listen. 334 func (*endpoint) Listen(int) tcpip.Error { 335 return &tcpip.ErrNotSupported{} 336 } 337 338 // Accept implements tcpip.Endpoint.Accept. 339 func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) { 340 return nil, nil, &tcpip.ErrNotSupported{} 341 } 342 343 // Bind implements tcpip.Endpoint.Bind. 344 func (e *endpoint) Bind(addr tcpip.FullAddress) tcpip.Error { 345 return e.net.BindAndThen(addr, func(netProto tcpip.NetworkProtocolNumber, _ tcpip.Address) tcpip.Error { 346 if !e.associated { 347 return nil 348 } 349 350 // Re-register the endpoint with the appropriate NIC. 351 if err := e.stack.RegisterRawTransportEndpoint(netProto, e.transProto, e); err != nil { 352 return err 353 } 354 e.stack.UnregisterRawTransportEndpoint(netProto, e.transProto, e) 355 return nil 356 }) 357 } 358 359 // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress. 360 func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { 361 a := e.net.GetLocalAddress() 362 // Linux returns the protocol in the port field. 363 a.Port = uint16(e.transProto) 364 return a, nil 365 } 366 367 // GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress. 368 func (*endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { 369 // Even a connected socket doesn't return a remote address. 370 return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} 371 } 372 373 // Readiness implements tcpip.Endpoint.Readiness. 374 func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { 375 // The endpoint is always writable. 376 result := waiter.WritableEvents & mask 377 378 // Determine whether the endpoint is readable. 379 if (mask & waiter.ReadableEvents) != 0 { 380 e.rcvMu.Lock() 381 if !e.rcvList.Empty() || e.rcvClosed { 382 result |= waiter.ReadableEvents 383 } 384 e.rcvMu.Unlock() 385 } 386 387 return result 388 } 389 390 // SetSockOpt implements tcpip.Endpoint.SetSockOpt. 391 func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { 392 switch opt.(type) { 393 case *tcpip.SocketDetachFilterOption: 394 return nil 395 396 default: 397 return e.net.SetSockOpt(opt) 398 } 399 } 400 401 func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { 402 return e.net.SetSockOptInt(opt, v) 403 } 404 405 // GetSockOpt implements tcpip.Endpoint.GetSockOpt. 406 func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { 407 return e.net.GetSockOpt(opt) 408 } 409 410 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. 411 func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { 412 switch opt { 413 case tcpip.ReceiveQueueSizeOption: 414 v := 0 415 e.rcvMu.Lock() 416 if !e.rcvList.Empty() { 417 p := e.rcvList.Front() 418 v = p.data.Size() 419 } 420 e.rcvMu.Unlock() 421 return v, nil 422 423 default: 424 return e.net.GetSockOptInt(opt) 425 } 426 } 427 428 // HandlePacket implements stack.RawTransportEndpoint.HandlePacket. 429 func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { 430 notifyReadableEvents := func() bool { 431 e.mu.RLock() 432 defer e.mu.RUnlock() 433 e.rcvMu.Lock() 434 defer e.rcvMu.Unlock() 435 436 // Drop the packet if our buffer is currently full or if this is an unassociated 437 // endpoint (i.e endpoint created w/ IPPROTO_RAW). Such endpoints are send only 438 // See: https://man7.org/linux/man-pages/man7/raw.7.html 439 // 440 // An IPPROTO_RAW socket is send only. If you really want to receive 441 // all IP packets, use a packet(7) socket with the ETH_P_IP protocol. 442 // Note that packet sockets don't reassemble IP fragments, unlike raw 443 // sockets. 444 if e.rcvClosed || !e.associated { 445 e.stack.Stats().DroppedPackets.Increment() 446 e.stats.ReceiveErrors.ClosedReceiver.Increment() 447 return false 448 } 449 450 rcvBufSize := e.ops.GetReceiveBufferSize() 451 if e.frozen || e.rcvBufSize >= int(rcvBufSize) { 452 e.stack.Stats().DroppedPackets.Increment() 453 e.stats.ReceiveErrors.ReceiveBufferOverflow.Increment() 454 return false 455 } 456 457 net := pkt.Network() 458 dstAddr := net.DestinationAddress() 459 srcAddr := net.SourceAddress() 460 info := e.net.Info() 461 462 switch state := e.net.State(); state { 463 case transport.DatagramEndpointStateInitial: 464 case transport.DatagramEndpointStateConnected: 465 // If connected, only accept packets from the remote address we 466 // connected to. 467 if info.ID.RemoteAddress != srcAddr { 468 return false 469 } 470 471 // Connected sockets may also have been bound to a specific 472 // address/NIC. 473 fallthrough 474 case transport.DatagramEndpointStateBound: 475 // If bound to a NIC, only accept data for that NIC. 476 if info.BindNICID != 0 && info.BindNICID != pkt.NICID { 477 return false 478 } 479 480 // If bound to an address, only accept data for that address. 481 if info.BindAddr != "" && info.BindAddr != dstAddr { 482 return false 483 } 484 default: 485 panic(fmt.Sprintf("unhandled state = %s", state)) 486 } 487 488 wasEmpty := e.rcvBufSize == 0 489 490 // Push new packet into receive list and increment the buffer size. 491 packet := &rawPacket{ 492 senderAddr: tcpip.FullAddress{ 493 NIC: pkt.NICID, 494 Addr: srcAddr, 495 }, 496 packetInfo: tcpip.IPPacketInfo{ 497 // TODO(gvisor.dev/issue/3556): dstAddr may be a multicast or broadcast 498 // address. LocalAddr should hold a unicast address that can be 499 // used to respond to the incoming packet. 500 LocalAddr: dstAddr, 501 DestinationAddr: dstAddr, 502 NIC: pkt.NICID, 503 }, 504 } 505 506 // Raw IPv4 endpoints return the IP header, but IPv6 endpoints do not. 507 // We copy headers' underlying bytes because pkt.*Header may point to 508 // the middle of a slice, and another struct may point to the "outer" 509 // slice. Save/restore doesn't support overlapping slices and will fail. 510 // 511 // TODO(https://gvisor.dev/issue/6517): Avoid the copy once S/R supports 512 // overlapping slices. 513 var combinedVV buffer.VectorisedView 514 if info.NetProto == header.IPv4ProtocolNumber { 515 networkHeader, transportHeader := pkt.NetworkHeader().View(), pkt.TransportHeader().View() 516 headers := make(buffer.View, 0, len(networkHeader)+len(transportHeader)) 517 headers = append(headers, networkHeader...) 518 headers = append(headers, transportHeader...) 519 combinedVV = headers.ToVectorisedView() 520 } else { 521 combinedVV = append(buffer.View(nil), pkt.TransportHeader().View()...).ToVectorisedView() 522 } 523 combinedVV.Append(pkt.Data().ExtractVV()) 524 packet.data = combinedVV 525 packet.receivedAt = e.stack.Clock().Now() 526 527 e.rcvList.PushBack(packet) 528 e.rcvBufSize += packet.data.Size() 529 e.stats.PacketsReceived.Increment() 530 531 // Notify waiters that there is data to be read now. 532 return wasEmpty 533 }() 534 535 if notifyReadableEvents { 536 e.waiterQueue.Notify(waiter.ReadableEvents) 537 } 538 } 539 540 // State implements socket.Socket.State. 541 func (e *endpoint) State() uint32 { 542 return 0 543 } 544 545 // Info returns a copy of the endpoint info. 546 func (e *endpoint) Info() tcpip.EndpointInfo { 547 ret := e.net.Info() 548 return &ret 549 } 550 551 // Stats returns a pointer to the endpoint stats. 552 func (e *endpoint) Stats() tcpip.EndpointStats { 553 return &e.stats 554 } 555 556 // Wait implements stack.TransportEndpoint.Wait. 557 func (*endpoint) Wait() {} 558 559 // LastError implements tcpip.Endpoint.LastError. 560 func (*endpoint) LastError() tcpip.Error { 561 return nil 562 } 563 564 // SocketOptions implements tcpip.Endpoint.SocketOptions. 565 func (e *endpoint) SocketOptions() *tcpip.SocketOptions { 566 return &e.ops 567 } 568 569 // freeze prevents any more packets from being delivered to the endpoint. 570 func (e *endpoint) freeze() { 571 e.mu.Lock() 572 e.frozen = true 573 e.mu.Unlock() 574 } 575 576 // thaw unfreezes a previously frozen endpoint using endpoint.freeze() allows 577 // new packets to be delivered again. 578 func (e *endpoint) thaw() { 579 e.mu.Lock() 580 e.frozen = false 581 e.mu.Unlock() 582 }