github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/tcpip/transport/packet/endpoint.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package packet provides the implementation of packet sockets (see 16 // packet(7)). Packet sockets allow applications to: 17 // 18 // * manually write and inspect link, network, and transport headers 19 // * receive all traffic of a given network protocol, or all protocols 20 // 21 // Packet sockets are similar to raw sockets, but provide even more power to 22 // users, letting them effectively talk directly to the network device. 23 // 24 // Packet sockets skip the input and output iptables chains. 25 package packet 26 27 import ( 28 "fmt" 29 "io" 30 "time" 31 32 "github.com/SagerNet/gvisor/pkg/sync" 33 "github.com/SagerNet/gvisor/pkg/tcpip" 34 "github.com/SagerNet/gvisor/pkg/tcpip/buffer" 35 "github.com/SagerNet/gvisor/pkg/tcpip/header" 36 "github.com/SagerNet/gvisor/pkg/tcpip/stack" 37 "github.com/SagerNet/gvisor/pkg/waiter" 38 ) 39 40 // +stateify savable 41 type packet struct { 42 packetEntry 43 // data holds the actual packet data, including any headers and 44 // payload. 45 data buffer.VectorisedView `state:".(buffer.VectorisedView)"` 46 receivedAt time.Time `state:".(int64)"` 47 // senderAddr is the network address of the sender. 48 senderAddr tcpip.FullAddress 49 // packetInfo holds additional information like the protocol 50 // of the packet etc. 51 packetInfo tcpip.LinkPacketInfo 52 } 53 54 // endpoint is the packet socket implementation of tcpip.Endpoint. It is legal 55 // to have goroutines make concurrent calls into the endpoint. 56 // 57 // Lock order: 58 // endpoint.mu 59 // endpoint.rcvMu 60 // 61 // +stateify savable 62 type endpoint struct { 63 stack.TransportEndpointInfo 64 tcpip.DefaultSocketOptionsHandler 65 66 // The following fields are initialized at creation time and are 67 // immutable. 68 stack *stack.Stack `state:"manual"` 69 netProto tcpip.NetworkProtocolNumber 70 waiterQueue *waiter.Queue 71 cooked bool 72 73 // The following fields are used to manage the receive queue and are 74 // protected by rcvMu. 75 rcvMu sync.Mutex `state:"nosave"` 76 rcvList packetList 77 rcvBufSize int 78 rcvClosed bool 79 80 // The following fields are protected by mu. 81 mu sync.RWMutex `state:"nosave"` 82 closed bool 83 stats tcpip.TransportEndpointStats `state:"nosave"` 84 bound bool 85 boundNIC tcpip.NICID 86 87 // lastErrorMu protects lastError. 88 lastErrorMu sync.Mutex `state:"nosave"` 89 lastError tcpip.Error 90 91 // ops is used to get socket level options. 92 ops tcpip.SocketOptions 93 94 // frozen indicates if the packets should be delivered to the endpoint 95 // during restore. 96 frozen bool 97 } 98 99 // NewEndpoint returns a new packet endpoint. 100 func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { 101 ep := &endpoint{ 102 stack: s, 103 TransportEndpointInfo: stack.TransportEndpointInfo{ 104 NetProto: netProto, 105 }, 106 cooked: cooked, 107 netProto: netProto, 108 waiterQueue: waiterQueue, 109 } 110 ep.ops.InitHandler(ep, ep.stack, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits) 111 ep.ops.SetReceiveBufferSize(32*1024, false /* notify */) 112 113 // Override with stack defaults. 114 var ss tcpip.SendBufferSizeOption 115 if err := s.Option(&ss); err == nil { 116 ep.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) 117 } 118 119 var rs tcpip.ReceiveBufferSizeOption 120 if err := s.Option(&rs); err == nil { 121 ep.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */) 122 } 123 124 if err := s.RegisterPacketEndpoint(0, netProto, ep); err != nil { 125 return nil, err 126 } 127 return ep, nil 128 } 129 130 // Abort implements stack.TransportEndpoint.Abort. 131 func (ep *endpoint) Abort() { 132 ep.Close() 133 } 134 135 // Close implements tcpip.Endpoint.Close. 136 func (ep *endpoint) Close() { 137 ep.mu.Lock() 138 defer ep.mu.Unlock() 139 140 if ep.closed { 141 return 142 } 143 144 ep.stack.UnregisterPacketEndpoint(0, ep.netProto, ep) 145 146 ep.rcvMu.Lock() 147 defer ep.rcvMu.Unlock() 148 149 // Clear the receive list. 150 ep.rcvClosed = true 151 ep.rcvBufSize = 0 152 for !ep.rcvList.Empty() { 153 ep.rcvList.Remove(ep.rcvList.Front()) 154 } 155 156 ep.closed = true 157 ep.bound = false 158 ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 159 } 160 161 // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf. 162 func (*endpoint) ModerateRecvBuf(int) {} 163 164 // Read implements tcpip.Endpoint.Read. 165 func (ep *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) { 166 ep.rcvMu.Lock() 167 168 // If there's no data to read, return that read would block or that the 169 // endpoint is closed. 170 if ep.rcvList.Empty() { 171 var err tcpip.Error = &tcpip.ErrWouldBlock{} 172 if ep.rcvClosed { 173 ep.stats.ReadErrors.ReadClosed.Increment() 174 err = &tcpip.ErrClosedForReceive{} 175 } 176 ep.rcvMu.Unlock() 177 return tcpip.ReadResult{}, err 178 } 179 180 packet := ep.rcvList.Front() 181 if !opts.Peek { 182 ep.rcvList.Remove(packet) 183 ep.rcvBufSize -= packet.data.Size() 184 } 185 186 ep.rcvMu.Unlock() 187 188 res := tcpip.ReadResult{ 189 Total: packet.data.Size(), 190 ControlMessages: tcpip.ControlMessages{ 191 HasTimestamp: true, 192 Timestamp: packet.receivedAt.UnixNano(), 193 }, 194 } 195 if opts.NeedRemoteAddr { 196 res.RemoteAddr = packet.senderAddr 197 } 198 if opts.NeedLinkPacketInfo { 199 res.LinkPacketInfo = packet.packetInfo 200 } 201 202 n, err := packet.data.ReadTo(dst, opts.Peek) 203 if n == 0 && err != nil { 204 return res, &tcpip.ErrBadBuffer{} 205 } 206 res.Count = n 207 return res, nil 208 } 209 210 func (*endpoint) Write(tcpip.Payloader, tcpip.WriteOptions) (int64, tcpip.Error) { 211 return 0, &tcpip.ErrInvalidOptionValue{} 212 } 213 214 // Disconnect implements tcpip.Endpoint.Disconnect. Packet sockets cannot be 215 // disconnected, and this function always returns tpcip.ErrNotSupported. 216 func (*endpoint) Disconnect() tcpip.Error { 217 return &tcpip.ErrNotSupported{} 218 } 219 220 // Connect implements tcpip.Endpoint.Connect. Packet sockets cannot be 221 // connected, and this function always returnes *tcpip.ErrNotSupported. 222 func (*endpoint) Connect(tcpip.FullAddress) tcpip.Error { 223 return &tcpip.ErrNotSupported{} 224 } 225 226 // Shutdown implements tcpip.Endpoint.Shutdown. Packet sockets cannot be used 227 // with Shutdown, and this function always returns *tcpip.ErrNotSupported. 228 func (*endpoint) Shutdown(tcpip.ShutdownFlags) tcpip.Error { 229 return &tcpip.ErrNotSupported{} 230 } 231 232 // Listen implements tcpip.Endpoint.Listen. Packet sockets cannot be used with 233 // Listen, and this function always returns *tcpip.ErrNotSupported. 234 func (*endpoint) Listen(int) tcpip.Error { 235 return &tcpip.ErrNotSupported{} 236 } 237 238 // Accept implements tcpip.Endpoint.Accept. Packet sockets cannot be used with 239 // Accept, and this function always returns *tcpip.ErrNotSupported. 240 func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) { 241 return nil, nil, &tcpip.ErrNotSupported{} 242 } 243 244 // Bind implements tcpip.Endpoint.Bind. 245 func (ep *endpoint) Bind(addr tcpip.FullAddress) tcpip.Error { 246 // "By default, all packets of the specified protocol type are passed 247 // to a packet socket. To get packets only from a specific interface 248 // use bind(2) specifying an address in a struct sockaddr_ll to bind 249 // the packet socket to an interface. Fields used for binding are 250 // sll_family (should be AF_PACKET), sll_protocol, and sll_ifindex." 251 // - packet(7). 252 253 ep.mu.Lock() 254 defer ep.mu.Unlock() 255 256 if ep.bound && ep.boundNIC == addr.NIC { 257 // If the NIC being bound is the same then just return success. 258 return nil 259 } 260 261 // Unregister endpoint with all the nics. 262 ep.stack.UnregisterPacketEndpoint(0, ep.netProto, ep) 263 ep.bound = false 264 265 // Bind endpoint to receive packets from specific interface. 266 if err := ep.stack.RegisterPacketEndpoint(addr.NIC, ep.netProto, ep); err != nil { 267 return err 268 } 269 270 ep.bound = true 271 ep.boundNIC = addr.NIC 272 273 return nil 274 } 275 276 // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress. 277 func (*endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { 278 return tcpip.FullAddress{}, &tcpip.ErrNotSupported{} 279 } 280 281 // GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress. 282 func (*endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { 283 // Even a connected socket doesn't return a remote address. 284 return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} 285 } 286 287 // Readiness implements tcpip.Endpoint.Readiness. 288 func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { 289 // The endpoint is always writable. 290 result := waiter.WritableEvents & mask 291 292 // Determine whether the endpoint is readable. 293 if (mask & waiter.ReadableEvents) != 0 { 294 ep.rcvMu.Lock() 295 if !ep.rcvList.Empty() || ep.rcvClosed { 296 result |= waiter.ReadableEvents 297 } 298 ep.rcvMu.Unlock() 299 } 300 301 return result 302 } 303 304 // SetSockOpt implements tcpip.Endpoint.SetSockOpt. Packet sockets cannot be 305 // used with SetSockOpt, and this function always returns 306 // *tcpip.ErrNotSupported. 307 func (ep *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { 308 switch opt.(type) { 309 case *tcpip.SocketDetachFilterOption: 310 return nil 311 312 default: 313 return &tcpip.ErrUnknownProtocolOption{} 314 } 315 } 316 317 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt. 318 func (*endpoint) SetSockOptInt(tcpip.SockOptInt, int) tcpip.Error { 319 return &tcpip.ErrUnknownProtocolOption{} 320 } 321 322 func (ep *endpoint) LastError() tcpip.Error { 323 ep.lastErrorMu.Lock() 324 defer ep.lastErrorMu.Unlock() 325 326 err := ep.lastError 327 ep.lastError = nil 328 return err 329 } 330 331 // UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError. 332 func (ep *endpoint) UpdateLastError(err tcpip.Error) { 333 ep.lastErrorMu.Lock() 334 ep.lastError = err 335 ep.lastErrorMu.Unlock() 336 } 337 338 // GetSockOpt implements tcpip.Endpoint.GetSockOpt. 339 func (*endpoint) GetSockOpt(tcpip.GettableSocketOption) tcpip.Error { 340 return &tcpip.ErrNotSupported{} 341 } 342 343 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. 344 func (ep *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { 345 switch opt { 346 case tcpip.ReceiveQueueSizeOption: 347 v := 0 348 ep.rcvMu.Lock() 349 if !ep.rcvList.Empty() { 350 p := ep.rcvList.Front() 351 v = p.data.Size() 352 } 353 ep.rcvMu.Unlock() 354 return v, nil 355 356 default: 357 return -1, &tcpip.ErrUnknownProtocolOption{} 358 } 359 } 360 361 // HandlePacket implements stack.PacketEndpoint.HandlePacket. 362 func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { 363 ep.rcvMu.Lock() 364 365 // Drop the packet if our buffer is currently full. 366 if ep.rcvClosed { 367 ep.rcvMu.Unlock() 368 ep.stack.Stats().DroppedPackets.Increment() 369 ep.stats.ReceiveErrors.ClosedReceiver.Increment() 370 return 371 } 372 373 rcvBufSize := ep.ops.GetReceiveBufferSize() 374 if ep.frozen || ep.rcvBufSize >= int(rcvBufSize) { 375 ep.rcvMu.Unlock() 376 ep.stack.Stats().DroppedPackets.Increment() 377 ep.stats.ReceiveErrors.ReceiveBufferOverflow.Increment() 378 return 379 } 380 381 wasEmpty := ep.rcvBufSize == 0 382 383 // Push new packet into receive list and increment the buffer size. 384 var packet packet 385 if !pkt.LinkHeader().View().IsEmpty() { 386 // Get info directly from the ethernet header. 387 hdr := header.Ethernet(pkt.LinkHeader().View()) 388 packet.senderAddr = tcpip.FullAddress{ 389 NIC: nicID, 390 Addr: tcpip.Address(hdr.SourceAddress()), 391 } 392 packet.packetInfo.Protocol = netProto 393 packet.packetInfo.PktType = pkt.PktType 394 } else { 395 // Guess the would-be ethernet header. 396 packet.senderAddr = tcpip.FullAddress{ 397 NIC: nicID, 398 Addr: tcpip.Address(localAddr), 399 } 400 packet.packetInfo.Protocol = netProto 401 packet.packetInfo.PktType = pkt.PktType 402 } 403 404 if ep.cooked { 405 // Cooked packets can simply be queued. 406 switch pkt.PktType { 407 case tcpip.PacketHost: 408 packet.data = pkt.Data().ExtractVV() 409 case tcpip.PacketOutgoing: 410 // Strip Link Header. 411 var combinedVV buffer.VectorisedView 412 if v := pkt.NetworkHeader().View(); !v.IsEmpty() { 413 combinedVV.AppendView(v) 414 } 415 if v := pkt.TransportHeader().View(); !v.IsEmpty() { 416 combinedVV.AppendView(v) 417 } 418 combinedVV.Append(pkt.Data().ExtractVV()) 419 packet.data = combinedVV 420 default: 421 panic(fmt.Sprintf("unexpected PktType in pkt: %+v", pkt)) 422 } 423 } else { 424 // Raw packets need their ethernet headers prepended before 425 // queueing. 426 var linkHeader buffer.View 427 if pkt.PktType != tcpip.PacketOutgoing { 428 if pkt.LinkHeader().View().IsEmpty() { 429 // We weren't provided with an actual ethernet header, 430 // so fake one. 431 ethFields := header.EthernetFields{ 432 SrcAddr: tcpip.LinkAddress([]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00}), 433 DstAddr: localAddr, 434 Type: netProto, 435 } 436 fakeHeader := make(header.Ethernet, header.EthernetMinimumSize) 437 fakeHeader.Encode(ðFields) 438 linkHeader = buffer.View(fakeHeader) 439 } else { 440 linkHeader = append(buffer.View(nil), pkt.LinkHeader().View()...) 441 } 442 combinedVV := linkHeader.ToVectorisedView() 443 combinedVV.Append(pkt.Data().ExtractVV()) 444 packet.data = combinedVV 445 } else { 446 packet.data = buffer.NewVectorisedView(pkt.Size(), pkt.Views()) 447 } 448 } 449 packet.receivedAt = ep.stack.Clock().Now() 450 451 ep.rcvList.PushBack(&packet) 452 ep.rcvBufSize += packet.data.Size() 453 454 ep.rcvMu.Unlock() 455 ep.stats.PacketsReceived.Increment() 456 // Notify waiters that there's data to be read. 457 if wasEmpty { 458 ep.waiterQueue.Notify(waiter.ReadableEvents) 459 } 460 } 461 462 // State implements socket.Socket.State. 463 func (*endpoint) State() uint32 { 464 return 0 465 } 466 467 // Info returns a copy of the endpoint info. 468 func (ep *endpoint) Info() tcpip.EndpointInfo { 469 ep.mu.RLock() 470 // Make a copy of the endpoint info. 471 ret := ep.TransportEndpointInfo 472 ep.mu.RUnlock() 473 return &ret 474 } 475 476 // Stats returns a pointer to the endpoint stats. 477 func (ep *endpoint) Stats() tcpip.EndpointStats { 478 return &ep.stats 479 } 480 481 // SetOwner implements tcpip.Endpoint.SetOwner. 482 func (*endpoint) SetOwner(tcpip.PacketOwner) {} 483 484 // SocketOptions implements tcpip.Endpoint.SocketOptions. 485 func (ep *endpoint) SocketOptions() *tcpip.SocketOptions { 486 return &ep.ops 487 } 488 489 // freeze prevents any more packets from being delivered to the endpoint. 490 func (ep *endpoint) freeze() { 491 ep.mu.Lock() 492 ep.frozen = true 493 ep.mu.Unlock() 494 } 495 496 // thaw unfreezes a previously frozen endpoint using endpoint.freeze() allows 497 // new packets to be delivered again. 498 func (ep *endpoint) thaw() { 499 ep.mu.Lock() 500 ep.frozen = false 501 ep.mu.Unlock() 502 }