github.com/sagernet/gvisor@v0.0.0-20240428053021-e691de28565f/pkg/tcpip/transport/tcp/endpoint.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "container/heap" 19 "fmt" 20 "io" 21 "math" 22 "runtime" 23 "strings" 24 "time" 25 26 "github.com/sagernet/gvisor/pkg/atomicbitops" 27 "github.com/sagernet/gvisor/pkg/buffer" 28 "github.com/sagernet/gvisor/pkg/sleep" 29 "github.com/sagernet/gvisor/pkg/sync" 30 "github.com/sagernet/gvisor/pkg/tcpip" 31 "github.com/sagernet/gvisor/pkg/tcpip/header" 32 "github.com/sagernet/gvisor/pkg/tcpip/ports" 33 "github.com/sagernet/gvisor/pkg/tcpip/seqnum" 34 "github.com/sagernet/gvisor/pkg/tcpip/stack" 35 "github.com/sagernet/gvisor/pkg/waiter" 36 ) 37 38 // EndpointState represents the state of a TCP endpoint. 39 type EndpointState tcpip.EndpointState 40 41 // Endpoint states. Note that are represented in a netstack-specific manner and 42 // may not be meaningful externally. Specifically, they need to be translated to 43 // Linux's representation for these states if presented to userspace. 44 const ( 45 _ EndpointState = iota 46 // TCP protocol states in sync with the definitions in 47 // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/tcp_states.h#L13 48 StateEstablished 49 StateSynSent 50 StateSynRecv 51 StateFinWait1 52 StateFinWait2 53 StateTimeWait 54 StateClose 55 StateCloseWait 56 StateLastAck 57 StateListen 58 StateClosing 59 60 // Endpoint states internal to netstack. 61 StateInitial 62 StateBound 63 StateConnecting // Connect() called, but the initial SYN hasn't been sent. 64 StateError 65 ) 66 67 const ( 68 // rcvAdvWndScale is used to split the available socket buffer into 69 // application buffer and the window to be advertised to the peer. This is 70 // currently hard coded to split the available space equally. 71 rcvAdvWndScale = 1 72 73 // SegOverheadFactor is used to multiply the value provided by the 74 // user on a SetSockOpt for setting the socket send/receive buffer sizes. 75 SegOverheadFactor = 2 76 ) 77 78 type connDirectionState uint32 79 80 // Connection direction states used for directionState checks in endpoint struct 81 // to detect half-closed connection and deliver POLLRDHUP 82 const ( 83 connDirectionStateOpen connDirectionState = 0 84 connDirectionStateRcvClosed connDirectionState = 1 85 connDirectionStateSndClosed connDirectionState = 2 86 connDirectionStateAll connDirectionState = connDirectionStateOpen | connDirectionStateRcvClosed | connDirectionStateSndClosed 87 ) 88 89 // connected returns true when s is one of the states representing an 90 // endpoint connected to a peer. 91 func (s EndpointState) connected() bool { 92 switch s { 93 case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing: 94 return true 95 default: 96 return false 97 } 98 } 99 100 // connecting returns true when s is one of the states representing a 101 // connection in progress, but not yet fully established. 102 func (s EndpointState) connecting() bool { 103 switch s { 104 case StateConnecting, StateSynSent, StateSynRecv: 105 return true 106 default: 107 return false 108 } 109 } 110 111 // internal returns true when the state is netstack internal. 112 func (s EndpointState) internal() bool { 113 switch s { 114 case StateInitial, StateBound, StateConnecting, StateError: 115 return true 116 default: 117 return false 118 } 119 } 120 121 // handshake returns true when s is one of the states representing an endpoint 122 // in the middle of a TCP handshake. 123 func (s EndpointState) handshake() bool { 124 switch s { 125 case StateSynSent, StateSynRecv: 126 return true 127 default: 128 return false 129 } 130 } 131 132 // closed returns true when s is one of the states an endpoint transitions to 133 // when closed or when it encounters an error. This is distinct from a newly 134 // initialized endpoint that was never connected. 135 func (s EndpointState) closed() bool { 136 switch s { 137 case StateClose, StateError: 138 return true 139 default: 140 return false 141 } 142 } 143 144 // String implements fmt.Stringer.String. 145 func (s EndpointState) String() string { 146 switch s { 147 case StateInitial: 148 return "INITIAL" 149 case StateBound: 150 return "BOUND" 151 case StateConnecting: 152 return "CONNECTING" 153 case StateError: 154 return "ERROR" 155 case StateEstablished: 156 return "ESTABLISHED" 157 case StateSynSent: 158 return "SYN-SENT" 159 case StateSynRecv: 160 return "SYN-RCVD" 161 case StateFinWait1: 162 return "FIN-WAIT1" 163 case StateFinWait2: 164 return "FIN-WAIT2" 165 case StateTimeWait: 166 return "TIME-WAIT" 167 case StateClose: 168 return "CLOSED" 169 case StateCloseWait: 170 return "CLOSE-WAIT" 171 case StateLastAck: 172 return "LAST-ACK" 173 case StateListen: 174 return "LISTEN" 175 case StateClosing: 176 return "CLOSING" 177 default: 178 panic("unreachable") 179 } 180 } 181 182 // SACKInfo holds TCP SACK related information for a given endpoint. 183 // 184 // +stateify savable 185 type SACKInfo struct { 186 // Blocks is the maximum number of SACK blocks we track 187 // per endpoint. 188 Blocks [MaxSACKBlocks]header.SACKBlock 189 190 // NumBlocks is the number of valid SACK blocks stored in the 191 // blocks array above. 192 NumBlocks int 193 } 194 195 // ReceiveErrors collect segment receive errors within transport layer. 196 // 197 // +stateify savable 198 type ReceiveErrors struct { 199 tcpip.ReceiveErrors 200 201 // SegmentQueueDropped is the number of segments dropped due to 202 // a full segment queue. 203 SegmentQueueDropped tcpip.StatCounter 204 205 // ChecksumErrors is the number of segments dropped due to bad checksums. 206 ChecksumErrors tcpip.StatCounter 207 208 // ListenOverflowSynDrop is the number of times the listen queue overflowed 209 // and a SYN was dropped. 210 ListenOverflowSynDrop tcpip.StatCounter 211 212 // ListenOverflowAckDrop is the number of times the final ACK 213 // in the handshake was dropped due to overflow. 214 ListenOverflowAckDrop tcpip.StatCounter 215 216 // ZeroRcvWindowState is the number of times we advertised 217 // a zero receive window when rcvQueue is full. 218 ZeroRcvWindowState tcpip.StatCounter 219 220 // WantZeroWindow is the number of times we wanted to advertise a 221 // zero receive window but couldn't because it would have caused 222 // the receive window's right edge to shrink. 223 WantZeroRcvWindow tcpip.StatCounter 224 } 225 226 // SendErrors collect segment send errors within the transport layer. 227 // 228 // +stateify savable 229 type SendErrors struct { 230 tcpip.SendErrors 231 232 // SegmentSendToNetworkFailed is the number of TCP segments failed to be sent 233 // to the network endpoint. 234 SegmentSendToNetworkFailed tcpip.StatCounter 235 236 // SynSendToNetworkFailed is the number of TCP SYNs failed to be sent 237 // to the network endpoint. 238 SynSendToNetworkFailed tcpip.StatCounter 239 240 // Retransmits is the number of TCP segments retransmitted. 241 Retransmits tcpip.StatCounter 242 243 // FastRetransmit is the number of segments retransmitted in fast 244 // recovery. 245 FastRetransmit tcpip.StatCounter 246 247 // Timeouts is the number of times the RTO expired. 248 Timeouts tcpip.StatCounter 249 } 250 251 // Stats holds statistics about the endpoint. 252 // 253 // +stateify savable 254 type Stats struct { 255 // SegmentsReceived is the number of TCP segments received that 256 // the transport layer successfully parsed. 257 SegmentsReceived tcpip.StatCounter 258 259 // SegmentsSent is the number of TCP segments sent. 260 SegmentsSent tcpip.StatCounter 261 262 // FailedConnectionAttempts is the number of times we saw Connect and 263 // Accept errors. 264 FailedConnectionAttempts tcpip.StatCounter 265 266 // ReceiveErrors collects segment receive errors within the 267 // transport layer. 268 ReceiveErrors ReceiveErrors 269 270 // ReadErrors collects segment read errors from an endpoint read call. 271 ReadErrors tcpip.ReadErrors 272 273 // SendErrors collects segment send errors within the transport layer. 274 SendErrors SendErrors 275 276 // WriteErrors collects segment write errors from an endpoint write call. 277 WriteErrors tcpip.WriteErrors 278 } 279 280 // IsEndpointStats is an empty method to implement the tcpip.EndpointStats 281 // marker interface. 282 func (*Stats) IsEndpointStats() {} 283 284 // sndQueueInfo implements a send queue. 285 // 286 // +stateify savable 287 type sndQueueInfo struct { 288 sndQueueMu sync.Mutex `state:"nosave"` 289 stack.TCPSndBufState 290 291 // sndWaker is used to signal the protocol goroutine when there may be 292 // segments that need to be sent. 293 sndWaker sleep.Waker `state:"manual"` 294 } 295 296 // CloneState clones sq into other. It is not thread safe 297 func (sq *sndQueueInfo) CloneState(other *stack.TCPSndBufState) { 298 other.SndBufSize = sq.SndBufSize 299 other.SndBufUsed = sq.SndBufUsed 300 other.SndClosed = sq.SndClosed 301 other.PacketTooBigCount = sq.PacketTooBigCount 302 other.SndMTU = sq.SndMTU 303 other.AutoTuneSndBufDisabled = atomicbitops.FromUint32(sq.AutoTuneSndBufDisabled.RacyLoad()) 304 } 305 306 // Endpoint represents a TCP endpoint. This struct serves as the interface 307 // between users of the endpoint and the protocol implementation; it is legal to 308 // have concurrent goroutines make calls into the endpoint, they are properly 309 // synchronized. The protocol implementation, however, runs in a single 310 // goroutine. 311 // 312 // Each endpoint has a few mutexes: 313 // 314 // e.mu -> Primary mutex for an endpoint must be held for all operations except 315 // in e.Readiness where acquiring it will result in a deadlock in epoll 316 // implementation. 317 // 318 // The following three mutexes can be acquired independent of e.mu but if 319 // acquired with e.mu then e.mu must be acquired first. 320 // 321 // e.acceptMu -> Protects e.acceptQueue. 322 // e.rcvQueueMu -> Protects e.rcvQueue's associated fields but not e.rcvQueue 323 // itself. 324 // e.sndQueueMu -> Protects the e.sndQueue and associated fields. 325 // e.lastErrorMu -> Protects the lastError field. 326 // 327 // LOCKING/UNLOCKING of the endpoint. The locking of an endpoint is different 328 // based on the context in which the lock is acquired. In the syscall context 329 // e.LockUser/e.UnlockUser should be used and when doing background processing 330 // e.mu.Lock/e.mu.Unlock should be used. The distinction is described below 331 // in brief. 332 // 333 // The reason for this locking behaviour is to avoid wakeups to handle packets. 334 // In cases where the endpoint is already locked the background processor can 335 // queue the packet up and go its merry way and the lock owner will eventually 336 // process the backlog when releasing the lock. Similarly when acquiring the 337 // lock from say a syscall goroutine we can implement a bit of spinning if we 338 // know that the lock is not held by another syscall goroutine. Background 339 // processors should never hold the lock for long and we can avoid an expensive 340 // sleep/wakeup by spinning for a shortwhile. 341 // 342 // For more details please see the detailed documentation on 343 // e.LockUser/e.UnlockUser methods. 344 // 345 // +stateify savable 346 type Endpoint struct { 347 stack.TCPEndpointStateInner 348 stack.TransportEndpointInfo 349 tcpip.DefaultSocketOptionsHandler 350 351 // EndpointEntry is used to queue endpoints for processing to the 352 // a given tcp processor goroutine. 353 // 354 // Precondition: epQueue.mu must be held to read/write this field.. 355 endpointEntry `state:"nosave"` 356 357 // pendingProcessingMu protects pendingProcessing. 358 pendingProcessingMu sync.Mutex `state:"nosave"` 359 360 // pendingProcessing is true if this endpoint is queued for processing 361 // to a TCP processor. 362 // +checklocks:pendingProcessingMu 363 pendingProcessing bool `state:"nosave"` 364 365 // The following fields are initialized at creation time and do not 366 // change throughout the lifetime of the endpoint. 367 stack *stack.Stack `state:"manual"` 368 protocol *protocol `state:"manual"` 369 waiterQueue *waiter.Queue `state:"wait"` 370 uniqueID uint64 371 372 // hardError is meaningful only when state is stateError. It stores the 373 // error to be returned when read/write syscalls are called and the 374 // endpoint is in this state. hardError is protected by endpoint mu. 375 hardError tcpip.Error 376 377 // lastError represents the last error that the endpoint reported; 378 // access to it is protected by the following mutex. 379 lastErrorMu sync.Mutex `state:"nosave"` 380 lastError tcpip.Error 381 382 rcvQueueMu sync.Mutex `state:"nosave"` 383 384 // +checklocks:rcvQueueMu 385 stack.TCPRcvBufState 386 387 // rcvMemUsed tracks the total amount of memory in use by received segments 388 // held in rcvQueue, pendingRcvdSegments and the segment queue. This is used to 389 // compute the window and the actual available buffer space. This is distinct 390 // from rcvBufUsed above which is the actual number of payload bytes held in 391 // the buffer not including any segment overheads. 392 rcvMemUsed atomicbitops.Int32 393 394 // mu protects all endpoint fields unless documented otherwise. mu must 395 // be acquired before interacting with the endpoint fields. 396 // 397 // During handshake, mu is locked by the protocol listen goroutine and 398 // released by the handshake completion goroutine. 399 mu sync.CrossGoroutineMutex `state:"nosave"` 400 ownedByUser atomicbitops.Uint32 401 402 // rcvQueue is the queue for ready-for-delivery segments. 403 // 404 // +checklocks:mu 405 rcvQueue segmentList `state:"wait"` 406 407 // state must be read/set using the EndpointState()/setEndpointState() 408 // methods. 409 state atomicbitops.Uint32 `state:".(EndpointState)"` 410 411 // connectionDirectionState holds current state of send and receive, 412 // accessed atomically 413 connectionDirectionState atomicbitops.Uint32 414 415 // origEndpointState is only used during a restore phase to save the 416 // endpoint state at restore time as the socket is moved to it's correct 417 // state. 418 origEndpointState uint32 `state:"nosave"` 419 420 isPortReserved bool `state:"manual"` 421 isRegistered bool `state:"manual"` 422 boundNICID tcpip.NICID 423 route *stack.Route `state:"manual"` 424 ipv4TTL uint8 425 ipv6HopLimit int16 426 isConnectNotified bool 427 428 // h stores a reference to the current handshake state if the endpoint is in 429 // the SYN-SENT or SYN-RECV states, in which case endpoint == endpoint.h.ep. 430 // nil otherwise. 431 // +checklocks:mu 432 h *handshake 433 434 // portFlags stores the current values of port related flags. 435 portFlags ports.Flags 436 437 // Values used to reserve a port or register a transport endpoint 438 // (which ever happens first). 439 boundBindToDevice tcpip.NICID 440 boundPortFlags ports.Flags 441 boundDest tcpip.FullAddress 442 443 // effectiveNetProtos contains the network protocols actually in use. In 444 // most cases it will only contain "netProto", but in cases like IPv6 445 // endpoints with v6only set to false, this could include multiple 446 // protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g., 447 // IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped 448 // address). 449 effectiveNetProtos []tcpip.NetworkProtocolNumber 450 451 // recentTSTime is the unix time when we last updated 452 // TCPEndpointStateInner.RecentTS. 453 recentTSTime tcpip.MonotonicTime 454 455 // shutdownFlags represent the current shutdown state of the endpoint. 456 shutdownFlags tcpip.ShutdownFlags 457 458 // tcpRecovery is the loss recovery algorithm used by TCP. 459 tcpRecovery tcpip.TCPRecovery 460 461 // sack holds TCP SACK related information for this endpoint. 462 sack SACKInfo 463 464 // delay enables Nagle's algorithm. 465 // 466 // delay is a boolean (0 is false) and must be accessed atomically. 467 delay uint32 468 469 // scoreboard holds TCP SACK Scoreboard information for this endpoint. 470 scoreboard *SACKScoreboard 471 472 // segmentQueue is used to hand received segments to the protocol 473 // goroutine. Segments are queued as long as the queue is not full, 474 // and dropped when it is. 475 segmentQueue segmentQueue `state:"wait"` 476 477 // userMSS if non-zero is the MSS value explicitly set by the user 478 // for this endpoint using the TCP_MAXSEG setsockopt. 479 userMSS uint16 480 481 // maxSynRetries is the maximum number of SYN retransmits that TCP should 482 // send before aborting the attempt to connect. It cannot exceed 255. 483 // 484 // NOTE: This is currently a no-op and does not change the SYN 485 // retransmissions. 486 maxSynRetries uint8 487 488 // windowClamp is used to bound the size of the advertised window to 489 // this value. 490 windowClamp uint32 491 492 // sndQueueInfo contains the implementation of the endpoint's send queue. 493 sndQueueInfo sndQueueInfo 494 495 // cc stores the name of the Congestion Control algorithm to use for 496 // this endpoint. 497 cc tcpip.CongestionControlOption 498 499 // keepalive manages TCP keepalive state. When the connection is idle 500 // (no data sent or received) for keepaliveIdle, we start sending 501 // keepalives every keepalive.interval. If we send keepalive.count 502 // without hearing a response, the connection is closed. 503 keepalive keepalive 504 505 // userTimeout if non-zero specifies a user specified timeout for 506 // a connection w/ pending data to send. A connection that has pending 507 // unacked data will be forcibily aborted if the timeout is reached 508 // without any data being acked. 509 userTimeout time.Duration 510 511 // deferAccept if non-zero specifies a user specified time during 512 // which the final ACK of a handshake will be dropped provided the 513 // ACK is a bare ACK and carries no data. If the timeout is crossed then 514 // the bare ACK is accepted and the connection is delivered to the 515 // listener. 516 deferAccept time.Duration 517 518 // acceptMu protects accepQueue 519 acceptMu sync.Mutex `state:"nosave"` 520 521 // acceptQueue is used by a listening endpoint to send newly accepted 522 // connections to the endpoint so that they can be read by Accept() 523 // calls. 524 // 525 // +checklocks:acceptMu 526 acceptQueue acceptQueue 527 528 // The following are only used from the protocol goroutine, and 529 // therefore don't need locks to protect them. 530 rcv *receiver `state:"wait"` 531 snd *sender `state:"wait"` 532 533 // The goroutine drain completion notification channel. 534 drainDone chan struct{} `state:"nosave"` 535 536 // The goroutine undrain notification channel. This is currently used as 537 // a way to block the worker goroutines. Today nothing closes/writes 538 // this channel and this causes any goroutines waiting on this to just 539 // block. This is used during save/restore to prevent worker goroutines 540 // from mutating state as it's being saved. 541 undrain chan struct{} `state:"nosave"` 542 543 // probe if not nil is invoked on every received segment. It is passed 544 // a copy of the current state of the endpoint. 545 probe stack.TCPProbeFunc `state:"nosave"` 546 547 // The following are only used to assist the restore run to re-connect. 548 connectingAddress tcpip.Address 549 550 // amss is the advertised MSS to the peer by this endpoint. 551 amss uint16 552 553 // sendTOS represents IPv4 TOS or IPv6 TrafficClass, 554 // applied while sending packets. Defaults to 0 as on Linux. 555 sendTOS uint8 556 557 gso stack.GSO 558 559 stats Stats 560 561 // tcpLingerTimeout is the maximum amount of a time a socket 562 // a socket stays in TIME_WAIT state before being marked 563 // closed. 564 tcpLingerTimeout time.Duration 565 566 // closed indicates that the user has called closed on the 567 // endpoint and at this point the endpoint is only around 568 // to complete the TCP shutdown. 569 closed bool 570 571 // txHash is the transport layer hash to be set on outbound packets 572 // emitted by this endpoint. 573 txHash uint32 574 575 // owner is used to get uid and gid of the packet. 576 owner tcpip.PacketOwner 577 578 // ops is used to get socket level options. 579 ops tcpip.SocketOptions 580 581 // lastOutOfWindowAckTime is the time at which the an ACK was sent in response 582 // to an out of window segment being received by this endpoint. 583 lastOutOfWindowAckTime tcpip.MonotonicTime 584 585 // finWait2Timer is used to reap orphaned sockets in FIN-WAIT-2 where the peer 586 // is yet to send a FIN but on our end the socket is fully closed i.e. endpoint.Close() 587 // has been called on the socket. This timer is not started for sockets that 588 // are waiting for a peer FIN but are not closed. 589 finWait2Timer tcpip.Timer `state:"nosave"` 590 591 // timeWaitTimer is used to reap a socket once a socket has been in TIME-WAIT state 592 // for tcp.DefaultTCPTimeWaitTimeout seconds. 593 timeWaitTimer tcpip.Timer `state:"nosave"` 594 595 // listenCtx is used by listening endpoints to store state used while listening for 596 // connections. Nil otherwise. 597 listenCtx *listenContext `state:"nosave"` 598 599 // limRdr is reused to avoid allocations. 600 // 601 // +checklocks:mu 602 limRdr *io.LimitedReader `state:"nosave"` 603 } 604 605 // UniqueID implements stack.TransportEndpoint.UniqueID. 606 func (e *Endpoint) UniqueID() uint64 { 607 return e.uniqueID 608 } 609 610 // calculateAdvertisedMSS calculates the MSS to advertise. 611 // 612 // If userMSS is non-zero and is not greater than the maximum possible MSS for 613 // r, it will be used; otherwise, the maximum possible MSS will be used. 614 func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 { 615 // The maximum possible MSS is dependent on the route. 616 // TODO(b/143359391): Respect TCP Min and Max size. 617 maxMSS := uint16(r.MTU() - header.TCPMinimumSize) 618 619 if userMSS != 0 && userMSS < maxMSS { 620 return userMSS 621 } 622 623 return maxMSS 624 } 625 626 // isOwnedByUser() returns true if the endpoint lock is currently 627 // held by a user(syscall) goroutine. 628 func (e *Endpoint) isOwnedByUser() bool { 629 return e.ownedByUser.Load() == 1 630 } 631 632 // LockUser tries to lock e.mu and if it fails it will check if the lock is held 633 // by another syscall goroutine. If yes, then it will goto sleep waiting for the 634 // lock to be released, if not then it will spin till it acquires the lock or 635 // another syscall goroutine acquires it in which case it will goto sleep as 636 // described above. 637 // 638 // The assumption behind spinning here being that background packet processing 639 // should not be holding the lock for long and spinning reduces latency as we 640 // avoid an expensive sleep/wakeup of the syscall goroutine). 641 // +checklocksacquire:e.mu 642 func (e *Endpoint) LockUser() { 643 const iterations = 5 644 for i := 0; i < iterations; i++ { 645 // Try first if the sock is locked then check if it's owned 646 // by another user goroutine if not then we spin, otherwise 647 // we just go to sleep on the Lock() and wait. 648 if !e.TryLock() { 649 // If socket is owned by the user then just go to sleep 650 // as the lock could be held for a reasonably long time. 651 if e.ownedByUser.Load() == 1 { 652 e.mu.Lock() 653 e.ownedByUser.Store(1) 654 return 655 } 656 // Spin but don't yield the processor since the lower half 657 // should yield the lock soon. 658 continue 659 } 660 e.ownedByUser.Store(1) 661 return 662 } 663 664 for i := 0; i < iterations; i++ { 665 // Try first if the sock is locked then check if it's owned 666 // by another user goroutine if not then we spin, otherwise 667 // we just go to sleep on the Lock() and wait. 668 if !e.TryLock() { 669 // If socket is owned by the user then just go to sleep 670 // as the lock could be held for a reasonably long time. 671 if e.ownedByUser.Load() == 1 { 672 e.mu.Lock() 673 e.ownedByUser.Store(1) 674 return 675 } 676 // Spin but yield the processor since the lower half 677 // should yield the lock soon. 678 runtime.Gosched() 679 continue 680 } 681 e.ownedByUser.Store(1) 682 return 683 } 684 685 // Finally just give up and wait for the Lock. 686 e.mu.Lock() 687 e.ownedByUser.Store(1) 688 } 689 690 // UnlockUser will check if there are any segments already queued for processing 691 // and wake up a processor goroutine to process them before unlocking e.mu. 692 // This is required because we when packets arrive and endpoint lock is already 693 // held then such packets are queued up to be processed. 694 // 695 // Precondition: e.LockUser() must have been called before calling e.UnlockUser() 696 // +checklocksrelease:e.mu 697 func (e *Endpoint) UnlockUser() { 698 // Lock segment queue before checking so that we avoid a race where 699 // segments can be queued between the time we check if queue is empty 700 // and actually unlock the endpoint mutex. 701 e.segmentQueue.mu.Lock() 702 if e.segmentQueue.emptyLocked() { 703 if e.ownedByUser.Swap(0) != 1 { 704 panic("e.UnlockUser() called without calling e.LockUser()") 705 } 706 e.mu.Unlock() 707 e.segmentQueue.mu.Unlock() 708 return 709 } 710 e.segmentQueue.mu.Unlock() 711 712 // Since we are waking the processor goroutine here just unlock 713 // and let it process the queued segments. 714 if e.ownedByUser.Swap(0) != 1 { 715 panic("e.UnlockUser() called without calling e.LockUser()") 716 } 717 processor := e.protocol.dispatcher.selectProcessor(e.ID) 718 e.mu.Unlock() 719 720 // Wake up the processor for this endpoint to process any queued 721 // segments after releasing the lock to avoid the case where if the 722 // processor goroutine starts running before we release the lock here 723 // then it will fail to process as TryLock() will fail. 724 processor.queueEndpoint(e) 725 return 726 } 727 728 // StopWork halts packet processing. Only to be used in tests. 729 // +checklocksacquire:e.mu 730 func (e *Endpoint) StopWork() { 731 e.mu.Lock() 732 } 733 734 // ResumeWork resumes packet processing. Only to be used in tests. 735 // +checklocksrelease:e.mu 736 func (e *Endpoint) ResumeWork() { 737 e.mu.Unlock() 738 } 739 740 // AssertLockHeld forces the checklocks analyzer to consider e.mu held. This is 741 // used in places where we know that e.mu is held, but checklocks does not, 742 // which can happen when creating new locked objects. You must pass the known 743 // locked endpoint to this function and it must be the same as the caller 744 // endpoint. 745 // TODO(b/226403629): Remove this function once checklocks understands local 746 // variable locks. 747 // +checklocks:locked.mu 748 // +checklocksacquire:e.mu 749 func (e *Endpoint) AssertLockHeld(locked *Endpoint) { 750 if e != locked { 751 panic("AssertLockHeld failed: locked endpoint != asserting endpoint") 752 } 753 } 754 755 // TryLock is a helper that calls TryLock on the endpoint's mutex and 756 // adds the necessary checklocks annotations. 757 // TODO(b/226403629): Remove this once checklocks understands TryLock. 758 // +checklocksacquire:e.mu 759 func (e *Endpoint) TryLock() bool { 760 if e.mu.TryLock() { 761 return true // +checklocksforce 762 } 763 return false // +checklocksignore 764 } 765 766 // setEndpointState updates the state of the endpoint to state atomically. This 767 // method is unexported as the only place we should update the state is in this 768 // package but we allow the state to be read freely without holding e.mu. 769 // 770 // +checklocks:e.mu 771 func (e *Endpoint) setEndpointState(state EndpointState) { 772 oldstate := EndpointState(e.state.Swap(uint32(state))) 773 switch state { 774 case StateEstablished: 775 e.stack.Stats().TCP.CurrentEstablished.Increment() 776 e.stack.Stats().TCP.CurrentConnected.Increment() 777 case StateError: 778 fallthrough 779 case StateClose: 780 if oldstate == StateCloseWait || oldstate == StateEstablished { 781 e.stack.Stats().TCP.EstablishedResets.Increment() 782 } 783 if oldstate.connected() { 784 e.stack.Stats().TCP.CurrentConnected.Decrement() 785 } 786 fallthrough 787 default: 788 if oldstate == StateEstablished { 789 e.stack.Stats().TCP.CurrentEstablished.Decrement() 790 } 791 } 792 } 793 794 // EndpointState returns the current state of the endpoint. 795 func (e *Endpoint) EndpointState() EndpointState { 796 return EndpointState(e.state.Load()) 797 } 798 799 // setRecentTimestamp sets the recentTS field to the provided value. 800 func (e *Endpoint) setRecentTimestamp(recentTS uint32) { 801 e.RecentTS = recentTS 802 e.recentTSTime = e.stack.Clock().NowMonotonic() 803 } 804 805 // recentTimestamp returns the value of the recentTS field. 806 func (e *Endpoint) recentTimestamp() uint32 { 807 return e.RecentTS 808 } 809 810 // TODO(gvisor.dev/issue/6974): Remove once tcp endpoints are composed with a 811 // network.Endpoint, which also defines this function. 812 func calculateTTL(route *stack.Route, ipv4TTL uint8, ipv6HopLimit int16) uint8 { 813 switch netProto := route.NetProto(); netProto { 814 case header.IPv4ProtocolNumber: 815 if ipv4TTL == tcpip.UseDefaultIPv4TTL { 816 return route.DefaultTTL() 817 } 818 return ipv4TTL 819 case header.IPv6ProtocolNumber: 820 if ipv6HopLimit == tcpip.UseDefaultIPv6HopLimit { 821 return route.DefaultTTL() 822 } 823 return uint8(ipv6HopLimit) 824 default: 825 panic(fmt.Sprintf("invalid protocol number = %d", netProto)) 826 } 827 } 828 829 // keepalive is a synchronization wrapper used to appease stateify. See the 830 // comment in endpoint, where it is used. 831 // 832 // +stateify savable 833 type keepalive struct { 834 sync.Mutex `state:"nosave"` 835 idle time.Duration 836 interval time.Duration 837 count int 838 unacked int 839 // should never be a zero timer if the endpoint is not closed. 840 timer timer `state:"nosave"` 841 waker sleep.Waker `state:"nosave"` 842 } 843 844 func newEndpoint(s *stack.Stack, protocol *protocol, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *Endpoint { 845 e := &Endpoint{ 846 stack: s, 847 protocol: protocol, 848 TransportEndpointInfo: stack.TransportEndpointInfo{ 849 NetProto: netProto, 850 TransProto: header.TCPProtocolNumber, 851 }, 852 sndQueueInfo: sndQueueInfo{ 853 TCPSndBufState: stack.TCPSndBufState{ 854 SndMTU: math.MaxInt32, 855 }, 856 }, 857 waiterQueue: waiterQueue, 858 state: atomicbitops.FromUint32(uint32(StateInitial)), 859 keepalive: keepalive{ 860 idle: DefaultKeepaliveIdle, 861 interval: DefaultKeepaliveInterval, 862 count: DefaultKeepaliveCount, 863 }, 864 uniqueID: s.UniqueID(), 865 ipv4TTL: tcpip.UseDefaultIPv4TTL, 866 ipv6HopLimit: tcpip.UseDefaultIPv6HopLimit, 867 // txHash only determines which outgoing queue to use, so 868 // InsecureRNG is fine. 869 txHash: s.InsecureRNG().Uint32(), 870 windowClamp: DefaultReceiveBufferSize, 871 maxSynRetries: DefaultSynRetries, 872 limRdr: &io.LimitedReader{}, 873 } 874 e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits, GetTCPReceiveBufferLimits) 875 e.ops.SetMulticastLoop(true) 876 e.ops.SetQuickAck(true) 877 e.ops.SetSendBufferSize(DefaultSendBufferSize, false /* notify */) 878 e.ops.SetReceiveBufferSize(DefaultReceiveBufferSize, false /* notify */) 879 880 var ss tcpip.TCPSendBufferSizeRangeOption 881 if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil { 882 e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) 883 } 884 885 var rs tcpip.TCPReceiveBufferSizeRangeOption 886 if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil { 887 e.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */) 888 } 889 890 var cs tcpip.CongestionControlOption 891 if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil { 892 e.cc = cs 893 } 894 895 var mrb tcpip.TCPModerateReceiveBufferOption 896 if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil { 897 e.RcvAutoParams.Disabled = !bool(mrb) 898 } 899 900 var de tcpip.TCPDelayEnabled 901 if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de { 902 e.ops.SetDelayOption(true) 903 } 904 905 var tcpLT tcpip.TCPLingerTimeoutOption 906 if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil { 907 e.tcpLingerTimeout = time.Duration(tcpLT) 908 } 909 910 var synRetries tcpip.TCPSynRetriesOption 911 if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil { 912 e.maxSynRetries = uint8(synRetries) 913 } 914 915 if p := s.GetTCPProbe(); p != nil { 916 e.probe = p 917 } 918 919 e.segmentQueue.ep = e 920 921 // TODO(https://gvisor.dev/issues/7493): Defer creating the timer until TCP connection becomes 922 // established. 923 e.keepalive.timer.init(e.stack.Clock(), timerHandler(e, e.keepaliveTimerExpired)) 924 925 return e 926 } 927 928 // Readiness returns the current readiness of the endpoint. For example, if 929 // waiter.EventIn is set, the endpoint is immediately readable. 930 func (e *Endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { 931 result := waiter.EventMask(0) 932 933 switch e.EndpointState() { 934 case StateInitial, StateBound: 935 // This prevents blocking of new sockets which are not 936 // connected when SO_LINGER is set. 937 result |= waiter.EventHUp 938 939 case StateConnecting, StateSynSent, StateSynRecv: 940 // Ready for nothing. 941 942 case StateClose, StateError, StateTimeWait: 943 // Ready for anything. 944 result = mask 945 946 case StateListen: 947 // Check if there's anything in the accepted queue. 948 if (mask & waiter.ReadableEvents) != 0 { 949 e.acceptMu.Lock() 950 if e.acceptQueue.endpoints.Len() != 0 { 951 result |= waiter.ReadableEvents 952 } 953 e.acceptMu.Unlock() 954 } 955 } 956 if e.EndpointState().connected() { 957 // Determine if the endpoint is writable if requested. 958 if (mask & waiter.WritableEvents) != 0 { 959 e.sndQueueInfo.sndQueueMu.Lock() 960 sndBufSize := e.getSendBufferSize() 961 if e.sndQueueInfo.SndClosed || e.sndQueueInfo.SndBufUsed < sndBufSize { 962 result |= waiter.WritableEvents 963 } 964 if e.sndQueueInfo.SndClosed { 965 e.updateConnDirectionState(connDirectionStateSndClosed) 966 } 967 e.sndQueueInfo.sndQueueMu.Unlock() 968 } 969 970 // Determine if the endpoint is readable if requested. 971 if (mask & waiter.ReadableEvents) != 0 { 972 e.rcvQueueMu.Lock() 973 if e.RcvBufUsed > 0 || e.RcvClosed { 974 result |= waiter.ReadableEvents 975 } 976 if e.RcvClosed { 977 e.updateConnDirectionState(connDirectionStateRcvClosed) 978 } 979 e.rcvQueueMu.Unlock() 980 } 981 } 982 983 // Determine whether endpoint is half-closed with rcv shutdown 984 if e.connDirectionState() == connDirectionStateRcvClosed { 985 result |= waiter.EventRdHUp 986 } 987 988 return result 989 } 990 991 // Purging pending rcv segments is only necessary on RST. 992 func (e *Endpoint) purgePendingRcvQueue() { 993 if e.rcv != nil { 994 for e.rcv.pendingRcvdSegments.Len() > 0 { 995 s := heap.Pop(&e.rcv.pendingRcvdSegments).(*segment) 996 s.DecRef() 997 } 998 } 999 } 1000 1001 // +checklocks:e.mu 1002 func (e *Endpoint) purgeReadQueue() { 1003 if e.rcv != nil { 1004 e.rcvQueueMu.Lock() 1005 defer e.rcvQueueMu.Unlock() 1006 for { 1007 s := e.rcvQueue.Front() 1008 if s == nil { 1009 break 1010 } 1011 e.rcvQueue.Remove(s) 1012 s.DecRef() 1013 } 1014 e.RcvBufUsed = 0 1015 } 1016 } 1017 1018 // +checklocks:e.mu 1019 func (e *Endpoint) purgeWriteQueue() { 1020 if e.snd != nil { 1021 e.sndQueueInfo.sndQueueMu.Lock() 1022 defer e.sndQueueInfo.sndQueueMu.Unlock() 1023 e.snd.updateWriteNext(nil) 1024 for { 1025 s := e.snd.writeList.Front() 1026 if s == nil { 1027 break 1028 } 1029 e.snd.writeList.Remove(s) 1030 s.DecRef() 1031 } 1032 e.sndQueueInfo.SndBufUsed = 0 1033 e.sndQueueInfo.SndClosed = true 1034 } 1035 } 1036 1037 // Abort implements stack.TransportEndpoint.Abort. 1038 func (e *Endpoint) Abort() { 1039 defer e.drainClosingSegmentQueue() 1040 e.LockUser() 1041 defer e.UnlockUser() 1042 defer e.purgeReadQueue() 1043 // Reset all connected endpoints. 1044 switch state := e.EndpointState(); { 1045 case state.connected(): 1046 e.resetConnectionLocked(&tcpip.ErrAborted{}) 1047 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 1048 return 1049 } 1050 e.closeLocked() 1051 } 1052 1053 // Close puts the endpoint in a closed state and frees all resources associated 1054 // with it. It must be called only once and with no other concurrent calls to 1055 // the endpoint. 1056 func (e *Endpoint) Close() { 1057 e.LockUser() 1058 if e.closed { 1059 e.UnlockUser() 1060 return 1061 } 1062 1063 // We always want to purge the read queue, but do so after the checks in 1064 // shutdownLocked. 1065 e.closeLocked() 1066 e.purgeReadQueue() 1067 if e.EndpointState() == StateClose || e.EndpointState() == StateError { 1068 // It should be safe to purge the read queue now as the endpoint 1069 // is now closed or in an error state and further reads are not 1070 // permitted. 1071 e.UnlockUser() 1072 e.drainClosingSegmentQueue() 1073 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 1074 return 1075 } 1076 e.UnlockUser() 1077 } 1078 1079 // +checklocks:e.mu 1080 func (e *Endpoint) closeLocked() { 1081 linger := e.SocketOptions().GetLinger() 1082 if linger.Enabled && linger.Timeout == 0 { 1083 s := e.EndpointState() 1084 isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv 1085 if isResetState { 1086 // Close the endpoint without doing full shutdown and 1087 // send a RST. 1088 e.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) 1089 return 1090 } 1091 } 1092 1093 // Issue a shutdown so that the peer knows we won't send any more data 1094 // if we're connected, or stop accepting if we're listening. 1095 e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead) 1096 e.closeNoShutdownLocked() 1097 } 1098 1099 // closeNoShutdown closes the endpoint without doing a full shutdown. 1100 // +checklocks:e.mu 1101 func (e *Endpoint) closeNoShutdownLocked() { 1102 // For listening sockets, we always release ports inline so that they 1103 // are immediately available for reuse after Close() is called. If also 1104 // registered, we unregister as well otherwise the next user would fail 1105 // in Listen() when trying to register. 1106 if e.EndpointState() == StateListen && e.isPortReserved { 1107 if e.isRegistered { 1108 e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 1109 e.isRegistered = false 1110 } 1111 1112 portRes := ports.Reservation{ 1113 Networks: e.effectiveNetProtos, 1114 Transport: ProtocolNumber, 1115 Addr: e.TransportEndpointInfo.ID.LocalAddress, 1116 Port: e.TransportEndpointInfo.ID.LocalPort, 1117 Flags: e.boundPortFlags, 1118 BindToDevice: e.boundBindToDevice, 1119 Dest: e.boundDest, 1120 } 1121 e.stack.ReleasePort(portRes) 1122 e.isPortReserved = false 1123 e.boundBindToDevice = 0 1124 e.boundPortFlags = ports.Flags{} 1125 e.boundDest = tcpip.FullAddress{} 1126 } 1127 1128 // Mark endpoint as closed. 1129 e.closed = true 1130 tcpip.AddDanglingEndpoint(e) 1131 1132 eventMask := waiter.ReadableEvents | waiter.WritableEvents 1133 1134 switch e.EndpointState() { 1135 case StateInitial, StateBound, StateListen: 1136 e.setEndpointState(StateClose) 1137 fallthrough 1138 case StateClose, StateError: 1139 eventMask |= waiter.EventHUp 1140 e.cleanupLocked() 1141 case StateConnecting, StateSynSent, StateSynRecv: 1142 // Abort the handshake and set the error. 1143 // Notify that the endpoint is closed. 1144 eventMask |= waiter.EventHUp 1145 e.handshakeFailed(&tcpip.ErrAborted{}) 1146 // Notify that the endpoint is closed. 1147 eventMask |= waiter.EventHUp 1148 case StateFinWait2: 1149 // The socket has been closed and we are in FIN-WAIT-2 so start 1150 // the FIN-WAIT-2 timer. 1151 if e.finWait2Timer == nil { 1152 e.finWait2Timer = e.stack.Clock().AfterFunc(e.tcpLingerTimeout, e.finWait2TimerExpired) 1153 } 1154 } 1155 1156 e.waiterQueue.Notify(eventMask) 1157 } 1158 1159 // closePendingAcceptableConnections closes all connections that have completed 1160 // handshake but not yet been delivered to the application. 1161 func (e *Endpoint) closePendingAcceptableConnectionsLocked() { 1162 e.acceptMu.Lock() 1163 1164 pendingEndpoints := e.acceptQueue.pendingEndpoints 1165 e.acceptQueue.pendingEndpoints = nil 1166 1167 completedEndpoints := make([]*Endpoint, 0, e.acceptQueue.endpoints.Len()) 1168 for n := e.acceptQueue.endpoints.Front(); n != nil; n = n.Next() { 1169 completedEndpoints = append(completedEndpoints, n.Value.(*Endpoint)) 1170 } 1171 e.acceptQueue.endpoints.Init() 1172 e.acceptQueue.capacity = 0 1173 e.acceptMu.Unlock() 1174 1175 // Close any endpoints in SYN-RCVD state. 1176 for n := range pendingEndpoints { 1177 n.Abort() 1178 } 1179 1180 // Reset all connections that are waiting to be accepted. 1181 for _, n := range completedEndpoints { 1182 n.Abort() 1183 } 1184 } 1185 1186 // cleanupLocked frees all resources associated with the endpoint. 1187 // +checklocks:e.mu 1188 func (e *Endpoint) cleanupLocked() { 1189 if e.snd != nil { 1190 e.snd.resendTimer.cleanup() 1191 e.snd.probeTimer.cleanup() 1192 e.snd.reorderTimer.cleanup() 1193 e.snd.corkTimer.cleanup() 1194 } 1195 1196 if e.finWait2Timer != nil { 1197 e.finWait2Timer.Stop() 1198 } 1199 1200 if e.timeWaitTimer != nil { 1201 e.timeWaitTimer.Stop() 1202 } 1203 1204 // Close all endpoints that might have been accepted by TCP but not by 1205 // the client. 1206 e.closePendingAcceptableConnectionsLocked() 1207 e.keepalive.timer.cleanup() 1208 1209 if e.isRegistered { 1210 e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 1211 e.isRegistered = false 1212 } 1213 1214 if e.isPortReserved { 1215 portRes := ports.Reservation{ 1216 Networks: e.effectiveNetProtos, 1217 Transport: ProtocolNumber, 1218 Addr: e.TransportEndpointInfo.ID.LocalAddress, 1219 Port: e.TransportEndpointInfo.ID.LocalPort, 1220 Flags: e.boundPortFlags, 1221 BindToDevice: e.boundBindToDevice, 1222 Dest: e.boundDest, 1223 } 1224 e.stack.ReleasePort(portRes) 1225 e.isPortReserved = false 1226 } 1227 e.boundBindToDevice = 0 1228 e.boundPortFlags = ports.Flags{} 1229 e.boundDest = tcpip.FullAddress{} 1230 1231 if e.route != nil { 1232 e.route.Release() 1233 e.route = nil 1234 } 1235 1236 e.purgeWriteQueue() 1237 // Only purge the read queue here if the socket is fully closed by the 1238 // user. 1239 if e.closed { 1240 e.purgeReadQueue() 1241 } 1242 e.stack.CompleteTransportEndpointCleanup(e) 1243 tcpip.DeleteDanglingEndpoint(e) 1244 } 1245 1246 // wndFromSpace returns the window that we can advertise based on the available 1247 // receive buffer space. 1248 func wndFromSpace(space int) int { 1249 return space >> rcvAdvWndScale 1250 } 1251 1252 // initialReceiveWindow returns the initial receive window to advertise in the 1253 // SYN/SYN-ACK. 1254 func (e *Endpoint) initialReceiveWindow() int { 1255 rcvWnd := wndFromSpace(e.receiveBufferAvailable()) 1256 if rcvWnd > math.MaxUint16 { 1257 rcvWnd = math.MaxUint16 1258 } 1259 1260 // Use the user supplied MSS, if available. 1261 routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2 1262 if rcvWnd > routeWnd { 1263 rcvWnd = routeWnd 1264 } 1265 rcvWndScale := e.rcvWndScaleForHandshake() 1266 1267 // Round-down the rcvWnd to a multiple of wndScale. This ensures that the 1268 // window offered in SYN won't be reduced due to the loss of precision if 1269 // window scaling is enabled after the handshake. 1270 rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale) 1271 1272 // Ensure we can always accept at least 1 byte if the scale specified 1273 // was too high for the provided rcvWnd. 1274 if rcvWnd == 0 { 1275 rcvWnd = 1 1276 } 1277 1278 return rcvWnd 1279 } 1280 1281 // ModerateRecvBuf adjusts the receive buffer and the advertised window 1282 // based on the number of bytes copied to userspace. 1283 func (e *Endpoint) ModerateRecvBuf(copied int) { 1284 e.LockUser() 1285 defer e.UnlockUser() 1286 1287 sendNonZeroWindowUpdate := false 1288 1289 e.rcvQueueMu.Lock() 1290 if e.RcvAutoParams.Disabled { 1291 e.rcvQueueMu.Unlock() 1292 return 1293 } 1294 now := e.stack.Clock().NowMonotonic() 1295 if rtt := e.RcvAutoParams.RTT; rtt == 0 || now.Sub(e.RcvAutoParams.MeasureTime) < rtt { 1296 e.RcvAutoParams.CopiedBytes += copied 1297 e.rcvQueueMu.Unlock() 1298 return 1299 } 1300 prevRTTCopied := e.RcvAutoParams.CopiedBytes + copied 1301 prevCopied := e.RcvAutoParams.PrevCopiedBytes 1302 rcvWnd := 0 1303 if prevRTTCopied > prevCopied { 1304 // The minimal receive window based on what was copied by the app 1305 // in the immediate preceding RTT and some extra buffer for 16 1306 // segments to account for variations. 1307 // We multiply by 2 to account for packet losses. 1308 rcvWnd = prevRTTCopied*2 + 16*int(e.amss) 1309 1310 // Scale for slow start based on bytes copied in this RTT vs previous. 1311 grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied 1312 1313 // Multiply growth factor by 2 again to account for sender being 1314 // in slow-start where the sender grows it's congestion window 1315 // by 100% per RTT. 1316 rcvWnd += grow * 2 1317 1318 // Make sure auto tuned buffer size can always receive upto 2x 1319 // the initial window of 10 segments. 1320 if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd { 1321 rcvWnd = minRcvWnd 1322 } 1323 1324 // Cap the auto tuned buffer size by the maximum permissible 1325 // receive buffer size. 1326 if max := e.maxReceiveBufferSize(); rcvWnd > max { 1327 rcvWnd = max 1328 } 1329 1330 // We do not adjust downwards as that can cause the receiver to 1331 // reject valid data that might already be in flight as the 1332 // acceptable window will shrink. 1333 rcvBufSize := int(e.ops.GetReceiveBufferSize()) 1334 if rcvWnd > rcvBufSize { 1335 availBefore := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize)) 1336 e.ops.SetReceiveBufferSize(int64(rcvWnd), false /* notify */) 1337 availAfter := wndFromSpace(e.receiveBufferAvailableLocked(rcvWnd)) 1338 if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, rcvBufSize); crossed && above { 1339 sendNonZeroWindowUpdate = true 1340 } 1341 } 1342 1343 // We only update PrevCopiedBytes when we grow the buffer because in cases 1344 // where PrevCopiedBytes > prevRTTCopied the existing buffer is already big 1345 // enough to handle the current rate and we don't need to do any 1346 // adjustments. 1347 e.RcvAutoParams.PrevCopiedBytes = prevRTTCopied 1348 } 1349 e.RcvAutoParams.MeasureTime = now 1350 e.RcvAutoParams.CopiedBytes = 0 1351 e.rcvQueueMu.Unlock() 1352 1353 // Send the update after unlocking rcvQueueMu as sending a segment acquires 1354 // the lock to calculate the window to be sent. 1355 if e.EndpointState().connected() && sendNonZeroWindowUpdate { 1356 e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu 1357 } 1358 } 1359 1360 // SetOwner implements tcpip.Endpoint.SetOwner. 1361 func (e *Endpoint) SetOwner(owner tcpip.PacketOwner) { 1362 e.owner = owner 1363 } 1364 1365 // +checklocks:e.mu 1366 func (e *Endpoint) hardErrorLocked() tcpip.Error { 1367 err := e.hardError 1368 e.hardError = nil 1369 return err 1370 } 1371 1372 // +checklocks:e.mu 1373 func (e *Endpoint) lastErrorLocked() tcpip.Error { 1374 e.lastErrorMu.Lock() 1375 defer e.lastErrorMu.Unlock() 1376 err := e.lastError 1377 e.lastError = nil 1378 return err 1379 } 1380 1381 // LastError implements tcpip.Endpoint.LastError. 1382 func (e *Endpoint) LastError() tcpip.Error { 1383 e.LockUser() 1384 defer e.UnlockUser() 1385 if err := e.hardErrorLocked(); err != nil { 1386 return err 1387 } 1388 return e.lastErrorLocked() 1389 } 1390 1391 // LastErrorLocked reads and clears lastError. 1392 // Only to be used in tests. 1393 // +checklocks:e.mu 1394 func (e *Endpoint) LastErrorLocked() tcpip.Error { 1395 return e.lastErrorLocked() 1396 } 1397 1398 // UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError. 1399 func (e *Endpoint) UpdateLastError(err tcpip.Error) { 1400 e.LockUser() 1401 e.lastErrorMu.Lock() 1402 e.lastError = err 1403 e.lastErrorMu.Unlock() 1404 e.UnlockUser() 1405 } 1406 1407 // Read implements tcpip.Endpoint.Read. 1408 func (e *Endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) { 1409 e.LockUser() 1410 defer e.UnlockUser() 1411 1412 if err := e.checkReadLocked(); err != nil { 1413 if _, ok := err.(*tcpip.ErrClosedForReceive); ok { 1414 e.stats.ReadErrors.ReadClosed.Increment() 1415 } 1416 return tcpip.ReadResult{}, err 1417 } 1418 1419 var err error 1420 done := 0 1421 // N.B. Here we get the first segment to be processed. It is safe to not 1422 // hold rcvQueueMu when processing, since we hold e.mu to ensure we only 1423 // remove segments from the list through Read() and that new segments 1424 // cannot be appended. 1425 s := e.rcvQueue.Front() 1426 for s != nil { 1427 var n int 1428 n, err = s.ReadTo(dst, opts.Peek) 1429 // Book keeping first then error handling. 1430 done += n 1431 1432 if opts.Peek { 1433 s = s.Next() 1434 } else { 1435 sendNonZeroWindowUpdate := false 1436 memDelta := 0 1437 for { 1438 seg := e.rcvQueue.Front() 1439 if seg == nil || seg.payloadSize() != 0 { 1440 break 1441 } 1442 e.rcvQueue.Remove(seg) 1443 // Memory is only considered released when the whole segment has been 1444 // read. 1445 memDelta += seg.segMemSize() 1446 seg.DecRef() 1447 } 1448 e.rcvQueueMu.Lock() 1449 e.RcvBufUsed -= n 1450 s = e.rcvQueue.Front() 1451 1452 if memDelta > 0 { 1453 // If the window was small before this read and if the read freed up 1454 // enough buffer space, to either fit an aMSS or half a receive buffer 1455 // (whichever smaller), then notify the protocol goroutine to send a 1456 // window update. 1457 if crossed, above := e.windowCrossedACKThresholdLocked(memDelta, int(e.ops.GetReceiveBufferSize())); crossed && above { 1458 sendNonZeroWindowUpdate = true 1459 } 1460 } 1461 e.rcvQueueMu.Unlock() 1462 1463 if e.EndpointState().connected() && sendNonZeroWindowUpdate { 1464 e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu 1465 } 1466 } 1467 1468 if err != nil { 1469 break 1470 } 1471 } 1472 1473 // If something is read, we must report it. Report error when nothing is read. 1474 if done == 0 && err != nil { 1475 return tcpip.ReadResult{}, &tcpip.ErrBadBuffer{} 1476 } 1477 return tcpip.ReadResult{ 1478 Count: done, 1479 Total: done, 1480 }, nil 1481 } 1482 1483 // checkRead checks that endpoint is in a readable state. 1484 // 1485 // +checklocks:e.mu 1486 func (e *Endpoint) checkReadLocked() tcpip.Error { 1487 e.rcvQueueMu.Lock() 1488 defer e.rcvQueueMu.Unlock() 1489 // When in SYN-SENT state, let the caller block on the receive. 1490 // An application can initiate a non-blocking connect and then block 1491 // on a receive. It can expect to read any data after the handshake 1492 // is complete. RFC793, section 3.9, p58. 1493 if e.EndpointState() == StateSynSent { 1494 return &tcpip.ErrWouldBlock{} 1495 } 1496 1497 // The endpoint can be read if it's connected, or if it's already closed 1498 // but has some pending unread data. Also note that a RST being received 1499 // would cause the state to become StateError so we should allow the 1500 // reads to proceed before returning a ECONNRESET. 1501 bufUsed := e.RcvBufUsed 1502 if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 { 1503 if s == StateError { 1504 if err := e.hardErrorLocked(); err != nil { 1505 return err 1506 } 1507 return &tcpip.ErrClosedForReceive{} 1508 } 1509 e.stats.ReadErrors.NotConnected.Increment() 1510 return &tcpip.ErrNotConnected{} 1511 } 1512 1513 if e.RcvBufUsed == 0 { 1514 if e.RcvClosed || !e.EndpointState().connected() { 1515 return &tcpip.ErrClosedForReceive{} 1516 } 1517 return &tcpip.ErrWouldBlock{} 1518 } 1519 1520 return nil 1521 } 1522 1523 // isEndpointWritableLocked checks if a given endpoint is writable 1524 // and also returns the number of bytes that can be written at this 1525 // moment. If the endpoint is not writable then it returns an error 1526 // indicating the reason why it's not writable. 1527 // +checklocks:e.mu 1528 // +checklocks:e.sndQueueInfo.sndQueueMu 1529 func (e *Endpoint) isEndpointWritableLocked() (int, tcpip.Error) { 1530 // The endpoint cannot be written to if it's not connected. 1531 switch s := e.EndpointState(); { 1532 case s == StateError: 1533 if err := e.hardErrorLocked(); err != nil { 1534 return 0, err 1535 } 1536 return 0, &tcpip.ErrClosedForSend{} 1537 case !s.connecting() && !s.connected(): 1538 return 0, &tcpip.ErrClosedForSend{} 1539 case s.connecting(): 1540 // As per RFC793, page 56, a send request arriving when in connecting 1541 // state, can be queued to be completed after the state becomes 1542 // connected. Return an error code for the caller of endpoint Write to 1543 // try again, until the connection handshake is complete. 1544 return 0, &tcpip.ErrWouldBlock{} 1545 } 1546 1547 // Check if the connection has already been closed for sends. 1548 if e.sndQueueInfo.SndClosed { 1549 return 0, &tcpip.ErrClosedForSend{} 1550 } 1551 1552 sndBufSize := e.getSendBufferSize() 1553 avail := sndBufSize - e.sndQueueInfo.SndBufUsed 1554 if avail <= 0 { 1555 return 0, &tcpip.ErrWouldBlock{} 1556 } 1557 return avail, nil 1558 } 1559 1560 // readFromPayloader reads a slice from the Payloader. 1561 // +checklocks:e.mu 1562 // +checklocks:e.sndQueueInfo.sndQueueMu 1563 func (e *Endpoint) readFromPayloader(p tcpip.Payloader, opts tcpip.WriteOptions, avail int) (buffer.Buffer, tcpip.Error) { 1564 // We can release locks while copying data. 1565 // 1566 // This is not possible if atomic is set, because we can't allow the 1567 // available buffer space to be consumed by some other caller while we 1568 // are copying data in. 1569 limRdr := e.limRdr 1570 if !opts.Atomic { 1571 defer func() { 1572 e.limRdr = limRdr 1573 }() 1574 e.limRdr = nil 1575 1576 e.sndQueueInfo.sndQueueMu.Unlock() 1577 defer e.sndQueueInfo.sndQueueMu.Lock() 1578 1579 e.UnlockUser() 1580 defer e.LockUser() 1581 } 1582 1583 // Fetch data. 1584 var payload buffer.Buffer 1585 if l := p.Len(); l < avail { 1586 avail = l 1587 } 1588 if avail == 0 { 1589 return payload, nil 1590 } 1591 if _, err := payload.WriteFromReaderAndLimitedReader(p, int64(avail), limRdr); err != nil { 1592 payload.Release() 1593 return buffer.Buffer{}, &tcpip.ErrBadBuffer{} 1594 } 1595 return payload, nil 1596 } 1597 1598 // queueSegment reads data from the payloader and returns a segment to be sent. 1599 // +checklocks:e.mu 1600 func (e *Endpoint) queueSegment(p tcpip.Payloader, opts tcpip.WriteOptions) (*segment, int, tcpip.Error) { 1601 e.sndQueueInfo.sndQueueMu.Lock() 1602 defer e.sndQueueInfo.sndQueueMu.Unlock() 1603 1604 avail, err := e.isEndpointWritableLocked() 1605 if err != nil { 1606 e.stats.WriteErrors.WriteClosed.Increment() 1607 return nil, 0, err 1608 } 1609 1610 buf, err := e.readFromPayloader(p, opts, avail) 1611 if err != nil { 1612 return nil, 0, err 1613 } 1614 1615 // Do not queue zero length segments. 1616 if buf.Size() == 0 { 1617 return nil, 0, nil 1618 } 1619 1620 if !opts.Atomic { 1621 // Since we released locks in between it's possible that the 1622 // endpoint transitioned to a CLOSED/ERROR states so make 1623 // sure endpoint is still writable before trying to write. 1624 avail, err := e.isEndpointWritableLocked() 1625 if err != nil { 1626 e.stats.WriteErrors.WriteClosed.Increment() 1627 buf.Release() 1628 return nil, 0, err 1629 } 1630 1631 // A simultaneous call to write on the socket can reduce avail. Discard 1632 // excess data copied if this is the case. 1633 if int64(avail) < buf.Size() { 1634 buf.Truncate(int64(avail)) 1635 } 1636 } 1637 1638 // Add data to the send queue. 1639 size := int(buf.Size()) 1640 s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), buf) 1641 e.sndQueueInfo.SndBufUsed += size 1642 e.snd.writeList.PushBack(s) 1643 1644 return s, size, nil 1645 } 1646 1647 // Write writes data to the endpoint's peer. 1648 func (e *Endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { 1649 // Linux completely ignores any address passed to sendto(2) for TCP sockets 1650 // (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More 1651 // and opts.EndOfRecord are also ignored. 1652 1653 e.LockUser() 1654 defer e.UnlockUser() 1655 1656 // Return if either we didn't queue anything or if an error occurred while 1657 // attempting to queue data. 1658 nextSeg, n, err := e.queueSegment(p, opts) 1659 if n == 0 || err != nil { 1660 return 0, err 1661 } 1662 1663 e.sendData(nextSeg) 1664 return int64(n), nil 1665 } 1666 1667 // selectWindowLocked returns the new window without checking for shrinking or scaling 1668 // applied. 1669 // +checklocks:e.mu 1670 // +checklocks:e.rcvQueueMu 1671 func (e *Endpoint) selectWindowLocked(rcvBufSize int) (wnd seqnum.Size) { 1672 wndFromAvailable := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize)) 1673 maxWindow := wndFromSpace(rcvBufSize) 1674 wndFromUsedBytes := maxWindow - e.RcvBufUsed 1675 1676 // We take the lesser of the wndFromAvailable and wndFromUsedBytes because in 1677 // cases where we receive a lot of small segments the segment overhead is a 1678 // lot higher and we can run out socket buffer space before we can fill the 1679 // previous window we advertised. In cases where we receive MSS sized or close 1680 // MSS sized segments we will probably run out of window space before we 1681 // exhaust receive buffer. 1682 newWnd := wndFromAvailable 1683 if newWnd > wndFromUsedBytes { 1684 newWnd = wndFromUsedBytes 1685 } 1686 if newWnd < 0 { 1687 newWnd = 0 1688 } 1689 return seqnum.Size(newWnd) 1690 } 1691 1692 // selectWindow invokes selectWindowLocked after acquiring e.rcvQueueMu. 1693 // +checklocks:e.mu 1694 func (e *Endpoint) selectWindow() (wnd seqnum.Size) { 1695 e.rcvQueueMu.Lock() 1696 wnd = e.selectWindowLocked(int(e.ops.GetReceiveBufferSize())) 1697 e.rcvQueueMu.Unlock() 1698 return wnd 1699 } 1700 1701 // windowCrossedACKThresholdLocked checks if the receive window to be announced 1702 // would be under aMSS or under the window derived from half receive buffer, 1703 // whichever smaller. This is useful as a receive side silly window syndrome 1704 // prevention mechanism. If window grows to reasonable value, we should send ACK 1705 // to the sender to inform the rx space is now large. We also want ensure a 1706 // series of small read()'s won't trigger a flood of spurious tiny ACK's. 1707 // 1708 // For large receive buffers, the threshold is aMSS - once reader reads more 1709 // than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of 1710 // receive buffer size. This is chosen arbitrarily. 1711 // crossed will be true if the window size crossed the ACK threshold. 1712 // above will be true if the new window is >= ACK threshold and false 1713 // otherwise. 1714 // 1715 // +checklocks:e.mu 1716 // +checklocks:e.rcvQueueMu 1717 func (e *Endpoint) windowCrossedACKThresholdLocked(deltaBefore int, rcvBufSize int) (crossed bool, above bool) { 1718 newAvail := int(e.selectWindowLocked(rcvBufSize)) 1719 oldAvail := newAvail - deltaBefore 1720 if oldAvail < 0 { 1721 oldAvail = 0 1722 } 1723 threshold := int(e.amss) 1724 // rcvBufFraction is the inverse of the fraction of receive buffer size that 1725 // is used to decide if the available buffer space is now above it. 1726 const rcvBufFraction = 2 1727 if wndThreshold := wndFromSpace(rcvBufSize / rcvBufFraction); threshold > wndThreshold { 1728 threshold = wndThreshold 1729 } 1730 1731 switch { 1732 case oldAvail < threshold && newAvail >= threshold: 1733 return true, true 1734 case oldAvail >= threshold && newAvail < threshold: 1735 return true, false 1736 } 1737 return false, false 1738 } 1739 1740 // OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet. 1741 func (e *Endpoint) OnReuseAddressSet(v bool) { 1742 e.LockUser() 1743 e.portFlags.TupleOnly = v 1744 e.UnlockUser() 1745 } 1746 1747 // OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet. 1748 func (e *Endpoint) OnReusePortSet(v bool) { 1749 e.LockUser() 1750 e.portFlags.LoadBalanced = v 1751 e.UnlockUser() 1752 } 1753 1754 // OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet. 1755 func (e *Endpoint) OnKeepAliveSet(bool) { 1756 e.LockUser() 1757 e.resetKeepaliveTimer(true /* receivedData */) 1758 e.UnlockUser() 1759 } 1760 1761 // OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet. 1762 func (e *Endpoint) OnDelayOptionSet(v bool) { 1763 if !v { 1764 e.LockUser() 1765 defer e.UnlockUser() 1766 // Handle delayed data. 1767 if e.EndpointState().connected() { 1768 e.sendData(nil /* next */) 1769 } 1770 } 1771 } 1772 1773 // OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet. 1774 func (e *Endpoint) OnCorkOptionSet(v bool) { 1775 if !v { 1776 e.LockUser() 1777 defer e.UnlockUser() 1778 if e.snd != nil { 1779 e.snd.corkTimer.disable() 1780 } 1781 // Handle the corked data. 1782 if e.EndpointState().connected() { 1783 e.sendData(nil /* next */) 1784 } 1785 } 1786 } 1787 1788 func (e *Endpoint) getSendBufferSize() int { 1789 return int(e.ops.GetSendBufferSize()) 1790 } 1791 1792 // OnSetReceiveBufferSize implements tcpip.SocketOptionsHandler.OnSetReceiveBufferSize. 1793 func (e *Endpoint) OnSetReceiveBufferSize(rcvBufSz, oldSz int64) (newSz int64, postSet func()) { 1794 e.LockUser() 1795 1796 sendNonZeroWindowUpdate := false 1797 e.rcvQueueMu.Lock() 1798 1799 // Make sure the receive buffer size allows us to send a 1800 // non-zero window size. 1801 scale := uint8(0) 1802 if e.rcv != nil { 1803 scale = e.rcv.RcvWndScale 1804 } 1805 if rcvBufSz>>scale == 0 { 1806 rcvBufSz = 1 << scale 1807 } 1808 1809 availBefore := wndFromSpace(e.receiveBufferAvailableLocked(int(oldSz))) 1810 availAfter := wndFromSpace(e.receiveBufferAvailableLocked(int(rcvBufSz))) 1811 e.RcvAutoParams.Disabled = true 1812 1813 // Immediately send an ACK to uncork the sender silly window 1814 // syndrome prevetion, when our available space grows above aMSS 1815 // or half receive buffer, whichever smaller. 1816 if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, int(rcvBufSz)); crossed && above { 1817 sendNonZeroWindowUpdate = true 1818 } 1819 1820 e.rcvQueueMu.Unlock() 1821 1822 postSet = func() { 1823 e.LockUser() 1824 defer e.UnlockUser() 1825 if e.EndpointState().connected() && sendNonZeroWindowUpdate { 1826 e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu 1827 } 1828 1829 } 1830 e.UnlockUser() 1831 return rcvBufSz, postSet 1832 } 1833 1834 // OnSetSendBufferSize implements tcpip.SocketOptionsHandler.OnSetSendBufferSize. 1835 func (e *Endpoint) OnSetSendBufferSize(sz int64) int64 { 1836 e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled.Store(1) 1837 return sz 1838 } 1839 1840 // WakeupWriters implements tcpip.SocketOptionsHandler.WakeupWriters. 1841 func (e *Endpoint) WakeupWriters() { 1842 e.LockUser() 1843 defer e.UnlockUser() 1844 1845 sendBufferSize := e.getSendBufferSize() 1846 e.sndQueueInfo.sndQueueMu.Lock() 1847 notify := (sendBufferSize - e.sndQueueInfo.SndBufUsed) >= e.sndQueueInfo.SndBufUsed>>1 1848 e.sndQueueInfo.sndQueueMu.Unlock() 1849 1850 if notify { 1851 e.waiterQueue.Notify(waiter.WritableEvents) 1852 } 1853 } 1854 1855 // SetSockOptInt sets a socket option. 1856 func (e *Endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { 1857 // Lower 2 bits represents ECN bits. RFC 3168, section 23.1 1858 const inetECNMask = 3 1859 1860 switch opt { 1861 case tcpip.KeepaliveCountOption: 1862 e.LockUser() 1863 e.keepalive.Lock() 1864 e.keepalive.count = v 1865 e.keepalive.Unlock() 1866 e.resetKeepaliveTimer(true /* receivedData */) 1867 e.UnlockUser() 1868 1869 case tcpip.IPv4TOSOption: 1870 e.LockUser() 1871 // TODO(gvisor.dev/issue/995): ECN is not currently supported, 1872 // ignore the bits for now. 1873 e.sendTOS = uint8(v) & ^uint8(inetECNMask) 1874 e.UnlockUser() 1875 1876 case tcpip.IPv6TrafficClassOption: 1877 e.LockUser() 1878 // TODO(gvisor.dev/issue/995): ECN is not currently supported, 1879 // ignore the bits for now. 1880 e.sendTOS = uint8(v) & ^uint8(inetECNMask) 1881 e.UnlockUser() 1882 1883 case tcpip.MaxSegOption: 1884 userMSS := v 1885 if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS { 1886 return &tcpip.ErrInvalidOptionValue{} 1887 } 1888 e.LockUser() 1889 e.userMSS = uint16(userMSS) 1890 e.UnlockUser() 1891 1892 case tcpip.MTUDiscoverOption: 1893 // Return not supported if attempting to set this option to 1894 // anything other than path MTU discovery disabled. 1895 if v != tcpip.PMTUDiscoveryDont { 1896 return &tcpip.ErrNotSupported{} 1897 } 1898 1899 case tcpip.IPv4TTLOption: 1900 e.LockUser() 1901 e.ipv4TTL = uint8(v) 1902 e.UnlockUser() 1903 1904 case tcpip.IPv6HopLimitOption: 1905 e.LockUser() 1906 e.ipv6HopLimit = int16(v) 1907 e.UnlockUser() 1908 1909 case tcpip.TCPSynCountOption: 1910 if v < 1 || v > 255 { 1911 return &tcpip.ErrInvalidOptionValue{} 1912 } 1913 e.LockUser() 1914 e.maxSynRetries = uint8(v) 1915 e.UnlockUser() 1916 1917 case tcpip.TCPWindowClampOption: 1918 if v == 0 { 1919 e.LockUser() 1920 switch e.EndpointState() { 1921 case StateClose, StateInitial: 1922 e.windowClamp = 0 1923 e.UnlockUser() 1924 return nil 1925 default: 1926 e.UnlockUser() 1927 return &tcpip.ErrInvalidOptionValue{} 1928 } 1929 } 1930 var rs tcpip.TCPReceiveBufferSizeRangeOption 1931 if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil { 1932 if v < rs.Min/2 { 1933 v = rs.Min / 2 1934 } 1935 } 1936 e.LockUser() 1937 e.windowClamp = uint32(v) 1938 e.UnlockUser() 1939 } 1940 return nil 1941 } 1942 1943 // HasNIC returns true if the NICID is defined in the stack or id is 0. 1944 func (e *Endpoint) HasNIC(id int32) bool { 1945 return id == 0 || e.stack.HasNIC(tcpip.NICID(id)) 1946 } 1947 1948 // SetSockOpt sets a socket option. 1949 func (e *Endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { 1950 switch v := opt.(type) { 1951 case *tcpip.KeepaliveIdleOption: 1952 e.LockUser() 1953 e.keepalive.Lock() 1954 e.keepalive.idle = time.Duration(*v) 1955 e.keepalive.Unlock() 1956 e.resetKeepaliveTimer(true /* receivedData */) 1957 e.UnlockUser() 1958 1959 case *tcpip.KeepaliveIntervalOption: 1960 e.LockUser() 1961 e.keepalive.Lock() 1962 e.keepalive.interval = time.Duration(*v) 1963 e.keepalive.Unlock() 1964 e.resetKeepaliveTimer(true /* receivedData */) 1965 e.UnlockUser() 1966 1967 case *tcpip.TCPUserTimeoutOption: 1968 e.LockUser() 1969 e.userTimeout = time.Duration(*v) 1970 e.UnlockUser() 1971 1972 case *tcpip.CongestionControlOption: 1973 // Query the available cc algorithms in the stack and 1974 // validate that the specified algorithm is actually 1975 // supported in the stack. 1976 var avail tcpip.TCPAvailableCongestionControlOption 1977 if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil { 1978 return err 1979 } 1980 availCC := strings.Split(string(avail), " ") 1981 for _, cc := range availCC { 1982 if *v == tcpip.CongestionControlOption(cc) { 1983 e.LockUser() 1984 state := e.EndpointState() 1985 e.cc = *v 1986 switch state { 1987 case StateEstablished: 1988 if e.EndpointState() == state { 1989 e.snd.cc = e.snd.initCongestionControl(e.cc) 1990 } 1991 } 1992 e.UnlockUser() 1993 return nil 1994 } 1995 } 1996 1997 // Linux returns ENOENT when an invalid congestion 1998 // control algorithm is specified. 1999 return &tcpip.ErrNoSuchFile{} 2000 2001 case *tcpip.TCPLingerTimeoutOption: 2002 e.LockUser() 2003 2004 switch { 2005 case *v < 0: 2006 // Same as effectively disabling TCPLinger timeout. 2007 *v = -1 2008 case *v == 0: 2009 // Same as the stack default. 2010 var stackLingerTimeout tcpip.TCPLingerTimeoutOption 2011 if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil { 2012 panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err)) 2013 } 2014 *v = stackLingerTimeout 2015 case *v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout): 2016 // Cap it to Stack's default TCP_LINGER2 timeout. 2017 *v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout) 2018 default: 2019 } 2020 2021 e.tcpLingerTimeout = time.Duration(*v) 2022 e.UnlockUser() 2023 2024 case *tcpip.TCPDeferAcceptOption: 2025 e.LockUser() 2026 if time.Duration(*v) > MaxRTO { 2027 *v = tcpip.TCPDeferAcceptOption(MaxRTO) 2028 } 2029 e.deferAccept = time.Duration(*v) 2030 e.UnlockUser() 2031 2032 case *tcpip.SocketDetachFilterOption: 2033 return nil 2034 2035 default: 2036 return nil 2037 } 2038 return nil 2039 } 2040 2041 // readyReceiveSize returns the number of bytes ready to be received. 2042 func (e *Endpoint) readyReceiveSize() (int, tcpip.Error) { 2043 e.LockUser() 2044 defer e.UnlockUser() 2045 2046 // The endpoint cannot be in listen state. 2047 if e.EndpointState() == StateListen { 2048 return 0, &tcpip.ErrInvalidEndpointState{} 2049 } 2050 2051 e.rcvQueueMu.Lock() 2052 defer e.rcvQueueMu.Unlock() 2053 2054 return e.RcvBufUsed, nil 2055 } 2056 2057 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. 2058 func (e *Endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { 2059 switch opt { 2060 case tcpip.KeepaliveCountOption: 2061 e.keepalive.Lock() 2062 v := e.keepalive.count 2063 e.keepalive.Unlock() 2064 return v, nil 2065 2066 case tcpip.IPv4TOSOption: 2067 e.LockUser() 2068 v := int(e.sendTOS) 2069 e.UnlockUser() 2070 return v, nil 2071 2072 case tcpip.IPv6TrafficClassOption: 2073 e.LockUser() 2074 v := int(e.sendTOS) 2075 e.UnlockUser() 2076 return v, nil 2077 2078 case tcpip.MaxSegOption: 2079 // Linux only returns user_mss value if user_mss is set and the socket is 2080 // unconnected. Otherwise Linux returns the actual current MSS. Netstack 2081 // mimics the user_mss behavior, but otherwise just returns the defaultMSS 2082 // for now. 2083 v := header.TCPDefaultMSS 2084 e.LockUser() 2085 if state := e.EndpointState(); e.userMSS > 0 && (state.internal() || state == StateClose || state == StateListen) { 2086 v = int(e.userMSS) 2087 } 2088 e.UnlockUser() 2089 return v, nil 2090 2091 case tcpip.MTUDiscoverOption: 2092 // Always return the path MTU discovery disabled setting since 2093 // it's the only one supported. 2094 return tcpip.PMTUDiscoveryDont, nil 2095 2096 case tcpip.ReceiveQueueSizeOption: 2097 return e.readyReceiveSize() 2098 2099 case tcpip.IPv4TTLOption: 2100 e.LockUser() 2101 v := int(e.ipv4TTL) 2102 e.UnlockUser() 2103 return v, nil 2104 2105 case tcpip.IPv6HopLimitOption: 2106 e.LockUser() 2107 v := int(e.ipv6HopLimit) 2108 e.UnlockUser() 2109 return v, nil 2110 2111 case tcpip.TCPSynCountOption: 2112 e.LockUser() 2113 v := int(e.maxSynRetries) 2114 e.UnlockUser() 2115 return v, nil 2116 2117 case tcpip.TCPWindowClampOption: 2118 e.LockUser() 2119 v := int(e.windowClamp) 2120 e.UnlockUser() 2121 return v, nil 2122 2123 case tcpip.MulticastTTLOption: 2124 return 1, nil 2125 2126 default: 2127 return -1, &tcpip.ErrUnknownProtocolOption{} 2128 } 2129 } 2130 2131 func (e *Endpoint) getTCPInfo() tcpip.TCPInfoOption { 2132 info := tcpip.TCPInfoOption{} 2133 e.LockUser() 2134 if state := e.EndpointState(); state.internal() { 2135 info.State = tcpip.EndpointState(StateClose) 2136 } else { 2137 info.State = tcpip.EndpointState(state) 2138 } 2139 snd := e.snd 2140 if snd != nil { 2141 // We do not calculate RTT before sending the data packets. If 2142 // the connection did not send and receive data, then RTT will 2143 // be zero. 2144 snd.rtt.Lock() 2145 info.RTT = snd.rtt.TCPRTTState.SRTT 2146 info.RTTVar = snd.rtt.TCPRTTState.RTTVar 2147 snd.rtt.Unlock() 2148 2149 info.RTO = snd.RTO 2150 info.CcState = snd.state 2151 info.SndSsthresh = uint32(snd.Ssthresh) 2152 info.SndCwnd = uint32(snd.SndCwnd) 2153 info.ReorderSeen = snd.rc.Reord 2154 } 2155 e.UnlockUser() 2156 return info 2157 } 2158 2159 // GetSockOpt implements tcpip.Endpoint.GetSockOpt. 2160 func (e *Endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { 2161 switch o := opt.(type) { 2162 case *tcpip.TCPInfoOption: 2163 *o = e.getTCPInfo() 2164 2165 case *tcpip.KeepaliveIdleOption: 2166 e.keepalive.Lock() 2167 *o = tcpip.KeepaliveIdleOption(e.keepalive.idle) 2168 e.keepalive.Unlock() 2169 2170 case *tcpip.KeepaliveIntervalOption: 2171 e.keepalive.Lock() 2172 *o = tcpip.KeepaliveIntervalOption(e.keepalive.interval) 2173 e.keepalive.Unlock() 2174 2175 case *tcpip.TCPUserTimeoutOption: 2176 e.LockUser() 2177 *o = tcpip.TCPUserTimeoutOption(e.userTimeout) 2178 e.UnlockUser() 2179 2180 case *tcpip.CongestionControlOption: 2181 e.LockUser() 2182 *o = e.cc 2183 e.UnlockUser() 2184 2185 case *tcpip.TCPLingerTimeoutOption: 2186 e.LockUser() 2187 *o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout) 2188 e.UnlockUser() 2189 2190 case *tcpip.TCPDeferAcceptOption: 2191 e.LockUser() 2192 *o = tcpip.TCPDeferAcceptOption(e.deferAccept) 2193 e.UnlockUser() 2194 2195 case *tcpip.OriginalDestinationOption: 2196 e.LockUser() 2197 ipt := e.stack.IPTables() 2198 addr, port, err := ipt.OriginalDst(e.TransportEndpointInfo.ID, e.NetProto, ProtocolNumber) 2199 e.UnlockUser() 2200 if err != nil { 2201 return err 2202 } 2203 *o = tcpip.OriginalDestinationOption{ 2204 Addr: addr, 2205 Port: port, 2206 } 2207 2208 default: 2209 return &tcpip.ErrUnknownProtocolOption{} 2210 } 2211 return nil 2212 } 2213 2214 // checkV4MappedLocked determines the effective network protocol and converts 2215 // addr to its canonical form. 2216 // +checklocks:e.mu 2217 func (e *Endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) { 2218 unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only()) 2219 if err != nil { 2220 return tcpip.FullAddress{}, 0, err 2221 } 2222 return unwrapped, netProto, nil 2223 } 2224 2225 // Disconnect implements tcpip.Endpoint.Disconnect. 2226 func (*Endpoint) Disconnect() tcpip.Error { 2227 return &tcpip.ErrNotSupported{} 2228 } 2229 2230 // Connect connects the endpoint to its peer. 2231 func (e *Endpoint) Connect(addr tcpip.FullAddress) tcpip.Error { 2232 e.LockUser() 2233 defer e.UnlockUser() 2234 err := e.connect(addr, true) 2235 if err != nil { 2236 if !err.IgnoreStats() { 2237 // Connect failed. Let's wake up any waiters. 2238 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 2239 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 2240 e.stats.FailedConnectionAttempts.Increment() 2241 } 2242 } 2243 return err 2244 } 2245 2246 // registerEndpoint registers the endpoint with the provided address. 2247 // 2248 // +checklocks:e.mu 2249 func (e *Endpoint) registerEndpoint(addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber, nicID tcpip.NICID) tcpip.Error { 2250 netProtos := []tcpip.NetworkProtocolNumber{netProto} 2251 if e.TransportEndpointInfo.ID.LocalPort != 0 { 2252 // The endpoint is bound to a port, attempt to register it. 2253 err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 2254 if err != nil { 2255 return err 2256 } 2257 } else { 2258 // The endpoint doesn't have a local port yet, so try to get 2259 // one. Make sure that it isn't one that will result in the same 2260 // address/port for both local and remote (otherwise this 2261 // endpoint would be trying to connect to itself). 2262 sameAddr := e.TransportEndpointInfo.ID.LocalAddress == e.TransportEndpointInfo.ID.RemoteAddress 2263 2264 var twReuse tcpip.TCPTimeWaitReuseOption 2265 if err := e.stack.TransportProtocolOption(ProtocolNumber, &twReuse); err != nil { 2266 panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &twReuse, err)) 2267 } 2268 2269 reuse := twReuse == tcpip.TCPTimeWaitReuseGlobal 2270 if twReuse == tcpip.TCPTimeWaitReuseLoopbackOnly { 2271 switch netProto { 2272 case header.IPv4ProtocolNumber: 2273 reuse = header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.LocalAddress) && header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.RemoteAddress) 2274 case header.IPv6ProtocolNumber: 2275 reuse = e.TransportEndpointInfo.ID.LocalAddress == header.IPv6Loopback && e.TransportEndpointInfo.ID.RemoteAddress == header.IPv6Loopback 2276 } 2277 } 2278 2279 bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) 2280 if _, err := e.stack.PickEphemeralPort(e.stack.SecureRNG(), func(p uint16) (bool, tcpip.Error) { 2281 if sameAddr && p == e.TransportEndpointInfo.ID.RemotePort { 2282 return false, nil 2283 } 2284 portRes := ports.Reservation{ 2285 Networks: netProtos, 2286 Transport: ProtocolNumber, 2287 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2288 Port: p, 2289 Flags: e.portFlags, 2290 BindToDevice: bindToDevice, 2291 Dest: addr, 2292 } 2293 if _, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, nil /* testPort */); err != nil { 2294 if _, ok := err.(*tcpip.ErrPortInUse); !ok || !reuse { 2295 return false, nil 2296 } 2297 transEPID := e.TransportEndpointInfo.ID 2298 transEPID.LocalPort = p 2299 // Check if an endpoint is registered with demuxer in TIME-WAIT and if 2300 // we can reuse it. If we can't find a transport endpoint then we just 2301 // skip using this port as it's possible that either an endpoint has 2302 // bound the port but not registered with demuxer yet (no listen/connect 2303 // done yet) or the reservation was freed between the check above and 2304 // the FindTransportEndpoint below. But rather than retry the same port 2305 // we just skip it and move on. 2306 transEP := e.stack.FindTransportEndpoint(netProto, ProtocolNumber, transEPID, nicID) 2307 if transEP == nil { 2308 // ReservePort failed but there is no registered endpoint with 2309 // demuxer. Which indicates there is at least some endpoint that has 2310 // bound the port. 2311 return false, nil 2312 } 2313 2314 tcpEP := transEP.(*Endpoint) 2315 tcpEP.LockUser() 2316 // If the endpoint is not in TIME-WAIT or if it is in TIME-WAIT but 2317 // less than 1 second has elapsed since its recentTS was updated then 2318 // we cannot reuse the port. 2319 if tcpEP.EndpointState() != StateTimeWait || e.stack.Clock().NowMonotonic().Sub(tcpEP.recentTSTime) < 1*time.Second { 2320 tcpEP.UnlockUser() 2321 return false, nil 2322 } 2323 // Since the endpoint is in TIME-WAIT it should be safe to acquire its 2324 // Lock while holding the lock for this endpoint as endpoints in 2325 // TIME-WAIT do not acquire locks on other endpoints. 2326 tcpEP.transitionToStateCloseLocked() 2327 tcpEP.drainClosingSegmentQueue() 2328 tcpEP.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 2329 tcpEP.UnlockUser() 2330 // Now try and Reserve again if it fails then we skip. 2331 portRes := ports.Reservation{ 2332 Networks: netProtos, 2333 Transport: ProtocolNumber, 2334 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2335 Port: p, 2336 Flags: e.portFlags, 2337 BindToDevice: bindToDevice, 2338 Dest: addr, 2339 } 2340 if _, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, nil /* testPort */); err != nil { 2341 return false, nil 2342 } 2343 } 2344 2345 id := e.TransportEndpointInfo.ID 2346 id.LocalPort = p 2347 if err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.portFlags, bindToDevice); err != nil { 2348 portRes := ports.Reservation{ 2349 Networks: netProtos, 2350 Transport: ProtocolNumber, 2351 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2352 Port: p, 2353 Flags: e.portFlags, 2354 BindToDevice: bindToDevice, 2355 Dest: addr, 2356 } 2357 e.stack.ReleasePort(portRes) 2358 if _, ok := err.(*tcpip.ErrPortInUse); ok { 2359 return false, nil 2360 } 2361 return false, err 2362 } 2363 2364 // Port picking successful. Save the details of 2365 // the selected port. 2366 e.TransportEndpointInfo.ID = id 2367 e.isPortReserved = true 2368 e.boundBindToDevice = bindToDevice 2369 e.boundPortFlags = e.portFlags 2370 e.boundDest = addr 2371 return true, nil 2372 }); err != nil { 2373 e.stack.Stats().TCP.FailedPortReservations.Increment() 2374 return err 2375 } 2376 } 2377 return nil 2378 } 2379 2380 // connect connects the endpoint to its peer. 2381 // +checklocks:e.mu 2382 func (e *Endpoint) connect(addr tcpip.FullAddress, handshake bool) tcpip.Error { 2383 connectingAddr := addr.Addr 2384 2385 addr, netProto, err := e.checkV4MappedLocked(addr) 2386 if err != nil { 2387 return err 2388 } 2389 2390 if e.EndpointState().connected() { 2391 // The endpoint is already connected. If caller hasn't been 2392 // notified yet, return success. 2393 if !e.isConnectNotified { 2394 e.isConnectNotified = true 2395 return nil 2396 } 2397 // Otherwise return that it's already connected. 2398 return &tcpip.ErrAlreadyConnected{} 2399 } 2400 2401 nicID := addr.NIC 2402 switch e.EndpointState() { 2403 case StateBound: 2404 // If we're already bound to a NIC but the caller is requesting 2405 // that we use a different one now, we cannot proceed. 2406 if e.boundNICID == 0 { 2407 break 2408 } 2409 2410 if nicID != 0 && nicID != e.boundNICID { 2411 return &tcpip.ErrHostUnreachable{} 2412 } 2413 2414 nicID = e.boundNICID 2415 2416 case StateInitial: 2417 // Nothing to do. We'll eventually fill-in the gaps in the ID (if any) 2418 // when we find a route. 2419 2420 case StateConnecting, StateSynSent, StateSynRecv: 2421 // A connection request has already been issued but hasn't completed 2422 // yet. 2423 return &tcpip.ErrAlreadyConnecting{} 2424 2425 case StateError: 2426 if err := e.hardErrorLocked(); err != nil { 2427 return err 2428 } 2429 return &tcpip.ErrConnectionAborted{} 2430 2431 default: 2432 return &tcpip.ErrInvalidEndpointState{} 2433 } 2434 2435 // Find a route to the desired destination. 2436 r, err := e.stack.FindRoute(nicID, e.TransportEndpointInfo.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */) 2437 if err != nil { 2438 return err 2439 } 2440 defer r.Release() 2441 2442 e.TransportEndpointInfo.ID.LocalAddress = r.LocalAddress() 2443 e.TransportEndpointInfo.ID.RemoteAddress = r.RemoteAddress() 2444 e.TransportEndpointInfo.ID.RemotePort = addr.Port 2445 2446 oldState := e.EndpointState() 2447 e.setEndpointState(StateConnecting) 2448 if err := e.registerEndpoint(addr, netProto, r.NICID()); err != nil { 2449 e.setEndpointState(oldState) 2450 if _, ok := err.(*tcpip.ErrPortInUse); ok { 2451 return &tcpip.ErrBadLocalAddress{} 2452 } 2453 return err 2454 } 2455 2456 e.isRegistered = true 2457 r.Acquire() 2458 e.route = r 2459 e.boundNICID = nicID 2460 e.effectiveNetProtos = []tcpip.NetworkProtocolNumber{netProto} 2461 e.connectingAddress = connectingAddr 2462 2463 e.initGSO() 2464 2465 // Connect in the restore phase does not perform handshake. Restore its 2466 // connection setting here. 2467 if !handshake { 2468 e.segmentQueue.mu.Lock() 2469 for _, l := range []segmentList{e.segmentQueue.list, e.snd.writeList} { 2470 for s := l.Front(); s != nil; s = s.Next() { 2471 s.id = e.TransportEndpointInfo.ID 2472 e.sndQueueInfo.sndWaker.Assert() 2473 } 2474 } 2475 e.segmentQueue.mu.Unlock() 2476 e.snd.ep.AssertLockHeld(e) 2477 e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0) 2478 e.setEndpointState(StateEstablished) 2479 // Set the new auto tuned send buffer size after entering 2480 // established state. 2481 e.ops.SetSendBufferSize(e.computeTCPSendBufferSize(), false /* notify */) 2482 return &tcpip.ErrConnectStarted{} 2483 } 2484 2485 // Start a new handshake. 2486 h := e.newHandshake() 2487 e.setEndpointState(StateSynSent) 2488 h.start() 2489 e.stack.Stats().TCP.ActiveConnectionOpenings.Increment() 2490 2491 return &tcpip.ErrConnectStarted{} 2492 } 2493 2494 // ConnectEndpoint is not supported. 2495 func (*Endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error { 2496 return &tcpip.ErrInvalidEndpointState{} 2497 } 2498 2499 // Shutdown closes the read and/or write end of the endpoint connection to its 2500 // peer. 2501 func (e *Endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error { 2502 e.LockUser() 2503 defer e.UnlockUser() 2504 2505 if e.EndpointState().connecting() { 2506 // When calling shutdown(2) on a connecting socket, the endpoint must 2507 // enter the error state. But this logic cannot belong to the shutdownLocked 2508 // method because that method is called during a close(2) (and closing a 2509 // connecting socket is not an error). 2510 e.handshakeFailed(&tcpip.ErrConnectionReset{}) 2511 e.waiterQueue.Notify(waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) 2512 return nil 2513 } 2514 2515 return e.shutdownLocked(flags) 2516 } 2517 2518 // +checklocks:e.mu 2519 func (e *Endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error { 2520 e.shutdownFlags |= flags 2521 switch { 2522 case e.EndpointState().connected(): 2523 // Close for read. 2524 if e.shutdownFlags&tcpip.ShutdownRead != 0 { 2525 // Mark read side as closed. 2526 e.rcvQueueMu.Lock() 2527 e.RcvClosed = true 2528 rcvBufUsed := e.RcvBufUsed 2529 e.rcvQueueMu.Unlock() 2530 // If we're fully closed and we have unread data we need to abort 2531 // the connection with a RST. 2532 if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 { 2533 e.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) 2534 return nil 2535 } 2536 // Wake up any readers that maybe waiting for the stream to become 2537 // readable. 2538 events := waiter.ReadableEvents 2539 if e.shutdownFlags&tcpip.ShutdownWrite == 0 { 2540 // If ShutdownWrite is not set, write end won't close and 2541 // we end up with a half-closed connection 2542 events |= waiter.EventRdHUp 2543 } 2544 e.waiterQueue.Notify(events) 2545 } 2546 2547 // Close for write. 2548 if e.shutdownFlags&tcpip.ShutdownWrite != 0 { 2549 e.sndQueueInfo.sndQueueMu.Lock() 2550 if e.sndQueueInfo.SndClosed { 2551 // Already closed. 2552 e.sndQueueInfo.sndQueueMu.Unlock() 2553 if e.EndpointState() == StateTimeWait { 2554 return &tcpip.ErrNotConnected{} 2555 } 2556 return nil 2557 } 2558 2559 // Queue fin segment. 2560 s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), buffer.Buffer{}) 2561 e.snd.writeList.PushBack(s) 2562 // Mark endpoint as closed. 2563 e.sndQueueInfo.SndClosed = true 2564 e.sndQueueInfo.sndQueueMu.Unlock() 2565 2566 // Drain the send queue. 2567 e.sendData(s) 2568 2569 // Mark send side as closed. 2570 e.snd.Closed = true 2571 2572 // Wake up any writers that maybe waiting for the stream to become 2573 // writable. 2574 e.waiterQueue.Notify(waiter.WritableEvents) 2575 } 2576 2577 return nil 2578 case e.EndpointState() == StateListen: 2579 if e.shutdownFlags&tcpip.ShutdownRead != 0 { 2580 // Reset all connections from the accept queue and keep the 2581 // worker running so that it can continue handling incoming 2582 // segments by replying with RST. 2583 // 2584 // By not removing this endpoint from the demuxer mapping, we 2585 // ensure that any other bind to the same port fails, as on Linux. 2586 e.rcvQueueMu.Lock() 2587 e.RcvClosed = true 2588 e.rcvQueueMu.Unlock() 2589 e.closePendingAcceptableConnectionsLocked() 2590 // Notify waiters that the endpoint is shutdown. 2591 e.waiterQueue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) 2592 } 2593 return nil 2594 default: 2595 return &tcpip.ErrNotConnected{} 2596 } 2597 } 2598 2599 // Listen puts the endpoint in "listen" mode, which allows it to accept 2600 // new connections. 2601 func (e *Endpoint) Listen(backlog int) tcpip.Error { 2602 if err := e.listen(backlog); err != nil { 2603 if !err.IgnoreStats() { 2604 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 2605 e.stats.FailedConnectionAttempts.Increment() 2606 } 2607 return err 2608 } 2609 return nil 2610 } 2611 2612 func (e *Endpoint) listen(backlog int) tcpip.Error { 2613 e.LockUser() 2614 defer e.UnlockUser() 2615 2616 if e.EndpointState() == StateListen && !e.closed { 2617 e.acceptMu.Lock() 2618 defer e.acceptMu.Unlock() 2619 2620 // Adjust the size of the backlog iff we can fit 2621 // existing pending connections into the new one. 2622 if e.acceptQueue.endpoints.Len() > backlog { 2623 return &tcpip.ErrInvalidEndpointState{} 2624 } 2625 e.acceptQueue.capacity = backlog 2626 2627 if e.acceptQueue.pendingEndpoints == nil { 2628 e.acceptQueue.pendingEndpoints = make(map[*Endpoint]struct{}) 2629 } 2630 2631 e.shutdownFlags = 0 2632 e.updateConnDirectionState(connDirectionStateOpen) 2633 e.rcvQueueMu.Lock() 2634 e.RcvClosed = false 2635 e.rcvQueueMu.Unlock() 2636 2637 return nil 2638 } 2639 2640 if e.EndpointState() == StateInitial { 2641 // The listen is called on an unbound socket, the socket is 2642 // automatically bound to a random free port with the local 2643 // address set to INADDR_ANY. 2644 if err := e.bindLocked(tcpip.FullAddress{}); err != nil { 2645 return err 2646 } 2647 } 2648 2649 // Endpoint must be bound before it can transition to listen mode. 2650 if e.EndpointState() != StateBound { 2651 e.stats.ReadErrors.InvalidEndpointState.Increment() 2652 return &tcpip.ErrInvalidEndpointState{} 2653 } 2654 2655 // Setting this state after RegisterTransportEndpoint will result in a 2656 // race where the endpoint is in Bound but reachable via the demuxer. Instead 2657 // we set it to listen so that incoming packets will just be queued to the 2658 // inbound segment queue by the TCP processor. 2659 e.setEndpointState(StateListen) 2660 // Register the endpoint. 2661 if err := e.stack.RegisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil { 2662 e.transitionToStateCloseLocked() 2663 return err 2664 } 2665 2666 e.isRegistered = true 2667 2668 // The queue may be non-zero when we're restoring the endpoint, and it 2669 // may be pre-populated with some previously accepted (but not Accepted) 2670 // endpoints. 2671 e.acceptMu.Lock() 2672 if e.acceptQueue.pendingEndpoints == nil { 2673 e.acceptQueue.pendingEndpoints = make(map[*Endpoint]struct{}) 2674 } 2675 if e.acceptQueue.capacity == 0 { 2676 e.acceptQueue.capacity = backlog 2677 } 2678 e.acceptMu.Unlock() 2679 2680 // Initialize the listening context. 2681 rcvWnd := seqnum.Size(e.receiveBufferAvailable()) 2682 e.listenCtx = newListenContext(e.stack, e.protocol, e, rcvWnd, e.ops.GetV6Only(), e.NetProto) 2683 2684 return nil 2685 } 2686 2687 // Accept returns a new endpoint if a peer has established a connection 2688 // to an endpoint previously set to listen mode. 2689 // 2690 // addr if not-nil will contain the peer address of the returned endpoint. 2691 func (e *Endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) { 2692 e.LockUser() 2693 defer e.UnlockUser() 2694 2695 e.rcvQueueMu.Lock() 2696 rcvClosed := e.RcvClosed 2697 e.rcvQueueMu.Unlock() 2698 // Endpoint must be in listen state before it can accept connections. 2699 if rcvClosed || e.EndpointState() != StateListen { 2700 return nil, nil, &tcpip.ErrInvalidEndpointState{} 2701 } 2702 2703 // Get the new accepted endpoint. 2704 var n *Endpoint 2705 e.acceptMu.Lock() 2706 if element := e.acceptQueue.endpoints.Front(); element != nil { 2707 n = e.acceptQueue.endpoints.Remove(element).(*Endpoint) 2708 } 2709 e.acceptMu.Unlock() 2710 if n == nil { 2711 return nil, nil, &tcpip.ErrWouldBlock{} 2712 } 2713 if peerAddr != nil { 2714 *peerAddr = n.getRemoteAddress() 2715 } 2716 return n, n.waiterQueue, nil 2717 } 2718 2719 // Bind binds the endpoint to a specific local port and optionally address. 2720 func (e *Endpoint) Bind(addr tcpip.FullAddress) (err tcpip.Error) { 2721 e.LockUser() 2722 defer e.UnlockUser() 2723 2724 return e.bindLocked(addr) 2725 } 2726 2727 // +checklocks:e.mu 2728 func (e *Endpoint) bindLocked(addr tcpip.FullAddress) (err tcpip.Error) { 2729 // Don't allow binding once endpoint is not in the initial state 2730 // anymore. This is because once the endpoint goes into a connected or 2731 // listen state, it is already bound. 2732 if e.EndpointState() != StateInitial { 2733 return &tcpip.ErrAlreadyBound{} 2734 } 2735 2736 e.BindAddr = addr.Addr 2737 addr, netProto, err := e.checkV4MappedLocked(addr) 2738 if err != nil { 2739 return err 2740 } 2741 2742 netProtos := []tcpip.NetworkProtocolNumber{netProto} 2743 2744 // Expand netProtos to include v4 and v6 under dual-stack if the caller is 2745 // binding to a wildcard (empty) address, and this is an IPv6 endpoint with 2746 // v6only set to false. 2747 if netProto == header.IPv6ProtocolNumber { 2748 stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber) 2749 alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == tcpip.Address{} && stackHasV4 2750 if alsoBindToV4 { 2751 netProtos = append(netProtos, header.IPv4ProtocolNumber) 2752 } 2753 } 2754 2755 var nic tcpip.NICID 2756 // If an address is specified, we must ensure that it's one of our 2757 // local addresses. 2758 if addr.Addr.Len() != 0 { 2759 nic = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) 2760 if nic == 0 { 2761 return &tcpip.ErrBadLocalAddress{} 2762 } 2763 e.TransportEndpointInfo.ID.LocalAddress = addr.Addr 2764 } 2765 2766 bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) 2767 portRes := ports.Reservation{ 2768 Networks: netProtos, 2769 Transport: ProtocolNumber, 2770 Addr: addr.Addr, 2771 Port: addr.Port, 2772 Flags: e.portFlags, 2773 BindToDevice: bindToDevice, 2774 Dest: tcpip.FullAddress{}, 2775 } 2776 port, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, func(p uint16) (bool, tcpip.Error) { 2777 id := e.TransportEndpointInfo.ID 2778 id.LocalPort = p 2779 // CheckRegisterTransportEndpoint should only return an error if there is a 2780 // listening endpoint bound with the same id and portFlags and bindToDevice 2781 // options. 2782 // 2783 // NOTE: Only listening and connected endpoint register with 2784 // demuxer. Further connected endpoints always have a remote 2785 // address/port. Hence this will only return an error if there is a matching 2786 // listening endpoint. 2787 if err := e.stack.CheckRegisterTransportEndpoint(netProtos, ProtocolNumber, id, e.portFlags, bindToDevice); err != nil { 2788 return false, nil 2789 } 2790 return true, nil 2791 }) 2792 if err != nil { 2793 e.stack.Stats().TCP.FailedPortReservations.Increment() 2794 return err 2795 } 2796 2797 e.boundBindToDevice = bindToDevice 2798 e.boundPortFlags = e.portFlags 2799 // TODO(gvisor.dev/issue/3691): Add test to verify boundNICID is correct. 2800 e.boundNICID = nic 2801 e.isPortReserved = true 2802 e.effectiveNetProtos = netProtos 2803 e.TransportEndpointInfo.ID.LocalPort = port 2804 2805 // Mark endpoint as bound. 2806 e.setEndpointState(StateBound) 2807 2808 return nil 2809 } 2810 2811 // GetLocalAddress returns the address to which the endpoint is bound. 2812 func (e *Endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { 2813 e.LockUser() 2814 defer e.UnlockUser() 2815 2816 return tcpip.FullAddress{ 2817 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2818 Port: e.TransportEndpointInfo.ID.LocalPort, 2819 NIC: e.boundNICID, 2820 }, nil 2821 } 2822 2823 // GetRemoteAddress returns the address to which the endpoint is connected. 2824 func (e *Endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { 2825 e.LockUser() 2826 defer e.UnlockUser() 2827 2828 if !e.EndpointState().connected() { 2829 return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} 2830 } 2831 2832 return e.getRemoteAddress(), nil 2833 } 2834 2835 func (e *Endpoint) getRemoteAddress() tcpip.FullAddress { 2836 return tcpip.FullAddress{ 2837 Addr: e.TransportEndpointInfo.ID.RemoteAddress, 2838 Port: e.TransportEndpointInfo.ID.RemotePort, 2839 NIC: e.boundNICID, 2840 } 2841 } 2842 2843 // HandlePacket implements stack.TransportEndpoint.HandlePacket. 2844 func (*Endpoint) HandlePacket(stack.TransportEndpointID, *stack.PacketBuffer) { 2845 // TCP HandlePacket is not required anymore as inbound packets first 2846 // land at the Dispatcher which then can either deliver using the 2847 // worker go routine or directly do the invoke the tcp processing inline 2848 // based on the state of the endpoint. 2849 } 2850 2851 func (e *Endpoint) enqueueSegment(s *segment) bool { 2852 // Send packet to worker goroutine. 2853 if !e.segmentQueue.enqueue(s) { 2854 // The queue is full, so we drop the segment. 2855 e.stack.Stats().DroppedPackets.Increment() 2856 e.stats.ReceiveErrors.SegmentQueueDropped.Increment() 2857 return false 2858 } 2859 return true 2860 } 2861 2862 func (e *Endpoint) onICMPError(err tcpip.Error, transErr stack.TransportError, pkt *stack.PacketBuffer) { 2863 // Update last error first. 2864 e.lastErrorMu.Lock() 2865 e.lastError = err 2866 e.lastErrorMu.Unlock() 2867 2868 var recvErr bool 2869 switch pkt.NetworkProtocolNumber { 2870 case header.IPv4ProtocolNumber: 2871 recvErr = e.SocketOptions().GetIPv4RecvError() 2872 case header.IPv6ProtocolNumber: 2873 recvErr = e.SocketOptions().GetIPv6RecvError() 2874 default: 2875 panic(fmt.Sprintf("unhandled network protocol number = %d", pkt.NetworkProtocolNumber)) 2876 } 2877 2878 if recvErr { 2879 e.SocketOptions().QueueErr(&tcpip.SockError{ 2880 Err: err, 2881 Cause: transErr, 2882 // Linux passes the payload with the TCP header. We don't know if the TCP 2883 // header even exists, it may not for fragmented packets. 2884 Payload: pkt.Data().AsRange().ToView(), 2885 Dst: tcpip.FullAddress{ 2886 NIC: pkt.NICID, 2887 Addr: e.TransportEndpointInfo.ID.RemoteAddress, 2888 Port: e.TransportEndpointInfo.ID.RemotePort, 2889 }, 2890 Offender: tcpip.FullAddress{ 2891 NIC: pkt.NICID, 2892 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2893 Port: e.TransportEndpointInfo.ID.LocalPort, 2894 }, 2895 NetProto: pkt.NetworkProtocolNumber, 2896 }) 2897 } 2898 2899 if e.EndpointState().connecting() { 2900 e.mu.Lock() 2901 if lEP := e.h.listenEP; lEP != nil { 2902 // Remove from listening endpoints pending list. 2903 lEP.acceptMu.Lock() 2904 delete(lEP.acceptQueue.pendingEndpoints, e) 2905 lEP.acceptMu.Unlock() 2906 lEP.stats.FailedConnectionAttempts.Increment() 2907 } 2908 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 2909 e.cleanupLocked() 2910 e.hardError = err 2911 e.setEndpointState(StateError) 2912 e.mu.Unlock() 2913 e.drainClosingSegmentQueue() 2914 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 2915 } 2916 } 2917 2918 // HandleError implements stack.TransportEndpoint. 2919 func (e *Endpoint) HandleError(transErr stack.TransportError, pkt *stack.PacketBuffer) { 2920 handlePacketTooBig := func(mtu uint32) { 2921 e.sndQueueInfo.sndQueueMu.Lock() 2922 update := false 2923 if v := int(mtu); v < e.sndQueueInfo.SndMTU { 2924 e.sndQueueInfo.SndMTU = v 2925 update = true 2926 } 2927 newMTU := e.sndQueueInfo.SndMTU 2928 e.sndQueueInfo.sndQueueMu.Unlock() 2929 if update { 2930 e.mu.Lock() 2931 defer e.mu.Unlock() 2932 if e.snd != nil { 2933 e.snd.updateMaxPayloadSize(newMTU, 1 /* count */) // +checklocksforce:e.snd.ep.mu 2934 } 2935 } 2936 } 2937 2938 // TODO(gvisor.dev/issues/5270): Handle all transport errors. 2939 switch transErr.Kind() { 2940 case stack.PacketTooBigTransportError: 2941 handlePacketTooBig(transErr.Info()) 2942 case stack.DestinationHostUnreachableTransportError: 2943 e.onICMPError(&tcpip.ErrHostUnreachable{}, transErr, pkt) 2944 case stack.DestinationNetworkUnreachableTransportError: 2945 e.onICMPError(&tcpip.ErrNetworkUnreachable{}, transErr, pkt) 2946 case stack.DestinationPortUnreachableTransportError: 2947 e.onICMPError(&tcpip.ErrConnectionRefused{}, transErr, pkt) 2948 case stack.DestinationProtoUnreachableTransportError: 2949 e.onICMPError(&tcpip.ErrUnknownProtocolOption{}, transErr, pkt) 2950 case stack.SourceRouteFailedTransportError: 2951 e.onICMPError(&tcpip.ErrNotSupported{}, transErr, pkt) 2952 case stack.SourceHostIsolatedTransportError: 2953 e.onICMPError(&tcpip.ErrNoNet{}, transErr, pkt) 2954 case stack.DestinationHostDownTransportError: 2955 e.onICMPError(&tcpip.ErrHostDown{}, transErr, pkt) 2956 } 2957 } 2958 2959 // updateSndBufferUsage is called by the protocol goroutine when room opens up 2960 // in the send buffer. The number of newly available bytes is v. 2961 func (e *Endpoint) updateSndBufferUsage(v int) { 2962 sendBufferSize := e.getSendBufferSize() 2963 e.sndQueueInfo.sndQueueMu.Lock() 2964 notify := e.sndQueueInfo.SndBufUsed >= sendBufferSize>>1 2965 e.sndQueueInfo.SndBufUsed -= v 2966 2967 // Get the new send buffer size with auto tuning, but do not set it 2968 // unless we decide to notify the writers. 2969 newSndBufSz := e.computeTCPSendBufferSize() 2970 2971 // We only notify when there is half the sendBufferSize available after 2972 // a full buffer event occurs. This ensures that we don't wake up 2973 // writers to queue just 1-2 segments and go back to sleep. 2974 notify = notify && e.sndQueueInfo.SndBufUsed < int(newSndBufSz)>>1 2975 e.sndQueueInfo.sndQueueMu.Unlock() 2976 2977 if notify { 2978 // Set the new send buffer size calculated from auto tuning. 2979 e.ops.SetSendBufferSize(newSndBufSz, false /* notify */) 2980 e.waiterQueue.Notify(waiter.WritableEvents) 2981 } 2982 } 2983 2984 // readyToRead is called by the protocol goroutine when a new segment is ready 2985 // to be read, or when the connection is closed for receiving (in which case 2986 // s will be nil). 2987 // 2988 // +checklocks:e.mu 2989 func (e *Endpoint) readyToRead(s *segment) { 2990 e.rcvQueueMu.Lock() 2991 if s != nil { 2992 e.RcvBufUsed += s.payloadSize() 2993 s.IncRef() 2994 e.rcvQueue.PushBack(s) 2995 } else { 2996 e.RcvClosed = true 2997 } 2998 e.rcvQueueMu.Unlock() 2999 e.waiterQueue.Notify(waiter.ReadableEvents) 3000 } 3001 3002 // receiveBufferAvailableLocked calculates how many bytes are still available 3003 // in the receive buffer. 3004 // +checklocks:e.rcvQueueMu 3005 func (e *Endpoint) receiveBufferAvailableLocked(rcvBufSize int) int { 3006 // We may use more bytes than the buffer size when the receive buffer 3007 // shrinks. 3008 memUsed := e.receiveMemUsed() 3009 if memUsed >= rcvBufSize { 3010 return 0 3011 } 3012 3013 return rcvBufSize - memUsed 3014 } 3015 3016 // receiveBufferAvailable calculates how many bytes are still available in the 3017 // receive buffer based on the actual memory used by all segments held in 3018 // receive buffer/pending and segment queue. 3019 func (e *Endpoint) receiveBufferAvailable() int { 3020 e.rcvQueueMu.Lock() 3021 available := e.receiveBufferAvailableLocked(int(e.ops.GetReceiveBufferSize())) 3022 e.rcvQueueMu.Unlock() 3023 return available 3024 } 3025 3026 // receiveBufferUsed returns the amount of in-use receive buffer. 3027 func (e *Endpoint) receiveBufferUsed() int { 3028 e.rcvQueueMu.Lock() 3029 used := e.RcvBufUsed 3030 e.rcvQueueMu.Unlock() 3031 return used 3032 } 3033 3034 // receiveMemUsed returns the total memory in use by segments held by this 3035 // endpoint. 3036 func (e *Endpoint) receiveMemUsed() int { 3037 return int(e.rcvMemUsed.Load()) 3038 } 3039 3040 // updateReceiveMemUsed adds the provided delta to e.rcvMemUsed. 3041 func (e *Endpoint) updateReceiveMemUsed(delta int) { 3042 e.rcvMemUsed.Add(int32(delta)) 3043 } 3044 3045 // maxReceiveBufferSize returns the stack wide maximum receive buffer size for 3046 // an endpoint. 3047 func (e *Endpoint) maxReceiveBufferSize() int { 3048 var rs tcpip.TCPReceiveBufferSizeRangeOption 3049 if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil { 3050 // As a fallback return the hardcoded max buffer size. 3051 return MaxBufferSize 3052 } 3053 return rs.Max 3054 } 3055 3056 // directionState returns the close state of send and receive part of the endpoint 3057 func (e *Endpoint) connDirectionState() connDirectionState { 3058 return connDirectionState(e.connectionDirectionState.Load()) 3059 } 3060 3061 // updateDirectionState updates the close state of send and receive part of the endpoint 3062 func (e *Endpoint) updateConnDirectionState(state connDirectionState) connDirectionState { 3063 return connDirectionState(e.connectionDirectionState.Swap(uint32(e.connDirectionState() | state))) 3064 } 3065 3066 // rcvWndScaleForHandshake computes the receive window scale to offer to the 3067 // peer when window scaling is enabled (true by default). If auto-tuning is 3068 // disabled then the window scaling factor is based on the size of the 3069 // receiveBuffer otherwise we use the max permissible receive buffer size to 3070 // compute the scale. 3071 func (e *Endpoint) rcvWndScaleForHandshake() int { 3072 bufSizeForScale := e.ops.GetReceiveBufferSize() 3073 3074 e.rcvQueueMu.Lock() 3075 autoTuningDisabled := e.RcvAutoParams.Disabled 3076 e.rcvQueueMu.Unlock() 3077 if autoTuningDisabled { 3078 return FindWndScale(seqnum.Size(bufSizeForScale)) 3079 } 3080 3081 return FindWndScale(seqnum.Size(e.maxReceiveBufferSize())) 3082 } 3083 3084 // updateRecentTimestamp updates the recent timestamp using the algorithm 3085 // described in https://tools.ietf.org/html/rfc7323#section-4.3 3086 func (e *Endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) { 3087 if e.SendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) { 3088 e.setRecentTimestamp(tsVal) 3089 } 3090 } 3091 3092 // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if 3093 // the SYN options indicate that timestamp option was negotiated. It also 3094 // initializes the recentTS with the value provided in synOpts.TSval. 3095 func (e *Endpoint) maybeEnableTimestamp(synOpts header.TCPSynOptions) { 3096 if synOpts.TS { 3097 e.SendTSOk = true 3098 e.setRecentTimestamp(synOpts.TSVal) 3099 } 3100 } 3101 3102 func (e *Endpoint) tsVal(now tcpip.MonotonicTime) uint32 { 3103 return e.TSOffset.TSVal(now) 3104 } 3105 3106 func (e *Endpoint) tsValNow() uint32 { 3107 return e.tsVal(e.stack.Clock().NowMonotonic()) 3108 } 3109 3110 func (e *Endpoint) elapsed(now tcpip.MonotonicTime, tsEcr uint32) time.Duration { 3111 return e.TSOffset.Elapsed(now, tsEcr) 3112 } 3113 3114 // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint 3115 // if the SYN options indicate that the SACK option was negotiated and the TCP 3116 // stack is configured to enable TCP SACK option. 3117 func (e *Endpoint) maybeEnableSACKPermitted(synOpts header.TCPSynOptions) { 3118 var v tcpip.TCPSACKEnabled 3119 if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil { 3120 // Stack doesn't support SACK. So just return. 3121 return 3122 } 3123 if bool(v) && synOpts.SACKPermitted { 3124 e.SACKPermitted = true 3125 e.stack.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery) 3126 } 3127 } 3128 3129 // maxOptionSize return the maximum size of TCP options. 3130 func (e *Endpoint) maxOptionSize() (size int) { 3131 var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock 3132 options := e.makeOptions(maxSackBlocks[:]) 3133 size = len(options) 3134 putOptions(options) 3135 3136 return size 3137 } 3138 3139 // completeStateLocked makes a full copy of the endpoint and returns it. This is 3140 // used before invoking the probe. 3141 // 3142 // +checklocks:e.mu 3143 func (e *Endpoint) completeStateLocked(s *stack.TCPEndpointState) { 3144 s.TCPEndpointStateInner = e.TCPEndpointStateInner 3145 s.ID = stack.TCPEndpointID(e.TransportEndpointInfo.ID) 3146 s.SegTime = e.stack.Clock().NowMonotonic() 3147 s.Receiver = e.rcv.TCPReceiverState 3148 s.Sender = e.snd.TCPSenderState 3149 3150 sndBufSize := e.getSendBufferSize() 3151 // Copy the send buffer atomically. 3152 e.sndQueueInfo.sndQueueMu.Lock() 3153 e.sndQueueInfo.CloneState(&s.SndBufState) 3154 s.SndBufState.SndBufSize = sndBufSize 3155 e.sndQueueInfo.sndQueueMu.Unlock() 3156 3157 // Copy the receive buffer atomically. 3158 e.rcvQueueMu.Lock() 3159 s.RcvBufState = e.TCPRcvBufState 3160 e.rcvQueueMu.Unlock() 3161 3162 // Copy the endpoint TCP Option state. 3163 s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks) 3164 copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks]) 3165 s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy() 3166 3167 e.snd.rtt.Lock() 3168 s.Sender.RTTState = e.snd.rtt.TCPRTTState 3169 e.snd.rtt.Unlock() 3170 3171 if cubic, ok := e.snd.cc.(*cubicState); ok { 3172 s.Sender.Cubic = cubic.TCPCubicState 3173 s.Sender.Cubic.TimeSinceLastCongestion = e.stack.Clock().NowMonotonic().Sub(s.Sender.Cubic.T) 3174 } 3175 3176 s.Sender.RACKState = e.snd.rc.TCPRACKState 3177 s.Sender.RetransmitTS = e.snd.retransmitTS 3178 s.Sender.SpuriousRecovery = e.snd.spuriousRecovery 3179 } 3180 3181 func (e *Endpoint) initHostGSO() { 3182 switch e.route.NetProto() { 3183 case header.IPv4ProtocolNumber: 3184 e.gso.Type = stack.GSOTCPv4 3185 e.gso.L3HdrLen = header.IPv4MinimumSize 3186 case header.IPv6ProtocolNumber: 3187 e.gso.Type = stack.GSOTCPv6 3188 e.gso.L3HdrLen = header.IPv6MinimumSize 3189 default: 3190 panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto)) 3191 } 3192 e.gso.NeedsCsum = true 3193 e.gso.CsumOffset = header.TCPChecksumOffset 3194 e.gso.MaxSize = e.route.GSOMaxSize() 3195 } 3196 3197 func (e *Endpoint) initGSO() { 3198 if e.route.HasHostGSOCapability() { 3199 e.initHostGSO() 3200 } else if e.route.HasGVisorGSOCapability() { 3201 e.gso = stack.GSO{ 3202 MaxSize: e.route.GSOMaxSize(), 3203 Type: stack.GSOGvisor, 3204 NeedsCsum: false, 3205 } 3206 } 3207 } 3208 3209 // State implements tcpip.Endpoint.State. It exports the endpoint's protocol 3210 // state for diagnostics. 3211 func (e *Endpoint) State() uint32 { 3212 return uint32(e.EndpointState()) 3213 } 3214 3215 // Info returns a copy of the endpoint info. 3216 func (e *Endpoint) Info() tcpip.EndpointInfo { 3217 e.LockUser() 3218 // Make a copy of the endpoint info. 3219 ret := e.TransportEndpointInfo 3220 e.UnlockUser() 3221 return &ret 3222 } 3223 3224 // Stats returns a pointer to the endpoint stats. 3225 func (e *Endpoint) Stats() tcpip.EndpointStats { 3226 return &e.stats 3227 } 3228 3229 // Wait implements stack.TransportEndpoint.Wait. 3230 func (e *Endpoint) Wait() { 3231 waitEntry, notifyCh := waiter.NewChannelEntry(waiter.EventHUp) 3232 e.waiterQueue.EventRegister(&waitEntry) 3233 defer e.waiterQueue.EventUnregister(&waitEntry) 3234 switch e.EndpointState() { 3235 case StateClose, StateError: 3236 return 3237 } 3238 <-notifyCh 3239 } 3240 3241 // SocketOptions implements tcpip.Endpoint.SocketOptions. 3242 func (e *Endpoint) SocketOptions() *tcpip.SocketOptions { 3243 return &e.ops 3244 } 3245 3246 // GetTCPSendBufferLimits is used to get send buffer size limits for TCP. 3247 func GetTCPSendBufferLimits(sh tcpip.StackHandler) tcpip.SendBufferSizeOption { 3248 // This type assertion is safe because only the TCP stack calls this 3249 // function. 3250 ss := sh.(*stack.Stack).TCPSendBufferLimits() 3251 return tcpip.SendBufferSizeOption{ 3252 Min: ss.Min, 3253 Default: ss.Default, 3254 Max: ss.Max, 3255 } 3256 } 3257 3258 // allowOutOfWindowAck returns true if an out-of-window ACK can be sent now. 3259 func (e *Endpoint) allowOutOfWindowAck() bool { 3260 now := e.stack.Clock().NowMonotonic() 3261 3262 if e.lastOutOfWindowAckTime != (tcpip.MonotonicTime{}) { 3263 var limit stack.TCPInvalidRateLimitOption 3264 if err := e.stack.Option(&limit); err != nil { 3265 panic(fmt.Sprintf("e.stack.Option(%+v) failed with error: %s", limit, err)) 3266 } 3267 if now.Sub(e.lastOutOfWindowAckTime) < time.Duration(limit) { 3268 return false 3269 } 3270 } 3271 3272 e.lastOutOfWindowAckTime = now 3273 return true 3274 } 3275 3276 // GetTCPReceiveBufferLimits is used to get send buffer size limits for TCP. 3277 func GetTCPReceiveBufferLimits(s tcpip.StackHandler) tcpip.ReceiveBufferSizeOption { 3278 var ss tcpip.TCPReceiveBufferSizeRangeOption 3279 if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil { 3280 panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err)) 3281 } 3282 3283 return tcpip.ReceiveBufferSizeOption{ 3284 Min: ss.Min, 3285 Default: ss.Default, 3286 Max: ss.Max, 3287 } 3288 } 3289 3290 // computeTCPSendBufferSize implements auto tuning of send buffer size and 3291 // returns the new send buffer size. 3292 func (e *Endpoint) computeTCPSendBufferSize() int64 { 3293 curSndBufSz := int64(e.getSendBufferSize()) 3294 3295 // Auto tuning is disabled when the user explicitly sets the send 3296 // buffer size with SO_SNDBUF option. 3297 if disabled := e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled.Load(); disabled == 1 { 3298 return curSndBufSz 3299 } 3300 3301 const packetOverheadFactor = 2 3302 curMSS := e.snd.MaxPayloadSize 3303 numSeg := InitialCwnd 3304 if numSeg < e.snd.SndCwnd { 3305 numSeg = e.snd.SndCwnd 3306 } 3307 3308 // SndCwnd indicates the number of segments that can be sent. This means 3309 // that the sender can send upto #SndCwnd segments and the send buffer 3310 // size should be set to SndCwnd*MSS to accommodate sending of all the 3311 // segments. 3312 newSndBufSz := int64(numSeg * curMSS * packetOverheadFactor) 3313 if newSndBufSz < curSndBufSz { 3314 return curSndBufSz 3315 } 3316 if ss := GetTCPSendBufferLimits(e.stack); int64(ss.Max) < newSndBufSz { 3317 newSndBufSz = int64(ss.Max) 3318 } 3319 3320 return newSndBufSz 3321 } 3322 3323 // GetAcceptConn implements tcpip.SocketOptionsHandler. 3324 func (e *Endpoint) GetAcceptConn() bool { 3325 return EndpointState(e.State()) == StateListen 3326 }