github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/tcpip/transport/tcp/endpoint.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "container/heap" 19 "encoding/binary" 20 "fmt" 21 "io" 22 "math" 23 "runtime" 24 "strings" 25 "time" 26 27 "github.com/MerlinKodo/gvisor/pkg/atomicbitops" 28 "github.com/MerlinKodo/gvisor/pkg/buffer" 29 "github.com/MerlinKodo/gvisor/pkg/sleep" 30 "github.com/MerlinKodo/gvisor/pkg/sync" 31 "github.com/MerlinKodo/gvisor/pkg/tcpip" 32 "github.com/MerlinKodo/gvisor/pkg/tcpip/hash/jenkins" 33 "github.com/MerlinKodo/gvisor/pkg/tcpip/header" 34 "github.com/MerlinKodo/gvisor/pkg/tcpip/ports" 35 "github.com/MerlinKodo/gvisor/pkg/tcpip/seqnum" 36 "github.com/MerlinKodo/gvisor/pkg/tcpip/stack" 37 "github.com/MerlinKodo/gvisor/pkg/waiter" 38 ) 39 40 // EndpointState represents the state of a TCP endpoint. 41 type EndpointState tcpip.EndpointState 42 43 // Endpoint states. Note that are represented in a netstack-specific manner and 44 // may not be meaningful externally. Specifically, they need to be translated to 45 // Linux's representation for these states if presented to userspace. 46 const ( 47 _ EndpointState = iota 48 // TCP protocol states in sync with the definitions in 49 // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/tcp_states.h#L13 50 StateEstablished 51 StateSynSent 52 StateSynRecv 53 StateFinWait1 54 StateFinWait2 55 StateTimeWait 56 StateClose 57 StateCloseWait 58 StateLastAck 59 StateListen 60 StateClosing 61 62 // Endpoint states internal to netstack. 63 StateInitial 64 StateBound 65 StateConnecting // Connect() called, but the initial SYN hasn't been sent. 66 StateError 67 ) 68 69 const ( 70 // rcvAdvWndScale is used to split the available socket buffer into 71 // application buffer and the window to be advertised to the peer. This is 72 // currently hard coded to split the available space equally. 73 rcvAdvWndScale = 1 74 75 // SegOverheadFactor is used to multiply the value provided by the 76 // user on a SetSockOpt for setting the socket send/receive buffer sizes. 77 SegOverheadFactor = 2 78 ) 79 80 type connDirectionState uint32 81 82 // Connection direction states used for directionState checks in endpoint struct 83 // to detect half-closed connection and deliver POLLRDHUP 84 const ( 85 connDirectionStateOpen connDirectionState = 0 86 connDirectionStateRcvClosed connDirectionState = 1 87 connDirectionStateSndClosed connDirectionState = 2 88 connDirectionStateAll connDirectionState = connDirectionStateOpen | connDirectionStateRcvClosed | connDirectionStateSndClosed 89 ) 90 91 // connected returns true when s is one of the states representing an 92 // endpoint connected to a peer. 93 func (s EndpointState) connected() bool { 94 switch s { 95 case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing: 96 return true 97 default: 98 return false 99 } 100 } 101 102 // connecting returns true when s is one of the states representing a 103 // connection in progress, but not yet fully established. 104 func (s EndpointState) connecting() bool { 105 switch s { 106 case StateConnecting, StateSynSent, StateSynRecv: 107 return true 108 default: 109 return false 110 } 111 } 112 113 // internal returns true when the state is netstack internal. 114 func (s EndpointState) internal() bool { 115 switch s { 116 case StateInitial, StateBound, StateConnecting, StateError: 117 return true 118 default: 119 return false 120 } 121 } 122 123 // handshake returns true when s is one of the states representing an endpoint 124 // in the middle of a TCP handshake. 125 func (s EndpointState) handshake() bool { 126 switch s { 127 case StateSynSent, StateSynRecv: 128 return true 129 default: 130 return false 131 } 132 } 133 134 // closed returns true when s is one of the states an endpoint transitions to 135 // when closed or when it encounters an error. This is distinct from a newly 136 // initialized endpoint that was never connected. 137 func (s EndpointState) closed() bool { 138 switch s { 139 case StateClose, StateError: 140 return true 141 default: 142 return false 143 } 144 } 145 146 // String implements fmt.Stringer.String. 147 func (s EndpointState) String() string { 148 switch s { 149 case StateInitial: 150 return "INITIAL" 151 case StateBound: 152 return "BOUND" 153 case StateConnecting: 154 return "CONNECTING" 155 case StateError: 156 return "ERROR" 157 case StateEstablished: 158 return "ESTABLISHED" 159 case StateSynSent: 160 return "SYN-SENT" 161 case StateSynRecv: 162 return "SYN-RCVD" 163 case StateFinWait1: 164 return "FIN-WAIT1" 165 case StateFinWait2: 166 return "FIN-WAIT2" 167 case StateTimeWait: 168 return "TIME-WAIT" 169 case StateClose: 170 return "CLOSED" 171 case StateCloseWait: 172 return "CLOSE-WAIT" 173 case StateLastAck: 174 return "LAST-ACK" 175 case StateListen: 176 return "LISTEN" 177 case StateClosing: 178 return "CLOSING" 179 default: 180 panic("unreachable") 181 } 182 } 183 184 // SACKInfo holds TCP SACK related information for a given endpoint. 185 // 186 // +stateify savable 187 type SACKInfo struct { 188 // Blocks is the maximum number of SACK blocks we track 189 // per endpoint. 190 Blocks [MaxSACKBlocks]header.SACKBlock 191 192 // NumBlocks is the number of valid SACK blocks stored in the 193 // blocks array above. 194 NumBlocks int 195 } 196 197 // ReceiveErrors collect segment receive errors within transport layer. 198 // 199 // +stateify savable 200 type ReceiveErrors struct { 201 tcpip.ReceiveErrors 202 203 // SegmentQueueDropped is the number of segments dropped due to 204 // a full segment queue. 205 SegmentQueueDropped tcpip.StatCounter 206 207 // ChecksumErrors is the number of segments dropped due to bad checksums. 208 ChecksumErrors tcpip.StatCounter 209 210 // ListenOverflowSynDrop is the number of times the listen queue overflowed 211 // and a SYN was dropped. 212 ListenOverflowSynDrop tcpip.StatCounter 213 214 // ListenOverflowAckDrop is the number of times the final ACK 215 // in the handshake was dropped due to overflow. 216 ListenOverflowAckDrop tcpip.StatCounter 217 218 // ZeroRcvWindowState is the number of times we advertised 219 // a zero receive window when rcvQueue is full. 220 ZeroRcvWindowState tcpip.StatCounter 221 222 // WantZeroWindow is the number of times we wanted to advertise a 223 // zero receive window but couldn't because it would have caused 224 // the receive window's right edge to shrink. 225 WantZeroRcvWindow tcpip.StatCounter 226 } 227 228 // SendErrors collect segment send errors within the transport layer. 229 // 230 // +stateify savable 231 type SendErrors struct { 232 tcpip.SendErrors 233 234 // SegmentSendToNetworkFailed is the number of TCP segments failed to be sent 235 // to the network endpoint. 236 SegmentSendToNetworkFailed tcpip.StatCounter 237 238 // SynSendToNetworkFailed is the number of TCP SYNs failed to be sent 239 // to the network endpoint. 240 SynSendToNetworkFailed tcpip.StatCounter 241 242 // Retransmits is the number of TCP segments retransmitted. 243 Retransmits tcpip.StatCounter 244 245 // FastRetransmit is the number of segments retransmitted in fast 246 // recovery. 247 FastRetransmit tcpip.StatCounter 248 249 // Timeouts is the number of times the RTO expired. 250 Timeouts tcpip.StatCounter 251 } 252 253 // Stats holds statistics about the endpoint. 254 // 255 // +stateify savable 256 type Stats struct { 257 // SegmentsReceived is the number of TCP segments received that 258 // the transport layer successfully parsed. 259 SegmentsReceived tcpip.StatCounter 260 261 // SegmentsSent is the number of TCP segments sent. 262 SegmentsSent tcpip.StatCounter 263 264 // FailedConnectionAttempts is the number of times we saw Connect and 265 // Accept errors. 266 FailedConnectionAttempts tcpip.StatCounter 267 268 // ReceiveErrors collects segment receive errors within the 269 // transport layer. 270 ReceiveErrors ReceiveErrors 271 272 // ReadErrors collects segment read errors from an endpoint read call. 273 ReadErrors tcpip.ReadErrors 274 275 // SendErrors collects segment send errors within the transport layer. 276 SendErrors SendErrors 277 278 // WriteErrors collects segment write errors from an endpoint write call. 279 WriteErrors tcpip.WriteErrors 280 } 281 282 // IsEndpointStats is an empty method to implement the tcpip.EndpointStats 283 // marker interface. 284 func (*Stats) IsEndpointStats() {} 285 286 // sndQueueInfo implements a send queue. 287 // 288 // +stateify savable 289 type sndQueueInfo struct { 290 sndQueueMu sync.Mutex `state:"nosave"` 291 stack.TCPSndBufState 292 293 // sndWaker is used to signal the protocol goroutine when there may be 294 // segments that need to be sent. 295 sndWaker sleep.Waker `state:"manual"` 296 } 297 298 // CloneState clones sq into other. It is not thread safe 299 func (sq *sndQueueInfo) CloneState(other *stack.TCPSndBufState) { 300 other.SndBufSize = sq.SndBufSize 301 other.SndBufUsed = sq.SndBufUsed 302 other.SndClosed = sq.SndClosed 303 other.PacketTooBigCount = sq.PacketTooBigCount 304 other.SndMTU = sq.SndMTU 305 other.AutoTuneSndBufDisabled = atomicbitops.FromUint32(sq.AutoTuneSndBufDisabled.RacyLoad()) 306 } 307 308 // endpoint represents a TCP endpoint. This struct serves as the interface 309 // between users of the endpoint and the protocol implementation; it is legal to 310 // have concurrent goroutines make calls into the endpoint, they are properly 311 // synchronized. The protocol implementation, however, runs in a single 312 // goroutine. 313 // 314 // Each endpoint has a few mutexes: 315 // 316 // e.mu -> Primary mutex for an endpoint must be held for all operations except 317 // in e.Readiness where acquiring it will result in a deadlock in epoll 318 // implementation. 319 // 320 // The following three mutexes can be acquired independent of e.mu but if 321 // acquired with e.mu then e.mu must be acquired first. 322 // 323 // e.acceptMu -> Protects e.acceptQueue. 324 // e.rcvQueueMu -> Protects e.rcvQueue's associated fields but not e.rcvQueue 325 // itself. 326 // e.sndQueueMu -> Protects the e.sndQueue and associated fields. 327 // e.lastErrorMu -> Protects the lastError field. 328 // 329 // LOCKING/UNLOCKING of the endpoint. The locking of an endpoint is different 330 // based on the context in which the lock is acquired. In the syscall context 331 // e.LockUser/e.UnlockUser should be used and when doing background processing 332 // e.mu.Lock/e.mu.Unlock should be used. The distinction is described below 333 // in brief. 334 // 335 // The reason for this locking behaviour is to avoid wakeups to handle packets. 336 // In cases where the endpoint is already locked the background processor can 337 // queue the packet up and go its merry way and the lock owner will eventually 338 // process the backlog when releasing the lock. Similarly when acquiring the 339 // lock from say a syscall goroutine we can implement a bit of spinning if we 340 // know that the lock is not held by another syscall goroutine. Background 341 // processors should never hold the lock for long and we can avoid an expensive 342 // sleep/wakeup by spinning for a shortwhile. 343 // 344 // For more details please see the detailed documentation on 345 // e.LockUser/e.UnlockUser methods. 346 // 347 // +stateify savable 348 type endpoint struct { 349 stack.TCPEndpointStateInner 350 stack.TransportEndpointInfo 351 tcpip.DefaultSocketOptionsHandler 352 353 // endpointEntry is used to queue endpoints for processing to the 354 // a given tcp processor goroutine. 355 // 356 // Precondition: epQueue.mu must be held to read/write this field.. 357 endpointEntry `state:"nosave"` 358 359 // pendingProcessingMu protects pendingProcessing. 360 pendingProcessingMu sync.Mutex `state:"nosave"` 361 362 // pendingProcessing is true if this endpoint is queued for processing 363 // to a TCP processor. 364 // +checklocks:pendingProcessingMu 365 pendingProcessing bool `state:"nosave"` 366 367 // The following fields are initialized at creation time and do not 368 // change throughout the lifetime of the endpoint. 369 stack *stack.Stack `state:"manual"` 370 protocol *protocol `state:"manual"` 371 waiterQueue *waiter.Queue `state:"wait"` 372 uniqueID uint64 373 374 // hardError is meaningful only when state is stateError. It stores the 375 // error to be returned when read/write syscalls are called and the 376 // endpoint is in this state. hardError is protected by endpoint mu. 377 hardError tcpip.Error 378 379 // lastError represents the last error that the endpoint reported; 380 // access to it is protected by the following mutex. 381 lastErrorMu sync.Mutex `state:"nosave"` 382 lastError tcpip.Error 383 384 rcvQueueMu sync.Mutex `state:"nosave"` 385 386 // +checklocks:rcvQueueMu 387 stack.TCPRcvBufState 388 389 // rcvMemUsed tracks the total amount of memory in use by received segments 390 // held in rcvQueue, pendingRcvdSegments and the segment queue. This is used to 391 // compute the window and the actual available buffer space. This is distinct 392 // from rcvBufUsed above which is the actual number of payload bytes held in 393 // the buffer not including any segment overheads. 394 rcvMemUsed atomicbitops.Int32 395 396 // mu protects all endpoint fields unless documented otherwise. mu must 397 // be acquired before interacting with the endpoint fields. 398 // 399 // During handshake, mu is locked by the protocol listen goroutine and 400 // released by the handshake completion goroutine. 401 mu sync.CrossGoroutineMutex `state:"nosave"` 402 ownedByUser atomicbitops.Uint32 403 404 // rcvQueue is the queue for ready-for-delivery segments. 405 // 406 // +checklocks:mu 407 rcvQueue segmentList `state:"wait"` 408 409 // state must be read/set using the EndpointState()/setEndpointState() 410 // methods. 411 state atomicbitops.Uint32 `state:".(EndpointState)"` 412 413 // connectionDirectionState holds current state of send and receive, 414 // accessed atomically 415 connectionDirectionState atomicbitops.Uint32 416 417 // origEndpointState is only used during a restore phase to save the 418 // endpoint state at restore time as the socket is moved to it's correct 419 // state. 420 origEndpointState uint32 `state:"nosave"` 421 422 isPortReserved bool `state:"manual"` 423 isRegistered bool `state:"manual"` 424 boundNICID tcpip.NICID 425 route *stack.Route `state:"manual"` 426 ipv4TTL uint8 427 ipv6HopLimit int16 428 isConnectNotified bool 429 430 // h stores a reference to the current handshake state if the endpoint is in 431 // the SYN-SENT or SYN-RECV states, in which case endpoint == endpoint.h.ep. 432 // nil otherwise. 433 // +checklocks:mu 434 h *handshake 435 436 // portFlags stores the current values of port related flags. 437 portFlags ports.Flags 438 439 // Values used to reserve a port or register a transport endpoint 440 // (which ever happens first). 441 boundBindToDevice tcpip.NICID 442 boundPortFlags ports.Flags 443 boundDest tcpip.FullAddress 444 445 // effectiveNetProtos contains the network protocols actually in use. In 446 // most cases it will only contain "netProto", but in cases like IPv6 447 // endpoints with v6only set to false, this could include multiple 448 // protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g., 449 // IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped 450 // address). 451 effectiveNetProtos []tcpip.NetworkProtocolNumber 452 453 // recentTSTime is the unix time when we last updated 454 // TCPEndpointStateInner.RecentTS. 455 recentTSTime tcpip.MonotonicTime 456 457 // shutdownFlags represent the current shutdown state of the endpoint. 458 shutdownFlags tcpip.ShutdownFlags 459 460 // tcpRecovery is the loss recovery algorithm used by TCP. 461 tcpRecovery tcpip.TCPRecovery 462 463 // sack holds TCP SACK related information for this endpoint. 464 sack SACKInfo 465 466 // delay enables Nagle's algorithm. 467 // 468 // delay is a boolean (0 is false) and must be accessed atomically. 469 delay uint32 470 471 // scoreboard holds TCP SACK Scoreboard information for this endpoint. 472 scoreboard *SACKScoreboard 473 474 // segmentQueue is used to hand received segments to the protocol 475 // goroutine. Segments are queued as long as the queue is not full, 476 // and dropped when it is. 477 segmentQueue segmentQueue `state:"wait"` 478 479 // userMSS if non-zero is the MSS value explicitly set by the user 480 // for this endpoint using the TCP_MAXSEG setsockopt. 481 userMSS uint16 482 483 // maxSynRetries is the maximum number of SYN retransmits that TCP should 484 // send before aborting the attempt to connect. It cannot exceed 255. 485 // 486 // NOTE: This is currently a no-op and does not change the SYN 487 // retransmissions. 488 maxSynRetries uint8 489 490 // windowClamp is used to bound the size of the advertised window to 491 // this value. 492 windowClamp uint32 493 494 // sndQueueInfo contains the implementation of the endpoint's send queue. 495 sndQueueInfo sndQueueInfo 496 497 // cc stores the name of the Congestion Control algorithm to use for 498 // this endpoint. 499 cc tcpip.CongestionControlOption 500 501 // keepalive manages TCP keepalive state. When the connection is idle 502 // (no data sent or received) for keepaliveIdle, we start sending 503 // keepalives every keepalive.interval. If we send keepalive.count 504 // without hearing a response, the connection is closed. 505 keepalive keepalive 506 507 // userTimeout if non-zero specifies a user specified timeout for 508 // a connection w/ pending data to send. A connection that has pending 509 // unacked data will be forcibily aborted if the timeout is reached 510 // without any data being acked. 511 userTimeout time.Duration 512 513 // deferAccept if non-zero specifies a user specified time during 514 // which the final ACK of a handshake will be dropped provided the 515 // ACK is a bare ACK and carries no data. If the timeout is crossed then 516 // the bare ACK is accepted and the connection is delivered to the 517 // listener. 518 deferAccept time.Duration 519 520 // acceptMu protects accepQueue 521 acceptMu sync.Mutex `state:"nosave"` 522 523 // acceptQueue is used by a listening endpoint to send newly accepted 524 // connections to the endpoint so that they can be read by Accept() 525 // calls. 526 // 527 // +checklocks:acceptMu 528 acceptQueue acceptQueue 529 530 // The following are only used from the protocol goroutine, and 531 // therefore don't need locks to protect them. 532 rcv *receiver `state:"wait"` 533 snd *sender `state:"wait"` 534 535 // The goroutine drain completion notification channel. 536 drainDone chan struct{} `state:"nosave"` 537 538 // The goroutine undrain notification channel. This is currently used as 539 // a way to block the worker goroutines. Today nothing closes/writes 540 // this channel and this causes any goroutines waiting on this to just 541 // block. This is used during save/restore to prevent worker goroutines 542 // from mutating state as it's being saved. 543 undrain chan struct{} `state:"nosave"` 544 545 // probe if not nil is invoked on every received segment. It is passed 546 // a copy of the current state of the endpoint. 547 probe stack.TCPProbeFunc `state:"nosave"` 548 549 // The following are only used to assist the restore run to re-connect. 550 connectingAddress tcpip.Address 551 552 // amss is the advertised MSS to the peer by this endpoint. 553 amss uint16 554 555 // sendTOS represents IPv4 TOS or IPv6 TrafficClass, 556 // applied while sending packets. Defaults to 0 as on Linux. 557 sendTOS uint8 558 559 gso stack.GSO 560 561 stats Stats 562 563 // tcpLingerTimeout is the maximum amount of a time a socket 564 // a socket stays in TIME_WAIT state before being marked 565 // closed. 566 tcpLingerTimeout time.Duration 567 568 // closed indicates that the user has called closed on the 569 // endpoint and at this point the endpoint is only around 570 // to complete the TCP shutdown. 571 closed bool 572 573 // txHash is the transport layer hash to be set on outbound packets 574 // emitted by this endpoint. 575 txHash uint32 576 577 // owner is used to get uid and gid of the packet. 578 owner tcpip.PacketOwner 579 580 // ops is used to get socket level options. 581 ops tcpip.SocketOptions 582 583 // lastOutOfWindowAckTime is the time at which the an ACK was sent in response 584 // to an out of window segment being received by this endpoint. 585 lastOutOfWindowAckTime tcpip.MonotonicTime 586 587 // finWait2Timer is used to reap orphaned sockets in FIN-WAIT-2 where the peer 588 // is yet to send a FIN but on our end the socket is fully closed i.e. endpoint.Close() 589 // has been called on the socket. This timer is not started for sockets that 590 // are waiting for a peer FIN but are not closed. 591 finWait2Timer tcpip.Timer `state:"nosave"` 592 593 // timeWaitTimer is used to reap a socket once a socket has been in TIME-WAIT state 594 // for tcp.DefaultTCPTimeWaitTimeout seconds. 595 timeWaitTimer tcpip.Timer `state:"nosave"` 596 597 // listenCtx is used by listening endpoints to store state used while listening for 598 // connections. Nil otherwise. 599 listenCtx *listenContext `state:"nosave"` 600 } 601 602 // UniqueID implements stack.TransportEndpoint.UniqueID. 603 func (e *endpoint) UniqueID() uint64 { 604 return e.uniqueID 605 } 606 607 // calculateAdvertisedMSS calculates the MSS to advertise. 608 // 609 // If userMSS is non-zero and is not greater than the maximum possible MSS for 610 // r, it will be used; otherwise, the maximum possible MSS will be used. 611 func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 { 612 // The maximum possible MSS is dependent on the route. 613 // TODO(b/143359391): Respect TCP Min and Max size. 614 maxMSS := uint16(r.MTU() - header.TCPMinimumSize) 615 616 if userMSS != 0 && userMSS < maxMSS { 617 return userMSS 618 } 619 620 return maxMSS 621 } 622 623 // isOwnedByUser() returns true if the endpoint lock is currently 624 // held by a user(syscall) goroutine. 625 func (e *endpoint) isOwnedByUser() bool { 626 return e.ownedByUser.Load() == 1 627 } 628 629 // LockUser tries to lock e.mu and if it fails it will check if the lock is held 630 // by another syscall goroutine. If yes, then it will goto sleep waiting for the 631 // lock to be released, if not then it will spin till it acquires the lock or 632 // another syscall goroutine acquires it in which case it will goto sleep as 633 // described above. 634 // 635 // The assumption behind spinning here being that background packet processing 636 // should not be holding the lock for long and spinning reduces latency as we 637 // avoid an expensive sleep/wakeup of the syscall goroutine). 638 // +checklocksacquire:e.mu 639 func (e *endpoint) LockUser() { 640 const iterations = 5 641 for i := 0; i < iterations; i++ { 642 // Try first if the sock is locked then check if it's owned 643 // by another user goroutine if not then we spin, otherwise 644 // we just go to sleep on the Lock() and wait. 645 if !e.TryLock() { 646 // If socket is owned by the user then just go to sleep 647 // as the lock could be held for a reasonably long time. 648 if e.ownedByUser.Load() == 1 { 649 e.mu.Lock() 650 e.ownedByUser.Store(1) 651 return 652 } 653 // Spin but don't yield the processor since the lower half 654 // should yield the lock soon. 655 continue 656 } 657 e.ownedByUser.Store(1) 658 return 659 } 660 661 for i := 0; i < iterations; i++ { 662 // Try first if the sock is locked then check if it's owned 663 // by another user goroutine if not then we spin, otherwise 664 // we just go to sleep on the Lock() and wait. 665 if !e.TryLock() { 666 // If socket is owned by the user then just go to sleep 667 // as the lock could be held for a reasonably long time. 668 if e.ownedByUser.Load() == 1 { 669 e.mu.Lock() 670 e.ownedByUser.Store(1) 671 return 672 } 673 // Spin but yield the processor since the lower half 674 // should yield the lock soon. 675 runtime.Gosched() 676 continue 677 } 678 e.ownedByUser.Store(1) 679 return 680 } 681 682 // Finally just give up and wait for the Lock. 683 e.mu.Lock() 684 e.ownedByUser.Store(1) 685 } 686 687 // UnlockUser will check if there are any segments already queued for processing 688 // and wake up a processor goroutine to process them before unlocking e.mu. 689 // This is required because we when packets arrive and endpoint lock is already 690 // held then such packets are queued up to be processed. 691 // 692 // Precondition: e.LockUser() must have been called before calling e.UnlockUser() 693 // +checklocksrelease:e.mu 694 func (e *endpoint) UnlockUser() { 695 // Lock segment queue before checking so that we avoid a race where 696 // segments can be queued between the time we check if queue is empty 697 // and actually unlock the endpoint mutex. 698 e.segmentQueue.mu.Lock() 699 if e.segmentQueue.emptyLocked() { 700 if e.ownedByUser.Swap(0) != 1 { 701 panic("e.UnlockUser() called without calling e.LockUser()") 702 } 703 e.mu.Unlock() 704 e.segmentQueue.mu.Unlock() 705 return 706 } 707 e.segmentQueue.mu.Unlock() 708 709 // Since we are waking the processor goroutine here just unlock 710 // and let it process the queued segments. 711 if e.ownedByUser.Swap(0) != 1 { 712 panic("e.UnlockUser() called without calling e.LockUser()") 713 } 714 processor := e.protocol.dispatcher.selectProcessor(e.ID) 715 e.mu.Unlock() 716 717 // Wake up the processor for this endpoint to process any queued 718 // segments after releasing the lock to avoid the case where if the 719 // processor goroutine starts running before we release the lock here 720 // then it will fail to process as TryLock() will fail. 721 processor.queueEndpoint(e) 722 return 723 } 724 725 // StopWork halts packet processing. Only to be used in tests. 726 // +checklocksacquire:e.mu 727 func (e *endpoint) StopWork() { 728 e.mu.Lock() 729 } 730 731 // ResumeWork resumes packet processing. Only to be used in tests. 732 // +checklocksrelease:e.mu 733 func (e *endpoint) ResumeWork() { 734 e.mu.Unlock() 735 } 736 737 // AssertLockHeld forces the checklocks analyzer to consider e.mu held. This is 738 // used in places where we know that e.mu is held, but checklocks does not, 739 // which can happen when creating new locked objects. You must pass the known 740 // locked endpoint to this function and it must be the same as the caller 741 // endpoint. 742 // TODO(b/226403629): Remove this function once checklocks understands local 743 // variable locks. 744 // +checklocks:locked.mu 745 // +checklocksacquire:e.mu 746 func (e *endpoint) AssertLockHeld(locked *endpoint) { 747 if e != locked { 748 panic("AssertLockHeld failed: locked endpoint != asserting endpoint") 749 } 750 } 751 752 // TryLock is a helper that calls TryLock on the endpoint's mutex and 753 // adds the necessary checklocks annotations. 754 // TODO(b/226403629): Remove this once checklocks understands TryLock. 755 // +checklocksacquire:e.mu 756 func (e *endpoint) TryLock() bool { 757 if e.mu.TryLock() { 758 return true // +checklocksforce 759 } 760 return false // +checklocksignore 761 } 762 763 // setEndpointState updates the state of the endpoint to state atomically. This 764 // method is unexported as the only place we should update the state is in this 765 // package but we allow the state to be read freely without holding e.mu. 766 // 767 // +checklocks:e.mu 768 func (e *endpoint) setEndpointState(state EndpointState) { 769 oldstate := EndpointState(e.state.Swap(uint32(state))) 770 switch state { 771 case StateEstablished: 772 e.stack.Stats().TCP.CurrentEstablished.Increment() 773 e.stack.Stats().TCP.CurrentConnected.Increment() 774 case StateError: 775 fallthrough 776 case StateClose: 777 if oldstate == StateCloseWait || oldstate == StateEstablished { 778 e.stack.Stats().TCP.EstablishedResets.Increment() 779 } 780 if oldstate.connected() { 781 e.stack.Stats().TCP.CurrentConnected.Decrement() 782 } 783 fallthrough 784 default: 785 if oldstate == StateEstablished { 786 e.stack.Stats().TCP.CurrentEstablished.Decrement() 787 } 788 } 789 } 790 791 // EndpointState returns the current state of the endpoint. 792 func (e *endpoint) EndpointState() EndpointState { 793 return EndpointState(e.state.Load()) 794 } 795 796 // setRecentTimestamp sets the recentTS field to the provided value. 797 func (e *endpoint) setRecentTimestamp(recentTS uint32) { 798 e.RecentTS = recentTS 799 e.recentTSTime = e.stack.Clock().NowMonotonic() 800 } 801 802 // recentTimestamp returns the value of the recentTS field. 803 func (e *endpoint) recentTimestamp() uint32 { 804 return e.RecentTS 805 } 806 807 // TODO(gvisor.dev/issue/6974): Remove once tcp endpoints are composed with a 808 // network.Endpoint, which also defines this function. 809 func calculateTTL(route *stack.Route, ipv4TTL uint8, ipv6HopLimit int16) uint8 { 810 switch netProto := route.NetProto(); netProto { 811 case header.IPv4ProtocolNumber: 812 if ipv4TTL == tcpip.UseDefaultIPv4TTL { 813 return route.DefaultTTL() 814 } 815 return ipv4TTL 816 case header.IPv6ProtocolNumber: 817 if ipv6HopLimit == tcpip.UseDefaultIPv6HopLimit { 818 return route.DefaultTTL() 819 } 820 return uint8(ipv6HopLimit) 821 default: 822 panic(fmt.Sprintf("invalid protocol number = %d", netProto)) 823 } 824 } 825 826 // keepalive is a synchronization wrapper used to appease stateify. See the 827 // comment in endpoint, where it is used. 828 // 829 // +stateify savable 830 type keepalive struct { 831 sync.Mutex `state:"nosave"` 832 idle time.Duration 833 interval time.Duration 834 count int 835 unacked int 836 // should never be a zero timer if the endpoint is not closed. 837 timer timer `state:"nosave"` 838 waker sleep.Waker `state:"nosave"` 839 } 840 841 func newEndpoint(s *stack.Stack, protocol *protocol, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint { 842 e := &endpoint{ 843 stack: s, 844 protocol: protocol, 845 TransportEndpointInfo: stack.TransportEndpointInfo{ 846 NetProto: netProto, 847 TransProto: header.TCPProtocolNumber, 848 }, 849 sndQueueInfo: sndQueueInfo{ 850 TCPSndBufState: stack.TCPSndBufState{ 851 SndMTU: math.MaxInt32, 852 }, 853 }, 854 waiterQueue: waiterQueue, 855 state: atomicbitops.FromUint32(uint32(StateInitial)), 856 keepalive: keepalive{ 857 idle: DefaultKeepaliveIdle, 858 interval: DefaultKeepaliveInterval, 859 count: DefaultKeepaliveCount, 860 }, 861 uniqueID: s.UniqueID(), 862 ipv4TTL: tcpip.UseDefaultIPv4TTL, 863 ipv6HopLimit: tcpip.UseDefaultIPv6HopLimit, 864 txHash: s.Rand().Uint32(), 865 windowClamp: DefaultReceiveBufferSize, 866 maxSynRetries: DefaultSynRetries, 867 } 868 e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits, GetTCPReceiveBufferLimits) 869 e.ops.SetMulticastLoop(true) 870 e.ops.SetQuickAck(true) 871 e.ops.SetSendBufferSize(DefaultSendBufferSize, false /* notify */) 872 e.ops.SetReceiveBufferSize(DefaultReceiveBufferSize, false /* notify */) 873 874 var ss tcpip.TCPSendBufferSizeRangeOption 875 if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil { 876 e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) 877 } 878 879 var rs tcpip.TCPReceiveBufferSizeRangeOption 880 if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil { 881 e.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */) 882 } 883 884 var cs tcpip.CongestionControlOption 885 if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil { 886 e.cc = cs 887 } 888 889 var mrb tcpip.TCPModerateReceiveBufferOption 890 if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil { 891 e.RcvAutoParams.Disabled = !bool(mrb) 892 } 893 894 var de tcpip.TCPDelayEnabled 895 if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de { 896 e.ops.SetDelayOption(true) 897 } 898 899 var tcpLT tcpip.TCPLingerTimeoutOption 900 if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil { 901 e.tcpLingerTimeout = time.Duration(tcpLT) 902 } 903 904 var synRetries tcpip.TCPSynRetriesOption 905 if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil { 906 e.maxSynRetries = uint8(synRetries) 907 } 908 909 if p := s.GetTCPProbe(); p != nil { 910 e.probe = p 911 } 912 913 e.segmentQueue.ep = e 914 915 // TODO(https://gvisor.dev/issues/7493): Defer creating the timer until TCP connection becomes 916 // established. 917 e.keepalive.timer.init(e.stack.Clock(), maybeFailTimerHandler(e, e.keepaliveTimerExpired)) 918 919 return e 920 } 921 922 // Readiness returns the current readiness of the endpoint. For example, if 923 // waiter.EventIn is set, the endpoint is immediately readable. 924 func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { 925 result := waiter.EventMask(0) 926 927 switch e.EndpointState() { 928 case StateInitial, StateBound: 929 // This prevents blocking of new sockets which are not 930 // connected when SO_LINGER is set. 931 result |= waiter.EventHUp 932 933 case StateConnecting, StateSynSent, StateSynRecv: 934 // Ready for nothing. 935 936 case StateClose, StateError, StateTimeWait: 937 // Ready for anything. 938 result = mask 939 940 case StateListen: 941 // Check if there's anything in the accepted queue. 942 if (mask & waiter.ReadableEvents) != 0 { 943 e.acceptMu.Lock() 944 if e.acceptQueue.endpoints.Len() != 0 { 945 result |= waiter.ReadableEvents 946 } 947 e.acceptMu.Unlock() 948 } 949 } 950 if e.EndpointState().connected() { 951 // Determine if the endpoint is writable if requested. 952 if (mask & waiter.WritableEvents) != 0 { 953 e.sndQueueInfo.sndQueueMu.Lock() 954 sndBufSize := e.getSendBufferSize() 955 if e.sndQueueInfo.SndClosed || e.sndQueueInfo.SndBufUsed < sndBufSize { 956 result |= waiter.WritableEvents 957 } 958 if e.sndQueueInfo.SndClosed { 959 e.updateConnDirectionState(connDirectionStateSndClosed) 960 } 961 e.sndQueueInfo.sndQueueMu.Unlock() 962 } 963 964 // Determine if the endpoint is readable if requested. 965 if (mask & waiter.ReadableEvents) != 0 { 966 e.rcvQueueMu.Lock() 967 if e.RcvBufUsed > 0 || e.RcvClosed { 968 result |= waiter.ReadableEvents 969 } 970 if e.RcvClosed { 971 e.updateConnDirectionState(connDirectionStateRcvClosed) 972 } 973 e.rcvQueueMu.Unlock() 974 } 975 } 976 977 // Determine whether endpoint is half-closed with rcv shutdown 978 if e.connDirectionState() == connDirectionStateRcvClosed { 979 result |= waiter.EventRdHUp 980 } 981 982 return result 983 } 984 985 // Purging pending rcv segments is only necessary on RST. 986 func (e *endpoint) purgePendingRcvQueue() { 987 if e.rcv != nil { 988 for e.rcv.pendingRcvdSegments.Len() > 0 { 989 s := heap.Pop(&e.rcv.pendingRcvdSegments).(*segment) 990 s.DecRef() 991 } 992 } 993 } 994 995 // +checklocks:e.mu 996 func (e *endpoint) purgeReadQueue() { 997 if e.rcv != nil { 998 e.rcvQueueMu.Lock() 999 defer e.rcvQueueMu.Unlock() 1000 for { 1001 s := e.rcvQueue.Front() 1002 if s == nil { 1003 break 1004 } 1005 e.rcvQueue.Remove(s) 1006 s.DecRef() 1007 } 1008 e.RcvBufUsed = 0 1009 } 1010 } 1011 1012 // +checklocks:e.mu 1013 func (e *endpoint) purgeWriteQueue() { 1014 if e.snd != nil { 1015 e.sndQueueInfo.sndQueueMu.Lock() 1016 defer e.sndQueueInfo.sndQueueMu.Unlock() 1017 e.snd.updateWriteNext(nil) 1018 for { 1019 s := e.snd.writeList.Front() 1020 if s == nil { 1021 break 1022 } 1023 e.snd.writeList.Remove(s) 1024 s.DecRef() 1025 } 1026 e.sndQueueInfo.SndBufUsed = 0 1027 e.sndQueueInfo.SndClosed = true 1028 } 1029 } 1030 1031 // Abort implements stack.TransportEndpoint.Abort. 1032 func (e *endpoint) Abort() { 1033 defer e.drainClosingSegmentQueue() 1034 e.LockUser() 1035 defer e.UnlockUser() 1036 defer e.purgeReadQueue() 1037 // Reset all connected endpoints. 1038 switch state := e.EndpointState(); { 1039 case state.connected(): 1040 e.resetConnectionLocked(&tcpip.ErrAborted{}) 1041 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 1042 return 1043 } 1044 e.closeLocked() 1045 } 1046 1047 // Close puts the endpoint in a closed state and frees all resources associated 1048 // with it. It must be called only once and with no other concurrent calls to 1049 // the endpoint. 1050 func (e *endpoint) Close() { 1051 e.LockUser() 1052 if e.closed { 1053 e.UnlockUser() 1054 return 1055 } 1056 1057 // We always want to purge the read queue, but do so after the checks in 1058 // shutdownLocked. 1059 e.closeLocked() 1060 e.purgeReadQueue() 1061 if e.EndpointState() == StateClose || e.EndpointState() == StateError { 1062 // It should be safe to purge the read queue now as the endpoint 1063 // is now closed or in an error state and further reads are not 1064 // permitted. 1065 e.UnlockUser() 1066 e.drainClosingSegmentQueue() 1067 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 1068 return 1069 } 1070 e.UnlockUser() 1071 } 1072 1073 // +checklocks:e.mu 1074 func (e *endpoint) closeLocked() { 1075 linger := e.SocketOptions().GetLinger() 1076 if linger.Enabled && linger.Timeout == 0 { 1077 s := e.EndpointState() 1078 isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv 1079 if isResetState { 1080 // Close the endpoint without doing full shutdown and 1081 // send a RST. 1082 e.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) 1083 return 1084 } 1085 } 1086 1087 // Issue a shutdown so that the peer knows we won't send any more data 1088 // if we're connected, or stop accepting if we're listening. 1089 e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead) 1090 e.closeNoShutdownLocked() 1091 } 1092 1093 // closeNoShutdown closes the endpoint without doing a full shutdown. 1094 // +checklocks:e.mu 1095 func (e *endpoint) closeNoShutdownLocked() { 1096 // For listening sockets, we always release ports inline so that they 1097 // are immediately available for reuse after Close() is called. If also 1098 // registered, we unregister as well otherwise the next user would fail 1099 // in Listen() when trying to register. 1100 if e.EndpointState() == StateListen && e.isPortReserved { 1101 if e.isRegistered { 1102 e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 1103 e.isRegistered = false 1104 } 1105 1106 portRes := ports.Reservation{ 1107 Networks: e.effectiveNetProtos, 1108 Transport: ProtocolNumber, 1109 Addr: e.TransportEndpointInfo.ID.LocalAddress, 1110 Port: e.TransportEndpointInfo.ID.LocalPort, 1111 Flags: e.boundPortFlags, 1112 BindToDevice: e.boundBindToDevice, 1113 Dest: e.boundDest, 1114 } 1115 e.stack.ReleasePort(portRes) 1116 e.isPortReserved = false 1117 e.boundBindToDevice = 0 1118 e.boundPortFlags = ports.Flags{} 1119 e.boundDest = tcpip.FullAddress{} 1120 } 1121 1122 // Mark endpoint as closed. 1123 e.closed = true 1124 tcpip.AddDanglingEndpoint(e) 1125 1126 eventMask := waiter.ReadableEvents | waiter.WritableEvents 1127 1128 switch e.EndpointState() { 1129 case StateInitial, StateBound, StateListen: 1130 e.setEndpointState(StateClose) 1131 fallthrough 1132 case StateClose, StateError: 1133 eventMask |= waiter.EventHUp 1134 e.cleanupLocked() 1135 case StateConnecting, StateSynSent, StateSynRecv: 1136 // Abort the handshake and set the error. 1137 // Notify that the endpoint is closed. 1138 eventMask |= waiter.EventHUp 1139 e.handshakeFailed(&tcpip.ErrAborted{}) 1140 // Notify that the endpoint is closed. 1141 eventMask |= waiter.EventHUp 1142 case StateFinWait2: 1143 // The socket has been closed and we are in FIN-WAIT-2 so start 1144 // the FIN-WAIT-2 timer. 1145 if e.finWait2Timer == nil { 1146 e.finWait2Timer = e.stack.Clock().AfterFunc(e.tcpLingerTimeout, e.finWait2TimerExpired) 1147 } 1148 } 1149 1150 e.waiterQueue.Notify(eventMask) 1151 } 1152 1153 // closePendingAcceptableConnections closes all connections that have completed 1154 // handshake but not yet been delivered to the application. 1155 func (e *endpoint) closePendingAcceptableConnectionsLocked() { 1156 e.acceptMu.Lock() 1157 1158 pendingEndpoints := e.acceptQueue.pendingEndpoints 1159 e.acceptQueue.pendingEndpoints = nil 1160 1161 completedEndpoints := make([]*endpoint, 0, e.acceptQueue.endpoints.Len()) 1162 for n := e.acceptQueue.endpoints.Front(); n != nil; n = n.Next() { 1163 completedEndpoints = append(completedEndpoints, n.Value.(*endpoint)) 1164 } 1165 e.acceptQueue.endpoints.Init() 1166 e.acceptQueue.capacity = 0 1167 e.acceptMu.Unlock() 1168 1169 // Close any endpoints in SYN-RCVD state. 1170 for n := range pendingEndpoints { 1171 n.Abort() 1172 } 1173 1174 // Reset all connections that are waiting to be accepted. 1175 for _, n := range completedEndpoints { 1176 n.Abort() 1177 } 1178 } 1179 1180 // cleanupLocked frees all resources associated with the endpoint. 1181 // +checklocks:e.mu 1182 func (e *endpoint) cleanupLocked() { 1183 if e.snd != nil { 1184 e.snd.resendTimer.cleanup() 1185 e.snd.probeTimer.cleanup() 1186 e.snd.reorderTimer.cleanup() 1187 } 1188 1189 if e.finWait2Timer != nil { 1190 e.finWait2Timer.Stop() 1191 } 1192 1193 if e.timeWaitTimer != nil { 1194 e.timeWaitTimer.Stop() 1195 } 1196 1197 // Close all endpoints that might have been accepted by TCP but not by 1198 // the client. 1199 e.closePendingAcceptableConnectionsLocked() 1200 e.keepalive.timer.cleanup() 1201 1202 if e.isRegistered { 1203 e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 1204 e.isRegistered = false 1205 } 1206 1207 if e.isPortReserved { 1208 portRes := ports.Reservation{ 1209 Networks: e.effectiveNetProtos, 1210 Transport: ProtocolNumber, 1211 Addr: e.TransportEndpointInfo.ID.LocalAddress, 1212 Port: e.TransportEndpointInfo.ID.LocalPort, 1213 Flags: e.boundPortFlags, 1214 BindToDevice: e.boundBindToDevice, 1215 Dest: e.boundDest, 1216 } 1217 e.stack.ReleasePort(portRes) 1218 e.isPortReserved = false 1219 } 1220 e.boundBindToDevice = 0 1221 e.boundPortFlags = ports.Flags{} 1222 e.boundDest = tcpip.FullAddress{} 1223 1224 if e.route != nil { 1225 e.route.Release() 1226 e.route = nil 1227 } 1228 1229 e.purgeWriteQueue() 1230 // Only purge the read queue here if the socket is fully closed by the 1231 // user. 1232 if e.closed { 1233 e.purgeReadQueue() 1234 } 1235 e.stack.CompleteTransportEndpointCleanup(e) 1236 tcpip.DeleteDanglingEndpoint(e) 1237 } 1238 1239 // wndFromSpace returns the window that we can advertise based on the available 1240 // receive buffer space. 1241 func wndFromSpace(space int) int { 1242 return space >> rcvAdvWndScale 1243 } 1244 1245 // initialReceiveWindow returns the initial receive window to advertise in the 1246 // SYN/SYN-ACK. 1247 func (e *endpoint) initialReceiveWindow() int { 1248 rcvWnd := wndFromSpace(e.receiveBufferAvailable()) 1249 if rcvWnd > math.MaxUint16 { 1250 rcvWnd = math.MaxUint16 1251 } 1252 1253 // Use the user supplied MSS, if available. 1254 routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2 1255 if rcvWnd > routeWnd { 1256 rcvWnd = routeWnd 1257 } 1258 rcvWndScale := e.rcvWndScaleForHandshake() 1259 1260 // Round-down the rcvWnd to a multiple of wndScale. This ensures that the 1261 // window offered in SYN won't be reduced due to the loss of precision if 1262 // window scaling is enabled after the handshake. 1263 rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale) 1264 1265 // Ensure we can always accept at least 1 byte if the scale specified 1266 // was too high for the provided rcvWnd. 1267 if rcvWnd == 0 { 1268 rcvWnd = 1 1269 } 1270 1271 return rcvWnd 1272 } 1273 1274 // ModerateRecvBuf adjusts the receive buffer and the advertised window 1275 // based on the number of bytes copied to userspace. 1276 func (e *endpoint) ModerateRecvBuf(copied int) { 1277 e.LockUser() 1278 defer e.UnlockUser() 1279 1280 sendNonZeroWindowUpdate := false 1281 1282 e.rcvQueueMu.Lock() 1283 if e.RcvAutoParams.Disabled { 1284 e.rcvQueueMu.Unlock() 1285 return 1286 } 1287 now := e.stack.Clock().NowMonotonic() 1288 if rtt := e.RcvAutoParams.RTT; rtt == 0 || now.Sub(e.RcvAutoParams.MeasureTime) < rtt { 1289 e.RcvAutoParams.CopiedBytes += copied 1290 e.rcvQueueMu.Unlock() 1291 return 1292 } 1293 prevRTTCopied := e.RcvAutoParams.CopiedBytes + copied 1294 prevCopied := e.RcvAutoParams.PrevCopiedBytes 1295 rcvWnd := 0 1296 if prevRTTCopied > prevCopied { 1297 // The minimal receive window based on what was copied by the app 1298 // in the immediate preceding RTT and some extra buffer for 16 1299 // segments to account for variations. 1300 // We multiply by 2 to account for packet losses. 1301 rcvWnd = prevRTTCopied*2 + 16*int(e.amss) 1302 1303 // Scale for slow start based on bytes copied in this RTT vs previous. 1304 grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied 1305 1306 // Multiply growth factor by 2 again to account for sender being 1307 // in slow-start where the sender grows it's congestion window 1308 // by 100% per RTT. 1309 rcvWnd += grow * 2 1310 1311 // Make sure auto tuned buffer size can always receive upto 2x 1312 // the initial window of 10 segments. 1313 if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd { 1314 rcvWnd = minRcvWnd 1315 } 1316 1317 // Cap the auto tuned buffer size by the maximum permissible 1318 // receive buffer size. 1319 if max := e.maxReceiveBufferSize(); rcvWnd > max { 1320 rcvWnd = max 1321 } 1322 1323 // We do not adjust downwards as that can cause the receiver to 1324 // reject valid data that might already be in flight as the 1325 // acceptable window will shrink. 1326 rcvBufSize := int(e.ops.GetReceiveBufferSize()) 1327 if rcvWnd > rcvBufSize { 1328 availBefore := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize)) 1329 e.ops.SetReceiveBufferSize(int64(rcvWnd), false /* notify */) 1330 availAfter := wndFromSpace(e.receiveBufferAvailableLocked(rcvWnd)) 1331 if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, rcvBufSize); crossed && above { 1332 sendNonZeroWindowUpdate = true 1333 } 1334 } 1335 1336 // We only update PrevCopiedBytes when we grow the buffer because in cases 1337 // where PrevCopiedBytes > prevRTTCopied the existing buffer is already big 1338 // enough to handle the current rate and we don't need to do any 1339 // adjustments. 1340 e.RcvAutoParams.PrevCopiedBytes = prevRTTCopied 1341 } 1342 e.RcvAutoParams.MeasureTime = now 1343 e.RcvAutoParams.CopiedBytes = 0 1344 e.rcvQueueMu.Unlock() 1345 1346 // Send the update after unlocking rcvQueueMu as sending a segment acquires 1347 // the lock to calculate the window to be sent. 1348 if e.EndpointState().connected() && sendNonZeroWindowUpdate { 1349 e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu 1350 } 1351 } 1352 1353 // SetOwner implements tcpip.Endpoint.SetOwner. 1354 func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { 1355 e.owner = owner 1356 } 1357 1358 // +checklocks:e.mu 1359 func (e *endpoint) hardErrorLocked() tcpip.Error { 1360 err := e.hardError 1361 e.hardError = nil 1362 return err 1363 } 1364 1365 // +checklocks:e.mu 1366 func (e *endpoint) lastErrorLocked() tcpip.Error { 1367 e.lastErrorMu.Lock() 1368 defer e.lastErrorMu.Unlock() 1369 err := e.lastError 1370 e.lastError = nil 1371 return err 1372 } 1373 1374 // LastError implements tcpip.Endpoint.LastError. 1375 func (e *endpoint) LastError() tcpip.Error { 1376 e.LockUser() 1377 defer e.UnlockUser() 1378 if err := e.hardErrorLocked(); err != nil { 1379 return err 1380 } 1381 return e.lastErrorLocked() 1382 } 1383 1384 // LastErrorLocked reads and clears lastError. 1385 // Only to be used in tests. 1386 // +checklocks:e.mu 1387 func (e *endpoint) LastErrorLocked() tcpip.Error { 1388 return e.lastErrorLocked() 1389 } 1390 1391 // UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError. 1392 func (e *endpoint) UpdateLastError(err tcpip.Error) { 1393 e.LockUser() 1394 e.lastErrorMu.Lock() 1395 e.lastError = err 1396 e.lastErrorMu.Unlock() 1397 e.UnlockUser() 1398 } 1399 1400 // Read implements tcpip.Endpoint.Read. 1401 func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) { 1402 e.LockUser() 1403 defer e.UnlockUser() 1404 1405 if err := e.checkReadLocked(); err != nil { 1406 if _, ok := err.(*tcpip.ErrClosedForReceive); ok { 1407 e.stats.ReadErrors.ReadClosed.Increment() 1408 } 1409 return tcpip.ReadResult{}, err 1410 } 1411 1412 var err error 1413 done := 0 1414 // N.B. Here we get the first segment to be processed. It is safe to not 1415 // hold rcvQueueMu when processing, since we hold e.mu to ensure we only 1416 // remove segments from the list through Read() and that new segments 1417 // cannot be appended. 1418 s := e.rcvQueue.Front() 1419 for s != nil { 1420 var n int 1421 n, err = s.ReadTo(dst, opts.Peek) 1422 // Book keeping first then error handling. 1423 done += n 1424 1425 if opts.Peek { 1426 s = s.Next() 1427 } else { 1428 sendNonZeroWindowUpdate := false 1429 memDelta := 0 1430 for { 1431 seg := e.rcvQueue.Front() 1432 if seg == nil || seg.payloadSize() != 0 { 1433 break 1434 } 1435 e.rcvQueue.Remove(seg) 1436 // Memory is only considered released when the whole segment has been 1437 // read. 1438 memDelta += seg.segMemSize() 1439 seg.DecRef() 1440 } 1441 e.rcvQueueMu.Lock() 1442 e.RcvBufUsed -= n 1443 s = e.rcvQueue.Front() 1444 1445 if memDelta > 0 { 1446 // If the window was small before this read and if the read freed up 1447 // enough buffer space, to either fit an aMSS or half a receive buffer 1448 // (whichever smaller), then notify the protocol goroutine to send a 1449 // window update. 1450 if crossed, above := e.windowCrossedACKThresholdLocked(memDelta, int(e.ops.GetReceiveBufferSize())); crossed && above { 1451 sendNonZeroWindowUpdate = true 1452 } 1453 } 1454 e.rcvQueueMu.Unlock() 1455 1456 if e.EndpointState().connected() && sendNonZeroWindowUpdate { 1457 e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu 1458 } 1459 } 1460 1461 if err != nil { 1462 break 1463 } 1464 } 1465 1466 // If something is read, we must report it. Report error when nothing is read. 1467 if done == 0 && err != nil { 1468 return tcpip.ReadResult{}, &tcpip.ErrBadBuffer{} 1469 } 1470 return tcpip.ReadResult{ 1471 Count: done, 1472 Total: done, 1473 }, nil 1474 } 1475 1476 // checkRead checks that endpoint is in a readable state. 1477 // 1478 // +checklocks:e.mu 1479 func (e *endpoint) checkReadLocked() tcpip.Error { 1480 e.rcvQueueMu.Lock() 1481 defer e.rcvQueueMu.Unlock() 1482 // When in SYN-SENT state, let the caller block on the receive. 1483 // An application can initiate a non-blocking connect and then block 1484 // on a receive. It can expect to read any data after the handshake 1485 // is complete. RFC793, section 3.9, p58. 1486 if e.EndpointState() == StateSynSent { 1487 return &tcpip.ErrWouldBlock{} 1488 } 1489 1490 // The endpoint can be read if it's connected, or if it's already closed 1491 // but has some pending unread data. Also note that a RST being received 1492 // would cause the state to become StateError so we should allow the 1493 // reads to proceed before returning a ECONNRESET. 1494 bufUsed := e.RcvBufUsed 1495 if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 { 1496 if s == StateError { 1497 if err := e.hardErrorLocked(); err != nil { 1498 return err 1499 } 1500 return &tcpip.ErrClosedForReceive{} 1501 } 1502 e.stats.ReadErrors.NotConnected.Increment() 1503 return &tcpip.ErrNotConnected{} 1504 } 1505 1506 if e.RcvBufUsed == 0 { 1507 if e.RcvClosed || !e.EndpointState().connected() { 1508 return &tcpip.ErrClosedForReceive{} 1509 } 1510 return &tcpip.ErrWouldBlock{} 1511 } 1512 1513 return nil 1514 } 1515 1516 // isEndpointWritableLocked checks if a given endpoint is writable 1517 // and also returns the number of bytes that can be written at this 1518 // moment. If the endpoint is not writable then it returns an error 1519 // indicating the reason why it's not writable. 1520 // +checklocks:e.mu 1521 // +checklocks:e.sndQueueInfo.sndQueueMu 1522 func (e *endpoint) isEndpointWritableLocked() (int, tcpip.Error) { 1523 // The endpoint cannot be written to if it's not connected. 1524 switch s := e.EndpointState(); { 1525 case s == StateError: 1526 if err := e.hardErrorLocked(); err != nil { 1527 return 0, err 1528 } 1529 return 0, &tcpip.ErrClosedForSend{} 1530 case !s.connecting() && !s.connected(): 1531 return 0, &tcpip.ErrClosedForSend{} 1532 case s.connecting(): 1533 // As per RFC793, page 56, a send request arriving when in connecting 1534 // state, can be queued to be completed after the state becomes 1535 // connected. Return an error code for the caller of endpoint Write to 1536 // try again, until the connection handshake is complete. 1537 return 0, &tcpip.ErrWouldBlock{} 1538 } 1539 1540 // Check if the connection has already been closed for sends. 1541 if e.sndQueueInfo.SndClosed { 1542 return 0, &tcpip.ErrClosedForSend{} 1543 } 1544 1545 sndBufSize := e.getSendBufferSize() 1546 avail := sndBufSize - e.sndQueueInfo.SndBufUsed 1547 if avail <= 0 { 1548 return 0, &tcpip.ErrWouldBlock{} 1549 } 1550 return avail, nil 1551 } 1552 1553 // readFromPayloader reads a slice from the Payloader. 1554 // +checklocks:e.mu 1555 // +checklocks:e.sndQueueInfo.sndQueueMu 1556 func (e *endpoint) readFromPayloader(p tcpip.Payloader, opts tcpip.WriteOptions, avail int) (buffer.Buffer, tcpip.Error) { 1557 // We can release locks while copying data. 1558 // 1559 // This is not possible if atomic is set, because we can't allow the 1560 // available buffer space to be consumed by some other caller while we 1561 // are copying data in. 1562 if !opts.Atomic { 1563 e.sndQueueInfo.sndQueueMu.Unlock() 1564 defer e.sndQueueInfo.sndQueueMu.Lock() 1565 1566 e.UnlockUser() 1567 defer e.LockUser() 1568 } 1569 1570 // Fetch data. 1571 var payload buffer.Buffer 1572 if l := p.Len(); l < avail { 1573 avail = l 1574 } 1575 if avail == 0 { 1576 return payload, nil 1577 } 1578 if _, err := payload.WriteFromReader(p, int64(avail)); err != nil { 1579 payload.Release() 1580 return buffer.Buffer{}, &tcpip.ErrBadBuffer{} 1581 } 1582 return payload, nil 1583 } 1584 1585 // queueSegment reads data from the payloader and returns a segment to be sent. 1586 // +checklocks:e.mu 1587 func (e *endpoint) queueSegment(p tcpip.Payloader, opts tcpip.WriteOptions) (*segment, int, tcpip.Error) { 1588 e.sndQueueInfo.sndQueueMu.Lock() 1589 defer e.sndQueueInfo.sndQueueMu.Unlock() 1590 1591 avail, err := e.isEndpointWritableLocked() 1592 if err != nil { 1593 e.stats.WriteErrors.WriteClosed.Increment() 1594 return nil, 0, err 1595 } 1596 1597 buf, err := e.readFromPayloader(p, opts, avail) 1598 if err != nil { 1599 return nil, 0, err 1600 } 1601 1602 // Do not queue zero length segments. 1603 if buf.Size() == 0 { 1604 return nil, 0, nil 1605 } 1606 1607 if !opts.Atomic { 1608 // Since we released locks in between it's possible that the 1609 // endpoint transitioned to a CLOSED/ERROR states so make 1610 // sure endpoint is still writable before trying to write. 1611 avail, err := e.isEndpointWritableLocked() 1612 if err != nil { 1613 e.stats.WriteErrors.WriteClosed.Increment() 1614 buf.Release() 1615 return nil, 0, err 1616 } 1617 1618 // A simultaneous call to write on the socket can reduce avail. Discard 1619 // excess data copied if this is the case. 1620 if int64(avail) < buf.Size() { 1621 buf.Truncate(int64(avail)) 1622 } 1623 } 1624 1625 // Add data to the send queue. 1626 size := int(buf.Size()) 1627 s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), buf) 1628 e.sndQueueInfo.SndBufUsed += size 1629 e.snd.writeList.PushBack(s) 1630 1631 return s, size, nil 1632 } 1633 1634 // Write writes data to the endpoint's peer. 1635 func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { 1636 // Linux completely ignores any address passed to sendto(2) for TCP sockets 1637 // (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More 1638 // and opts.EndOfRecord are also ignored. 1639 1640 e.LockUser() 1641 defer e.UnlockUser() 1642 1643 // Return if either we didn't queue anything or if an error occurred while 1644 // attempting to queue data. 1645 nextSeg, n, err := e.queueSegment(p, opts) 1646 if n == 0 || err != nil { 1647 return 0, err 1648 } 1649 1650 e.sendData(nextSeg) 1651 return int64(n), nil 1652 } 1653 1654 // selectWindowLocked returns the new window without checking for shrinking or scaling 1655 // applied. 1656 // +checklocks:e.mu 1657 // +checklocks:e.rcvQueueMu 1658 func (e *endpoint) selectWindowLocked(rcvBufSize int) (wnd seqnum.Size) { 1659 wndFromAvailable := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize)) 1660 maxWindow := wndFromSpace(rcvBufSize) 1661 wndFromUsedBytes := maxWindow - e.RcvBufUsed 1662 1663 // We take the lesser of the wndFromAvailable and wndFromUsedBytes because in 1664 // cases where we receive a lot of small segments the segment overhead is a 1665 // lot higher and we can run out socket buffer space before we can fill the 1666 // previous window we advertised. In cases where we receive MSS sized or close 1667 // MSS sized segments we will probably run out of window space before we 1668 // exhaust receive buffer. 1669 newWnd := wndFromAvailable 1670 if newWnd > wndFromUsedBytes { 1671 newWnd = wndFromUsedBytes 1672 } 1673 if newWnd < 0 { 1674 newWnd = 0 1675 } 1676 return seqnum.Size(newWnd) 1677 } 1678 1679 // selectWindow invokes selectWindowLocked after acquiring e.rcvQueueMu. 1680 // +checklocks:e.mu 1681 func (e *endpoint) selectWindow() (wnd seqnum.Size) { 1682 e.rcvQueueMu.Lock() 1683 wnd = e.selectWindowLocked(int(e.ops.GetReceiveBufferSize())) 1684 e.rcvQueueMu.Unlock() 1685 return wnd 1686 } 1687 1688 // windowCrossedACKThresholdLocked checks if the receive window to be announced 1689 // would be under aMSS or under the window derived from half receive buffer, 1690 // whichever smaller. This is useful as a receive side silly window syndrome 1691 // prevention mechanism. If window grows to reasonable value, we should send ACK 1692 // to the sender to inform the rx space is now large. We also want ensure a 1693 // series of small read()'s won't trigger a flood of spurious tiny ACK's. 1694 // 1695 // For large receive buffers, the threshold is aMSS - once reader reads more 1696 // than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of 1697 // receive buffer size. This is chosen arbitrarily. 1698 // crossed will be true if the window size crossed the ACK threshold. 1699 // above will be true if the new window is >= ACK threshold and false 1700 // otherwise. 1701 // 1702 // +checklocks:e.mu 1703 // +checklocks:e.rcvQueueMu 1704 func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int, rcvBufSize int) (crossed bool, above bool) { 1705 newAvail := int(e.selectWindowLocked(rcvBufSize)) 1706 oldAvail := newAvail - deltaBefore 1707 if oldAvail < 0 { 1708 oldAvail = 0 1709 } 1710 threshold := int(e.amss) 1711 // rcvBufFraction is the inverse of the fraction of receive buffer size that 1712 // is used to decide if the available buffer space is now above it. 1713 const rcvBufFraction = 2 1714 if wndThreshold := wndFromSpace(rcvBufSize / rcvBufFraction); threshold > wndThreshold { 1715 threshold = wndThreshold 1716 } 1717 1718 switch { 1719 case oldAvail < threshold && newAvail >= threshold: 1720 return true, true 1721 case oldAvail >= threshold && newAvail < threshold: 1722 return true, false 1723 } 1724 return false, false 1725 } 1726 1727 // OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet. 1728 func (e *endpoint) OnReuseAddressSet(v bool) { 1729 e.LockUser() 1730 e.portFlags.TupleOnly = v 1731 e.UnlockUser() 1732 } 1733 1734 // OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet. 1735 func (e *endpoint) OnReusePortSet(v bool) { 1736 e.LockUser() 1737 e.portFlags.LoadBalanced = v 1738 e.UnlockUser() 1739 } 1740 1741 // OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet. 1742 func (e *endpoint) OnKeepAliveSet(bool) { 1743 e.LockUser() 1744 e.resetKeepaliveTimer(true /* receivedData */) 1745 e.UnlockUser() 1746 } 1747 1748 // OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet. 1749 func (e *endpoint) OnDelayOptionSet(v bool) { 1750 if !v { 1751 e.LockUser() 1752 defer e.UnlockUser() 1753 // Handle delayed data. 1754 if e.EndpointState().connected() { 1755 e.sendData(nil /* next */) 1756 } 1757 } 1758 } 1759 1760 // OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet. 1761 func (e *endpoint) OnCorkOptionSet(v bool) { 1762 if !v { 1763 e.LockUser() 1764 defer e.UnlockUser() 1765 // Handle the corked data. 1766 if e.EndpointState().connected() { 1767 e.sendData(nil /* next */) 1768 } 1769 } 1770 } 1771 1772 func (e *endpoint) getSendBufferSize() int { 1773 return int(e.ops.GetSendBufferSize()) 1774 } 1775 1776 // OnSetReceiveBufferSize implements tcpip.SocketOptionsHandler.OnSetReceiveBufferSize. 1777 func (e *endpoint) OnSetReceiveBufferSize(rcvBufSz, oldSz int64) (newSz int64, postSet func()) { 1778 e.LockUser() 1779 1780 sendNonZeroWindowUpdate := false 1781 e.rcvQueueMu.Lock() 1782 1783 // Make sure the receive buffer size allows us to send a 1784 // non-zero window size. 1785 scale := uint8(0) 1786 if e.rcv != nil { 1787 scale = e.rcv.RcvWndScale 1788 } 1789 if rcvBufSz>>scale == 0 { 1790 rcvBufSz = 1 << scale 1791 } 1792 1793 availBefore := wndFromSpace(e.receiveBufferAvailableLocked(int(oldSz))) 1794 availAfter := wndFromSpace(e.receiveBufferAvailableLocked(int(rcvBufSz))) 1795 e.RcvAutoParams.Disabled = true 1796 1797 // Immediately send an ACK to uncork the sender silly window 1798 // syndrome prevetion, when our available space grows above aMSS 1799 // or half receive buffer, whichever smaller. 1800 if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, int(rcvBufSz)); crossed && above { 1801 sendNonZeroWindowUpdate = true 1802 } 1803 1804 e.rcvQueueMu.Unlock() 1805 1806 postSet = func() { 1807 e.LockUser() 1808 defer e.UnlockUser() 1809 if e.EndpointState().connected() && sendNonZeroWindowUpdate { 1810 e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu 1811 } 1812 1813 } 1814 e.UnlockUser() 1815 return rcvBufSz, postSet 1816 } 1817 1818 // OnSetSendBufferSize implements tcpip.SocketOptionsHandler.OnSetSendBufferSize. 1819 func (e *endpoint) OnSetSendBufferSize(sz int64) int64 { 1820 e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled.Store(1) 1821 return sz 1822 } 1823 1824 // WakeupWriters implements tcpip.SocketOptionsHandler.WakeupWriters. 1825 func (e *endpoint) WakeupWriters() { 1826 e.LockUser() 1827 defer e.UnlockUser() 1828 1829 sendBufferSize := e.getSendBufferSize() 1830 e.sndQueueInfo.sndQueueMu.Lock() 1831 notify := (sendBufferSize - e.sndQueueInfo.SndBufUsed) >= e.sndQueueInfo.SndBufUsed>>1 1832 e.sndQueueInfo.sndQueueMu.Unlock() 1833 1834 if notify { 1835 e.waiterQueue.Notify(waiter.WritableEvents) 1836 } 1837 } 1838 1839 // SetSockOptInt sets a socket option. 1840 func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { 1841 // Lower 2 bits represents ECN bits. RFC 3168, section 23.1 1842 const inetECNMask = 3 1843 1844 switch opt { 1845 case tcpip.KeepaliveCountOption: 1846 e.LockUser() 1847 e.keepalive.Lock() 1848 e.keepalive.count = v 1849 e.keepalive.Unlock() 1850 e.resetKeepaliveTimer(true /* receivedData */) 1851 e.UnlockUser() 1852 1853 case tcpip.IPv4TOSOption: 1854 e.LockUser() 1855 // TODO(gvisor.dev/issue/995): ECN is not currently supported, 1856 // ignore the bits for now. 1857 e.sendTOS = uint8(v) & ^uint8(inetECNMask) 1858 e.UnlockUser() 1859 1860 case tcpip.IPv6TrafficClassOption: 1861 e.LockUser() 1862 // TODO(gvisor.dev/issue/995): ECN is not currently supported, 1863 // ignore the bits for now. 1864 e.sendTOS = uint8(v) & ^uint8(inetECNMask) 1865 e.UnlockUser() 1866 1867 case tcpip.MaxSegOption: 1868 userMSS := v 1869 if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS { 1870 return &tcpip.ErrInvalidOptionValue{} 1871 } 1872 e.LockUser() 1873 e.userMSS = uint16(userMSS) 1874 e.UnlockUser() 1875 1876 case tcpip.MTUDiscoverOption: 1877 // Return not supported if attempting to set this option to 1878 // anything other than path MTU discovery disabled. 1879 if v != tcpip.PMTUDiscoveryDont { 1880 return &tcpip.ErrNotSupported{} 1881 } 1882 1883 case tcpip.IPv4TTLOption: 1884 e.LockUser() 1885 e.ipv4TTL = uint8(v) 1886 e.UnlockUser() 1887 1888 case tcpip.IPv6HopLimitOption: 1889 e.LockUser() 1890 e.ipv6HopLimit = int16(v) 1891 e.UnlockUser() 1892 1893 case tcpip.TCPSynCountOption: 1894 if v < 1 || v > 255 { 1895 return &tcpip.ErrInvalidOptionValue{} 1896 } 1897 e.LockUser() 1898 e.maxSynRetries = uint8(v) 1899 e.UnlockUser() 1900 1901 case tcpip.TCPWindowClampOption: 1902 if v == 0 { 1903 e.LockUser() 1904 switch e.EndpointState() { 1905 case StateClose, StateInitial: 1906 e.windowClamp = 0 1907 e.UnlockUser() 1908 return nil 1909 default: 1910 e.UnlockUser() 1911 return &tcpip.ErrInvalidOptionValue{} 1912 } 1913 } 1914 var rs tcpip.TCPReceiveBufferSizeRangeOption 1915 if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil { 1916 if v < rs.Min/2 { 1917 v = rs.Min / 2 1918 } 1919 } 1920 e.LockUser() 1921 e.windowClamp = uint32(v) 1922 e.UnlockUser() 1923 } 1924 return nil 1925 } 1926 1927 func (e *endpoint) HasNIC(id int32) bool { 1928 return id == 0 || e.stack.HasNIC(tcpip.NICID(id)) 1929 } 1930 1931 // SetSockOpt sets a socket option. 1932 func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { 1933 switch v := opt.(type) { 1934 case *tcpip.KeepaliveIdleOption: 1935 e.LockUser() 1936 e.keepalive.Lock() 1937 e.keepalive.idle = time.Duration(*v) 1938 e.keepalive.Unlock() 1939 e.resetKeepaliveTimer(true /* receivedData */) 1940 e.UnlockUser() 1941 1942 case *tcpip.KeepaliveIntervalOption: 1943 e.LockUser() 1944 e.keepalive.Lock() 1945 e.keepalive.interval = time.Duration(*v) 1946 e.keepalive.Unlock() 1947 e.resetKeepaliveTimer(true /* receivedData */) 1948 e.UnlockUser() 1949 1950 case *tcpip.TCPUserTimeoutOption: 1951 e.LockUser() 1952 e.userTimeout = time.Duration(*v) 1953 e.UnlockUser() 1954 1955 case *tcpip.CongestionControlOption: 1956 // Query the available cc algorithms in the stack and 1957 // validate that the specified algorithm is actually 1958 // supported in the stack. 1959 var avail tcpip.TCPAvailableCongestionControlOption 1960 if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil { 1961 return err 1962 } 1963 availCC := strings.Split(string(avail), " ") 1964 for _, cc := range availCC { 1965 if *v == tcpip.CongestionControlOption(cc) { 1966 e.LockUser() 1967 state := e.EndpointState() 1968 e.cc = *v 1969 switch state { 1970 case StateEstablished: 1971 if e.EndpointState() == state { 1972 e.snd.cc = e.snd.initCongestionControl(e.cc) 1973 } 1974 } 1975 e.UnlockUser() 1976 return nil 1977 } 1978 } 1979 1980 // Linux returns ENOENT when an invalid congestion 1981 // control algorithm is specified. 1982 return &tcpip.ErrNoSuchFile{} 1983 1984 case *tcpip.TCPLingerTimeoutOption: 1985 e.LockUser() 1986 1987 switch { 1988 case *v < 0: 1989 // Same as effectively disabling TCPLinger timeout. 1990 *v = -1 1991 case *v == 0: 1992 // Same as the stack default. 1993 var stackLingerTimeout tcpip.TCPLingerTimeoutOption 1994 if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil { 1995 panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err)) 1996 } 1997 *v = stackLingerTimeout 1998 case *v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout): 1999 // Cap it to Stack's default TCP_LINGER2 timeout. 2000 *v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout) 2001 default: 2002 } 2003 2004 e.tcpLingerTimeout = time.Duration(*v) 2005 e.UnlockUser() 2006 2007 case *tcpip.TCPDeferAcceptOption: 2008 e.LockUser() 2009 if time.Duration(*v) > MaxRTO { 2010 *v = tcpip.TCPDeferAcceptOption(MaxRTO) 2011 } 2012 e.deferAccept = time.Duration(*v) 2013 e.UnlockUser() 2014 2015 case *tcpip.SocketDetachFilterOption: 2016 return nil 2017 2018 default: 2019 return nil 2020 } 2021 return nil 2022 } 2023 2024 // readyReceiveSize returns the number of bytes ready to be received. 2025 func (e *endpoint) readyReceiveSize() (int, tcpip.Error) { 2026 e.LockUser() 2027 defer e.UnlockUser() 2028 2029 // The endpoint cannot be in listen state. 2030 if e.EndpointState() == StateListen { 2031 return 0, &tcpip.ErrInvalidEndpointState{} 2032 } 2033 2034 e.rcvQueueMu.Lock() 2035 defer e.rcvQueueMu.Unlock() 2036 2037 return e.RcvBufUsed, nil 2038 } 2039 2040 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. 2041 func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { 2042 switch opt { 2043 case tcpip.KeepaliveCountOption: 2044 e.keepalive.Lock() 2045 v := e.keepalive.count 2046 e.keepalive.Unlock() 2047 return v, nil 2048 2049 case tcpip.IPv4TOSOption: 2050 e.LockUser() 2051 v := int(e.sendTOS) 2052 e.UnlockUser() 2053 return v, nil 2054 2055 case tcpip.IPv6TrafficClassOption: 2056 e.LockUser() 2057 v := int(e.sendTOS) 2058 e.UnlockUser() 2059 return v, nil 2060 2061 case tcpip.MaxSegOption: 2062 // Linux only returns user_mss value if user_mss is set and the socket is 2063 // unconnected. Otherwise Linux returns the actual current MSS. Netstack 2064 // mimics the user_mss behavior, but otherwise just returns the defaultMSS 2065 // for now. 2066 v := header.TCPDefaultMSS 2067 e.LockUser() 2068 if state := e.EndpointState(); e.userMSS > 0 && (state.internal() || state == StateClose || state == StateListen) { 2069 v = int(e.userMSS) 2070 } 2071 e.UnlockUser() 2072 return v, nil 2073 2074 case tcpip.MTUDiscoverOption: 2075 // Always return the path MTU discovery disabled setting since 2076 // it's the only one supported. 2077 return tcpip.PMTUDiscoveryDont, nil 2078 2079 case tcpip.ReceiveQueueSizeOption: 2080 return e.readyReceiveSize() 2081 2082 case tcpip.IPv4TTLOption: 2083 e.LockUser() 2084 v := int(e.ipv4TTL) 2085 e.UnlockUser() 2086 return v, nil 2087 2088 case tcpip.IPv6HopLimitOption: 2089 e.LockUser() 2090 v := int(e.ipv6HopLimit) 2091 e.UnlockUser() 2092 return v, nil 2093 2094 case tcpip.TCPSynCountOption: 2095 e.LockUser() 2096 v := int(e.maxSynRetries) 2097 e.UnlockUser() 2098 return v, nil 2099 2100 case tcpip.TCPWindowClampOption: 2101 e.LockUser() 2102 v := int(e.windowClamp) 2103 e.UnlockUser() 2104 return v, nil 2105 2106 case tcpip.MulticastTTLOption: 2107 return 1, nil 2108 2109 default: 2110 return -1, &tcpip.ErrUnknownProtocolOption{} 2111 } 2112 } 2113 2114 func (e *endpoint) getTCPInfo() tcpip.TCPInfoOption { 2115 info := tcpip.TCPInfoOption{} 2116 e.LockUser() 2117 if state := e.EndpointState(); state.internal() { 2118 info.State = tcpip.EndpointState(StateClose) 2119 } else { 2120 info.State = tcpip.EndpointState(state) 2121 } 2122 snd := e.snd 2123 if snd != nil { 2124 // We do not calculate RTT before sending the data packets. If 2125 // the connection did not send and receive data, then RTT will 2126 // be zero. 2127 snd.rtt.Lock() 2128 info.RTT = snd.rtt.TCPRTTState.SRTT 2129 info.RTTVar = snd.rtt.TCPRTTState.RTTVar 2130 snd.rtt.Unlock() 2131 2132 info.RTO = snd.RTO 2133 info.CcState = snd.state 2134 info.SndSsthresh = uint32(snd.Ssthresh) 2135 info.SndCwnd = uint32(snd.SndCwnd) 2136 info.ReorderSeen = snd.rc.Reord 2137 } 2138 e.UnlockUser() 2139 return info 2140 } 2141 2142 // GetSockOpt implements tcpip.Endpoint.GetSockOpt. 2143 func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { 2144 switch o := opt.(type) { 2145 case *tcpip.TCPInfoOption: 2146 *o = e.getTCPInfo() 2147 2148 case *tcpip.KeepaliveIdleOption: 2149 e.keepalive.Lock() 2150 *o = tcpip.KeepaliveIdleOption(e.keepalive.idle) 2151 e.keepalive.Unlock() 2152 2153 case *tcpip.KeepaliveIntervalOption: 2154 e.keepalive.Lock() 2155 *o = tcpip.KeepaliveIntervalOption(e.keepalive.interval) 2156 e.keepalive.Unlock() 2157 2158 case *tcpip.TCPUserTimeoutOption: 2159 e.LockUser() 2160 *o = tcpip.TCPUserTimeoutOption(e.userTimeout) 2161 e.UnlockUser() 2162 2163 case *tcpip.CongestionControlOption: 2164 e.LockUser() 2165 *o = e.cc 2166 e.UnlockUser() 2167 2168 case *tcpip.TCPLingerTimeoutOption: 2169 e.LockUser() 2170 *o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout) 2171 e.UnlockUser() 2172 2173 case *tcpip.TCPDeferAcceptOption: 2174 e.LockUser() 2175 *o = tcpip.TCPDeferAcceptOption(e.deferAccept) 2176 e.UnlockUser() 2177 2178 case *tcpip.OriginalDestinationOption: 2179 e.LockUser() 2180 ipt := e.stack.IPTables() 2181 addr, port, err := ipt.OriginalDst(e.TransportEndpointInfo.ID, e.NetProto, ProtocolNumber) 2182 e.UnlockUser() 2183 if err != nil { 2184 return err 2185 } 2186 *o = tcpip.OriginalDestinationOption{ 2187 Addr: addr, 2188 Port: port, 2189 } 2190 2191 default: 2192 return &tcpip.ErrUnknownProtocolOption{} 2193 } 2194 return nil 2195 } 2196 2197 // checkV4MappedLocked determines the effective network protocol and converts 2198 // addr to its canonical form. 2199 // +checklocks:e.mu 2200 func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) { 2201 unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only()) 2202 if err != nil { 2203 return tcpip.FullAddress{}, 0, err 2204 } 2205 return unwrapped, netProto, nil 2206 } 2207 2208 // Disconnect implements tcpip.Endpoint.Disconnect. 2209 func (*endpoint) Disconnect() tcpip.Error { 2210 return &tcpip.ErrNotSupported{} 2211 } 2212 2213 // Connect connects the endpoint to its peer. 2214 func (e *endpoint) Connect(addr tcpip.FullAddress) tcpip.Error { 2215 e.LockUser() 2216 defer e.UnlockUser() 2217 err := e.connect(addr, true) 2218 if err != nil { 2219 if !err.IgnoreStats() { 2220 // Connect failed. Let's wake up any waiters. 2221 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 2222 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 2223 e.stats.FailedConnectionAttempts.Increment() 2224 } 2225 } 2226 return err 2227 } 2228 2229 // registerEndpoint registers the endpoint with the provided address. 2230 // 2231 // +checklocks:e.mu 2232 func (e *endpoint) registerEndpoint(addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber, nicID tcpip.NICID) tcpip.Error { 2233 netProtos := []tcpip.NetworkProtocolNumber{netProto} 2234 if e.TransportEndpointInfo.ID.LocalPort != 0 { 2235 // The endpoint is bound to a port, attempt to register it. 2236 err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 2237 if err != nil { 2238 return err 2239 } 2240 } else { 2241 // The endpoint doesn't have a local port yet, so try to get 2242 // one. Make sure that it isn't one that will result in the same 2243 // address/port for both local and remote (otherwise this 2244 // endpoint would be trying to connect to itself). 2245 sameAddr := e.TransportEndpointInfo.ID.LocalAddress == e.TransportEndpointInfo.ID.RemoteAddress 2246 2247 // Calculate a port offset based on the destination IP/port and 2248 // src IP to ensure that for a given tuple (srcIP, destIP, 2249 // destPort) the offset used as a starting point is the same to 2250 // ensure that we can cycle through the port space effectively. 2251 portBuf := make([]byte, 2) 2252 binary.LittleEndian.PutUint16(portBuf, e.ID.RemotePort) 2253 2254 h := jenkins.Sum32(e.protocol.portOffsetSecret) 2255 for _, s := range [][]byte{ 2256 e.ID.LocalAddress.AsSlice(), 2257 e.ID.RemoteAddress.AsSlice(), 2258 portBuf, 2259 } { 2260 // Per io.Writer.Write: 2261 // 2262 // Write must return a non-nil error if it returns n < len(p). 2263 if _, err := h.Write(s); err != nil { 2264 panic(err) 2265 } 2266 } 2267 portOffset := h.Sum32() 2268 2269 var twReuse tcpip.TCPTimeWaitReuseOption 2270 if err := e.stack.TransportProtocolOption(ProtocolNumber, &twReuse); err != nil { 2271 panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &twReuse, err)) 2272 } 2273 2274 reuse := twReuse == tcpip.TCPTimeWaitReuseGlobal 2275 if twReuse == tcpip.TCPTimeWaitReuseLoopbackOnly { 2276 switch netProto { 2277 case header.IPv4ProtocolNumber: 2278 reuse = header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.LocalAddress) && header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.RemoteAddress) 2279 case header.IPv6ProtocolNumber: 2280 reuse = e.TransportEndpointInfo.ID.LocalAddress == header.IPv6Loopback && e.TransportEndpointInfo.ID.RemoteAddress == header.IPv6Loopback 2281 } 2282 } 2283 2284 bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) 2285 if _, err := e.stack.PickEphemeralPortStable(portOffset, func(p uint16) (bool, tcpip.Error) { 2286 if sameAddr && p == e.TransportEndpointInfo.ID.RemotePort { 2287 return false, nil 2288 } 2289 portRes := ports.Reservation{ 2290 Networks: netProtos, 2291 Transport: ProtocolNumber, 2292 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2293 Port: p, 2294 Flags: e.portFlags, 2295 BindToDevice: bindToDevice, 2296 Dest: addr, 2297 } 2298 if _, err := e.stack.ReservePort(e.stack.Rand(), portRes, nil /* testPort */); err != nil { 2299 if _, ok := err.(*tcpip.ErrPortInUse); !ok || !reuse { 2300 return false, nil 2301 } 2302 transEPID := e.TransportEndpointInfo.ID 2303 transEPID.LocalPort = p 2304 // Check if an endpoint is registered with demuxer in TIME-WAIT and if 2305 // we can reuse it. If we can't find a transport endpoint then we just 2306 // skip using this port as it's possible that either an endpoint has 2307 // bound the port but not registered with demuxer yet (no listen/connect 2308 // done yet) or the reservation was freed between the check above and 2309 // the FindTransportEndpoint below. But rather than retry the same port 2310 // we just skip it and move on. 2311 transEP := e.stack.FindTransportEndpoint(netProto, ProtocolNumber, transEPID, nicID) 2312 if transEP == nil { 2313 // ReservePort failed but there is no registered endpoint with 2314 // demuxer. Which indicates there is at least some endpoint that has 2315 // bound the port. 2316 return false, nil 2317 } 2318 2319 tcpEP := transEP.(*endpoint) 2320 tcpEP.LockUser() 2321 // If the endpoint is not in TIME-WAIT or if it is in TIME-WAIT but 2322 // less than 1 second has elapsed since its recentTS was updated then 2323 // we cannot reuse the port. 2324 if tcpEP.EndpointState() != StateTimeWait || e.stack.Clock().NowMonotonic().Sub(tcpEP.recentTSTime) < 1*time.Second { 2325 tcpEP.UnlockUser() 2326 return false, nil 2327 } 2328 // Since the endpoint is in TIME-WAIT it should be safe to acquire its 2329 // Lock while holding the lock for this endpoint as endpoints in 2330 // TIME-WAIT do not acquire locks on other endpoints. 2331 tcpEP.transitionToStateCloseLocked() 2332 tcpEP.drainClosingSegmentQueue() 2333 tcpEP.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 2334 tcpEP.UnlockUser() 2335 // Now try and Reserve again if it fails then we skip. 2336 portRes := ports.Reservation{ 2337 Networks: netProtos, 2338 Transport: ProtocolNumber, 2339 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2340 Port: p, 2341 Flags: e.portFlags, 2342 BindToDevice: bindToDevice, 2343 Dest: addr, 2344 } 2345 if _, err := e.stack.ReservePort(e.stack.Rand(), portRes, nil /* testPort */); err != nil { 2346 return false, nil 2347 } 2348 } 2349 2350 id := e.TransportEndpointInfo.ID 2351 id.LocalPort = p 2352 if err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.portFlags, bindToDevice); err != nil { 2353 portRes := ports.Reservation{ 2354 Networks: netProtos, 2355 Transport: ProtocolNumber, 2356 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2357 Port: p, 2358 Flags: e.portFlags, 2359 BindToDevice: bindToDevice, 2360 Dest: addr, 2361 } 2362 e.stack.ReleasePort(portRes) 2363 if _, ok := err.(*tcpip.ErrPortInUse); ok { 2364 return false, nil 2365 } 2366 return false, err 2367 } 2368 2369 // Port picking successful. Save the details of 2370 // the selected port. 2371 e.TransportEndpointInfo.ID = id 2372 e.isPortReserved = true 2373 e.boundBindToDevice = bindToDevice 2374 e.boundPortFlags = e.portFlags 2375 e.boundDest = addr 2376 return true, nil 2377 }); err != nil { 2378 e.stack.Stats().TCP.FailedPortReservations.Increment() 2379 return err 2380 } 2381 } 2382 return nil 2383 } 2384 2385 // connect connects the endpoint to its peer. 2386 // +checklocks:e.mu 2387 func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool) tcpip.Error { 2388 connectingAddr := addr.Addr 2389 2390 addr, netProto, err := e.checkV4MappedLocked(addr) 2391 if err != nil { 2392 return err 2393 } 2394 2395 if e.EndpointState().connected() { 2396 // The endpoint is already connected. If caller hasn't been 2397 // notified yet, return success. 2398 if !e.isConnectNotified { 2399 e.isConnectNotified = true 2400 return nil 2401 } 2402 // Otherwise return that it's already connected. 2403 return &tcpip.ErrAlreadyConnected{} 2404 } 2405 2406 nicID := addr.NIC 2407 switch e.EndpointState() { 2408 case StateBound: 2409 // If we're already bound to a NIC but the caller is requesting 2410 // that we use a different one now, we cannot proceed. 2411 if e.boundNICID == 0 { 2412 break 2413 } 2414 2415 if nicID != 0 && nicID != e.boundNICID { 2416 return &tcpip.ErrHostUnreachable{} 2417 } 2418 2419 nicID = e.boundNICID 2420 2421 case StateInitial: 2422 // Nothing to do. We'll eventually fill-in the gaps in the ID (if any) 2423 // when we find a route. 2424 2425 case StateConnecting, StateSynSent, StateSynRecv: 2426 // A connection request has already been issued but hasn't completed 2427 // yet. 2428 return &tcpip.ErrAlreadyConnecting{} 2429 2430 case StateError: 2431 if err := e.hardErrorLocked(); err != nil { 2432 return err 2433 } 2434 return &tcpip.ErrConnectionAborted{} 2435 2436 default: 2437 return &tcpip.ErrInvalidEndpointState{} 2438 } 2439 2440 // Find a route to the desired destination. 2441 r, err := e.stack.FindRoute(nicID, e.TransportEndpointInfo.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */) 2442 if err != nil { 2443 return err 2444 } 2445 defer r.Release() 2446 2447 e.TransportEndpointInfo.ID.LocalAddress = r.LocalAddress() 2448 e.TransportEndpointInfo.ID.RemoteAddress = r.RemoteAddress() 2449 e.TransportEndpointInfo.ID.RemotePort = addr.Port 2450 2451 oldState := e.EndpointState() 2452 e.setEndpointState(StateConnecting) 2453 if err := e.registerEndpoint(addr, netProto, r.NICID()); err != nil { 2454 e.setEndpointState(oldState) 2455 if _, ok := err.(*tcpip.ErrPortInUse); ok { 2456 return &tcpip.ErrBadLocalAddress{} 2457 } 2458 return err 2459 } 2460 2461 e.isRegistered = true 2462 r.Acquire() 2463 e.route = r 2464 e.boundNICID = nicID 2465 e.effectiveNetProtos = []tcpip.NetworkProtocolNumber{netProto} 2466 e.connectingAddress = connectingAddr 2467 2468 e.initGSO() 2469 2470 // Connect in the restore phase does not perform handshake. Restore its 2471 // connection setting here. 2472 if !handshake { 2473 e.segmentQueue.mu.Lock() 2474 for _, l := range []segmentList{e.segmentQueue.list, e.snd.writeList} { 2475 for s := l.Front(); s != nil; s = s.Next() { 2476 s.id = e.TransportEndpointInfo.ID 2477 e.sndQueueInfo.sndWaker.Assert() 2478 } 2479 } 2480 e.segmentQueue.mu.Unlock() 2481 e.snd.ep.AssertLockHeld(e) 2482 e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0) 2483 e.setEndpointState(StateEstablished) 2484 // Set the new auto tuned send buffer size after entering 2485 // established state. 2486 e.ops.SetSendBufferSize(e.computeTCPSendBufferSize(), false /* notify */) 2487 return &tcpip.ErrConnectStarted{} 2488 } 2489 2490 // Start a new handshake. 2491 h := e.newHandshake() 2492 e.setEndpointState(StateSynSent) 2493 h.start() 2494 e.stack.Stats().TCP.ActiveConnectionOpenings.Increment() 2495 2496 return &tcpip.ErrConnectStarted{} 2497 } 2498 2499 // ConnectEndpoint is not supported. 2500 func (*endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error { 2501 return &tcpip.ErrInvalidEndpointState{} 2502 } 2503 2504 // Shutdown closes the read and/or write end of the endpoint connection to its 2505 // peer. 2506 func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error { 2507 e.LockUser() 2508 defer e.UnlockUser() 2509 2510 if e.EndpointState().connecting() { 2511 // When calling shutdown(2) on a connecting socket, the endpoint must 2512 // enter the error state. But this logic cannot belong to the shutdownLocked 2513 // method because that method is called during a close(2) (and closing a 2514 // connecting socket is not an error). 2515 e.handshakeFailed(&tcpip.ErrConnectionReset{}) 2516 e.waiterQueue.Notify(waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) 2517 return nil 2518 } 2519 2520 return e.shutdownLocked(flags) 2521 } 2522 2523 // +checklocks:e.mu 2524 func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error { 2525 e.shutdownFlags |= flags 2526 switch { 2527 case e.EndpointState().connected(): 2528 // Close for read. 2529 if e.shutdownFlags&tcpip.ShutdownRead != 0 { 2530 // Mark read side as closed. 2531 e.rcvQueueMu.Lock() 2532 e.RcvClosed = true 2533 rcvBufUsed := e.RcvBufUsed 2534 e.rcvQueueMu.Unlock() 2535 // If we're fully closed and we have unread data we need to abort 2536 // the connection with a RST. 2537 if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 { 2538 e.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) 2539 return nil 2540 } 2541 // Wake up any readers that maybe waiting for the stream to become 2542 // readable. 2543 events := waiter.ReadableEvents 2544 if e.shutdownFlags&tcpip.ShutdownWrite == 0 { 2545 // If ShutdownWrite is not set, write end won't close and 2546 // we end up with a half-closed connection 2547 events |= waiter.EventRdHUp 2548 } 2549 e.waiterQueue.Notify(events) 2550 } 2551 2552 // Close for write. 2553 if e.shutdownFlags&tcpip.ShutdownWrite != 0 { 2554 e.sndQueueInfo.sndQueueMu.Lock() 2555 if e.sndQueueInfo.SndClosed { 2556 // Already closed. 2557 e.sndQueueInfo.sndQueueMu.Unlock() 2558 if e.EndpointState() == StateTimeWait { 2559 return &tcpip.ErrNotConnected{} 2560 } 2561 return nil 2562 } 2563 2564 // Queue fin segment. 2565 s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), buffer.Buffer{}) 2566 e.snd.writeList.PushBack(s) 2567 // Mark endpoint as closed. 2568 e.sndQueueInfo.SndClosed = true 2569 e.sndQueueInfo.sndQueueMu.Unlock() 2570 2571 // Drain the send queue. 2572 e.sendData(s) 2573 2574 // Mark send side as closed. 2575 e.snd.Closed = true 2576 2577 // Wake up any writers that maybe waiting for the stream to become 2578 // writable. 2579 e.waiterQueue.Notify(waiter.WritableEvents) 2580 } 2581 2582 return nil 2583 case e.EndpointState() == StateListen: 2584 if e.shutdownFlags&tcpip.ShutdownRead != 0 { 2585 // Reset all connections from the accept queue and keep the 2586 // worker running so that it can continue handling incoming 2587 // segments by replying with RST. 2588 // 2589 // By not removing this endpoint from the demuxer mapping, we 2590 // ensure that any other bind to the same port fails, as on Linux. 2591 e.rcvQueueMu.Lock() 2592 e.RcvClosed = true 2593 e.rcvQueueMu.Unlock() 2594 e.closePendingAcceptableConnectionsLocked() 2595 // Notify waiters that the endpoint is shutdown. 2596 e.waiterQueue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) 2597 } 2598 return nil 2599 default: 2600 return &tcpip.ErrNotConnected{} 2601 } 2602 } 2603 2604 // Listen puts the endpoint in "listen" mode, which allows it to accept 2605 // new connections. 2606 func (e *endpoint) Listen(backlog int) tcpip.Error { 2607 if err := e.listen(backlog); err != nil { 2608 if !err.IgnoreStats() { 2609 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 2610 e.stats.FailedConnectionAttempts.Increment() 2611 } 2612 return err 2613 } 2614 return nil 2615 } 2616 2617 func (e *endpoint) listen(backlog int) tcpip.Error { 2618 e.LockUser() 2619 defer e.UnlockUser() 2620 2621 if e.EndpointState() == StateListen && !e.closed { 2622 e.acceptMu.Lock() 2623 defer e.acceptMu.Unlock() 2624 2625 // Adjust the size of the backlog iff we can fit 2626 // existing pending connections into the new one. 2627 if e.acceptQueue.endpoints.Len() > backlog { 2628 return &tcpip.ErrInvalidEndpointState{} 2629 } 2630 e.acceptQueue.capacity = backlog 2631 2632 if e.acceptQueue.pendingEndpoints == nil { 2633 e.acceptQueue.pendingEndpoints = make(map[*endpoint]struct{}) 2634 } 2635 2636 e.shutdownFlags = 0 2637 e.updateConnDirectionState(connDirectionStateOpen) 2638 e.rcvQueueMu.Lock() 2639 e.RcvClosed = false 2640 e.rcvQueueMu.Unlock() 2641 2642 return nil 2643 } 2644 2645 if e.EndpointState() == StateInitial { 2646 // The listen is called on an unbound socket, the socket is 2647 // automatically bound to a random free port with the local 2648 // address set to INADDR_ANY. 2649 if err := e.bindLocked(tcpip.FullAddress{}); err != nil { 2650 return err 2651 } 2652 } 2653 2654 // Endpoint must be bound before it can transition to listen mode. 2655 if e.EndpointState() != StateBound { 2656 e.stats.ReadErrors.InvalidEndpointState.Increment() 2657 return &tcpip.ErrInvalidEndpointState{} 2658 } 2659 2660 // Setting this state after RegisterTransportEndpoint will result in a 2661 // race where the endpoint is in Bound but reachable via the demuxer. Instead 2662 // we set it to listen so that incoming packets will just be queued to the 2663 // inbound segment queue by the TCP processor. 2664 e.setEndpointState(StateListen) 2665 // Register the endpoint. 2666 if err := e.stack.RegisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil { 2667 e.transitionToStateCloseLocked() 2668 return err 2669 } 2670 2671 e.isRegistered = true 2672 2673 // The queue may be non-zero when we're restoring the endpoint, and it 2674 // may be pre-populated with some previously accepted (but not Accepted) 2675 // endpoints. 2676 e.acceptMu.Lock() 2677 if e.acceptQueue.pendingEndpoints == nil { 2678 e.acceptQueue.pendingEndpoints = make(map[*endpoint]struct{}) 2679 } 2680 if e.acceptQueue.capacity == 0 { 2681 e.acceptQueue.capacity = backlog 2682 } 2683 e.acceptMu.Unlock() 2684 2685 // Initialize the listening context. 2686 rcvWnd := seqnum.Size(e.receiveBufferAvailable()) 2687 e.listenCtx = newListenContext(e.stack, e.protocol, e, rcvWnd, e.ops.GetV6Only(), e.NetProto) 2688 2689 return nil 2690 } 2691 2692 // Accept returns a new endpoint if a peer has established a connection 2693 // to an endpoint previously set to listen mode. 2694 // 2695 // addr if not-nil will contain the peer address of the returned endpoint. 2696 func (e *endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) { 2697 e.LockUser() 2698 defer e.UnlockUser() 2699 2700 e.rcvQueueMu.Lock() 2701 rcvClosed := e.RcvClosed 2702 e.rcvQueueMu.Unlock() 2703 // Endpoint must be in listen state before it can accept connections. 2704 if rcvClosed || e.EndpointState() != StateListen { 2705 return nil, nil, &tcpip.ErrInvalidEndpointState{} 2706 } 2707 2708 // Get the new accepted endpoint. 2709 var n *endpoint 2710 e.acceptMu.Lock() 2711 if element := e.acceptQueue.endpoints.Front(); element != nil { 2712 n = e.acceptQueue.endpoints.Remove(element).(*endpoint) 2713 } 2714 e.acceptMu.Unlock() 2715 if n == nil { 2716 return nil, nil, &tcpip.ErrWouldBlock{} 2717 } 2718 if peerAddr != nil { 2719 *peerAddr = n.getRemoteAddress() 2720 } 2721 return n, n.waiterQueue, nil 2722 } 2723 2724 // Bind binds the endpoint to a specific local port and optionally address. 2725 func (e *endpoint) Bind(addr tcpip.FullAddress) (err tcpip.Error) { 2726 e.LockUser() 2727 defer e.UnlockUser() 2728 2729 return e.bindLocked(addr) 2730 } 2731 2732 // +checklocks:e.mu 2733 func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err tcpip.Error) { 2734 // Don't allow binding once endpoint is not in the initial state 2735 // anymore. This is because once the endpoint goes into a connected or 2736 // listen state, it is already bound. 2737 if e.EndpointState() != StateInitial { 2738 return &tcpip.ErrAlreadyBound{} 2739 } 2740 2741 e.BindAddr = addr.Addr 2742 addr, netProto, err := e.checkV4MappedLocked(addr) 2743 if err != nil { 2744 return err 2745 } 2746 2747 netProtos := []tcpip.NetworkProtocolNumber{netProto} 2748 2749 // Expand netProtos to include v4 and v6 under dual-stack if the caller is 2750 // binding to a wildcard (empty) address, and this is an IPv6 endpoint with 2751 // v6only set to false. 2752 if netProto == header.IPv6ProtocolNumber { 2753 stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber) 2754 alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == tcpip.Address{} && stackHasV4 2755 if alsoBindToV4 { 2756 netProtos = append(netProtos, header.IPv4ProtocolNumber) 2757 } 2758 } 2759 2760 var nic tcpip.NICID 2761 // If an address is specified, we must ensure that it's one of our 2762 // local addresses. 2763 if addr.Addr.Len() != 0 { 2764 nic = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) 2765 if nic == 0 { 2766 return &tcpip.ErrBadLocalAddress{} 2767 } 2768 e.TransportEndpointInfo.ID.LocalAddress = addr.Addr 2769 } 2770 2771 bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) 2772 portRes := ports.Reservation{ 2773 Networks: netProtos, 2774 Transport: ProtocolNumber, 2775 Addr: addr.Addr, 2776 Port: addr.Port, 2777 Flags: e.portFlags, 2778 BindToDevice: bindToDevice, 2779 Dest: tcpip.FullAddress{}, 2780 } 2781 port, err := e.stack.ReservePort(e.stack.Rand(), portRes, func(p uint16) (bool, tcpip.Error) { 2782 id := e.TransportEndpointInfo.ID 2783 id.LocalPort = p 2784 // CheckRegisterTransportEndpoint should only return an error if there is a 2785 // listening endpoint bound with the same id and portFlags and bindToDevice 2786 // options. 2787 // 2788 // NOTE: Only listening and connected endpoint register with 2789 // demuxer. Further connected endpoints always have a remote 2790 // address/port. Hence this will only return an error if there is a matching 2791 // listening endpoint. 2792 if err := e.stack.CheckRegisterTransportEndpoint(netProtos, ProtocolNumber, id, e.portFlags, bindToDevice); err != nil { 2793 return false, nil 2794 } 2795 return true, nil 2796 }) 2797 if err != nil { 2798 e.stack.Stats().TCP.FailedPortReservations.Increment() 2799 return err 2800 } 2801 2802 e.boundBindToDevice = bindToDevice 2803 e.boundPortFlags = e.portFlags 2804 // TODO(gvisor.dev/issue/3691): Add test to verify boundNICID is correct. 2805 e.boundNICID = nic 2806 e.isPortReserved = true 2807 e.effectiveNetProtos = netProtos 2808 e.TransportEndpointInfo.ID.LocalPort = port 2809 2810 // Mark endpoint as bound. 2811 e.setEndpointState(StateBound) 2812 2813 return nil 2814 } 2815 2816 // GetLocalAddress returns the address to which the endpoint is bound. 2817 func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { 2818 e.LockUser() 2819 defer e.UnlockUser() 2820 2821 return tcpip.FullAddress{ 2822 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2823 Port: e.TransportEndpointInfo.ID.LocalPort, 2824 NIC: e.boundNICID, 2825 }, nil 2826 } 2827 2828 // GetRemoteAddress returns the address to which the endpoint is connected. 2829 func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { 2830 e.LockUser() 2831 defer e.UnlockUser() 2832 2833 if !e.EndpointState().connected() { 2834 return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} 2835 } 2836 2837 return e.getRemoteAddress(), nil 2838 } 2839 2840 func (e *endpoint) getRemoteAddress() tcpip.FullAddress { 2841 return tcpip.FullAddress{ 2842 Addr: e.TransportEndpointInfo.ID.RemoteAddress, 2843 Port: e.TransportEndpointInfo.ID.RemotePort, 2844 NIC: e.boundNICID, 2845 } 2846 } 2847 2848 func (*endpoint) HandlePacket(stack.TransportEndpointID, stack.PacketBufferPtr) { 2849 // TCP HandlePacket is not required anymore as inbound packets first 2850 // land at the Dispatcher which then can either deliver using the 2851 // worker go routine or directly do the invoke the tcp processing inline 2852 // based on the state of the endpoint. 2853 } 2854 2855 func (e *endpoint) enqueueSegment(s *segment) bool { 2856 // Send packet to worker goroutine. 2857 if !e.segmentQueue.enqueue(s) { 2858 // The queue is full, so we drop the segment. 2859 e.stack.Stats().DroppedPackets.Increment() 2860 e.stats.ReceiveErrors.SegmentQueueDropped.Increment() 2861 return false 2862 } 2863 return true 2864 } 2865 2866 func (e *endpoint) onICMPError(err tcpip.Error, transErr stack.TransportError, pkt stack.PacketBufferPtr) { 2867 // Update last error first. 2868 e.lastErrorMu.Lock() 2869 e.lastError = err 2870 e.lastErrorMu.Unlock() 2871 2872 var recvErr bool 2873 switch pkt.NetworkProtocolNumber { 2874 case header.IPv4ProtocolNumber: 2875 recvErr = e.SocketOptions().GetIPv4RecvError() 2876 case header.IPv6ProtocolNumber: 2877 recvErr = e.SocketOptions().GetIPv6RecvError() 2878 default: 2879 panic(fmt.Sprintf("unhandled network protocol number = %d", pkt.NetworkProtocolNumber)) 2880 } 2881 2882 if recvErr { 2883 e.SocketOptions().QueueErr(&tcpip.SockError{ 2884 Err: err, 2885 Cause: transErr, 2886 // Linux passes the payload with the TCP header. We don't know if the TCP 2887 // header even exists, it may not for fragmented packets. 2888 Payload: pkt.Data().AsRange().ToView(), 2889 Dst: tcpip.FullAddress{ 2890 NIC: pkt.NICID, 2891 Addr: e.TransportEndpointInfo.ID.RemoteAddress, 2892 Port: e.TransportEndpointInfo.ID.RemotePort, 2893 }, 2894 Offender: tcpip.FullAddress{ 2895 NIC: pkt.NICID, 2896 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2897 Port: e.TransportEndpointInfo.ID.LocalPort, 2898 }, 2899 NetProto: pkt.NetworkProtocolNumber, 2900 }) 2901 } 2902 2903 if e.EndpointState().connecting() { 2904 e.mu.Lock() 2905 if lEP := e.h.listenEP; lEP != nil { 2906 // Remove from listening endpoints pending list. 2907 lEP.acceptMu.Lock() 2908 delete(lEP.acceptQueue.pendingEndpoints, e) 2909 lEP.acceptMu.Unlock() 2910 lEP.stats.FailedConnectionAttempts.Increment() 2911 } 2912 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 2913 e.cleanupLocked() 2914 e.hardError = err 2915 e.setEndpointState(StateError) 2916 e.mu.Unlock() 2917 e.drainClosingSegmentQueue() 2918 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 2919 } 2920 } 2921 2922 // HandleError implements stack.TransportEndpoint. 2923 func (e *endpoint) HandleError(transErr stack.TransportError, pkt stack.PacketBufferPtr) { 2924 handlePacketTooBig := func(mtu uint32) { 2925 e.sndQueueInfo.sndQueueMu.Lock() 2926 update := false 2927 if v := int(mtu); v < e.sndQueueInfo.SndMTU { 2928 e.sndQueueInfo.SndMTU = v 2929 update = true 2930 } 2931 newMTU := e.sndQueueInfo.SndMTU 2932 e.sndQueueInfo.sndQueueMu.Unlock() 2933 if update { 2934 e.mu.Lock() 2935 defer e.mu.Unlock() 2936 if e.snd != nil { 2937 e.snd.updateMaxPayloadSize(newMTU, 1 /* count */) // +checklocksforce:e.snd.ep.mu 2938 } 2939 } 2940 } 2941 2942 // TODO(gvisor.dev/issues/5270): Handle all transport errors. 2943 switch transErr.Kind() { 2944 case stack.PacketTooBigTransportError: 2945 handlePacketTooBig(transErr.Info()) 2946 case stack.DestinationHostUnreachableTransportError: 2947 e.onICMPError(&tcpip.ErrHostUnreachable{}, transErr, pkt) 2948 case stack.DestinationNetworkUnreachableTransportError: 2949 e.onICMPError(&tcpip.ErrNetworkUnreachable{}, transErr, pkt) 2950 case stack.DestinationPortUnreachableTransportError: 2951 e.onICMPError(&tcpip.ErrConnectionRefused{}, transErr, pkt) 2952 case stack.DestinationProtoUnreachableTransportError: 2953 e.onICMPError(&tcpip.ErrUnknownProtocolOption{}, transErr, pkt) 2954 case stack.SourceRouteFailedTransportError: 2955 e.onICMPError(&tcpip.ErrNotSupported{}, transErr, pkt) 2956 case stack.SourceHostIsolatedTransportError: 2957 e.onICMPError(&tcpip.ErrNoNet{}, transErr, pkt) 2958 case stack.DestinationHostDownTransportError: 2959 e.onICMPError(&tcpip.ErrHostDown{}, transErr, pkt) 2960 } 2961 } 2962 2963 // updateSndBufferUsage is called by the protocol goroutine when room opens up 2964 // in the send buffer. The number of newly available bytes is v. 2965 func (e *endpoint) updateSndBufferUsage(v int) { 2966 sendBufferSize := e.getSendBufferSize() 2967 e.sndQueueInfo.sndQueueMu.Lock() 2968 notify := e.sndQueueInfo.SndBufUsed >= sendBufferSize>>1 2969 e.sndQueueInfo.SndBufUsed -= v 2970 2971 // Get the new send buffer size with auto tuning, but do not set it 2972 // unless we decide to notify the writers. 2973 newSndBufSz := e.computeTCPSendBufferSize() 2974 2975 // We only notify when there is half the sendBufferSize available after 2976 // a full buffer event occurs. This ensures that we don't wake up 2977 // writers to queue just 1-2 segments and go back to sleep. 2978 notify = notify && e.sndQueueInfo.SndBufUsed < int(newSndBufSz)>>1 2979 e.sndQueueInfo.sndQueueMu.Unlock() 2980 2981 if notify { 2982 // Set the new send buffer size calculated from auto tuning. 2983 e.ops.SetSendBufferSize(newSndBufSz, false /* notify */) 2984 e.waiterQueue.Notify(waiter.WritableEvents) 2985 } 2986 } 2987 2988 // readyToRead is called by the protocol goroutine when a new segment is ready 2989 // to be read, or when the connection is closed for receiving (in which case 2990 // s will be nil). 2991 // 2992 // +checklocks:e.mu 2993 func (e *endpoint) readyToRead(s *segment) { 2994 e.rcvQueueMu.Lock() 2995 if s != nil { 2996 e.RcvBufUsed += s.payloadSize() 2997 s.IncRef() 2998 e.rcvQueue.PushBack(s) 2999 } else { 3000 e.RcvClosed = true 3001 } 3002 e.rcvQueueMu.Unlock() 3003 e.waiterQueue.Notify(waiter.ReadableEvents) 3004 } 3005 3006 // receiveBufferAvailableLocked calculates how many bytes are still available 3007 // in the receive buffer. 3008 // +checklocks:e.rcvQueueMu 3009 func (e *endpoint) receiveBufferAvailableLocked(rcvBufSize int) int { 3010 // We may use more bytes than the buffer size when the receive buffer 3011 // shrinks. 3012 memUsed := e.receiveMemUsed() 3013 if memUsed >= rcvBufSize { 3014 return 0 3015 } 3016 3017 return rcvBufSize - memUsed 3018 } 3019 3020 // receiveBufferAvailable calculates how many bytes are still available in the 3021 // receive buffer based on the actual memory used by all segments held in 3022 // receive buffer/pending and segment queue. 3023 func (e *endpoint) receiveBufferAvailable() int { 3024 e.rcvQueueMu.Lock() 3025 available := e.receiveBufferAvailableLocked(int(e.ops.GetReceiveBufferSize())) 3026 e.rcvQueueMu.Unlock() 3027 return available 3028 } 3029 3030 // receiveBufferUsed returns the amount of in-use receive buffer. 3031 func (e *endpoint) receiveBufferUsed() int { 3032 e.rcvQueueMu.Lock() 3033 used := e.RcvBufUsed 3034 e.rcvQueueMu.Unlock() 3035 return used 3036 } 3037 3038 // receiveMemUsed returns the total memory in use by segments held by this 3039 // endpoint. 3040 func (e *endpoint) receiveMemUsed() int { 3041 return int(e.rcvMemUsed.Load()) 3042 } 3043 3044 // updateReceiveMemUsed adds the provided delta to e.rcvMemUsed. 3045 func (e *endpoint) updateReceiveMemUsed(delta int) { 3046 e.rcvMemUsed.Add(int32(delta)) 3047 } 3048 3049 // maxReceiveBufferSize returns the stack wide maximum receive buffer size for 3050 // an endpoint. 3051 func (e *endpoint) maxReceiveBufferSize() int { 3052 var rs tcpip.TCPReceiveBufferSizeRangeOption 3053 if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil { 3054 // As a fallback return the hardcoded max buffer size. 3055 return MaxBufferSize 3056 } 3057 return rs.Max 3058 } 3059 3060 // directionState returns the close state of send and receive part of the endpoint 3061 func (e *endpoint) connDirectionState() connDirectionState { 3062 return connDirectionState(e.connectionDirectionState.Load()) 3063 } 3064 3065 // updateDirectionState updates the close state of send and receive part of the endpoint 3066 func (e *endpoint) updateConnDirectionState(state connDirectionState) connDirectionState { 3067 return connDirectionState(e.connectionDirectionState.Swap(uint32(e.connDirectionState() | state))) 3068 } 3069 3070 // rcvWndScaleForHandshake computes the receive window scale to offer to the 3071 // peer when window scaling is enabled (true by default). If auto-tuning is 3072 // disabled then the window scaling factor is based on the size of the 3073 // receiveBuffer otherwise we use the max permissible receive buffer size to 3074 // compute the scale. 3075 func (e *endpoint) rcvWndScaleForHandshake() int { 3076 bufSizeForScale := e.ops.GetReceiveBufferSize() 3077 3078 e.rcvQueueMu.Lock() 3079 autoTuningDisabled := e.RcvAutoParams.Disabled 3080 e.rcvQueueMu.Unlock() 3081 if autoTuningDisabled { 3082 return FindWndScale(seqnum.Size(bufSizeForScale)) 3083 } 3084 3085 return FindWndScale(seqnum.Size(e.maxReceiveBufferSize())) 3086 } 3087 3088 // updateRecentTimestamp updates the recent timestamp using the algorithm 3089 // described in https://tools.ietf.org/html/rfc7323#section-4.3 3090 func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) { 3091 if e.SendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) { 3092 e.setRecentTimestamp(tsVal) 3093 } 3094 } 3095 3096 // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if 3097 // the SYN options indicate that timestamp option was negotiated. It also 3098 // initializes the recentTS with the value provided in synOpts.TSval. 3099 func (e *endpoint) maybeEnableTimestamp(synOpts header.TCPSynOptions) { 3100 if synOpts.TS { 3101 e.SendTSOk = true 3102 e.setRecentTimestamp(synOpts.TSVal) 3103 } 3104 } 3105 3106 func (e *endpoint) tsVal(now tcpip.MonotonicTime) uint32 { 3107 return e.TSOffset.TSVal(now) 3108 } 3109 3110 func (e *endpoint) tsValNow() uint32 { 3111 return e.tsVal(e.stack.Clock().NowMonotonic()) 3112 } 3113 3114 func (e *endpoint) elapsed(now tcpip.MonotonicTime, tsEcr uint32) time.Duration { 3115 return e.TSOffset.Elapsed(now, tsEcr) 3116 } 3117 3118 // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint 3119 // if the SYN options indicate that the SACK option was negotiated and the TCP 3120 // stack is configured to enable TCP SACK option. 3121 func (e *endpoint) maybeEnableSACKPermitted(synOpts header.TCPSynOptions) { 3122 var v tcpip.TCPSACKEnabled 3123 if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil { 3124 // Stack doesn't support SACK. So just return. 3125 return 3126 } 3127 if bool(v) && synOpts.SACKPermitted { 3128 e.SACKPermitted = true 3129 e.stack.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery) 3130 } 3131 } 3132 3133 // maxOptionSize return the maximum size of TCP options. 3134 func (e *endpoint) maxOptionSize() (size int) { 3135 var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock 3136 options := e.makeOptions(maxSackBlocks[:]) 3137 size = len(options) 3138 putOptions(options) 3139 3140 return size 3141 } 3142 3143 // completeStateLocked makes a full copy of the endpoint and returns it. This is 3144 // used before invoking the probe. 3145 // 3146 // +checklocks:e.mu 3147 func (e *endpoint) completeStateLocked(s *stack.TCPEndpointState) { 3148 s.TCPEndpointStateInner = e.TCPEndpointStateInner 3149 s.ID = stack.TCPEndpointID(e.TransportEndpointInfo.ID) 3150 s.SegTime = e.stack.Clock().NowMonotonic() 3151 s.Receiver = e.rcv.TCPReceiverState 3152 s.Sender = e.snd.TCPSenderState 3153 3154 sndBufSize := e.getSendBufferSize() 3155 // Copy the send buffer atomically. 3156 e.sndQueueInfo.sndQueueMu.Lock() 3157 e.sndQueueInfo.CloneState(&s.SndBufState) 3158 s.SndBufState.SndBufSize = sndBufSize 3159 e.sndQueueInfo.sndQueueMu.Unlock() 3160 3161 // Copy the receive buffer atomically. 3162 e.rcvQueueMu.Lock() 3163 s.RcvBufState = e.TCPRcvBufState 3164 e.rcvQueueMu.Unlock() 3165 3166 // Copy the endpoint TCP Option state. 3167 s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks) 3168 copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks]) 3169 s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy() 3170 3171 e.snd.rtt.Lock() 3172 s.Sender.RTTState = e.snd.rtt.TCPRTTState 3173 e.snd.rtt.Unlock() 3174 3175 if cubic, ok := e.snd.cc.(*cubicState); ok { 3176 s.Sender.Cubic = cubic.TCPCubicState 3177 s.Sender.Cubic.TimeSinceLastCongestion = e.stack.Clock().NowMonotonic().Sub(s.Sender.Cubic.T) 3178 } 3179 3180 s.Sender.RACKState = e.snd.rc.TCPRACKState 3181 s.Sender.RetransmitTS = e.snd.retransmitTS 3182 s.Sender.SpuriousRecovery = e.snd.spuriousRecovery 3183 } 3184 3185 func (e *endpoint) initHostGSO() { 3186 switch e.route.NetProto() { 3187 case header.IPv4ProtocolNumber: 3188 e.gso.Type = stack.GSOTCPv4 3189 e.gso.L3HdrLen = header.IPv4MinimumSize 3190 case header.IPv6ProtocolNumber: 3191 e.gso.Type = stack.GSOTCPv6 3192 e.gso.L3HdrLen = header.IPv6MinimumSize 3193 default: 3194 panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto)) 3195 } 3196 e.gso.NeedsCsum = true 3197 e.gso.CsumOffset = header.TCPChecksumOffset 3198 e.gso.MaxSize = e.route.GSOMaxSize() 3199 } 3200 3201 func (e *endpoint) initGSO() { 3202 if e.route.HasHostGSOCapability() { 3203 e.initHostGSO() 3204 } else if e.route.HasGvisorGSOCapability() { 3205 e.gso = stack.GSO{ 3206 MaxSize: e.route.GSOMaxSize(), 3207 Type: stack.GSOGvisor, 3208 NeedsCsum: false, 3209 } 3210 } 3211 } 3212 3213 // State implements tcpip.Endpoint.State. It exports the endpoint's protocol 3214 // state for diagnostics. 3215 func (e *endpoint) State() uint32 { 3216 return uint32(e.EndpointState()) 3217 } 3218 3219 // Info returns a copy of the endpoint info. 3220 func (e *endpoint) Info() tcpip.EndpointInfo { 3221 e.LockUser() 3222 // Make a copy of the endpoint info. 3223 ret := e.TransportEndpointInfo 3224 e.UnlockUser() 3225 return &ret 3226 } 3227 3228 // Stats returns a pointer to the endpoint stats. 3229 func (e *endpoint) Stats() tcpip.EndpointStats { 3230 return &e.stats 3231 } 3232 3233 // Wait implements stack.TransportEndpoint.Wait. 3234 func (e *endpoint) Wait() { 3235 waitEntry, notifyCh := waiter.NewChannelEntry(waiter.EventHUp) 3236 e.waiterQueue.EventRegister(&waitEntry) 3237 defer e.waiterQueue.EventUnregister(&waitEntry) 3238 switch e.EndpointState() { 3239 case StateClose, StateError: 3240 return 3241 } 3242 <-notifyCh 3243 } 3244 3245 // SocketOptions implements tcpip.Endpoint.SocketOptions. 3246 func (e *endpoint) SocketOptions() *tcpip.SocketOptions { 3247 return &e.ops 3248 } 3249 3250 // GetTCPSendBufferLimits is used to get send buffer size limits for TCP. 3251 func GetTCPSendBufferLimits(s tcpip.StackHandler) tcpip.SendBufferSizeOption { 3252 var ss tcpip.TCPSendBufferSizeRangeOption 3253 if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil { 3254 panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err)) 3255 } 3256 3257 return tcpip.SendBufferSizeOption{ 3258 Min: ss.Min, 3259 Default: ss.Default, 3260 Max: ss.Max, 3261 } 3262 } 3263 3264 // allowOutOfWindowAck returns true if an out-of-window ACK can be sent now. 3265 func (e *endpoint) allowOutOfWindowAck() bool { 3266 now := e.stack.Clock().NowMonotonic() 3267 3268 if e.lastOutOfWindowAckTime != (tcpip.MonotonicTime{}) { 3269 var limit stack.TCPInvalidRateLimitOption 3270 if err := e.stack.Option(&limit); err != nil { 3271 panic(fmt.Sprintf("e.stack.Option(%+v) failed with error: %s", limit, err)) 3272 } 3273 if now.Sub(e.lastOutOfWindowAckTime) < time.Duration(limit) { 3274 return false 3275 } 3276 } 3277 3278 e.lastOutOfWindowAckTime = now 3279 return true 3280 } 3281 3282 // GetTCPReceiveBufferLimits is used to get send buffer size limits for TCP. 3283 func GetTCPReceiveBufferLimits(s tcpip.StackHandler) tcpip.ReceiveBufferSizeOption { 3284 var ss tcpip.TCPReceiveBufferSizeRangeOption 3285 if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil { 3286 panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err)) 3287 } 3288 3289 return tcpip.ReceiveBufferSizeOption{ 3290 Min: ss.Min, 3291 Default: ss.Default, 3292 Max: ss.Max, 3293 } 3294 } 3295 3296 // computeTCPSendBufferSize implements auto tuning of send buffer size and 3297 // returns the new send buffer size. 3298 func (e *endpoint) computeTCPSendBufferSize() int64 { 3299 curSndBufSz := int64(e.getSendBufferSize()) 3300 3301 // Auto tuning is disabled when the user explicitly sets the send 3302 // buffer size with SO_SNDBUF option. 3303 if disabled := e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled.Load(); disabled == 1 { 3304 return curSndBufSz 3305 } 3306 3307 const packetOverheadFactor = 2 3308 curMSS := e.snd.MaxPayloadSize 3309 numSeg := InitialCwnd 3310 if numSeg < e.snd.SndCwnd { 3311 numSeg = e.snd.SndCwnd 3312 } 3313 3314 // SndCwnd indicates the number of segments that can be sent. This means 3315 // that the sender can send upto #SndCwnd segments and the send buffer 3316 // size should be set to SndCwnd*MSS to accommodate sending of all the 3317 // segments. 3318 newSndBufSz := int64(numSeg * curMSS * packetOverheadFactor) 3319 if newSndBufSz < curSndBufSz { 3320 return curSndBufSz 3321 } 3322 if ss := GetTCPSendBufferLimits(e.stack); int64(ss.Max) < newSndBufSz { 3323 newSndBufSz = int64(ss.Max) 3324 } 3325 3326 return newSndBufSz 3327 }