github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/tcpip/transport/tcp/endpoint.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "container/heap" 19 "fmt" 20 "io" 21 "math" 22 "runtime" 23 "strings" 24 "time" 25 26 "github.com/metacubex/gvisor/pkg/atomicbitops" 27 "github.com/metacubex/gvisor/pkg/buffer" 28 "github.com/metacubex/gvisor/pkg/sleep" 29 "github.com/metacubex/gvisor/pkg/sync" 30 "github.com/metacubex/gvisor/pkg/tcpip" 31 "github.com/metacubex/gvisor/pkg/tcpip/header" 32 "github.com/metacubex/gvisor/pkg/tcpip/ports" 33 "github.com/metacubex/gvisor/pkg/tcpip/seqnum" 34 "github.com/metacubex/gvisor/pkg/tcpip/stack" 35 "github.com/metacubex/gvisor/pkg/waiter" 36 ) 37 38 // EndpointState represents the state of a TCP endpoint. 39 type EndpointState tcpip.EndpointState 40 41 // Endpoint states. Note that are represented in a netstack-specific manner and 42 // may not be meaningful externally. Specifically, they need to be translated to 43 // Linux's representation for these states if presented to userspace. 44 const ( 45 _ EndpointState = iota 46 // TCP protocol states in sync with the definitions in 47 // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/tcp_states.h#L13 48 StateEstablished 49 StateSynSent 50 StateSynRecv 51 StateFinWait1 52 StateFinWait2 53 StateTimeWait 54 StateClose 55 StateCloseWait 56 StateLastAck 57 StateListen 58 StateClosing 59 60 // Endpoint states internal to netstack. 61 StateInitial 62 StateBound 63 StateConnecting // Connect() called, but the initial SYN hasn't been sent. 64 StateError 65 ) 66 67 const ( 68 // rcvAdvWndScale is used to split the available socket buffer into 69 // application buffer and the window to be advertised to the peer. This is 70 // currently hard coded to split the available space equally. 71 rcvAdvWndScale = 1 72 73 // SegOverheadFactor is used to multiply the value provided by the 74 // user on a SetSockOpt for setting the socket send/receive buffer sizes. 75 SegOverheadFactor = 2 76 ) 77 78 type connDirectionState uint32 79 80 // Connection direction states used for directionState checks in endpoint struct 81 // to detect half-closed connection and deliver POLLRDHUP 82 const ( 83 connDirectionStateOpen connDirectionState = 0 84 connDirectionStateRcvClosed connDirectionState = 1 85 connDirectionStateSndClosed connDirectionState = 2 86 connDirectionStateAll connDirectionState = connDirectionStateOpen | connDirectionStateRcvClosed | connDirectionStateSndClosed 87 ) 88 89 // connected returns true when s is one of the states representing an 90 // endpoint connected to a peer. 91 func (s EndpointState) connected() bool { 92 switch s { 93 case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing: 94 return true 95 default: 96 return false 97 } 98 } 99 100 // connecting returns true when s is one of the states representing a 101 // connection in progress, but not yet fully established. 102 func (s EndpointState) connecting() bool { 103 switch s { 104 case StateConnecting, StateSynSent, StateSynRecv: 105 return true 106 default: 107 return false 108 } 109 } 110 111 // internal returns true when the state is netstack internal. 112 func (s EndpointState) internal() bool { 113 switch s { 114 case StateInitial, StateBound, StateConnecting, StateError: 115 return true 116 default: 117 return false 118 } 119 } 120 121 // handshake returns true when s is one of the states representing an endpoint 122 // in the middle of a TCP handshake. 123 func (s EndpointState) handshake() bool { 124 switch s { 125 case StateSynSent, StateSynRecv: 126 return true 127 default: 128 return false 129 } 130 } 131 132 // closed returns true when s is one of the states an endpoint transitions to 133 // when closed or when it encounters an error. This is distinct from a newly 134 // initialized endpoint that was never connected. 135 func (s EndpointState) closed() bool { 136 switch s { 137 case StateClose, StateError: 138 return true 139 default: 140 return false 141 } 142 } 143 144 // String implements fmt.Stringer.String. 145 func (s EndpointState) String() string { 146 switch s { 147 case StateInitial: 148 return "INITIAL" 149 case StateBound: 150 return "BOUND" 151 case StateConnecting: 152 return "CONNECTING" 153 case StateError: 154 return "ERROR" 155 case StateEstablished: 156 return "ESTABLISHED" 157 case StateSynSent: 158 return "SYN-SENT" 159 case StateSynRecv: 160 return "SYN-RCVD" 161 case StateFinWait1: 162 return "FIN-WAIT1" 163 case StateFinWait2: 164 return "FIN-WAIT2" 165 case StateTimeWait: 166 return "TIME-WAIT" 167 case StateClose: 168 return "CLOSED" 169 case StateCloseWait: 170 return "CLOSE-WAIT" 171 case StateLastAck: 172 return "LAST-ACK" 173 case StateListen: 174 return "LISTEN" 175 case StateClosing: 176 return "CLOSING" 177 default: 178 panic("unreachable") 179 } 180 } 181 182 // SACKInfo holds TCP SACK related information for a given endpoint. 183 // 184 // +stateify savable 185 type SACKInfo struct { 186 // Blocks is the maximum number of SACK blocks we track 187 // per endpoint. 188 Blocks [MaxSACKBlocks]header.SACKBlock 189 190 // NumBlocks is the number of valid SACK blocks stored in the 191 // blocks array above. 192 NumBlocks int 193 } 194 195 // ReceiveErrors collect segment receive errors within transport layer. 196 // 197 // +stateify savable 198 type ReceiveErrors struct { 199 tcpip.ReceiveErrors 200 201 // SegmentQueueDropped is the number of segments dropped due to 202 // a full segment queue. 203 SegmentQueueDropped tcpip.StatCounter 204 205 // ChecksumErrors is the number of segments dropped due to bad checksums. 206 ChecksumErrors tcpip.StatCounter 207 208 // ListenOverflowSynDrop is the number of times the listen queue overflowed 209 // and a SYN was dropped. 210 ListenOverflowSynDrop tcpip.StatCounter 211 212 // ListenOverflowAckDrop is the number of times the final ACK 213 // in the handshake was dropped due to overflow. 214 ListenOverflowAckDrop tcpip.StatCounter 215 216 // ZeroRcvWindowState is the number of times we advertised 217 // a zero receive window when rcvQueue is full. 218 ZeroRcvWindowState tcpip.StatCounter 219 220 // WantZeroWindow is the number of times we wanted to advertise a 221 // zero receive window but couldn't because it would have caused 222 // the receive window's right edge to shrink. 223 WantZeroRcvWindow tcpip.StatCounter 224 } 225 226 // SendErrors collect segment send errors within the transport layer. 227 // 228 // +stateify savable 229 type SendErrors struct { 230 tcpip.SendErrors 231 232 // SegmentSendToNetworkFailed is the number of TCP segments failed to be sent 233 // to the network endpoint. 234 SegmentSendToNetworkFailed tcpip.StatCounter 235 236 // SynSendToNetworkFailed is the number of TCP SYNs failed to be sent 237 // to the network endpoint. 238 SynSendToNetworkFailed tcpip.StatCounter 239 240 // Retransmits is the number of TCP segments retransmitted. 241 Retransmits tcpip.StatCounter 242 243 // FastRetransmit is the number of segments retransmitted in fast 244 // recovery. 245 FastRetransmit tcpip.StatCounter 246 247 // Timeouts is the number of times the RTO expired. 248 Timeouts tcpip.StatCounter 249 } 250 251 // Stats holds statistics about the endpoint. 252 // 253 // +stateify savable 254 type Stats struct { 255 // SegmentsReceived is the number of TCP segments received that 256 // the transport layer successfully parsed. 257 SegmentsReceived tcpip.StatCounter 258 259 // SegmentsSent is the number of TCP segments sent. 260 SegmentsSent tcpip.StatCounter 261 262 // FailedConnectionAttempts is the number of times we saw Connect and 263 // Accept errors. 264 FailedConnectionAttempts tcpip.StatCounter 265 266 // ReceiveErrors collects segment receive errors within the 267 // transport layer. 268 ReceiveErrors ReceiveErrors 269 270 // ReadErrors collects segment read errors from an endpoint read call. 271 ReadErrors tcpip.ReadErrors 272 273 // SendErrors collects segment send errors within the transport layer. 274 SendErrors SendErrors 275 276 // WriteErrors collects segment write errors from an endpoint write call. 277 WriteErrors tcpip.WriteErrors 278 } 279 280 // IsEndpointStats is an empty method to implement the tcpip.EndpointStats 281 // marker interface. 282 func (*Stats) IsEndpointStats() {} 283 284 // sndQueueInfo implements a send queue. 285 // 286 // +stateify savable 287 type sndQueueInfo struct { 288 sndQueueMu sync.Mutex `state:"nosave"` 289 stack.TCPSndBufState 290 291 // sndWaker is used to signal the protocol goroutine when there may be 292 // segments that need to be sent. 293 sndWaker sleep.Waker `state:"manual"` 294 } 295 296 // CloneState clones sq into other. It is not thread safe 297 func (sq *sndQueueInfo) CloneState(other *stack.TCPSndBufState) { 298 other.SndBufSize = sq.SndBufSize 299 other.SndBufUsed = sq.SndBufUsed 300 other.SndClosed = sq.SndClosed 301 other.PacketTooBigCount = sq.PacketTooBigCount 302 other.SndMTU = sq.SndMTU 303 other.AutoTuneSndBufDisabled = atomicbitops.FromUint32(sq.AutoTuneSndBufDisabled.RacyLoad()) 304 } 305 306 // Endpoint represents a TCP endpoint. This struct serves as the interface 307 // between users of the endpoint and the protocol implementation; it is legal to 308 // have concurrent goroutines make calls into the endpoint, they are properly 309 // synchronized. The protocol implementation, however, runs in a single 310 // goroutine. 311 // 312 // Each endpoint has a few mutexes: 313 // 314 // e.mu -> Primary mutex for an endpoint must be held for all operations except 315 // in e.Readiness where acquiring it will result in a deadlock in epoll 316 // implementation. 317 // 318 // The following three mutexes can be acquired independent of e.mu but if 319 // acquired with e.mu then e.mu must be acquired first. 320 // 321 // e.acceptMu -> Protects e.acceptQueue. 322 // e.rcvQueueMu -> Protects e.rcvQueue's associated fields but not e.rcvQueue 323 // itself. 324 // e.sndQueueMu -> Protects the e.sndQueue and associated fields. 325 // e.lastErrorMu -> Protects the lastError field. 326 // 327 // LOCKING/UNLOCKING of the endpoint. The locking of an endpoint is different 328 // based on the context in which the lock is acquired. In the syscall context 329 // e.LockUser/e.UnlockUser should be used and when doing background processing 330 // e.mu.Lock/e.mu.Unlock should be used. The distinction is described below 331 // in brief. 332 // 333 // The reason for this locking behaviour is to avoid wakeups to handle packets. 334 // In cases where the endpoint is already locked the background processor can 335 // queue the packet up and go its merry way and the lock owner will eventually 336 // process the backlog when releasing the lock. Similarly when acquiring the 337 // lock from say a syscall goroutine we can implement a bit of spinning if we 338 // know that the lock is not held by another syscall goroutine. Background 339 // processors should never hold the lock for long and we can avoid an expensive 340 // sleep/wakeup by spinning for a shortwhile. 341 // 342 // For more details please see the detailed documentation on 343 // e.LockUser/e.UnlockUser methods. 344 // 345 // +stateify savable 346 type Endpoint struct { 347 stack.TCPEndpointStateInner 348 stack.TransportEndpointInfo 349 tcpip.DefaultSocketOptionsHandler 350 351 // endpointEntry is used to queue endpoints for processing to the 352 // a given tcp processor goroutine. 353 // 354 // Precondition: epQueue.mu must be held to read/write this field.. 355 endpointEntry `state:"nosave"` 356 357 // pendingProcessingMu protects pendingProcessing. 358 pendingProcessingMu sync.Mutex `state:"nosave"` 359 360 // pendingProcessing is true if this endpoint is queued for processing 361 // to a TCP processor. 362 // +checklocks:pendingProcessingMu 363 pendingProcessing bool `state:"nosave"` 364 365 // The following fields are initialized at creation time and do not 366 // change throughout the lifetime of the endpoint. 367 stack *stack.Stack `state:"manual"` 368 protocol *protocol `state:"manual"` 369 waiterQueue *waiter.Queue `state:"wait"` 370 uniqueID uint64 371 372 // hardError is meaningful only when state is stateError. It stores the 373 // error to be returned when read/write syscalls are called and the 374 // endpoint is in this state. hardError is protected by endpoint mu. 375 hardError tcpip.Error 376 377 // lastError represents the last error that the endpoint reported; 378 // access to it is protected by the following mutex. 379 lastErrorMu sync.Mutex `state:"nosave"` 380 lastError tcpip.Error 381 382 rcvQueueMu sync.Mutex `state:"nosave"` 383 384 // +checklocks:rcvQueueMu 385 stack.TCPRcvBufState 386 387 // rcvMemUsed tracks the total amount of memory in use by received segments 388 // held in rcvQueue, pendingRcvdSegments and the segment queue. This is used to 389 // compute the window and the actual available buffer space. This is distinct 390 // from rcvBufUsed above which is the actual number of payload bytes held in 391 // the buffer not including any segment overheads. 392 rcvMemUsed atomicbitops.Int32 393 394 // mu protects all endpoint fields unless documented otherwise. mu must 395 // be acquired before interacting with the endpoint fields. 396 // 397 // During handshake, mu is locked by the protocol listen goroutine and 398 // released by the handshake completion goroutine. 399 mu sync.CrossGoroutineMutex `state:"nosave"` 400 ownedByUser atomicbitops.Uint32 401 402 // rcvQueue is the queue for ready-for-delivery segments. 403 // 404 // +checklocks:mu 405 rcvQueue segmentList `state:"wait"` 406 407 // state must be read/set using the EndpointState()/setEndpointState() 408 // methods. 409 state atomicbitops.Uint32 `state:".(EndpointState)"` 410 411 // connectionDirectionState holds current state of send and receive, 412 // accessed atomically 413 connectionDirectionState atomicbitops.Uint32 414 415 // origEndpointState is only used during a restore phase to save the 416 // endpoint state at restore time as the socket is moved to it's correct 417 // state. 418 origEndpointState uint32 `state:"nosave"` 419 420 isPortReserved bool `state:"manual"` 421 isRegistered bool `state:"manual"` 422 boundNICID tcpip.NICID 423 route *stack.Route `state:"manual"` 424 ipv4TTL uint8 425 ipv6HopLimit int16 426 isConnectNotified bool 427 428 // h stores a reference to the current handshake state if the endpoint is in 429 // the SYN-SENT or SYN-RECV states, in which case endpoint == endpoint.h.ep. 430 // nil otherwise. 431 // +checklocks:mu 432 h *handshake 433 434 // portFlags stores the current values of port related flags. 435 portFlags ports.Flags 436 437 // Values used to reserve a port or register a transport endpoint 438 // (which ever happens first). 439 boundBindToDevice tcpip.NICID 440 boundPortFlags ports.Flags 441 boundDest tcpip.FullAddress 442 443 // effectiveNetProtos contains the network protocols actually in use. In 444 // most cases it will only contain "netProto", but in cases like IPv6 445 // endpoints with v6only set to false, this could include multiple 446 // protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g., 447 // IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped 448 // address). 449 effectiveNetProtos []tcpip.NetworkProtocolNumber 450 451 // recentTSTime is the unix time when we last updated 452 // TCPEndpointStateInner.RecentTS. 453 recentTSTime tcpip.MonotonicTime 454 455 // shutdownFlags represent the current shutdown state of the endpoint. 456 shutdownFlags tcpip.ShutdownFlags 457 458 // tcpRecovery is the loss recovery algorithm used by TCP. 459 tcpRecovery tcpip.TCPRecovery 460 461 // sack holds TCP SACK related information for this endpoint. 462 sack SACKInfo 463 464 // delay enables Nagle's algorithm. 465 // 466 // delay is a boolean (0 is false) and must be accessed atomically. 467 delay uint32 468 469 // scoreboard holds TCP SACK Scoreboard information for this endpoint. 470 scoreboard *SACKScoreboard 471 472 // segmentQueue is used to hand received segments to the protocol 473 // goroutine. Segments are queued as long as the queue is not full, 474 // and dropped when it is. 475 segmentQueue segmentQueue `state:"wait"` 476 477 // userMSS if non-zero is the MSS value explicitly set by the user 478 // for this endpoint using the TCP_MAXSEG setsockopt. 479 userMSS uint16 480 481 // maxSynRetries is the maximum number of SYN retransmits that TCP should 482 // send before aborting the attempt to connect. It cannot exceed 255. 483 // 484 // NOTE: This is currently a no-op and does not change the SYN 485 // retransmissions. 486 maxSynRetries uint8 487 488 // windowClamp is used to bound the size of the advertised window to 489 // this value. 490 windowClamp uint32 491 492 // sndQueueInfo contains the implementation of the endpoint's send queue. 493 sndQueueInfo sndQueueInfo 494 495 // cc stores the name of the Congestion Control algorithm to use for 496 // this endpoint. 497 cc tcpip.CongestionControlOption 498 499 // keepalive manages TCP keepalive state. When the connection is idle 500 // (no data sent or received) for keepaliveIdle, we start sending 501 // keepalives every keepalive.interval. If we send keepalive.count 502 // without hearing a response, the connection is closed. 503 keepalive keepalive 504 505 // userTimeout if non-zero specifies a user specified timeout for 506 // a connection w/ pending data to send. A connection that has pending 507 // unacked data will be forcibily aborted if the timeout is reached 508 // without any data being acked. 509 userTimeout time.Duration 510 511 // deferAccept if non-zero specifies a user specified time during 512 // which the final ACK of a handshake will be dropped provided the 513 // ACK is a bare ACK and carries no data. If the timeout is crossed then 514 // the bare ACK is accepted and the connection is delivered to the 515 // listener. 516 deferAccept time.Duration 517 518 // acceptMu protects accepQueue 519 acceptMu sync.Mutex `state:"nosave"` 520 521 // acceptQueue is used by a listening endpoint to send newly accepted 522 // connections to the endpoint so that they can be read by Accept() 523 // calls. 524 // 525 // +checklocks:acceptMu 526 acceptQueue acceptQueue 527 528 // The following are only used from the protocol goroutine, and 529 // therefore don't need locks to protect them. 530 rcv *receiver `state:"wait"` 531 snd *sender `state:"wait"` 532 533 // The goroutine drain completion notification channel. 534 drainDone chan struct{} `state:"nosave"` 535 536 // The goroutine undrain notification channel. This is currently used as 537 // a way to block the worker goroutines. Today nothing closes/writes 538 // this channel and this causes any goroutines waiting on this to just 539 // block. This is used during save/restore to prevent worker goroutines 540 // from mutating state as it's being saved. 541 undrain chan struct{} `state:"nosave"` 542 543 // probe if not nil is invoked on every received segment. It is passed 544 // a copy of the current state of the endpoint. 545 probe stack.TCPProbeFunc `state:"nosave"` 546 547 // The following are only used to assist the restore run to re-connect. 548 connectingAddress tcpip.Address 549 550 // amss is the advertised MSS to the peer by this endpoint. 551 amss uint16 552 553 // sendTOS represents IPv4 TOS or IPv6 TrafficClass, 554 // applied while sending packets. Defaults to 0 as on Linux. 555 sendTOS uint8 556 557 gso stack.GSO 558 559 stats Stats 560 561 // tcpLingerTimeout is the maximum amount of a time a socket 562 // a socket stays in TIME_WAIT state before being marked 563 // closed. 564 tcpLingerTimeout time.Duration 565 566 // closed indicates that the user has called closed on the 567 // endpoint and at this point the endpoint is only around 568 // to complete the TCP shutdown. 569 closed bool 570 571 // txHash is the transport layer hash to be set on outbound packets 572 // emitted by this endpoint. 573 txHash uint32 574 575 // owner is used to get uid and gid of the packet. 576 owner tcpip.PacketOwner 577 578 // ops is used to get socket level options. 579 ops tcpip.SocketOptions 580 581 // lastOutOfWindowAckTime is the time at which the an ACK was sent in response 582 // to an out of window segment being received by this endpoint. 583 lastOutOfWindowAckTime tcpip.MonotonicTime 584 585 // finWait2Timer is used to reap orphaned sockets in FIN-WAIT-2 where the peer 586 // is yet to send a FIN but on our end the socket is fully closed i.e. endpoint.Close() 587 // has been called on the socket. This timer is not started for sockets that 588 // are waiting for a peer FIN but are not closed. 589 finWait2Timer tcpip.Timer `state:"nosave"` 590 591 // timeWaitTimer is used to reap a socket once a socket has been in TIME-WAIT state 592 // for tcp.DefaultTCPTimeWaitTimeout seconds. 593 timeWaitTimer tcpip.Timer `state:"nosave"` 594 595 // listenCtx is used by listening endpoints to store state used while listening for 596 // connections. Nil otherwise. 597 listenCtx *listenContext `state:"nosave"` 598 } 599 600 // UniqueID implements stack.TransportEndpoint.UniqueID. 601 func (e *Endpoint) UniqueID() uint64 { 602 return e.uniqueID 603 } 604 605 // calculateAdvertisedMSS calculates the MSS to advertise. 606 // 607 // If userMSS is non-zero and is not greater than the maximum possible MSS for 608 // r, it will be used; otherwise, the maximum possible MSS will be used. 609 func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 { 610 // The maximum possible MSS is dependent on the route. 611 // TODO(b/143359391): Respect TCP Min and Max size. 612 maxMSS := uint16(r.MTU() - header.TCPMinimumSize) 613 614 if userMSS != 0 && userMSS < maxMSS { 615 return userMSS 616 } 617 618 return maxMSS 619 } 620 621 // isOwnedByUser() returns true if the endpoint lock is currently 622 // held by a user(syscall) goroutine. 623 func (e *Endpoint) isOwnedByUser() bool { 624 return e.ownedByUser.Load() == 1 625 } 626 627 // LockUser tries to lock e.mu and if it fails it will check if the lock is held 628 // by another syscall goroutine. If yes, then it will goto sleep waiting for the 629 // lock to be released, if not then it will spin till it acquires the lock or 630 // another syscall goroutine acquires it in which case it will goto sleep as 631 // described above. 632 // 633 // The assumption behind spinning here being that background packet processing 634 // should not be holding the lock for long and spinning reduces latency as we 635 // avoid an expensive sleep/wakeup of the syscall goroutine). 636 // +checklocksacquire:e.mu 637 func (e *Endpoint) LockUser() { 638 const iterations = 5 639 for i := 0; i < iterations; i++ { 640 // Try first if the sock is locked then check if it's owned 641 // by another user goroutine if not then we spin, otherwise 642 // we just go to sleep on the Lock() and wait. 643 if !e.TryLock() { 644 // If socket is owned by the user then just go to sleep 645 // as the lock could be held for a reasonably long time. 646 if e.ownedByUser.Load() == 1 { 647 e.mu.Lock() 648 e.ownedByUser.Store(1) 649 return 650 } 651 // Spin but don't yield the processor since the lower half 652 // should yield the lock soon. 653 continue 654 } 655 e.ownedByUser.Store(1) 656 return 657 } 658 659 for i := 0; i < iterations; i++ { 660 // Try first if the sock is locked then check if it's owned 661 // by another user goroutine if not then we spin, otherwise 662 // we just go to sleep on the Lock() and wait. 663 if !e.TryLock() { 664 // If socket is owned by the user then just go to sleep 665 // as the lock could be held for a reasonably long time. 666 if e.ownedByUser.Load() == 1 { 667 e.mu.Lock() 668 e.ownedByUser.Store(1) 669 return 670 } 671 // Spin but yield the processor since the lower half 672 // should yield the lock soon. 673 runtime.Gosched() 674 continue 675 } 676 e.ownedByUser.Store(1) 677 return 678 } 679 680 // Finally just give up and wait for the Lock. 681 e.mu.Lock() 682 e.ownedByUser.Store(1) 683 } 684 685 // UnlockUser will check if there are any segments already queued for processing 686 // and wake up a processor goroutine to process them before unlocking e.mu. 687 // This is required because we when packets arrive and endpoint lock is already 688 // held then such packets are queued up to be processed. 689 // 690 // Precondition: e.LockUser() must have been called before calling e.UnlockUser() 691 // +checklocksrelease:e.mu 692 func (e *Endpoint) UnlockUser() { 693 // Lock segment queue before checking so that we avoid a race where 694 // segments can be queued between the time we check if queue is empty 695 // and actually unlock the endpoint mutex. 696 e.segmentQueue.mu.Lock() 697 if e.segmentQueue.emptyLocked() { 698 if e.ownedByUser.Swap(0) != 1 { 699 panic("e.UnlockUser() called without calling e.LockUser()") 700 } 701 e.mu.Unlock() 702 e.segmentQueue.mu.Unlock() 703 return 704 } 705 e.segmentQueue.mu.Unlock() 706 707 // Since we are waking the processor goroutine here just unlock 708 // and let it process the queued segments. 709 if e.ownedByUser.Swap(0) != 1 { 710 panic("e.UnlockUser() called without calling e.LockUser()") 711 } 712 processor := e.protocol.dispatcher.selectProcessor(e.ID) 713 e.mu.Unlock() 714 715 // Wake up the processor for this endpoint to process any queued 716 // segments after releasing the lock to avoid the case where if the 717 // processor goroutine starts running before we release the lock here 718 // then it will fail to process as TryLock() will fail. 719 processor.queueEndpoint(e) 720 return 721 } 722 723 // StopWork halts packet processing. Only to be used in tests. 724 // +checklocksacquire:e.mu 725 func (e *Endpoint) StopWork() { 726 e.mu.Lock() 727 } 728 729 // ResumeWork resumes packet processing. Only to be used in tests. 730 // +checklocksrelease:e.mu 731 func (e *Endpoint) ResumeWork() { 732 e.mu.Unlock() 733 } 734 735 // AssertLockHeld forces the checklocks analyzer to consider e.mu held. This is 736 // used in places where we know that e.mu is held, but checklocks does not, 737 // which can happen when creating new locked objects. You must pass the known 738 // locked endpoint to this function and it must be the same as the caller 739 // endpoint. 740 // TODO(b/226403629): Remove this function once checklocks understands local 741 // variable locks. 742 // +checklocks:locked.mu 743 // +checklocksacquire:e.mu 744 func (e *Endpoint) AssertLockHeld(locked *Endpoint) { 745 if e != locked { 746 panic("AssertLockHeld failed: locked endpoint != asserting endpoint") 747 } 748 } 749 750 // TryLock is a helper that calls TryLock on the endpoint's mutex and 751 // adds the necessary checklocks annotations. 752 // TODO(b/226403629): Remove this once checklocks understands TryLock. 753 // +checklocksacquire:e.mu 754 func (e *Endpoint) TryLock() bool { 755 if e.mu.TryLock() { 756 return true // +checklocksforce 757 } 758 return false // +checklocksignore 759 } 760 761 // setEndpointState updates the state of the endpoint to state atomically. This 762 // method is unexported as the only place we should update the state is in this 763 // package but we allow the state to be read freely without holding e.mu. 764 // 765 // +checklocks:e.mu 766 func (e *Endpoint) setEndpointState(state EndpointState) { 767 oldstate := EndpointState(e.state.Swap(uint32(state))) 768 switch state { 769 case StateEstablished: 770 e.stack.Stats().TCP.CurrentEstablished.Increment() 771 e.stack.Stats().TCP.CurrentConnected.Increment() 772 case StateError: 773 fallthrough 774 case StateClose: 775 if oldstate == StateCloseWait || oldstate == StateEstablished { 776 e.stack.Stats().TCP.EstablishedResets.Increment() 777 } 778 if oldstate.connected() { 779 e.stack.Stats().TCP.CurrentConnected.Decrement() 780 } 781 fallthrough 782 default: 783 if oldstate == StateEstablished { 784 e.stack.Stats().TCP.CurrentEstablished.Decrement() 785 } 786 } 787 } 788 789 // EndpointState returns the current state of the endpoint. 790 func (e *Endpoint) EndpointState() EndpointState { 791 return EndpointState(e.state.Load()) 792 } 793 794 // setRecentTimestamp sets the recentTS field to the provided value. 795 func (e *Endpoint) setRecentTimestamp(recentTS uint32) { 796 e.RecentTS = recentTS 797 e.recentTSTime = e.stack.Clock().NowMonotonic() 798 } 799 800 // recentTimestamp returns the value of the recentTS field. 801 func (e *Endpoint) recentTimestamp() uint32 { 802 return e.RecentTS 803 } 804 805 // TODO(gvisor.dev/issue/6974): Remove once tcp endpoints are composed with a 806 // network.Endpoint, which also defines this function. 807 func calculateTTL(route *stack.Route, ipv4TTL uint8, ipv6HopLimit int16) uint8 { 808 switch netProto := route.NetProto(); netProto { 809 case header.IPv4ProtocolNumber: 810 if ipv4TTL == tcpip.UseDefaultIPv4TTL { 811 return route.DefaultTTL() 812 } 813 return ipv4TTL 814 case header.IPv6ProtocolNumber: 815 if ipv6HopLimit == tcpip.UseDefaultIPv6HopLimit { 816 return route.DefaultTTL() 817 } 818 return uint8(ipv6HopLimit) 819 default: 820 panic(fmt.Sprintf("invalid protocol number = %d", netProto)) 821 } 822 } 823 824 // keepalive is a synchronization wrapper used to appease stateify. See the 825 // comment in endpoint, where it is used. 826 // 827 // +stateify savable 828 type keepalive struct { 829 sync.Mutex `state:"nosave"` 830 idle time.Duration 831 interval time.Duration 832 count int 833 unacked int 834 // should never be a zero timer if the endpoint is not closed. 835 timer timer `state:"nosave"` 836 waker sleep.Waker `state:"nosave"` 837 } 838 839 func newEndpoint(s *stack.Stack, protocol *protocol, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *Endpoint { 840 e := &Endpoint{ 841 stack: s, 842 protocol: protocol, 843 TransportEndpointInfo: stack.TransportEndpointInfo{ 844 NetProto: netProto, 845 TransProto: header.TCPProtocolNumber, 846 }, 847 sndQueueInfo: sndQueueInfo{ 848 TCPSndBufState: stack.TCPSndBufState{ 849 SndMTU: math.MaxInt32, 850 }, 851 }, 852 waiterQueue: waiterQueue, 853 state: atomicbitops.FromUint32(uint32(StateInitial)), 854 keepalive: keepalive{ 855 idle: DefaultKeepaliveIdle, 856 interval: DefaultKeepaliveInterval, 857 count: DefaultKeepaliveCount, 858 }, 859 uniqueID: s.UniqueID(), 860 ipv4TTL: tcpip.UseDefaultIPv4TTL, 861 ipv6HopLimit: tcpip.UseDefaultIPv6HopLimit, 862 // txHash only determines which outgoing queue to use, so 863 // InsecureRNG is fine. 864 txHash: s.InsecureRNG().Uint32(), 865 windowClamp: DefaultReceiveBufferSize, 866 maxSynRetries: DefaultSynRetries, 867 } 868 e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits, GetTCPReceiveBufferLimits) 869 e.ops.SetMulticastLoop(true) 870 e.ops.SetQuickAck(true) 871 e.ops.SetSendBufferSize(DefaultSendBufferSize, false /* notify */) 872 e.ops.SetReceiveBufferSize(DefaultReceiveBufferSize, false /* notify */) 873 874 var ss tcpip.TCPSendBufferSizeRangeOption 875 if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil { 876 e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) 877 } 878 879 var rs tcpip.TCPReceiveBufferSizeRangeOption 880 if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil { 881 e.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */) 882 } 883 884 var cs tcpip.CongestionControlOption 885 if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil { 886 e.cc = cs 887 } 888 889 var mrb tcpip.TCPModerateReceiveBufferOption 890 if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil { 891 e.RcvAutoParams.Disabled = !bool(mrb) 892 } 893 894 var de tcpip.TCPDelayEnabled 895 if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de { 896 e.ops.SetDelayOption(true) 897 } 898 899 var tcpLT tcpip.TCPLingerTimeoutOption 900 if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil { 901 e.tcpLingerTimeout = time.Duration(tcpLT) 902 } 903 904 var synRetries tcpip.TCPSynRetriesOption 905 if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil { 906 e.maxSynRetries = uint8(synRetries) 907 } 908 909 if p := s.GetTCPProbe(); p != nil { 910 e.probe = p 911 } 912 913 e.segmentQueue.ep = e 914 915 // TODO(https://gvisor.dev/issues/7493): Defer creating the timer until TCP connection becomes 916 // established. 917 e.keepalive.timer.init(e.stack.Clock(), timerHandler(e, e.keepaliveTimerExpired)) 918 919 return e 920 } 921 922 // Readiness returns the current readiness of the endpoint. For example, if 923 // waiter.EventIn is set, the endpoint is immediately readable. 924 func (e *Endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { 925 result := waiter.EventMask(0) 926 927 switch e.EndpointState() { 928 case StateInitial, StateBound: 929 // This prevents blocking of new sockets which are not 930 // connected when SO_LINGER is set. 931 result |= waiter.EventHUp 932 933 case StateConnecting, StateSynSent, StateSynRecv: 934 // Ready for nothing. 935 936 case StateClose, StateError, StateTimeWait: 937 // Ready for anything. 938 result = mask 939 940 case StateListen: 941 // Check if there's anything in the accepted queue. 942 if (mask & waiter.ReadableEvents) != 0 { 943 e.acceptMu.Lock() 944 if e.acceptQueue.endpoints.Len() != 0 { 945 result |= waiter.ReadableEvents 946 } 947 e.acceptMu.Unlock() 948 } 949 } 950 if e.EndpointState().connected() { 951 // Determine if the endpoint is writable if requested. 952 if (mask & waiter.WritableEvents) != 0 { 953 e.sndQueueInfo.sndQueueMu.Lock() 954 sndBufSize := e.getSendBufferSize() 955 if e.sndQueueInfo.SndClosed || e.sndQueueInfo.SndBufUsed < sndBufSize { 956 result |= waiter.WritableEvents 957 } 958 if e.sndQueueInfo.SndClosed { 959 e.updateConnDirectionState(connDirectionStateSndClosed) 960 } 961 e.sndQueueInfo.sndQueueMu.Unlock() 962 } 963 964 // Determine if the endpoint is readable if requested. 965 if (mask & waiter.ReadableEvents) != 0 { 966 e.rcvQueueMu.Lock() 967 if e.RcvBufUsed > 0 || e.RcvClosed { 968 result |= waiter.ReadableEvents 969 } 970 if e.RcvClosed { 971 e.updateConnDirectionState(connDirectionStateRcvClosed) 972 } 973 e.rcvQueueMu.Unlock() 974 } 975 } 976 977 // Determine whether endpoint is half-closed with rcv shutdown 978 if e.connDirectionState() == connDirectionStateRcvClosed { 979 result |= waiter.EventRdHUp 980 } 981 982 return result 983 } 984 985 // Purging pending rcv segments is only necessary on RST. 986 func (e *Endpoint) purgePendingRcvQueue() { 987 if e.rcv != nil { 988 for e.rcv.pendingRcvdSegments.Len() > 0 { 989 s := heap.Pop(&e.rcv.pendingRcvdSegments).(*segment) 990 s.DecRef() 991 } 992 } 993 } 994 995 // +checklocks:e.mu 996 func (e *Endpoint) purgeReadQueue() { 997 if e.rcv != nil { 998 e.rcvQueueMu.Lock() 999 defer e.rcvQueueMu.Unlock() 1000 for { 1001 s := e.rcvQueue.Front() 1002 if s == nil { 1003 break 1004 } 1005 e.rcvQueue.Remove(s) 1006 s.DecRef() 1007 } 1008 e.RcvBufUsed = 0 1009 } 1010 } 1011 1012 // +checklocks:e.mu 1013 func (e *Endpoint) purgeWriteQueue() { 1014 if e.snd != nil { 1015 e.sndQueueInfo.sndQueueMu.Lock() 1016 defer e.sndQueueInfo.sndQueueMu.Unlock() 1017 e.snd.updateWriteNext(nil) 1018 for { 1019 s := e.snd.writeList.Front() 1020 if s == nil { 1021 break 1022 } 1023 e.snd.writeList.Remove(s) 1024 s.DecRef() 1025 } 1026 e.sndQueueInfo.SndBufUsed = 0 1027 e.sndQueueInfo.SndClosed = true 1028 } 1029 } 1030 1031 // Abort implements stack.TransportEndpoint.Abort. 1032 func (e *Endpoint) Abort() { 1033 defer e.drainClosingSegmentQueue() 1034 e.LockUser() 1035 defer e.UnlockUser() 1036 defer e.purgeReadQueue() 1037 // Reset all connected endpoints. 1038 switch state := e.EndpointState(); { 1039 case state.connected(): 1040 e.resetConnectionLocked(&tcpip.ErrAborted{}) 1041 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 1042 return 1043 } 1044 e.closeLocked() 1045 } 1046 1047 // Close puts the endpoint in a closed state and frees all resources associated 1048 // with it. It must be called only once and with no other concurrent calls to 1049 // the endpoint. 1050 func (e *Endpoint) Close() { 1051 e.LockUser() 1052 if e.closed { 1053 e.UnlockUser() 1054 return 1055 } 1056 1057 // We always want to purge the read queue, but do so after the checks in 1058 // shutdownLocked. 1059 e.closeLocked() 1060 e.purgeReadQueue() 1061 if e.EndpointState() == StateClose || e.EndpointState() == StateError { 1062 // It should be safe to purge the read queue now as the endpoint 1063 // is now closed or in an error state and further reads are not 1064 // permitted. 1065 e.UnlockUser() 1066 e.drainClosingSegmentQueue() 1067 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 1068 return 1069 } 1070 e.UnlockUser() 1071 } 1072 1073 // +checklocks:e.mu 1074 func (e *Endpoint) closeLocked() { 1075 linger := e.SocketOptions().GetLinger() 1076 if linger.Enabled && linger.Timeout == 0 { 1077 s := e.EndpointState() 1078 isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv 1079 if isResetState { 1080 // Close the endpoint without doing full shutdown and 1081 // send a RST. 1082 e.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) 1083 return 1084 } 1085 } 1086 1087 // Issue a shutdown so that the peer knows we won't send any more data 1088 // if we're connected, or stop accepting if we're listening. 1089 e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead) 1090 e.closeNoShutdownLocked() 1091 } 1092 1093 // closeNoShutdown closes the endpoint without doing a full shutdown. 1094 // +checklocks:e.mu 1095 func (e *Endpoint) closeNoShutdownLocked() { 1096 // For listening sockets, we always release ports inline so that they 1097 // are immediately available for reuse after Close() is called. If also 1098 // registered, we unregister as well otherwise the next user would fail 1099 // in Listen() when trying to register. 1100 if e.EndpointState() == StateListen && e.isPortReserved { 1101 if e.isRegistered { 1102 e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 1103 e.isRegistered = false 1104 } 1105 1106 portRes := ports.Reservation{ 1107 Networks: e.effectiveNetProtos, 1108 Transport: ProtocolNumber, 1109 Addr: e.TransportEndpointInfo.ID.LocalAddress, 1110 Port: e.TransportEndpointInfo.ID.LocalPort, 1111 Flags: e.boundPortFlags, 1112 BindToDevice: e.boundBindToDevice, 1113 Dest: e.boundDest, 1114 } 1115 e.stack.ReleasePort(portRes) 1116 e.isPortReserved = false 1117 e.boundBindToDevice = 0 1118 e.boundPortFlags = ports.Flags{} 1119 e.boundDest = tcpip.FullAddress{} 1120 } 1121 1122 // Mark endpoint as closed. 1123 e.closed = true 1124 tcpip.AddDanglingEndpoint(e) 1125 1126 eventMask := waiter.ReadableEvents | waiter.WritableEvents 1127 1128 switch e.EndpointState() { 1129 case StateInitial, StateBound, StateListen: 1130 e.setEndpointState(StateClose) 1131 fallthrough 1132 case StateClose, StateError: 1133 eventMask |= waiter.EventHUp 1134 e.cleanupLocked() 1135 case StateConnecting, StateSynSent, StateSynRecv: 1136 // Abort the handshake and set the error. 1137 // Notify that the endpoint is closed. 1138 eventMask |= waiter.EventHUp 1139 e.handshakeFailed(&tcpip.ErrAborted{}) 1140 // Notify that the endpoint is closed. 1141 eventMask |= waiter.EventHUp 1142 case StateFinWait2: 1143 // The socket has been closed and we are in FIN-WAIT-2 so start 1144 // the FIN-WAIT-2 timer. 1145 if e.finWait2Timer == nil { 1146 e.finWait2Timer = e.stack.Clock().AfterFunc(e.tcpLingerTimeout, e.finWait2TimerExpired) 1147 } 1148 } 1149 1150 e.waiterQueue.Notify(eventMask) 1151 } 1152 1153 // closePendingAcceptableConnections closes all connections that have completed 1154 // handshake but not yet been delivered to the application. 1155 func (e *Endpoint) closePendingAcceptableConnectionsLocked() { 1156 e.acceptMu.Lock() 1157 1158 pendingEndpoints := e.acceptQueue.pendingEndpoints 1159 e.acceptQueue.pendingEndpoints = nil 1160 1161 completedEndpoints := make([]*Endpoint, 0, e.acceptQueue.endpoints.Len()) 1162 for n := e.acceptQueue.endpoints.Front(); n != nil; n = n.Next() { 1163 completedEndpoints = append(completedEndpoints, n.Value.(*Endpoint)) 1164 } 1165 e.acceptQueue.endpoints.Init() 1166 e.acceptQueue.capacity = 0 1167 e.acceptMu.Unlock() 1168 1169 // Close any endpoints in SYN-RCVD state. 1170 for n := range pendingEndpoints { 1171 n.Abort() 1172 } 1173 1174 // Reset all connections that are waiting to be accepted. 1175 for _, n := range completedEndpoints { 1176 n.Abort() 1177 } 1178 } 1179 1180 // cleanupLocked frees all resources associated with the endpoint. 1181 // +checklocks:e.mu 1182 func (e *Endpoint) cleanupLocked() { 1183 if e.snd != nil { 1184 e.snd.resendTimer.cleanup() 1185 e.snd.probeTimer.cleanup() 1186 e.snd.reorderTimer.cleanup() 1187 e.snd.corkTimer.cleanup() 1188 } 1189 1190 if e.finWait2Timer != nil { 1191 e.finWait2Timer.Stop() 1192 } 1193 1194 if e.timeWaitTimer != nil { 1195 e.timeWaitTimer.Stop() 1196 } 1197 1198 // Close all endpoints that might have been accepted by TCP but not by 1199 // the client. 1200 e.closePendingAcceptableConnectionsLocked() 1201 e.keepalive.timer.cleanup() 1202 1203 if e.isRegistered { 1204 e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 1205 e.isRegistered = false 1206 } 1207 1208 if e.isPortReserved { 1209 portRes := ports.Reservation{ 1210 Networks: e.effectiveNetProtos, 1211 Transport: ProtocolNumber, 1212 Addr: e.TransportEndpointInfo.ID.LocalAddress, 1213 Port: e.TransportEndpointInfo.ID.LocalPort, 1214 Flags: e.boundPortFlags, 1215 BindToDevice: e.boundBindToDevice, 1216 Dest: e.boundDest, 1217 } 1218 e.stack.ReleasePort(portRes) 1219 e.isPortReserved = false 1220 } 1221 e.boundBindToDevice = 0 1222 e.boundPortFlags = ports.Flags{} 1223 e.boundDest = tcpip.FullAddress{} 1224 1225 if e.route != nil { 1226 e.route.Release() 1227 e.route = nil 1228 } 1229 1230 e.purgeWriteQueue() 1231 // Only purge the read queue here if the socket is fully closed by the 1232 // user. 1233 if e.closed { 1234 e.purgeReadQueue() 1235 } 1236 e.stack.CompleteTransportEndpointCleanup(e) 1237 tcpip.DeleteDanglingEndpoint(e) 1238 } 1239 1240 // wndFromSpace returns the window that we can advertise based on the available 1241 // receive buffer space. 1242 func wndFromSpace(space int) int { 1243 return space >> rcvAdvWndScale 1244 } 1245 1246 // initialReceiveWindow returns the initial receive window to advertise in the 1247 // SYN/SYN-ACK. 1248 func (e *Endpoint) initialReceiveWindow() int { 1249 rcvWnd := wndFromSpace(e.receiveBufferAvailable()) 1250 if rcvWnd > math.MaxUint16 { 1251 rcvWnd = math.MaxUint16 1252 } 1253 1254 // Use the user supplied MSS, if available. 1255 routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2 1256 if rcvWnd > routeWnd { 1257 rcvWnd = routeWnd 1258 } 1259 rcvWndScale := e.rcvWndScaleForHandshake() 1260 1261 // Round-down the rcvWnd to a multiple of wndScale. This ensures that the 1262 // window offered in SYN won't be reduced due to the loss of precision if 1263 // window scaling is enabled after the handshake. 1264 rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale) 1265 1266 // Ensure we can always accept at least 1 byte if the scale specified 1267 // was too high for the provided rcvWnd. 1268 if rcvWnd == 0 { 1269 rcvWnd = 1 1270 } 1271 1272 return rcvWnd 1273 } 1274 1275 // ModerateRecvBuf adjusts the receive buffer and the advertised window 1276 // based on the number of bytes copied to userspace. 1277 func (e *Endpoint) ModerateRecvBuf(copied int) { 1278 e.LockUser() 1279 defer e.UnlockUser() 1280 1281 sendNonZeroWindowUpdate := false 1282 1283 e.rcvQueueMu.Lock() 1284 if e.RcvAutoParams.Disabled { 1285 e.rcvQueueMu.Unlock() 1286 return 1287 } 1288 now := e.stack.Clock().NowMonotonic() 1289 if rtt := e.RcvAutoParams.RTT; rtt == 0 || now.Sub(e.RcvAutoParams.MeasureTime) < rtt { 1290 e.RcvAutoParams.CopiedBytes += copied 1291 e.rcvQueueMu.Unlock() 1292 return 1293 } 1294 prevRTTCopied := e.RcvAutoParams.CopiedBytes + copied 1295 prevCopied := e.RcvAutoParams.PrevCopiedBytes 1296 rcvWnd := 0 1297 if prevRTTCopied > prevCopied { 1298 // The minimal receive window based on what was copied by the app 1299 // in the immediate preceding RTT and some extra buffer for 16 1300 // segments to account for variations. 1301 // We multiply by 2 to account for packet losses. 1302 rcvWnd = prevRTTCopied*2 + 16*int(e.amss) 1303 1304 // Scale for slow start based on bytes copied in this RTT vs previous. 1305 grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied 1306 1307 // Multiply growth factor by 2 again to account for sender being 1308 // in slow-start where the sender grows it's congestion window 1309 // by 100% per RTT. 1310 rcvWnd += grow * 2 1311 1312 // Make sure auto tuned buffer size can always receive upto 2x 1313 // the initial window of 10 segments. 1314 if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd { 1315 rcvWnd = minRcvWnd 1316 } 1317 1318 // Cap the auto tuned buffer size by the maximum permissible 1319 // receive buffer size. 1320 if max := e.maxReceiveBufferSize(); rcvWnd > max { 1321 rcvWnd = max 1322 } 1323 1324 // We do not adjust downwards as that can cause the receiver to 1325 // reject valid data that might already be in flight as the 1326 // acceptable window will shrink. 1327 rcvBufSize := int(e.ops.GetReceiveBufferSize()) 1328 if rcvWnd > rcvBufSize { 1329 availBefore := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize)) 1330 e.ops.SetReceiveBufferSize(int64(rcvWnd), false /* notify */) 1331 availAfter := wndFromSpace(e.receiveBufferAvailableLocked(rcvWnd)) 1332 if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, rcvBufSize); crossed && above { 1333 sendNonZeroWindowUpdate = true 1334 } 1335 } 1336 1337 // We only update PrevCopiedBytes when we grow the buffer because in cases 1338 // where PrevCopiedBytes > prevRTTCopied the existing buffer is already big 1339 // enough to handle the current rate and we don't need to do any 1340 // adjustments. 1341 e.RcvAutoParams.PrevCopiedBytes = prevRTTCopied 1342 } 1343 e.RcvAutoParams.MeasureTime = now 1344 e.RcvAutoParams.CopiedBytes = 0 1345 e.rcvQueueMu.Unlock() 1346 1347 // Send the update after unlocking rcvQueueMu as sending a segment acquires 1348 // the lock to calculate the window to be sent. 1349 if e.EndpointState().connected() && sendNonZeroWindowUpdate { 1350 e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu 1351 } 1352 } 1353 1354 // SetOwner implements tcpip.Endpoint.SetOwner. 1355 func (e *Endpoint) SetOwner(owner tcpip.PacketOwner) { 1356 e.owner = owner 1357 } 1358 1359 // +checklocks:e.mu 1360 func (e *Endpoint) hardErrorLocked() tcpip.Error { 1361 err := e.hardError 1362 e.hardError = nil 1363 return err 1364 } 1365 1366 // +checklocks:e.mu 1367 func (e *Endpoint) lastErrorLocked() tcpip.Error { 1368 e.lastErrorMu.Lock() 1369 defer e.lastErrorMu.Unlock() 1370 err := e.lastError 1371 e.lastError = nil 1372 return err 1373 } 1374 1375 // LastError implements tcpip.Endpoint.LastError. 1376 func (e *Endpoint) LastError() tcpip.Error { 1377 e.LockUser() 1378 defer e.UnlockUser() 1379 if err := e.hardErrorLocked(); err != nil { 1380 return err 1381 } 1382 return e.lastErrorLocked() 1383 } 1384 1385 // LastErrorLocked reads and clears lastError. 1386 // Only to be used in tests. 1387 // +checklocks:e.mu 1388 func (e *Endpoint) LastErrorLocked() tcpip.Error { 1389 return e.lastErrorLocked() 1390 } 1391 1392 // UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError. 1393 func (e *Endpoint) UpdateLastError(err tcpip.Error) { 1394 e.LockUser() 1395 e.lastErrorMu.Lock() 1396 e.lastError = err 1397 e.lastErrorMu.Unlock() 1398 e.UnlockUser() 1399 } 1400 1401 // Read implements tcpip.Endpoint.Read. 1402 func (e *Endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) { 1403 e.LockUser() 1404 defer e.UnlockUser() 1405 1406 if err := e.checkReadLocked(); err != nil { 1407 if _, ok := err.(*tcpip.ErrClosedForReceive); ok { 1408 e.stats.ReadErrors.ReadClosed.Increment() 1409 } 1410 return tcpip.ReadResult{}, err 1411 } 1412 1413 var err error 1414 done := 0 1415 // N.B. Here we get the first segment to be processed. It is safe to not 1416 // hold rcvQueueMu when processing, since we hold e.mu to ensure we only 1417 // remove segments from the list through Read() and that new segments 1418 // cannot be appended. 1419 s := e.rcvQueue.Front() 1420 for s != nil { 1421 var n int 1422 n, err = s.ReadTo(dst, opts.Peek) 1423 // Book keeping first then error handling. 1424 done += n 1425 1426 if opts.Peek { 1427 s = s.Next() 1428 } else { 1429 sendNonZeroWindowUpdate := false 1430 memDelta := 0 1431 for { 1432 seg := e.rcvQueue.Front() 1433 if seg == nil || seg.payloadSize() != 0 { 1434 break 1435 } 1436 e.rcvQueue.Remove(seg) 1437 // Memory is only considered released when the whole segment has been 1438 // read. 1439 memDelta += seg.segMemSize() 1440 seg.DecRef() 1441 } 1442 e.rcvQueueMu.Lock() 1443 e.RcvBufUsed -= n 1444 s = e.rcvQueue.Front() 1445 1446 if memDelta > 0 { 1447 // If the window was small before this read and if the read freed up 1448 // enough buffer space, to either fit an aMSS or half a receive buffer 1449 // (whichever smaller), then notify the protocol goroutine to send a 1450 // window update. 1451 if crossed, above := e.windowCrossedACKThresholdLocked(memDelta, int(e.ops.GetReceiveBufferSize())); crossed && above { 1452 sendNonZeroWindowUpdate = true 1453 } 1454 } 1455 e.rcvQueueMu.Unlock() 1456 1457 if e.EndpointState().connected() && sendNonZeroWindowUpdate { 1458 e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu 1459 } 1460 } 1461 1462 if err != nil { 1463 break 1464 } 1465 } 1466 1467 // If something is read, we must report it. Report error when nothing is read. 1468 if done == 0 && err != nil { 1469 return tcpip.ReadResult{}, &tcpip.ErrBadBuffer{} 1470 } 1471 return tcpip.ReadResult{ 1472 Count: done, 1473 Total: done, 1474 }, nil 1475 } 1476 1477 // checkRead checks that endpoint is in a readable state. 1478 // 1479 // +checklocks:e.mu 1480 func (e *Endpoint) checkReadLocked() tcpip.Error { 1481 e.rcvQueueMu.Lock() 1482 defer e.rcvQueueMu.Unlock() 1483 // When in SYN-SENT state, let the caller block on the receive. 1484 // An application can initiate a non-blocking connect and then block 1485 // on a receive. It can expect to read any data after the handshake 1486 // is complete. RFC793, section 3.9, p58. 1487 if e.EndpointState() == StateSynSent { 1488 return &tcpip.ErrWouldBlock{} 1489 } 1490 1491 // The endpoint can be read if it's connected, or if it's already closed 1492 // but has some pending unread data. Also note that a RST being received 1493 // would cause the state to become StateError so we should allow the 1494 // reads to proceed before returning a ECONNRESET. 1495 bufUsed := e.RcvBufUsed 1496 if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 { 1497 if s == StateError { 1498 if err := e.hardErrorLocked(); err != nil { 1499 return err 1500 } 1501 return &tcpip.ErrClosedForReceive{} 1502 } 1503 e.stats.ReadErrors.NotConnected.Increment() 1504 return &tcpip.ErrNotConnected{} 1505 } 1506 1507 if e.RcvBufUsed == 0 { 1508 if e.RcvClosed || !e.EndpointState().connected() { 1509 return &tcpip.ErrClosedForReceive{} 1510 } 1511 return &tcpip.ErrWouldBlock{} 1512 } 1513 1514 return nil 1515 } 1516 1517 // isEndpointWritableLocked checks if a given endpoint is writable 1518 // and also returns the number of bytes that can be written at this 1519 // moment. If the endpoint is not writable then it returns an error 1520 // indicating the reason why it's not writable. 1521 // +checklocks:e.mu 1522 // +checklocks:e.sndQueueInfo.sndQueueMu 1523 func (e *Endpoint) isEndpointWritableLocked() (int, tcpip.Error) { 1524 // The endpoint cannot be written to if it's not connected. 1525 switch s := e.EndpointState(); { 1526 case s == StateError: 1527 if err := e.hardErrorLocked(); err != nil { 1528 return 0, err 1529 } 1530 return 0, &tcpip.ErrClosedForSend{} 1531 case !s.connecting() && !s.connected(): 1532 return 0, &tcpip.ErrClosedForSend{} 1533 case s.connecting(): 1534 // As per RFC793, page 56, a send request arriving when in connecting 1535 // state, can be queued to be completed after the state becomes 1536 // connected. Return an error code for the caller of endpoint Write to 1537 // try again, until the connection handshake is complete. 1538 return 0, &tcpip.ErrWouldBlock{} 1539 } 1540 1541 // Check if the connection has already been closed for sends. 1542 if e.sndQueueInfo.SndClosed { 1543 return 0, &tcpip.ErrClosedForSend{} 1544 } 1545 1546 sndBufSize := e.getSendBufferSize() 1547 avail := sndBufSize - e.sndQueueInfo.SndBufUsed 1548 if avail <= 0 { 1549 return 0, &tcpip.ErrWouldBlock{} 1550 } 1551 return avail, nil 1552 } 1553 1554 // readFromPayloader reads a slice from the Payloader. 1555 // +checklocks:e.mu 1556 // +checklocks:e.sndQueueInfo.sndQueueMu 1557 func (e *Endpoint) readFromPayloader(p tcpip.Payloader, opts tcpip.WriteOptions, avail int) (buffer.Buffer, tcpip.Error) { 1558 // We can release locks while copying data. 1559 // 1560 // This is not possible if atomic is set, because we can't allow the 1561 // available buffer space to be consumed by some other caller while we 1562 // are copying data in. 1563 if !opts.Atomic { 1564 e.sndQueueInfo.sndQueueMu.Unlock() 1565 defer e.sndQueueInfo.sndQueueMu.Lock() 1566 1567 e.UnlockUser() 1568 defer e.LockUser() 1569 } 1570 1571 // Fetch data. 1572 var payload buffer.Buffer 1573 if l := p.Len(); l < avail { 1574 avail = l 1575 } 1576 if avail == 0 { 1577 return payload, nil 1578 } 1579 if _, err := payload.WriteFromReader(p, int64(avail)); err != nil { 1580 payload.Release() 1581 return buffer.Buffer{}, &tcpip.ErrBadBuffer{} 1582 } 1583 return payload, nil 1584 } 1585 1586 // queueSegment reads data from the payloader and returns a segment to be sent. 1587 // +checklocks:e.mu 1588 func (e *Endpoint) queueSegment(p tcpip.Payloader, opts tcpip.WriteOptions) (*segment, int, tcpip.Error) { 1589 e.sndQueueInfo.sndQueueMu.Lock() 1590 defer e.sndQueueInfo.sndQueueMu.Unlock() 1591 1592 avail, err := e.isEndpointWritableLocked() 1593 if err != nil { 1594 e.stats.WriteErrors.WriteClosed.Increment() 1595 return nil, 0, err 1596 } 1597 1598 buf, err := e.readFromPayloader(p, opts, avail) 1599 if err != nil { 1600 return nil, 0, err 1601 } 1602 1603 // Do not queue zero length segments. 1604 if buf.Size() == 0 { 1605 return nil, 0, nil 1606 } 1607 1608 if !opts.Atomic { 1609 // Since we released locks in between it's possible that the 1610 // endpoint transitioned to a CLOSED/ERROR states so make 1611 // sure endpoint is still writable before trying to write. 1612 avail, err := e.isEndpointWritableLocked() 1613 if err != nil { 1614 e.stats.WriteErrors.WriteClosed.Increment() 1615 buf.Release() 1616 return nil, 0, err 1617 } 1618 1619 // A simultaneous call to write on the socket can reduce avail. Discard 1620 // excess data copied if this is the case. 1621 if int64(avail) < buf.Size() { 1622 buf.Truncate(int64(avail)) 1623 } 1624 } 1625 1626 // Add data to the send queue. 1627 size := int(buf.Size()) 1628 s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), buf) 1629 e.sndQueueInfo.SndBufUsed += size 1630 e.snd.writeList.PushBack(s) 1631 1632 return s, size, nil 1633 } 1634 1635 // Write writes data to the endpoint's peer. 1636 func (e *Endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { 1637 // Linux completely ignores any address passed to sendto(2) for TCP sockets 1638 // (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More 1639 // and opts.EndOfRecord are also ignored. 1640 1641 e.LockUser() 1642 defer e.UnlockUser() 1643 1644 // Return if either we didn't queue anything or if an error occurred while 1645 // attempting to queue data. 1646 nextSeg, n, err := e.queueSegment(p, opts) 1647 if n == 0 || err != nil { 1648 return 0, err 1649 } 1650 1651 e.sendData(nextSeg) 1652 return int64(n), nil 1653 } 1654 1655 // selectWindowLocked returns the new window without checking for shrinking or scaling 1656 // applied. 1657 // +checklocks:e.mu 1658 // +checklocks:e.rcvQueueMu 1659 func (e *Endpoint) selectWindowLocked(rcvBufSize int) (wnd seqnum.Size) { 1660 wndFromAvailable := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize)) 1661 maxWindow := wndFromSpace(rcvBufSize) 1662 wndFromUsedBytes := maxWindow - e.RcvBufUsed 1663 1664 // We take the lesser of the wndFromAvailable and wndFromUsedBytes because in 1665 // cases where we receive a lot of small segments the segment overhead is a 1666 // lot higher and we can run out socket buffer space before we can fill the 1667 // previous window we advertised. In cases where we receive MSS sized or close 1668 // MSS sized segments we will probably run out of window space before we 1669 // exhaust receive buffer. 1670 newWnd := wndFromAvailable 1671 if newWnd > wndFromUsedBytes { 1672 newWnd = wndFromUsedBytes 1673 } 1674 if newWnd < 0 { 1675 newWnd = 0 1676 } 1677 return seqnum.Size(newWnd) 1678 } 1679 1680 // selectWindow invokes selectWindowLocked after acquiring e.rcvQueueMu. 1681 // +checklocks:e.mu 1682 func (e *Endpoint) selectWindow() (wnd seqnum.Size) { 1683 e.rcvQueueMu.Lock() 1684 wnd = e.selectWindowLocked(int(e.ops.GetReceiveBufferSize())) 1685 e.rcvQueueMu.Unlock() 1686 return wnd 1687 } 1688 1689 // windowCrossedACKThresholdLocked checks if the receive window to be announced 1690 // would be under aMSS or under the window derived from half receive buffer, 1691 // whichever smaller. This is useful as a receive side silly window syndrome 1692 // prevention mechanism. If window grows to reasonable value, we should send ACK 1693 // to the sender to inform the rx space is now large. We also want ensure a 1694 // series of small read()'s won't trigger a flood of spurious tiny ACK's. 1695 // 1696 // For large receive buffers, the threshold is aMSS - once reader reads more 1697 // than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of 1698 // receive buffer size. This is chosen arbitrarily. 1699 // crossed will be true if the window size crossed the ACK threshold. 1700 // above will be true if the new window is >= ACK threshold and false 1701 // otherwise. 1702 // 1703 // +checklocks:e.mu 1704 // +checklocks:e.rcvQueueMu 1705 func (e *Endpoint) windowCrossedACKThresholdLocked(deltaBefore int, rcvBufSize int) (crossed bool, above bool) { 1706 newAvail := int(e.selectWindowLocked(rcvBufSize)) 1707 oldAvail := newAvail - deltaBefore 1708 if oldAvail < 0 { 1709 oldAvail = 0 1710 } 1711 threshold := int(e.amss) 1712 // rcvBufFraction is the inverse of the fraction of receive buffer size that 1713 // is used to decide if the available buffer space is now above it. 1714 const rcvBufFraction = 2 1715 if wndThreshold := wndFromSpace(rcvBufSize / rcvBufFraction); threshold > wndThreshold { 1716 threshold = wndThreshold 1717 } 1718 1719 switch { 1720 case oldAvail < threshold && newAvail >= threshold: 1721 return true, true 1722 case oldAvail >= threshold && newAvail < threshold: 1723 return true, false 1724 } 1725 return false, false 1726 } 1727 1728 // OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet. 1729 func (e *Endpoint) OnReuseAddressSet(v bool) { 1730 e.LockUser() 1731 e.portFlags.TupleOnly = v 1732 e.UnlockUser() 1733 } 1734 1735 // OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet. 1736 func (e *Endpoint) OnReusePortSet(v bool) { 1737 e.LockUser() 1738 e.portFlags.LoadBalanced = v 1739 e.UnlockUser() 1740 } 1741 1742 // OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet. 1743 func (e *Endpoint) OnKeepAliveSet(bool) { 1744 e.LockUser() 1745 e.resetKeepaliveTimer(true /* receivedData */) 1746 e.UnlockUser() 1747 } 1748 1749 // OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet. 1750 func (e *Endpoint) OnDelayOptionSet(v bool) { 1751 if !v { 1752 e.LockUser() 1753 defer e.UnlockUser() 1754 // Handle delayed data. 1755 if e.EndpointState().connected() { 1756 e.sendData(nil /* next */) 1757 } 1758 } 1759 } 1760 1761 // OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet. 1762 func (e *Endpoint) OnCorkOptionSet(v bool) { 1763 if !v { 1764 e.LockUser() 1765 defer e.UnlockUser() 1766 if e.snd != nil { 1767 e.snd.corkTimer.disable() 1768 } 1769 // Handle the corked data. 1770 if e.EndpointState().connected() { 1771 e.sendData(nil /* next */) 1772 } 1773 } 1774 } 1775 1776 func (e *Endpoint) getSendBufferSize() int { 1777 return int(e.ops.GetSendBufferSize()) 1778 } 1779 1780 // OnSetReceiveBufferSize implements tcpip.SocketOptionsHandler.OnSetReceiveBufferSize. 1781 func (e *Endpoint) OnSetReceiveBufferSize(rcvBufSz, oldSz int64) (newSz int64, postSet func()) { 1782 e.LockUser() 1783 1784 sendNonZeroWindowUpdate := false 1785 e.rcvQueueMu.Lock() 1786 1787 // Make sure the receive buffer size allows us to send a 1788 // non-zero window size. 1789 scale := uint8(0) 1790 if e.rcv != nil { 1791 scale = e.rcv.RcvWndScale 1792 } 1793 if rcvBufSz>>scale == 0 { 1794 rcvBufSz = 1 << scale 1795 } 1796 1797 availBefore := wndFromSpace(e.receiveBufferAvailableLocked(int(oldSz))) 1798 availAfter := wndFromSpace(e.receiveBufferAvailableLocked(int(rcvBufSz))) 1799 e.RcvAutoParams.Disabled = true 1800 1801 // Immediately send an ACK to uncork the sender silly window 1802 // syndrome prevetion, when our available space grows above aMSS 1803 // or half receive buffer, whichever smaller. 1804 if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, int(rcvBufSz)); crossed && above { 1805 sendNonZeroWindowUpdate = true 1806 } 1807 1808 e.rcvQueueMu.Unlock() 1809 1810 postSet = func() { 1811 e.LockUser() 1812 defer e.UnlockUser() 1813 if e.EndpointState().connected() && sendNonZeroWindowUpdate { 1814 e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu 1815 } 1816 1817 } 1818 e.UnlockUser() 1819 return rcvBufSz, postSet 1820 } 1821 1822 // OnSetSendBufferSize implements tcpip.SocketOptionsHandler.OnSetSendBufferSize. 1823 func (e *Endpoint) OnSetSendBufferSize(sz int64) int64 { 1824 e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled.Store(1) 1825 return sz 1826 } 1827 1828 // WakeupWriters implements tcpip.SocketOptionsHandler.WakeupWriters. 1829 func (e *Endpoint) WakeupWriters() { 1830 e.LockUser() 1831 defer e.UnlockUser() 1832 1833 sendBufferSize := e.getSendBufferSize() 1834 e.sndQueueInfo.sndQueueMu.Lock() 1835 notify := (sendBufferSize - e.sndQueueInfo.SndBufUsed) >= e.sndQueueInfo.SndBufUsed>>1 1836 e.sndQueueInfo.sndQueueMu.Unlock() 1837 1838 if notify { 1839 e.waiterQueue.Notify(waiter.WritableEvents) 1840 } 1841 } 1842 1843 // SetSockOptInt sets a socket option. 1844 func (e *Endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { 1845 // Lower 2 bits represents ECN bits. RFC 3168, section 23.1 1846 const inetECNMask = 3 1847 1848 switch opt { 1849 case tcpip.KeepaliveCountOption: 1850 e.LockUser() 1851 e.keepalive.Lock() 1852 e.keepalive.count = v 1853 e.keepalive.Unlock() 1854 e.resetKeepaliveTimer(true /* receivedData */) 1855 e.UnlockUser() 1856 1857 case tcpip.IPv4TOSOption: 1858 e.LockUser() 1859 // TODO(gvisor.dev/issue/995): ECN is not currently supported, 1860 // ignore the bits for now. 1861 e.sendTOS = uint8(v) & ^uint8(inetECNMask) 1862 e.UnlockUser() 1863 1864 case tcpip.IPv6TrafficClassOption: 1865 e.LockUser() 1866 // TODO(gvisor.dev/issue/995): ECN is not currently supported, 1867 // ignore the bits for now. 1868 e.sendTOS = uint8(v) & ^uint8(inetECNMask) 1869 e.UnlockUser() 1870 1871 case tcpip.MaxSegOption: 1872 userMSS := v 1873 if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS { 1874 return &tcpip.ErrInvalidOptionValue{} 1875 } 1876 e.LockUser() 1877 e.userMSS = uint16(userMSS) 1878 e.UnlockUser() 1879 1880 case tcpip.MTUDiscoverOption: 1881 // Return not supported if attempting to set this option to 1882 // anything other than path MTU discovery disabled. 1883 if v != tcpip.PMTUDiscoveryDont { 1884 return &tcpip.ErrNotSupported{} 1885 } 1886 1887 case tcpip.IPv4TTLOption: 1888 e.LockUser() 1889 e.ipv4TTL = uint8(v) 1890 e.UnlockUser() 1891 1892 case tcpip.IPv6HopLimitOption: 1893 e.LockUser() 1894 e.ipv6HopLimit = int16(v) 1895 e.UnlockUser() 1896 1897 case tcpip.TCPSynCountOption: 1898 if v < 1 || v > 255 { 1899 return &tcpip.ErrInvalidOptionValue{} 1900 } 1901 e.LockUser() 1902 e.maxSynRetries = uint8(v) 1903 e.UnlockUser() 1904 1905 case tcpip.TCPWindowClampOption: 1906 if v == 0 { 1907 e.LockUser() 1908 switch e.EndpointState() { 1909 case StateClose, StateInitial: 1910 e.windowClamp = 0 1911 e.UnlockUser() 1912 return nil 1913 default: 1914 e.UnlockUser() 1915 return &tcpip.ErrInvalidOptionValue{} 1916 } 1917 } 1918 var rs tcpip.TCPReceiveBufferSizeRangeOption 1919 if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil { 1920 if v < rs.Min/2 { 1921 v = rs.Min / 2 1922 } 1923 } 1924 e.LockUser() 1925 e.windowClamp = uint32(v) 1926 e.UnlockUser() 1927 } 1928 return nil 1929 } 1930 1931 // HasNIC returns true if the NICID is defined in the stack or id is 0. 1932 func (e *Endpoint) HasNIC(id int32) bool { 1933 return id == 0 || e.stack.HasNIC(tcpip.NICID(id)) 1934 } 1935 1936 // SetSockOpt sets a socket option. 1937 func (e *Endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { 1938 switch v := opt.(type) { 1939 case *tcpip.KeepaliveIdleOption: 1940 e.LockUser() 1941 e.keepalive.Lock() 1942 e.keepalive.idle = time.Duration(*v) 1943 e.keepalive.Unlock() 1944 e.resetKeepaliveTimer(true /* receivedData */) 1945 e.UnlockUser() 1946 1947 case *tcpip.KeepaliveIntervalOption: 1948 e.LockUser() 1949 e.keepalive.Lock() 1950 e.keepalive.interval = time.Duration(*v) 1951 e.keepalive.Unlock() 1952 e.resetKeepaliveTimer(true /* receivedData */) 1953 e.UnlockUser() 1954 1955 case *tcpip.TCPUserTimeoutOption: 1956 e.LockUser() 1957 e.userTimeout = time.Duration(*v) 1958 e.UnlockUser() 1959 1960 case *tcpip.CongestionControlOption: 1961 // Query the available cc algorithms in the stack and 1962 // validate that the specified algorithm is actually 1963 // supported in the stack. 1964 var avail tcpip.TCPAvailableCongestionControlOption 1965 if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil { 1966 return err 1967 } 1968 availCC := strings.Split(string(avail), " ") 1969 for _, cc := range availCC { 1970 if *v == tcpip.CongestionControlOption(cc) { 1971 e.LockUser() 1972 state := e.EndpointState() 1973 e.cc = *v 1974 switch state { 1975 case StateEstablished: 1976 if e.EndpointState() == state { 1977 e.snd.cc = e.snd.initCongestionControl(e.cc) 1978 } 1979 } 1980 e.UnlockUser() 1981 return nil 1982 } 1983 } 1984 1985 // Linux returns ENOENT when an invalid congestion 1986 // control algorithm is specified. 1987 return &tcpip.ErrNoSuchFile{} 1988 1989 case *tcpip.TCPLingerTimeoutOption: 1990 e.LockUser() 1991 1992 switch { 1993 case *v < 0: 1994 // Same as effectively disabling TCPLinger timeout. 1995 *v = -1 1996 case *v == 0: 1997 // Same as the stack default. 1998 var stackLingerTimeout tcpip.TCPLingerTimeoutOption 1999 if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil { 2000 panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err)) 2001 } 2002 *v = stackLingerTimeout 2003 case *v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout): 2004 // Cap it to Stack's default TCP_LINGER2 timeout. 2005 *v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout) 2006 default: 2007 } 2008 2009 e.tcpLingerTimeout = time.Duration(*v) 2010 e.UnlockUser() 2011 2012 case *tcpip.TCPDeferAcceptOption: 2013 e.LockUser() 2014 if time.Duration(*v) > MaxRTO { 2015 *v = tcpip.TCPDeferAcceptOption(MaxRTO) 2016 } 2017 e.deferAccept = time.Duration(*v) 2018 e.UnlockUser() 2019 2020 case *tcpip.SocketDetachFilterOption: 2021 return nil 2022 2023 default: 2024 return nil 2025 } 2026 return nil 2027 } 2028 2029 // readyReceiveSize returns the number of bytes ready to be received. 2030 func (e *Endpoint) readyReceiveSize() (int, tcpip.Error) { 2031 e.LockUser() 2032 defer e.UnlockUser() 2033 2034 // The endpoint cannot be in listen state. 2035 if e.EndpointState() == StateListen { 2036 return 0, &tcpip.ErrInvalidEndpointState{} 2037 } 2038 2039 e.rcvQueueMu.Lock() 2040 defer e.rcvQueueMu.Unlock() 2041 2042 return e.RcvBufUsed, nil 2043 } 2044 2045 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. 2046 func (e *Endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { 2047 switch opt { 2048 case tcpip.KeepaliveCountOption: 2049 e.keepalive.Lock() 2050 v := e.keepalive.count 2051 e.keepalive.Unlock() 2052 return v, nil 2053 2054 case tcpip.IPv4TOSOption: 2055 e.LockUser() 2056 v := int(e.sendTOS) 2057 e.UnlockUser() 2058 return v, nil 2059 2060 case tcpip.IPv6TrafficClassOption: 2061 e.LockUser() 2062 v := int(e.sendTOS) 2063 e.UnlockUser() 2064 return v, nil 2065 2066 case tcpip.MaxSegOption: 2067 // Linux only returns user_mss value if user_mss is set and the socket is 2068 // unconnected. Otherwise Linux returns the actual current MSS. Netstack 2069 // mimics the user_mss behavior, but otherwise just returns the defaultMSS 2070 // for now. 2071 v := header.TCPDefaultMSS 2072 e.LockUser() 2073 if state := e.EndpointState(); e.userMSS > 0 && (state.internal() || state == StateClose || state == StateListen) { 2074 v = int(e.userMSS) 2075 } 2076 e.UnlockUser() 2077 return v, nil 2078 2079 case tcpip.MTUDiscoverOption: 2080 // Always return the path MTU discovery disabled setting since 2081 // it's the only one supported. 2082 return tcpip.PMTUDiscoveryDont, nil 2083 2084 case tcpip.ReceiveQueueSizeOption: 2085 return e.readyReceiveSize() 2086 2087 case tcpip.IPv4TTLOption: 2088 e.LockUser() 2089 v := int(e.ipv4TTL) 2090 e.UnlockUser() 2091 return v, nil 2092 2093 case tcpip.IPv6HopLimitOption: 2094 e.LockUser() 2095 v := int(e.ipv6HopLimit) 2096 e.UnlockUser() 2097 return v, nil 2098 2099 case tcpip.TCPSynCountOption: 2100 e.LockUser() 2101 v := int(e.maxSynRetries) 2102 e.UnlockUser() 2103 return v, nil 2104 2105 case tcpip.TCPWindowClampOption: 2106 e.LockUser() 2107 v := int(e.windowClamp) 2108 e.UnlockUser() 2109 return v, nil 2110 2111 case tcpip.MulticastTTLOption: 2112 return 1, nil 2113 2114 default: 2115 return -1, &tcpip.ErrUnknownProtocolOption{} 2116 } 2117 } 2118 2119 func (e *Endpoint) getTCPInfo() tcpip.TCPInfoOption { 2120 info := tcpip.TCPInfoOption{} 2121 e.LockUser() 2122 if state := e.EndpointState(); state.internal() { 2123 info.State = tcpip.EndpointState(StateClose) 2124 } else { 2125 info.State = tcpip.EndpointState(state) 2126 } 2127 snd := e.snd 2128 if snd != nil { 2129 // We do not calculate RTT before sending the data packets. If 2130 // the connection did not send and receive data, then RTT will 2131 // be zero. 2132 snd.rtt.Lock() 2133 info.RTT = snd.rtt.TCPRTTState.SRTT 2134 info.RTTVar = snd.rtt.TCPRTTState.RTTVar 2135 snd.rtt.Unlock() 2136 2137 info.RTO = snd.RTO 2138 info.CcState = snd.state 2139 info.SndSsthresh = uint32(snd.Ssthresh) 2140 info.SndCwnd = uint32(snd.SndCwnd) 2141 info.ReorderSeen = snd.rc.Reord 2142 } 2143 e.UnlockUser() 2144 return info 2145 } 2146 2147 // GetSockOpt implements tcpip.Endpoint.GetSockOpt. 2148 func (e *Endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { 2149 switch o := opt.(type) { 2150 case *tcpip.TCPInfoOption: 2151 *o = e.getTCPInfo() 2152 2153 case *tcpip.KeepaliveIdleOption: 2154 e.keepalive.Lock() 2155 *o = tcpip.KeepaliveIdleOption(e.keepalive.idle) 2156 e.keepalive.Unlock() 2157 2158 case *tcpip.KeepaliveIntervalOption: 2159 e.keepalive.Lock() 2160 *o = tcpip.KeepaliveIntervalOption(e.keepalive.interval) 2161 e.keepalive.Unlock() 2162 2163 case *tcpip.TCPUserTimeoutOption: 2164 e.LockUser() 2165 *o = tcpip.TCPUserTimeoutOption(e.userTimeout) 2166 e.UnlockUser() 2167 2168 case *tcpip.CongestionControlOption: 2169 e.LockUser() 2170 *o = e.cc 2171 e.UnlockUser() 2172 2173 case *tcpip.TCPLingerTimeoutOption: 2174 e.LockUser() 2175 *o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout) 2176 e.UnlockUser() 2177 2178 case *tcpip.TCPDeferAcceptOption: 2179 e.LockUser() 2180 *o = tcpip.TCPDeferAcceptOption(e.deferAccept) 2181 e.UnlockUser() 2182 2183 case *tcpip.OriginalDestinationOption: 2184 e.LockUser() 2185 ipt := e.stack.IPTables() 2186 addr, port, err := ipt.OriginalDst(e.TransportEndpointInfo.ID, e.NetProto, ProtocolNumber) 2187 e.UnlockUser() 2188 if err != nil { 2189 return err 2190 } 2191 *o = tcpip.OriginalDestinationOption{ 2192 Addr: addr, 2193 Port: port, 2194 } 2195 2196 default: 2197 return &tcpip.ErrUnknownProtocolOption{} 2198 } 2199 return nil 2200 } 2201 2202 // checkV4MappedLocked determines the effective network protocol and converts 2203 // addr to its canonical form. 2204 // +checklocks:e.mu 2205 func (e *Endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) { 2206 unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only()) 2207 if err != nil { 2208 return tcpip.FullAddress{}, 0, err 2209 } 2210 return unwrapped, netProto, nil 2211 } 2212 2213 // Disconnect implements tcpip.Endpoint.Disconnect. 2214 func (*Endpoint) Disconnect() tcpip.Error { 2215 return &tcpip.ErrNotSupported{} 2216 } 2217 2218 // Connect connects the endpoint to its peer. 2219 func (e *Endpoint) Connect(addr tcpip.FullAddress) tcpip.Error { 2220 e.LockUser() 2221 defer e.UnlockUser() 2222 err := e.connect(addr, true) 2223 if err != nil { 2224 if !err.IgnoreStats() { 2225 // Connect failed. Let's wake up any waiters. 2226 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 2227 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 2228 e.stats.FailedConnectionAttempts.Increment() 2229 } 2230 } 2231 return err 2232 } 2233 2234 // registerEndpoint registers the endpoint with the provided address. 2235 // 2236 // +checklocks:e.mu 2237 func (e *Endpoint) registerEndpoint(addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber, nicID tcpip.NICID) tcpip.Error { 2238 netProtos := []tcpip.NetworkProtocolNumber{netProto} 2239 if e.TransportEndpointInfo.ID.LocalPort != 0 { 2240 // The endpoint is bound to a port, attempt to register it. 2241 err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 2242 if err != nil { 2243 return err 2244 } 2245 } else { 2246 // The endpoint doesn't have a local port yet, so try to get 2247 // one. Make sure that it isn't one that will result in the same 2248 // address/port for both local and remote (otherwise this 2249 // endpoint would be trying to connect to itself). 2250 sameAddr := e.TransportEndpointInfo.ID.LocalAddress == e.TransportEndpointInfo.ID.RemoteAddress 2251 2252 var twReuse tcpip.TCPTimeWaitReuseOption 2253 if err := e.stack.TransportProtocolOption(ProtocolNumber, &twReuse); err != nil { 2254 panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &twReuse, err)) 2255 } 2256 2257 reuse := twReuse == tcpip.TCPTimeWaitReuseGlobal 2258 if twReuse == tcpip.TCPTimeWaitReuseLoopbackOnly { 2259 switch netProto { 2260 case header.IPv4ProtocolNumber: 2261 reuse = header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.LocalAddress) && header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.RemoteAddress) 2262 case header.IPv6ProtocolNumber: 2263 reuse = e.TransportEndpointInfo.ID.LocalAddress == header.IPv6Loopback && e.TransportEndpointInfo.ID.RemoteAddress == header.IPv6Loopback 2264 } 2265 } 2266 2267 bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) 2268 if _, err := e.stack.PickEphemeralPort(e.stack.SecureRNG(), func(p uint16) (bool, tcpip.Error) { 2269 if sameAddr && p == e.TransportEndpointInfo.ID.RemotePort { 2270 return false, nil 2271 } 2272 portRes := ports.Reservation{ 2273 Networks: netProtos, 2274 Transport: ProtocolNumber, 2275 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2276 Port: p, 2277 Flags: e.portFlags, 2278 BindToDevice: bindToDevice, 2279 Dest: addr, 2280 } 2281 if _, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, nil /* testPort */); err != nil { 2282 if _, ok := err.(*tcpip.ErrPortInUse); !ok || !reuse { 2283 return false, nil 2284 } 2285 transEPID := e.TransportEndpointInfo.ID 2286 transEPID.LocalPort = p 2287 // Check if an endpoint is registered with demuxer in TIME-WAIT and if 2288 // we can reuse it. If we can't find a transport endpoint then we just 2289 // skip using this port as it's possible that either an endpoint has 2290 // bound the port but not registered with demuxer yet (no listen/connect 2291 // done yet) or the reservation was freed between the check above and 2292 // the FindTransportEndpoint below. But rather than retry the same port 2293 // we just skip it and move on. 2294 transEP := e.stack.FindTransportEndpoint(netProto, ProtocolNumber, transEPID, nicID) 2295 if transEP == nil { 2296 // ReservePort failed but there is no registered endpoint with 2297 // demuxer. Which indicates there is at least some endpoint that has 2298 // bound the port. 2299 return false, nil 2300 } 2301 2302 tcpEP := transEP.(*Endpoint) 2303 tcpEP.LockUser() 2304 // If the endpoint is not in TIME-WAIT or if it is in TIME-WAIT but 2305 // less than 1 second has elapsed since its recentTS was updated then 2306 // we cannot reuse the port. 2307 if tcpEP.EndpointState() != StateTimeWait || e.stack.Clock().NowMonotonic().Sub(tcpEP.recentTSTime) < 1*time.Second { 2308 tcpEP.UnlockUser() 2309 return false, nil 2310 } 2311 // Since the endpoint is in TIME-WAIT it should be safe to acquire its 2312 // Lock while holding the lock for this endpoint as endpoints in 2313 // TIME-WAIT do not acquire locks on other endpoints. 2314 tcpEP.transitionToStateCloseLocked() 2315 tcpEP.drainClosingSegmentQueue() 2316 tcpEP.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 2317 tcpEP.UnlockUser() 2318 // Now try and Reserve again if it fails then we skip. 2319 portRes := ports.Reservation{ 2320 Networks: netProtos, 2321 Transport: ProtocolNumber, 2322 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2323 Port: p, 2324 Flags: e.portFlags, 2325 BindToDevice: bindToDevice, 2326 Dest: addr, 2327 } 2328 if _, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, nil /* testPort */); err != nil { 2329 return false, nil 2330 } 2331 } 2332 2333 id := e.TransportEndpointInfo.ID 2334 id.LocalPort = p 2335 if err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.portFlags, bindToDevice); err != nil { 2336 portRes := ports.Reservation{ 2337 Networks: netProtos, 2338 Transport: ProtocolNumber, 2339 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2340 Port: p, 2341 Flags: e.portFlags, 2342 BindToDevice: bindToDevice, 2343 Dest: addr, 2344 } 2345 e.stack.ReleasePort(portRes) 2346 if _, ok := err.(*tcpip.ErrPortInUse); ok { 2347 return false, nil 2348 } 2349 return false, err 2350 } 2351 2352 // Port picking successful. Save the details of 2353 // the selected port. 2354 e.TransportEndpointInfo.ID = id 2355 e.isPortReserved = true 2356 e.boundBindToDevice = bindToDevice 2357 e.boundPortFlags = e.portFlags 2358 e.boundDest = addr 2359 return true, nil 2360 }); err != nil { 2361 e.stack.Stats().TCP.FailedPortReservations.Increment() 2362 return err 2363 } 2364 } 2365 return nil 2366 } 2367 2368 // connect connects the endpoint to its peer. 2369 // +checklocks:e.mu 2370 func (e *Endpoint) connect(addr tcpip.FullAddress, handshake bool) tcpip.Error { 2371 connectingAddr := addr.Addr 2372 2373 addr, netProto, err := e.checkV4MappedLocked(addr) 2374 if err != nil { 2375 return err 2376 } 2377 2378 if e.EndpointState().connected() { 2379 // The endpoint is already connected. If caller hasn't been 2380 // notified yet, return success. 2381 if !e.isConnectNotified { 2382 e.isConnectNotified = true 2383 return nil 2384 } 2385 // Otherwise return that it's already connected. 2386 return &tcpip.ErrAlreadyConnected{} 2387 } 2388 2389 nicID := addr.NIC 2390 switch e.EndpointState() { 2391 case StateBound: 2392 // If we're already bound to a NIC but the caller is requesting 2393 // that we use a different one now, we cannot proceed. 2394 if e.boundNICID == 0 { 2395 break 2396 } 2397 2398 if nicID != 0 && nicID != e.boundNICID { 2399 return &tcpip.ErrHostUnreachable{} 2400 } 2401 2402 nicID = e.boundNICID 2403 2404 case StateInitial: 2405 // Nothing to do. We'll eventually fill-in the gaps in the ID (if any) 2406 // when we find a route. 2407 2408 case StateConnecting, StateSynSent, StateSynRecv: 2409 // A connection request has already been issued but hasn't completed 2410 // yet. 2411 return &tcpip.ErrAlreadyConnecting{} 2412 2413 case StateError: 2414 if err := e.hardErrorLocked(); err != nil { 2415 return err 2416 } 2417 return &tcpip.ErrConnectionAborted{} 2418 2419 default: 2420 return &tcpip.ErrInvalidEndpointState{} 2421 } 2422 2423 // Find a route to the desired destination. 2424 r, err := e.stack.FindRoute(nicID, e.TransportEndpointInfo.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */) 2425 if err != nil { 2426 return err 2427 } 2428 defer r.Release() 2429 2430 e.TransportEndpointInfo.ID.LocalAddress = r.LocalAddress() 2431 e.TransportEndpointInfo.ID.RemoteAddress = r.RemoteAddress() 2432 e.TransportEndpointInfo.ID.RemotePort = addr.Port 2433 2434 oldState := e.EndpointState() 2435 e.setEndpointState(StateConnecting) 2436 if err := e.registerEndpoint(addr, netProto, r.NICID()); err != nil { 2437 e.setEndpointState(oldState) 2438 if _, ok := err.(*tcpip.ErrPortInUse); ok { 2439 return &tcpip.ErrBadLocalAddress{} 2440 } 2441 return err 2442 } 2443 2444 e.isRegistered = true 2445 r.Acquire() 2446 e.route = r 2447 e.boundNICID = nicID 2448 e.effectiveNetProtos = []tcpip.NetworkProtocolNumber{netProto} 2449 e.connectingAddress = connectingAddr 2450 2451 e.initGSO() 2452 2453 // Connect in the restore phase does not perform handshake. Restore its 2454 // connection setting here. 2455 if !handshake { 2456 e.segmentQueue.mu.Lock() 2457 for _, l := range []segmentList{e.segmentQueue.list, e.snd.writeList} { 2458 for s := l.Front(); s != nil; s = s.Next() { 2459 s.id = e.TransportEndpointInfo.ID 2460 e.sndQueueInfo.sndWaker.Assert() 2461 } 2462 } 2463 e.segmentQueue.mu.Unlock() 2464 e.snd.ep.AssertLockHeld(e) 2465 e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0) 2466 e.setEndpointState(StateEstablished) 2467 // Set the new auto tuned send buffer size after entering 2468 // established state. 2469 e.ops.SetSendBufferSize(e.computeTCPSendBufferSize(), false /* notify */) 2470 return &tcpip.ErrConnectStarted{} 2471 } 2472 2473 // Start a new handshake. 2474 h := e.newHandshake() 2475 e.setEndpointState(StateSynSent) 2476 h.start() 2477 e.stack.Stats().TCP.ActiveConnectionOpenings.Increment() 2478 2479 return &tcpip.ErrConnectStarted{} 2480 } 2481 2482 // ConnectEndpoint is not supported. 2483 func (*Endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error { 2484 return &tcpip.ErrInvalidEndpointState{} 2485 } 2486 2487 // Shutdown closes the read and/or write end of the endpoint connection to its 2488 // peer. 2489 func (e *Endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error { 2490 e.LockUser() 2491 defer e.UnlockUser() 2492 2493 if e.EndpointState().connecting() { 2494 // When calling shutdown(2) on a connecting socket, the endpoint must 2495 // enter the error state. But this logic cannot belong to the shutdownLocked 2496 // method because that method is called during a close(2) (and closing a 2497 // connecting socket is not an error). 2498 e.handshakeFailed(&tcpip.ErrConnectionReset{}) 2499 e.waiterQueue.Notify(waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) 2500 return nil 2501 } 2502 2503 return e.shutdownLocked(flags) 2504 } 2505 2506 // +checklocks:e.mu 2507 func (e *Endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error { 2508 e.shutdownFlags |= flags 2509 switch { 2510 case e.EndpointState().connected(): 2511 // Close for read. 2512 if e.shutdownFlags&tcpip.ShutdownRead != 0 { 2513 // Mark read side as closed. 2514 e.rcvQueueMu.Lock() 2515 e.RcvClosed = true 2516 rcvBufUsed := e.RcvBufUsed 2517 e.rcvQueueMu.Unlock() 2518 // If we're fully closed and we have unread data we need to abort 2519 // the connection with a RST. 2520 if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 { 2521 e.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) 2522 return nil 2523 } 2524 // Wake up any readers that maybe waiting for the stream to become 2525 // readable. 2526 events := waiter.ReadableEvents 2527 if e.shutdownFlags&tcpip.ShutdownWrite == 0 { 2528 // If ShutdownWrite is not set, write end won't close and 2529 // we end up with a half-closed connection 2530 events |= waiter.EventRdHUp 2531 } 2532 e.waiterQueue.Notify(events) 2533 } 2534 2535 // Close for write. 2536 if e.shutdownFlags&tcpip.ShutdownWrite != 0 { 2537 e.sndQueueInfo.sndQueueMu.Lock() 2538 if e.sndQueueInfo.SndClosed { 2539 // Already closed. 2540 e.sndQueueInfo.sndQueueMu.Unlock() 2541 if e.EndpointState() == StateTimeWait { 2542 return &tcpip.ErrNotConnected{} 2543 } 2544 return nil 2545 } 2546 2547 // Queue fin segment. 2548 s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), buffer.Buffer{}) 2549 e.snd.writeList.PushBack(s) 2550 // Mark endpoint as closed. 2551 e.sndQueueInfo.SndClosed = true 2552 e.sndQueueInfo.sndQueueMu.Unlock() 2553 2554 // Drain the send queue. 2555 e.sendData(s) 2556 2557 // Mark send side as closed. 2558 e.snd.Closed = true 2559 2560 // Wake up any writers that maybe waiting for the stream to become 2561 // writable. 2562 e.waiterQueue.Notify(waiter.WritableEvents) 2563 } 2564 2565 return nil 2566 case e.EndpointState() == StateListen: 2567 if e.shutdownFlags&tcpip.ShutdownRead != 0 { 2568 // Reset all connections from the accept queue and keep the 2569 // worker running so that it can continue handling incoming 2570 // segments by replying with RST. 2571 // 2572 // By not removing this endpoint from the demuxer mapping, we 2573 // ensure that any other bind to the same port fails, as on Linux. 2574 e.rcvQueueMu.Lock() 2575 e.RcvClosed = true 2576 e.rcvQueueMu.Unlock() 2577 e.closePendingAcceptableConnectionsLocked() 2578 // Notify waiters that the endpoint is shutdown. 2579 e.waiterQueue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) 2580 } 2581 return nil 2582 default: 2583 return &tcpip.ErrNotConnected{} 2584 } 2585 } 2586 2587 // Listen puts the endpoint in "listen" mode, which allows it to accept 2588 // new connections. 2589 func (e *Endpoint) Listen(backlog int) tcpip.Error { 2590 if err := e.listen(backlog); err != nil { 2591 if !err.IgnoreStats() { 2592 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 2593 e.stats.FailedConnectionAttempts.Increment() 2594 } 2595 return err 2596 } 2597 return nil 2598 } 2599 2600 func (e *Endpoint) listen(backlog int) tcpip.Error { 2601 e.LockUser() 2602 defer e.UnlockUser() 2603 2604 if e.EndpointState() == StateListen && !e.closed { 2605 e.acceptMu.Lock() 2606 defer e.acceptMu.Unlock() 2607 2608 // Adjust the size of the backlog iff we can fit 2609 // existing pending connections into the new one. 2610 if e.acceptQueue.endpoints.Len() > backlog { 2611 return &tcpip.ErrInvalidEndpointState{} 2612 } 2613 e.acceptQueue.capacity = backlog 2614 2615 if e.acceptQueue.pendingEndpoints == nil { 2616 e.acceptQueue.pendingEndpoints = make(map[*Endpoint]struct{}) 2617 } 2618 2619 e.shutdownFlags = 0 2620 e.updateConnDirectionState(connDirectionStateOpen) 2621 e.rcvQueueMu.Lock() 2622 e.RcvClosed = false 2623 e.rcvQueueMu.Unlock() 2624 2625 return nil 2626 } 2627 2628 if e.EndpointState() == StateInitial { 2629 // The listen is called on an unbound socket, the socket is 2630 // automatically bound to a random free port with the local 2631 // address set to INADDR_ANY. 2632 if err := e.bindLocked(tcpip.FullAddress{}); err != nil { 2633 return err 2634 } 2635 } 2636 2637 // Endpoint must be bound before it can transition to listen mode. 2638 if e.EndpointState() != StateBound { 2639 e.stats.ReadErrors.InvalidEndpointState.Increment() 2640 return &tcpip.ErrInvalidEndpointState{} 2641 } 2642 2643 // Setting this state after RegisterTransportEndpoint will result in a 2644 // race where the endpoint is in Bound but reachable via the demuxer. Instead 2645 // we set it to listen so that incoming packets will just be queued to the 2646 // inbound segment queue by the TCP processor. 2647 e.setEndpointState(StateListen) 2648 // Register the endpoint. 2649 if err := e.stack.RegisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil { 2650 e.transitionToStateCloseLocked() 2651 return err 2652 } 2653 2654 e.isRegistered = true 2655 2656 // The queue may be non-zero when we're restoring the endpoint, and it 2657 // may be pre-populated with some previously accepted (but not Accepted) 2658 // endpoints. 2659 e.acceptMu.Lock() 2660 if e.acceptQueue.pendingEndpoints == nil { 2661 e.acceptQueue.pendingEndpoints = make(map[*Endpoint]struct{}) 2662 } 2663 if e.acceptQueue.capacity == 0 { 2664 e.acceptQueue.capacity = backlog 2665 } 2666 e.acceptMu.Unlock() 2667 2668 // Initialize the listening context. 2669 rcvWnd := seqnum.Size(e.receiveBufferAvailable()) 2670 e.listenCtx = newListenContext(e.stack, e.protocol, e, rcvWnd, e.ops.GetV6Only(), e.NetProto) 2671 2672 return nil 2673 } 2674 2675 // Accept returns a new endpoint if a peer has established a connection 2676 // to an endpoint previously set to listen mode. 2677 // 2678 // addr if not-nil will contain the peer address of the returned endpoint. 2679 func (e *Endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) { 2680 e.LockUser() 2681 defer e.UnlockUser() 2682 2683 e.rcvQueueMu.Lock() 2684 rcvClosed := e.RcvClosed 2685 e.rcvQueueMu.Unlock() 2686 // Endpoint must be in listen state before it can accept connections. 2687 if rcvClosed || e.EndpointState() != StateListen { 2688 return nil, nil, &tcpip.ErrInvalidEndpointState{} 2689 } 2690 2691 // Get the new accepted endpoint. 2692 var n *Endpoint 2693 e.acceptMu.Lock() 2694 if element := e.acceptQueue.endpoints.Front(); element != nil { 2695 n = e.acceptQueue.endpoints.Remove(element).(*Endpoint) 2696 } 2697 e.acceptMu.Unlock() 2698 if n == nil { 2699 return nil, nil, &tcpip.ErrWouldBlock{} 2700 } 2701 if peerAddr != nil { 2702 *peerAddr = n.getRemoteAddress() 2703 } 2704 return n, n.waiterQueue, nil 2705 } 2706 2707 // Bind binds the endpoint to a specific local port and optionally address. 2708 func (e *Endpoint) Bind(addr tcpip.FullAddress) (err tcpip.Error) { 2709 e.LockUser() 2710 defer e.UnlockUser() 2711 2712 return e.bindLocked(addr) 2713 } 2714 2715 // +checklocks:e.mu 2716 func (e *Endpoint) bindLocked(addr tcpip.FullAddress) (err tcpip.Error) { 2717 // Don't allow binding once endpoint is not in the initial state 2718 // anymore. This is because once the endpoint goes into a connected or 2719 // listen state, it is already bound. 2720 if e.EndpointState() != StateInitial { 2721 return &tcpip.ErrAlreadyBound{} 2722 } 2723 2724 e.BindAddr = addr.Addr 2725 addr, netProto, err := e.checkV4MappedLocked(addr) 2726 if err != nil { 2727 return err 2728 } 2729 2730 netProtos := []tcpip.NetworkProtocolNumber{netProto} 2731 2732 // Expand netProtos to include v4 and v6 under dual-stack if the caller is 2733 // binding to a wildcard (empty) address, and this is an IPv6 endpoint with 2734 // v6only set to false. 2735 if netProto == header.IPv6ProtocolNumber { 2736 stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber) 2737 alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == tcpip.Address{} && stackHasV4 2738 if alsoBindToV4 { 2739 netProtos = append(netProtos, header.IPv4ProtocolNumber) 2740 } 2741 } 2742 2743 var nic tcpip.NICID 2744 // If an address is specified, we must ensure that it's one of our 2745 // local addresses. 2746 if addr.Addr.Len() != 0 { 2747 nic = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) 2748 if nic == 0 { 2749 return &tcpip.ErrBadLocalAddress{} 2750 } 2751 e.TransportEndpointInfo.ID.LocalAddress = addr.Addr 2752 } 2753 2754 bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) 2755 portRes := ports.Reservation{ 2756 Networks: netProtos, 2757 Transport: ProtocolNumber, 2758 Addr: addr.Addr, 2759 Port: addr.Port, 2760 Flags: e.portFlags, 2761 BindToDevice: bindToDevice, 2762 Dest: tcpip.FullAddress{}, 2763 } 2764 port, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, func(p uint16) (bool, tcpip.Error) { 2765 id := e.TransportEndpointInfo.ID 2766 id.LocalPort = p 2767 // CheckRegisterTransportEndpoint should only return an error if there is a 2768 // listening endpoint bound with the same id and portFlags and bindToDevice 2769 // options. 2770 // 2771 // NOTE: Only listening and connected endpoint register with 2772 // demuxer. Further connected endpoints always have a remote 2773 // address/port. Hence this will only return an error if there is a matching 2774 // listening endpoint. 2775 if err := e.stack.CheckRegisterTransportEndpoint(netProtos, ProtocolNumber, id, e.portFlags, bindToDevice); err != nil { 2776 return false, nil 2777 } 2778 return true, nil 2779 }) 2780 if err != nil { 2781 e.stack.Stats().TCP.FailedPortReservations.Increment() 2782 return err 2783 } 2784 2785 e.boundBindToDevice = bindToDevice 2786 e.boundPortFlags = e.portFlags 2787 // TODO(gvisor.dev/issue/3691): Add test to verify boundNICID is correct. 2788 e.boundNICID = nic 2789 e.isPortReserved = true 2790 e.effectiveNetProtos = netProtos 2791 e.TransportEndpointInfo.ID.LocalPort = port 2792 2793 // Mark endpoint as bound. 2794 e.setEndpointState(StateBound) 2795 2796 return nil 2797 } 2798 2799 // GetLocalAddress returns the address to which the endpoint is bound. 2800 func (e *Endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { 2801 e.LockUser() 2802 defer e.UnlockUser() 2803 2804 return tcpip.FullAddress{ 2805 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2806 Port: e.TransportEndpointInfo.ID.LocalPort, 2807 NIC: e.boundNICID, 2808 }, nil 2809 } 2810 2811 // GetRemoteAddress returns the address to which the endpoint is connected. 2812 func (e *Endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { 2813 e.LockUser() 2814 defer e.UnlockUser() 2815 2816 if !e.EndpointState().connected() { 2817 return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} 2818 } 2819 2820 return e.getRemoteAddress(), nil 2821 } 2822 2823 func (e *Endpoint) getRemoteAddress() tcpip.FullAddress { 2824 return tcpip.FullAddress{ 2825 Addr: e.TransportEndpointInfo.ID.RemoteAddress, 2826 Port: e.TransportEndpointInfo.ID.RemotePort, 2827 NIC: e.boundNICID, 2828 } 2829 } 2830 2831 // HandlePacket implements stack.TransportEndpoint.HandlePacket. 2832 func (*Endpoint) HandlePacket(stack.TransportEndpointID, *stack.PacketBuffer) { 2833 // TCP HandlePacket is not required anymore as inbound packets first 2834 // land at the Dispatcher which then can either deliver using the 2835 // worker go routine or directly do the invoke the tcp processing inline 2836 // based on the state of the endpoint. 2837 } 2838 2839 func (e *Endpoint) enqueueSegment(s *segment) bool { 2840 // Send packet to worker goroutine. 2841 if !e.segmentQueue.enqueue(s) { 2842 // The queue is full, so we drop the segment. 2843 e.stack.Stats().DroppedPackets.Increment() 2844 e.stats.ReceiveErrors.SegmentQueueDropped.Increment() 2845 return false 2846 } 2847 return true 2848 } 2849 2850 func (e *Endpoint) onICMPError(err tcpip.Error, transErr stack.TransportError, pkt *stack.PacketBuffer) { 2851 // Update last error first. 2852 e.lastErrorMu.Lock() 2853 e.lastError = err 2854 e.lastErrorMu.Unlock() 2855 2856 var recvErr bool 2857 switch pkt.NetworkProtocolNumber { 2858 case header.IPv4ProtocolNumber: 2859 recvErr = e.SocketOptions().GetIPv4RecvError() 2860 case header.IPv6ProtocolNumber: 2861 recvErr = e.SocketOptions().GetIPv6RecvError() 2862 default: 2863 panic(fmt.Sprintf("unhandled network protocol number = %d", pkt.NetworkProtocolNumber)) 2864 } 2865 2866 if recvErr { 2867 e.SocketOptions().QueueErr(&tcpip.SockError{ 2868 Err: err, 2869 Cause: transErr, 2870 // Linux passes the payload with the TCP header. We don't know if the TCP 2871 // header even exists, it may not for fragmented packets. 2872 Payload: pkt.Data().AsRange().ToView(), 2873 Dst: tcpip.FullAddress{ 2874 NIC: pkt.NICID, 2875 Addr: e.TransportEndpointInfo.ID.RemoteAddress, 2876 Port: e.TransportEndpointInfo.ID.RemotePort, 2877 }, 2878 Offender: tcpip.FullAddress{ 2879 NIC: pkt.NICID, 2880 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2881 Port: e.TransportEndpointInfo.ID.LocalPort, 2882 }, 2883 NetProto: pkt.NetworkProtocolNumber, 2884 }) 2885 } 2886 2887 if e.EndpointState().connecting() { 2888 e.mu.Lock() 2889 if lEP := e.h.listenEP; lEP != nil { 2890 // Remove from listening endpoints pending list. 2891 lEP.acceptMu.Lock() 2892 delete(lEP.acceptQueue.pendingEndpoints, e) 2893 lEP.acceptMu.Unlock() 2894 lEP.stats.FailedConnectionAttempts.Increment() 2895 } 2896 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 2897 e.cleanupLocked() 2898 e.hardError = err 2899 e.setEndpointState(StateError) 2900 e.mu.Unlock() 2901 e.drainClosingSegmentQueue() 2902 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 2903 } 2904 } 2905 2906 // HandleError implements stack.TransportEndpoint. 2907 func (e *Endpoint) HandleError(transErr stack.TransportError, pkt *stack.PacketBuffer) { 2908 handlePacketTooBig := func(mtu uint32) { 2909 e.sndQueueInfo.sndQueueMu.Lock() 2910 update := false 2911 if v := int(mtu); v < e.sndQueueInfo.SndMTU { 2912 e.sndQueueInfo.SndMTU = v 2913 update = true 2914 } 2915 newMTU := e.sndQueueInfo.SndMTU 2916 e.sndQueueInfo.sndQueueMu.Unlock() 2917 if update { 2918 e.mu.Lock() 2919 defer e.mu.Unlock() 2920 if e.snd != nil { 2921 e.snd.updateMaxPayloadSize(newMTU, 1 /* count */) // +checklocksforce:e.snd.ep.mu 2922 } 2923 } 2924 } 2925 2926 // TODO(gvisor.dev/issues/5270): Handle all transport errors. 2927 switch transErr.Kind() { 2928 case stack.PacketTooBigTransportError: 2929 handlePacketTooBig(transErr.Info()) 2930 case stack.DestinationHostUnreachableTransportError: 2931 e.onICMPError(&tcpip.ErrHostUnreachable{}, transErr, pkt) 2932 case stack.DestinationNetworkUnreachableTransportError: 2933 e.onICMPError(&tcpip.ErrNetworkUnreachable{}, transErr, pkt) 2934 case stack.DestinationPortUnreachableTransportError: 2935 e.onICMPError(&tcpip.ErrConnectionRefused{}, transErr, pkt) 2936 case stack.DestinationProtoUnreachableTransportError: 2937 e.onICMPError(&tcpip.ErrUnknownProtocolOption{}, transErr, pkt) 2938 case stack.SourceRouteFailedTransportError: 2939 e.onICMPError(&tcpip.ErrNotSupported{}, transErr, pkt) 2940 case stack.SourceHostIsolatedTransportError: 2941 e.onICMPError(&tcpip.ErrNoNet{}, transErr, pkt) 2942 case stack.DestinationHostDownTransportError: 2943 e.onICMPError(&tcpip.ErrHostDown{}, transErr, pkt) 2944 } 2945 } 2946 2947 // updateSndBufferUsage is called by the protocol goroutine when room opens up 2948 // in the send buffer. The number of newly available bytes is v. 2949 func (e *Endpoint) updateSndBufferUsage(v int) { 2950 sendBufferSize := e.getSendBufferSize() 2951 e.sndQueueInfo.sndQueueMu.Lock() 2952 notify := e.sndQueueInfo.SndBufUsed >= sendBufferSize>>1 2953 e.sndQueueInfo.SndBufUsed -= v 2954 2955 // Get the new send buffer size with auto tuning, but do not set it 2956 // unless we decide to notify the writers. 2957 newSndBufSz := e.computeTCPSendBufferSize() 2958 2959 // We only notify when there is half the sendBufferSize available after 2960 // a full buffer event occurs. This ensures that we don't wake up 2961 // writers to queue just 1-2 segments and go back to sleep. 2962 notify = notify && e.sndQueueInfo.SndBufUsed < int(newSndBufSz)>>1 2963 e.sndQueueInfo.sndQueueMu.Unlock() 2964 2965 if notify { 2966 // Set the new send buffer size calculated from auto tuning. 2967 e.ops.SetSendBufferSize(newSndBufSz, false /* notify */) 2968 e.waiterQueue.Notify(waiter.WritableEvents) 2969 } 2970 } 2971 2972 // readyToRead is called by the protocol goroutine when a new segment is ready 2973 // to be read, or when the connection is closed for receiving (in which case 2974 // s will be nil). 2975 // 2976 // +checklocks:e.mu 2977 func (e *Endpoint) readyToRead(s *segment) { 2978 e.rcvQueueMu.Lock() 2979 if s != nil { 2980 e.RcvBufUsed += s.payloadSize() 2981 s.IncRef() 2982 e.rcvQueue.PushBack(s) 2983 } else { 2984 e.RcvClosed = true 2985 } 2986 e.rcvQueueMu.Unlock() 2987 e.waiterQueue.Notify(waiter.ReadableEvents) 2988 } 2989 2990 // receiveBufferAvailableLocked calculates how many bytes are still available 2991 // in the receive buffer. 2992 // +checklocks:e.rcvQueueMu 2993 func (e *Endpoint) receiveBufferAvailableLocked(rcvBufSize int) int { 2994 // We may use more bytes than the buffer size when the receive buffer 2995 // shrinks. 2996 memUsed := e.receiveMemUsed() 2997 if memUsed >= rcvBufSize { 2998 return 0 2999 } 3000 3001 return rcvBufSize - memUsed 3002 } 3003 3004 // receiveBufferAvailable calculates how many bytes are still available in the 3005 // receive buffer based on the actual memory used by all segments held in 3006 // receive buffer/pending and segment queue. 3007 func (e *Endpoint) receiveBufferAvailable() int { 3008 e.rcvQueueMu.Lock() 3009 available := e.receiveBufferAvailableLocked(int(e.ops.GetReceiveBufferSize())) 3010 e.rcvQueueMu.Unlock() 3011 return available 3012 } 3013 3014 // receiveBufferUsed returns the amount of in-use receive buffer. 3015 func (e *Endpoint) receiveBufferUsed() int { 3016 e.rcvQueueMu.Lock() 3017 used := e.RcvBufUsed 3018 e.rcvQueueMu.Unlock() 3019 return used 3020 } 3021 3022 // receiveMemUsed returns the total memory in use by segments held by this 3023 // endpoint. 3024 func (e *Endpoint) receiveMemUsed() int { 3025 return int(e.rcvMemUsed.Load()) 3026 } 3027 3028 // updateReceiveMemUsed adds the provided delta to e.rcvMemUsed. 3029 func (e *Endpoint) updateReceiveMemUsed(delta int) { 3030 e.rcvMemUsed.Add(int32(delta)) 3031 } 3032 3033 // maxReceiveBufferSize returns the stack wide maximum receive buffer size for 3034 // an endpoint. 3035 func (e *Endpoint) maxReceiveBufferSize() int { 3036 var rs tcpip.TCPReceiveBufferSizeRangeOption 3037 if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil { 3038 // As a fallback return the hardcoded max buffer size. 3039 return MaxBufferSize 3040 } 3041 return rs.Max 3042 } 3043 3044 // directionState returns the close state of send and receive part of the endpoint 3045 func (e *Endpoint) connDirectionState() connDirectionState { 3046 return connDirectionState(e.connectionDirectionState.Load()) 3047 } 3048 3049 // updateDirectionState updates the close state of send and receive part of the endpoint 3050 func (e *Endpoint) updateConnDirectionState(state connDirectionState) connDirectionState { 3051 return connDirectionState(e.connectionDirectionState.Swap(uint32(e.connDirectionState() | state))) 3052 } 3053 3054 // rcvWndScaleForHandshake computes the receive window scale to offer to the 3055 // peer when window scaling is enabled (true by default). If auto-tuning is 3056 // disabled then the window scaling factor is based on the size of the 3057 // receiveBuffer otherwise we use the max permissible receive buffer size to 3058 // compute the scale. 3059 func (e *Endpoint) rcvWndScaleForHandshake() int { 3060 bufSizeForScale := e.ops.GetReceiveBufferSize() 3061 3062 e.rcvQueueMu.Lock() 3063 autoTuningDisabled := e.RcvAutoParams.Disabled 3064 e.rcvQueueMu.Unlock() 3065 if autoTuningDisabled { 3066 return FindWndScale(seqnum.Size(bufSizeForScale)) 3067 } 3068 3069 return FindWndScale(seqnum.Size(e.maxReceiveBufferSize())) 3070 } 3071 3072 // updateRecentTimestamp updates the recent timestamp using the algorithm 3073 // described in https://tools.ietf.org/html/rfc7323#section-4.3 3074 func (e *Endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) { 3075 if e.SendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) { 3076 e.setRecentTimestamp(tsVal) 3077 } 3078 } 3079 3080 // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if 3081 // the SYN options indicate that timestamp option was negotiated. It also 3082 // initializes the recentTS with the value provided in synOpts.TSval. 3083 func (e *Endpoint) maybeEnableTimestamp(synOpts header.TCPSynOptions) { 3084 if synOpts.TS { 3085 e.SendTSOk = true 3086 e.setRecentTimestamp(synOpts.TSVal) 3087 } 3088 } 3089 3090 func (e *Endpoint) tsVal(now tcpip.MonotonicTime) uint32 { 3091 return e.TSOffset.TSVal(now) 3092 } 3093 3094 func (e *Endpoint) tsValNow() uint32 { 3095 return e.tsVal(e.stack.Clock().NowMonotonic()) 3096 } 3097 3098 func (e *Endpoint) elapsed(now tcpip.MonotonicTime, tsEcr uint32) time.Duration { 3099 return e.TSOffset.Elapsed(now, tsEcr) 3100 } 3101 3102 // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint 3103 // if the SYN options indicate that the SACK option was negotiated and the TCP 3104 // stack is configured to enable TCP SACK option. 3105 func (e *Endpoint) maybeEnableSACKPermitted(synOpts header.TCPSynOptions) { 3106 var v tcpip.TCPSACKEnabled 3107 if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil { 3108 // Stack doesn't support SACK. So just return. 3109 return 3110 } 3111 if bool(v) && synOpts.SACKPermitted { 3112 e.SACKPermitted = true 3113 e.stack.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery) 3114 } 3115 } 3116 3117 // maxOptionSize return the maximum size of TCP options. 3118 func (e *Endpoint) maxOptionSize() (size int) { 3119 var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock 3120 options := e.makeOptions(maxSackBlocks[:]) 3121 size = len(options) 3122 putOptions(options) 3123 3124 return size 3125 } 3126 3127 // completeStateLocked makes a full copy of the endpoint and returns it. This is 3128 // used before invoking the probe. 3129 // 3130 // +checklocks:e.mu 3131 func (e *Endpoint) completeStateLocked(s *stack.TCPEndpointState) { 3132 s.TCPEndpointStateInner = e.TCPEndpointStateInner 3133 s.ID = stack.TCPEndpointID(e.TransportEndpointInfo.ID) 3134 s.SegTime = e.stack.Clock().NowMonotonic() 3135 s.Receiver = e.rcv.TCPReceiverState 3136 s.Sender = e.snd.TCPSenderState 3137 3138 sndBufSize := e.getSendBufferSize() 3139 // Copy the send buffer atomically. 3140 e.sndQueueInfo.sndQueueMu.Lock() 3141 e.sndQueueInfo.CloneState(&s.SndBufState) 3142 s.SndBufState.SndBufSize = sndBufSize 3143 e.sndQueueInfo.sndQueueMu.Unlock() 3144 3145 // Copy the receive buffer atomically. 3146 e.rcvQueueMu.Lock() 3147 s.RcvBufState = e.TCPRcvBufState 3148 e.rcvQueueMu.Unlock() 3149 3150 // Copy the endpoint TCP Option state. 3151 s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks) 3152 copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks]) 3153 s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy() 3154 3155 e.snd.rtt.Lock() 3156 s.Sender.RTTState = e.snd.rtt.TCPRTTState 3157 e.snd.rtt.Unlock() 3158 3159 if cubic, ok := e.snd.cc.(*cubicState); ok { 3160 s.Sender.Cubic = cubic.TCPCubicState 3161 s.Sender.Cubic.TimeSinceLastCongestion = e.stack.Clock().NowMonotonic().Sub(s.Sender.Cubic.T) 3162 } 3163 3164 s.Sender.RACKState = e.snd.rc.TCPRACKState 3165 s.Sender.RetransmitTS = e.snd.retransmitTS 3166 s.Sender.SpuriousRecovery = e.snd.spuriousRecovery 3167 } 3168 3169 func (e *Endpoint) initHostGSO() { 3170 switch e.route.NetProto() { 3171 case header.IPv4ProtocolNumber: 3172 e.gso.Type = stack.GSOTCPv4 3173 e.gso.L3HdrLen = header.IPv4MinimumSize 3174 case header.IPv6ProtocolNumber: 3175 e.gso.Type = stack.GSOTCPv6 3176 e.gso.L3HdrLen = header.IPv6MinimumSize 3177 default: 3178 panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto)) 3179 } 3180 e.gso.NeedsCsum = true 3181 e.gso.CsumOffset = header.TCPChecksumOffset 3182 e.gso.MaxSize = e.route.GSOMaxSize() 3183 } 3184 3185 func (e *Endpoint) initGSO() { 3186 if e.route.HasHostGSOCapability() { 3187 e.initHostGSO() 3188 } else if e.route.HasGvisorGSOCapability() { 3189 e.gso = stack.GSO{ 3190 MaxSize: e.route.GSOMaxSize(), 3191 Type: stack.GSOGvisor, 3192 NeedsCsum: false, 3193 } 3194 } 3195 } 3196 3197 // State implements tcpip.Endpoint.State. It exports the endpoint's protocol 3198 // state for diagnostics. 3199 func (e *Endpoint) State() uint32 { 3200 return uint32(e.EndpointState()) 3201 } 3202 3203 // Info returns a copy of the endpoint info. 3204 func (e *Endpoint) Info() tcpip.EndpointInfo { 3205 e.LockUser() 3206 // Make a copy of the endpoint info. 3207 ret := e.TransportEndpointInfo 3208 e.UnlockUser() 3209 return &ret 3210 } 3211 3212 // Stats returns a pointer to the endpoint stats. 3213 func (e *Endpoint) Stats() tcpip.EndpointStats { 3214 return &e.stats 3215 } 3216 3217 // Wait implements stack.TransportEndpoint.Wait. 3218 func (e *Endpoint) Wait() { 3219 waitEntry, notifyCh := waiter.NewChannelEntry(waiter.EventHUp) 3220 e.waiterQueue.EventRegister(&waitEntry) 3221 defer e.waiterQueue.EventUnregister(&waitEntry) 3222 switch e.EndpointState() { 3223 case StateClose, StateError: 3224 return 3225 } 3226 <-notifyCh 3227 } 3228 3229 // SocketOptions implements tcpip.Endpoint.SocketOptions. 3230 func (e *Endpoint) SocketOptions() *tcpip.SocketOptions { 3231 return &e.ops 3232 } 3233 3234 // GetTCPSendBufferLimits is used to get send buffer size limits for TCP. 3235 func GetTCPSendBufferLimits(sh tcpip.StackHandler) tcpip.SendBufferSizeOption { 3236 // This type assertion is safe because only the TCP stack calls this 3237 // function. 3238 ss := sh.(*stack.Stack).TCPSendBufferLimits() 3239 return tcpip.SendBufferSizeOption{ 3240 Min: ss.Min, 3241 Default: ss.Default, 3242 Max: ss.Max, 3243 } 3244 } 3245 3246 // allowOutOfWindowAck returns true if an out-of-window ACK can be sent now. 3247 func (e *Endpoint) allowOutOfWindowAck() bool { 3248 now := e.stack.Clock().NowMonotonic() 3249 3250 if e.lastOutOfWindowAckTime != (tcpip.MonotonicTime{}) { 3251 var limit stack.TCPInvalidRateLimitOption 3252 if err := e.stack.Option(&limit); err != nil { 3253 panic(fmt.Sprintf("e.stack.Option(%+v) failed with error: %s", limit, err)) 3254 } 3255 if now.Sub(e.lastOutOfWindowAckTime) < time.Duration(limit) { 3256 return false 3257 } 3258 } 3259 3260 e.lastOutOfWindowAckTime = now 3261 return true 3262 } 3263 3264 // GetTCPReceiveBufferLimits is used to get send buffer size limits for TCP. 3265 func GetTCPReceiveBufferLimits(s tcpip.StackHandler) tcpip.ReceiveBufferSizeOption { 3266 var ss tcpip.TCPReceiveBufferSizeRangeOption 3267 if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil { 3268 panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err)) 3269 } 3270 3271 return tcpip.ReceiveBufferSizeOption{ 3272 Min: ss.Min, 3273 Default: ss.Default, 3274 Max: ss.Max, 3275 } 3276 } 3277 3278 // computeTCPSendBufferSize implements auto tuning of send buffer size and 3279 // returns the new send buffer size. 3280 func (e *Endpoint) computeTCPSendBufferSize() int64 { 3281 curSndBufSz := int64(e.getSendBufferSize()) 3282 3283 // Auto tuning is disabled when the user explicitly sets the send 3284 // buffer size with SO_SNDBUF option. 3285 if disabled := e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled.Load(); disabled == 1 { 3286 return curSndBufSz 3287 } 3288 3289 const packetOverheadFactor = 2 3290 curMSS := e.snd.MaxPayloadSize 3291 numSeg := InitialCwnd 3292 if numSeg < e.snd.SndCwnd { 3293 numSeg = e.snd.SndCwnd 3294 } 3295 3296 // SndCwnd indicates the number of segments that can be sent. This means 3297 // that the sender can send upto #SndCwnd segments and the send buffer 3298 // size should be set to SndCwnd*MSS to accommodate sending of all the 3299 // segments. 3300 newSndBufSz := int64(numSeg * curMSS * packetOverheadFactor) 3301 if newSndBufSz < curSndBufSz { 3302 return curSndBufSz 3303 } 3304 if ss := GetTCPSendBufferLimits(e.stack); int64(ss.Max) < newSndBufSz { 3305 newSndBufSz = int64(ss.Max) 3306 } 3307 3308 return newSndBufSz 3309 } 3310 3311 // GetAcceptConn implements tcpip.SocketOptionsHandler. 3312 func (e *Endpoint) GetAcceptConn() bool { 3313 return EndpointState(e.State()) == StateListen 3314 }