gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/tcpip/transport/tcp/endpoint.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "container/heap" 19 "fmt" 20 "io" 21 "math" 22 "runtime" 23 "strings" 24 "time" 25 26 "gvisor.dev/gvisor/pkg/atomicbitops" 27 "gvisor.dev/gvisor/pkg/buffer" 28 "gvisor.dev/gvisor/pkg/sleep" 29 "gvisor.dev/gvisor/pkg/sync" 30 "gvisor.dev/gvisor/pkg/tcpip" 31 "gvisor.dev/gvisor/pkg/tcpip/header" 32 "gvisor.dev/gvisor/pkg/tcpip/ports" 33 "gvisor.dev/gvisor/pkg/tcpip/seqnum" 34 "gvisor.dev/gvisor/pkg/tcpip/stack" 35 "gvisor.dev/gvisor/pkg/waiter" 36 ) 37 38 // EndpointState represents the state of a TCP endpoint. 39 type EndpointState tcpip.EndpointState 40 41 // Endpoint states. Note that are represented in a netstack-specific manner and 42 // may not be meaningful externally. Specifically, they need to be translated to 43 // Linux's representation for these states if presented to userspace. 44 const ( 45 _ EndpointState = iota 46 // TCP protocol states in sync with the definitions in 47 // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/tcp_states.h#L13 48 StateEstablished 49 StateSynSent 50 StateSynRecv 51 StateFinWait1 52 StateFinWait2 53 StateTimeWait 54 StateClose 55 StateCloseWait 56 StateLastAck 57 StateListen 58 StateClosing 59 60 // Endpoint states internal to netstack. 61 StateInitial 62 StateBound 63 StateConnecting // Connect() called, but the initial SYN hasn't been sent. 64 StateError 65 ) 66 67 const ( 68 // rcvAdvWndScale is used to split the available socket buffer into 69 // application buffer and the window to be advertised to the peer. This is 70 // currently hard coded to split the available space equally. 71 rcvAdvWndScale = 1 72 73 // SegOverheadFactor is used to multiply the value provided by the 74 // user on a SetSockOpt for setting the socket send/receive buffer sizes. 75 SegOverheadFactor = 2 76 ) 77 78 type connDirectionState uint32 79 80 // Connection direction states used for directionState checks in endpoint struct 81 // to detect half-closed connection and deliver POLLRDHUP 82 const ( 83 connDirectionStateOpen connDirectionState = 0 84 connDirectionStateRcvClosed connDirectionState = 1 85 connDirectionStateSndClosed connDirectionState = 2 86 connDirectionStateAll connDirectionState = connDirectionStateOpen | connDirectionStateRcvClosed | connDirectionStateSndClosed 87 ) 88 89 // connected returns true when s is one of the states representing an 90 // endpoint connected to a peer. 91 func (s EndpointState) connected() bool { 92 switch s { 93 case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing: 94 return true 95 default: 96 return false 97 } 98 } 99 100 // connecting returns true when s is one of the states representing a 101 // connection in progress, but not yet fully established. 102 func (s EndpointState) connecting() bool { 103 switch s { 104 case StateConnecting, StateSynSent, StateSynRecv: 105 return true 106 default: 107 return false 108 } 109 } 110 111 // internal returns true when the state is netstack internal. 112 func (s EndpointState) internal() bool { 113 switch s { 114 case StateInitial, StateBound, StateConnecting, StateError: 115 return true 116 default: 117 return false 118 } 119 } 120 121 // handshake returns true when s is one of the states representing an endpoint 122 // in the middle of a TCP handshake. 123 func (s EndpointState) handshake() bool { 124 switch s { 125 case StateSynSent, StateSynRecv: 126 return true 127 default: 128 return false 129 } 130 } 131 132 // closed returns true when s is one of the states an endpoint transitions to 133 // when closed or when it encounters an error. This is distinct from a newly 134 // initialized endpoint that was never connected. 135 func (s EndpointState) closed() bool { 136 switch s { 137 case StateClose, StateError: 138 return true 139 default: 140 return false 141 } 142 } 143 144 // String implements fmt.Stringer.String. 145 func (s EndpointState) String() string { 146 switch s { 147 case StateInitial: 148 return "INITIAL" 149 case StateBound: 150 return "BOUND" 151 case StateConnecting: 152 return "CONNECTING" 153 case StateError: 154 return "ERROR" 155 case StateEstablished: 156 return "ESTABLISHED" 157 case StateSynSent: 158 return "SYN-SENT" 159 case StateSynRecv: 160 return "SYN-RCVD" 161 case StateFinWait1: 162 return "FIN-WAIT1" 163 case StateFinWait2: 164 return "FIN-WAIT2" 165 case StateTimeWait: 166 return "TIME-WAIT" 167 case StateClose: 168 return "CLOSED" 169 case StateCloseWait: 170 return "CLOSE-WAIT" 171 case StateLastAck: 172 return "LAST-ACK" 173 case StateListen: 174 return "LISTEN" 175 case StateClosing: 176 return "CLOSING" 177 default: 178 panic("unreachable") 179 } 180 } 181 182 // SACKInfo holds TCP SACK related information for a given endpoint. 183 // 184 // +stateify savable 185 type SACKInfo struct { 186 // Blocks is the maximum number of SACK blocks we track 187 // per endpoint. 188 Blocks [MaxSACKBlocks]header.SACKBlock 189 190 // NumBlocks is the number of valid SACK blocks stored in the 191 // blocks array above. 192 NumBlocks int 193 } 194 195 // ReceiveErrors collect segment receive errors within transport layer. 196 // 197 // +stateify savable 198 type ReceiveErrors struct { 199 tcpip.ReceiveErrors 200 201 // SegmentQueueDropped is the number of segments dropped due to 202 // a full segment queue. 203 SegmentQueueDropped tcpip.StatCounter 204 205 // ChecksumErrors is the number of segments dropped due to bad checksums. 206 ChecksumErrors tcpip.StatCounter 207 208 // ListenOverflowSynDrop is the number of times the listen queue overflowed 209 // and a SYN was dropped. 210 ListenOverflowSynDrop tcpip.StatCounter 211 212 // ListenOverflowAckDrop is the number of times the final ACK 213 // in the handshake was dropped due to overflow. 214 ListenOverflowAckDrop tcpip.StatCounter 215 216 // ZeroRcvWindowState is the number of times we advertised 217 // a zero receive window when rcvQueue is full. 218 ZeroRcvWindowState tcpip.StatCounter 219 220 // WantZeroWindow is the number of times we wanted to advertise a 221 // zero receive window but couldn't because it would have caused 222 // the receive window's right edge to shrink. 223 WantZeroRcvWindow tcpip.StatCounter 224 } 225 226 // SendErrors collect segment send errors within the transport layer. 227 // 228 // +stateify savable 229 type SendErrors struct { 230 tcpip.SendErrors 231 232 // SegmentSendToNetworkFailed is the number of TCP segments failed to be sent 233 // to the network endpoint. 234 SegmentSendToNetworkFailed tcpip.StatCounter 235 236 // SynSendToNetworkFailed is the number of TCP SYNs failed to be sent 237 // to the network endpoint. 238 SynSendToNetworkFailed tcpip.StatCounter 239 240 // Retransmits is the number of TCP segments retransmitted. 241 Retransmits tcpip.StatCounter 242 243 // FastRetransmit is the number of segments retransmitted in fast 244 // recovery. 245 FastRetransmit tcpip.StatCounter 246 247 // Timeouts is the number of times the RTO expired. 248 Timeouts tcpip.StatCounter 249 } 250 251 // Stats holds statistics about the endpoint. 252 // 253 // +stateify savable 254 type Stats struct { 255 // SegmentsReceived is the number of TCP segments received that 256 // the transport layer successfully parsed. 257 SegmentsReceived tcpip.StatCounter 258 259 // SegmentsSent is the number of TCP segments sent. 260 SegmentsSent tcpip.StatCounter 261 262 // FailedConnectionAttempts is the number of times we saw Connect and 263 // Accept errors. 264 FailedConnectionAttempts tcpip.StatCounter 265 266 // ReceiveErrors collects segment receive errors within the 267 // transport layer. 268 ReceiveErrors ReceiveErrors 269 270 // ReadErrors collects segment read errors from an endpoint read call. 271 ReadErrors tcpip.ReadErrors 272 273 // SendErrors collects segment send errors within the transport layer. 274 SendErrors SendErrors 275 276 // WriteErrors collects segment write errors from an endpoint write call. 277 WriteErrors tcpip.WriteErrors 278 } 279 280 // IsEndpointStats is an empty method to implement the tcpip.EndpointStats 281 // marker interface. 282 func (*Stats) IsEndpointStats() {} 283 284 // sndQueueInfo implements a send queue. 285 // 286 // +stateify savable 287 type sndQueueInfo struct { 288 sndQueueMu sync.Mutex `state:"nosave"` 289 stack.TCPSndBufState 290 291 // sndWaker is used to signal the protocol goroutine when there may be 292 // segments that need to be sent. 293 sndWaker sleep.Waker `state:"manual"` 294 } 295 296 // CloneState clones sq into other. It is not thread safe 297 func (sq *sndQueueInfo) CloneState(other *stack.TCPSndBufState) { 298 other.SndBufSize = sq.SndBufSize 299 other.SndBufUsed = sq.SndBufUsed 300 other.SndClosed = sq.SndClosed 301 other.PacketTooBigCount = sq.PacketTooBigCount 302 other.SndMTU = sq.SndMTU 303 other.AutoTuneSndBufDisabled = atomicbitops.FromUint32(sq.AutoTuneSndBufDisabled.RacyLoad()) 304 } 305 306 // Endpoint represents a TCP endpoint. This struct serves as the interface 307 // between users of the endpoint and the protocol implementation; it is legal to 308 // have concurrent goroutines make calls into the endpoint, they are properly 309 // synchronized. The protocol implementation, however, runs in a single 310 // goroutine. 311 // 312 // Each endpoint has a few mutexes: 313 // 314 // e.mu -> Primary mutex for an endpoint must be held for all operations except 315 // in e.Readiness where acquiring it will result in a deadlock in epoll 316 // implementation. 317 // 318 // The following three mutexes can be acquired independent of e.mu but if 319 // acquired with e.mu then e.mu must be acquired first. 320 // 321 // e.acceptMu -> Protects e.acceptQueue. 322 // e.rcvQueueMu -> Protects e.rcvQueue's associated fields but not e.rcvQueue 323 // itself. 324 // e.sndQueueMu -> Protects the e.sndQueue and associated fields. 325 // e.lastErrorMu -> Protects the lastError field. 326 // 327 // LOCKING/UNLOCKING of the endpoint. The locking of an endpoint is different 328 // based on the context in which the lock is acquired. In the syscall context 329 // e.LockUser/e.UnlockUser should be used and when doing background processing 330 // e.mu.Lock/e.mu.Unlock should be used. The distinction is described below 331 // in brief. 332 // 333 // The reason for this locking behaviour is to avoid wakeups to handle packets. 334 // In cases where the endpoint is already locked the background processor can 335 // queue the packet up and go its merry way and the lock owner will eventually 336 // process the backlog when releasing the lock. Similarly when acquiring the 337 // lock from say a syscall goroutine we can implement a bit of spinning if we 338 // know that the lock is not held by another syscall goroutine. Background 339 // processors should never hold the lock for long and we can avoid an expensive 340 // sleep/wakeup by spinning for a shortwhile. 341 // 342 // For more details please see the detailed documentation on 343 // e.LockUser/e.UnlockUser methods. 344 // 345 // +stateify savable 346 type Endpoint struct { 347 stack.TCPEndpointStateInner 348 stack.TransportEndpointInfo 349 tcpip.DefaultSocketOptionsHandler 350 351 // EndpointEntry is used to queue endpoints for processing to the 352 // a given tcp processor goroutine. 353 // 354 // Precondition: epQueue.mu must be held to read/write this field.. 355 endpointEntry `state:"nosave"` 356 357 // pendingProcessingMu protects pendingProcessing. 358 pendingProcessingMu sync.Mutex `state:"nosave"` 359 360 // pendingProcessing is true if this endpoint is queued for processing 361 // to a TCP processor. 362 // +checklocks:pendingProcessingMu 363 pendingProcessing bool `state:"nosave"` 364 365 // The following fields are initialized at creation time and do not 366 // change throughout the lifetime of the endpoint. 367 stack *stack.Stack `state:"manual"` 368 protocol *protocol `state:"manual"` 369 waiterQueue *waiter.Queue `state:"wait"` 370 uniqueID uint64 371 372 // hardError is meaningful only when state is stateError. It stores the 373 // error to be returned when read/write syscalls are called and the 374 // endpoint is in this state. hardError is protected by endpoint mu. 375 hardError tcpip.Error 376 377 // lastError represents the last error that the endpoint reported; 378 // access to it is protected by the following mutex. 379 lastErrorMu sync.Mutex `state:"nosave"` 380 lastError tcpip.Error 381 382 rcvQueueMu sync.Mutex `state:"nosave"` 383 384 // +checklocks:rcvQueueMu 385 stack.TCPRcvBufState 386 387 // rcvMemUsed tracks the total amount of memory in use by received segments 388 // held in rcvQueue, pendingRcvdSegments and the segment queue. This is used to 389 // compute the window and the actual available buffer space. This is distinct 390 // from rcvBufUsed above which is the actual number of payload bytes held in 391 // the buffer not including any segment overheads. 392 rcvMemUsed atomicbitops.Int32 393 394 // mu protects all endpoint fields unless documented otherwise. mu must 395 // be acquired before interacting with the endpoint fields. 396 // 397 // During handshake, mu is locked by the protocol listen goroutine and 398 // released by the handshake completion goroutine. 399 mu sync.CrossGoroutineMutex `state:"nosave"` 400 ownedByUser atomicbitops.Uint32 401 402 // rcvQueue is the queue for ready-for-delivery segments. 403 // 404 // +checklocks:mu 405 rcvQueue segmentList `state:"wait"` 406 407 // state must be read/set using the EndpointState()/setEndpointState() 408 // methods. 409 state atomicbitops.Uint32 `state:".(EndpointState)"` 410 411 // connectionDirectionState holds current state of send and receive, 412 // accessed atomically 413 connectionDirectionState atomicbitops.Uint32 414 415 // origEndpointState is only used during a restore phase to save the 416 // endpoint state at restore time as the socket is moved to it's correct 417 // state. 418 origEndpointState uint32 `state:"nosave"` 419 420 isPortReserved bool `state:"manual"` 421 isRegistered bool `state:"manual"` 422 boundNICID tcpip.NICID 423 route *stack.Route `state:"manual"` 424 ipv4TTL uint8 425 ipv6HopLimit int16 426 isConnectNotified bool 427 428 // h stores a reference to the current handshake state if the endpoint is in 429 // the SYN-SENT or SYN-RECV states, in which case endpoint == endpoint.h.ep. 430 // nil otherwise. 431 // +checklocks:mu 432 h *handshake 433 434 // portFlags stores the current values of port related flags. 435 portFlags ports.Flags 436 437 // Values used to reserve a port or register a transport endpoint 438 // (which ever happens first). 439 boundBindToDevice tcpip.NICID 440 boundPortFlags ports.Flags 441 boundDest tcpip.FullAddress 442 443 // effectiveNetProtos contains the network protocols actually in use. In 444 // most cases it will only contain "netProto", but in cases like IPv6 445 // endpoints with v6only set to false, this could include multiple 446 // protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g., 447 // IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped 448 // address). 449 effectiveNetProtos []tcpip.NetworkProtocolNumber 450 451 // recentTSTime is the unix time when we last updated 452 // TCPEndpointStateInner.RecentTS. 453 recentTSTime tcpip.MonotonicTime 454 455 // shutdownFlags represent the current shutdown state of the endpoint. 456 shutdownFlags tcpip.ShutdownFlags 457 458 // tcpRecovery is the loss recovery algorithm used by TCP. 459 tcpRecovery tcpip.TCPRecovery 460 461 // sack holds TCP SACK related information for this endpoint. 462 sack SACKInfo 463 464 // delay enables Nagle's algorithm. 465 // 466 // delay is a boolean (0 is false) and must be accessed atomically. 467 delay uint32 468 469 // scoreboard holds TCP SACK Scoreboard information for this endpoint. 470 scoreboard *SACKScoreboard 471 472 // segmentQueue is used to hand received segments to the protocol 473 // goroutine. Segments are queued as long as the queue is not full, 474 // and dropped when it is. 475 segmentQueue segmentQueue `state:"wait"` 476 477 // userMSS if non-zero is the MSS value explicitly set by the user 478 // for this endpoint using the TCP_MAXSEG setsockopt. 479 userMSS uint16 480 481 // maxSynRetries is the maximum number of SYN retransmits that TCP should 482 // send before aborting the attempt to connect. It cannot exceed 255. 483 // 484 // NOTE: This is currently a no-op and does not change the SYN 485 // retransmissions. 486 maxSynRetries uint8 487 488 // windowClamp is used to bound the size of the advertised window to 489 // this value. 490 windowClamp uint32 491 492 // sndQueueInfo contains the implementation of the endpoint's send queue. 493 sndQueueInfo sndQueueInfo 494 495 // cc stores the name of the Congestion Control algorithm to use for 496 // this endpoint. 497 cc tcpip.CongestionControlOption 498 499 // keepalive manages TCP keepalive state. When the connection is idle 500 // (no data sent or received) for keepaliveIdle, we start sending 501 // keepalives every keepalive.interval. If we send keepalive.count 502 // without hearing a response, the connection is closed. 503 keepalive keepalive 504 505 // userTimeout if non-zero specifies a user specified timeout for 506 // a connection w/ pending data to send. A connection that has pending 507 // unacked data will be forcibily aborted if the timeout is reached 508 // without any data being acked. 509 userTimeout time.Duration 510 511 // deferAccept if non-zero specifies a user specified time during 512 // which the final ACK of a handshake will be dropped provided the 513 // ACK is a bare ACK and carries no data. If the timeout is crossed then 514 // the bare ACK is accepted and the connection is delivered to the 515 // listener. 516 deferAccept time.Duration 517 518 // acceptMu protects accepQueue 519 acceptMu sync.Mutex `state:"nosave"` 520 521 // acceptQueue is used by a listening endpoint to send newly accepted 522 // connections to the endpoint so that they can be read by Accept() 523 // calls. 524 // 525 // +checklocks:acceptMu 526 acceptQueue acceptQueue 527 528 // The following are only used from the protocol goroutine, and 529 // therefore don't need locks to protect them. 530 rcv *receiver `state:"wait"` 531 snd *sender `state:"wait"` 532 533 // The goroutine drain completion notification channel. 534 drainDone chan struct{} `state:"nosave"` 535 536 // The goroutine undrain notification channel. This is currently used as 537 // a way to block the worker goroutines. Today nothing closes/writes 538 // this channel and this causes any goroutines waiting on this to just 539 // block. This is used during save/restore to prevent worker goroutines 540 // from mutating state as it's being saved. 541 undrain chan struct{} `state:"nosave"` 542 543 // probe if not nil is invoked on every received segment. It is passed 544 // a copy of the current state of the endpoint. 545 probe stack.TCPProbeFunc `state:"nosave"` 546 547 // The following are only used to assist the restore run to re-connect. 548 connectingAddress tcpip.Address 549 550 // amss is the advertised MSS to the peer by this endpoint. 551 amss uint16 552 553 // sendTOS represents IPv4 TOS or IPv6 TrafficClass, 554 // applied while sending packets. Defaults to 0 as on Linux. 555 sendTOS uint8 556 557 gso stack.GSO 558 559 stats Stats 560 561 // tcpLingerTimeout is the maximum amount of a time a socket 562 // a socket stays in TIME_WAIT state before being marked 563 // closed. 564 tcpLingerTimeout time.Duration 565 566 // closed indicates that the user has called closed on the 567 // endpoint and at this point the endpoint is only around 568 // to complete the TCP shutdown. 569 closed bool 570 571 // txHash is the transport layer hash to be set on outbound packets 572 // emitted by this endpoint. 573 txHash uint32 574 575 // owner is used to get uid and gid of the packet. 576 owner tcpip.PacketOwner 577 578 // ops is used to get socket level options. 579 ops tcpip.SocketOptions 580 581 // lastOutOfWindowAckTime is the time at which the an ACK was sent in response 582 // to an out of window segment being received by this endpoint. 583 lastOutOfWindowAckTime tcpip.MonotonicTime 584 585 // finWait2Timer is used to reap orphaned sockets in FIN-WAIT-2 where the peer 586 // is yet to send a FIN but on our end the socket is fully closed i.e. endpoint.Close() 587 // has been called on the socket. This timer is not started for sockets that 588 // are waiting for a peer FIN but are not closed. 589 finWait2Timer tcpip.Timer `state:"nosave"` 590 591 // timeWaitTimer is used to reap a socket once a socket has been in TIME-WAIT state 592 // for tcp.DefaultTCPTimeWaitTimeout seconds. 593 timeWaitTimer tcpip.Timer `state:"nosave"` 594 595 // listenCtx is used by listening endpoints to store state used while listening for 596 // connections. Nil otherwise. 597 listenCtx *listenContext `state:"nosave"` 598 599 // limRdr is reused to avoid allocations. 600 // 601 // +checklocks:mu 602 limRdr *io.LimitedReader `state:"nosave"` 603 604 // pmtud is the PMTUD strategy to use. 605 // 606 // +checklocks:mu 607 pmtud tcpip.PMTUDStrategy 608 } 609 610 // UniqueID implements stack.TransportEndpoint.UniqueID. 611 func (e *Endpoint) UniqueID() uint64 { 612 return e.uniqueID 613 } 614 615 // calculateAdvertisedMSS calculates the MSS to advertise. 616 // 617 // If userMSS is non-zero and is not greater than the maximum possible MSS for 618 // r, it will be used; otherwise, the maximum possible MSS will be used. 619 func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 { 620 // The maximum possible MSS is dependent on the route. 621 // TODO(b/143359391): Respect TCP Min and Max size. 622 maxMSS := uint16(r.MTU() - header.TCPMinimumSize) 623 624 if userMSS != 0 && userMSS < maxMSS { 625 return userMSS 626 } 627 628 return maxMSS 629 } 630 631 // isOwnedByUser() returns true if the endpoint lock is currently 632 // held by a user(syscall) goroutine. 633 func (e *Endpoint) isOwnedByUser() bool { 634 return e.ownedByUser.Load() == 1 635 } 636 637 // LockUser tries to lock e.mu and if it fails it will check if the lock is held 638 // by another syscall goroutine. If yes, then it will goto sleep waiting for the 639 // lock to be released, if not then it will spin till it acquires the lock or 640 // another syscall goroutine acquires it in which case it will goto sleep as 641 // described above. 642 // 643 // The assumption behind spinning here being that background packet processing 644 // should not be holding the lock for long and spinning reduces latency as we 645 // avoid an expensive sleep/wakeup of the syscall goroutine). 646 // +checklocksacquire:e.mu 647 func (e *Endpoint) LockUser() { 648 const iterations = 5 649 for i := 0; i < iterations; i++ { 650 // Try first if the sock is locked then check if it's owned 651 // by another user goroutine if not then we spin, otherwise 652 // we just go to sleep on the Lock() and wait. 653 if !e.TryLock() { 654 // If socket is owned by the user then just go to sleep 655 // as the lock could be held for a reasonably long time. 656 if e.ownedByUser.Load() == 1 { 657 e.mu.Lock() 658 e.ownedByUser.Store(1) 659 return 660 } 661 // Spin but don't yield the processor since the lower half 662 // should yield the lock soon. 663 continue 664 } 665 e.ownedByUser.Store(1) 666 return 667 } 668 669 for i := 0; i < iterations; i++ { 670 // Try first if the sock is locked then check if it's owned 671 // by another user goroutine if not then we spin, otherwise 672 // we just go to sleep on the Lock() and wait. 673 if !e.TryLock() { 674 // If socket is owned by the user then just go to sleep 675 // as the lock could be held for a reasonably long time. 676 if e.ownedByUser.Load() == 1 { 677 e.mu.Lock() 678 e.ownedByUser.Store(1) 679 return 680 } 681 // Spin but yield the processor since the lower half 682 // should yield the lock soon. 683 runtime.Gosched() 684 continue 685 } 686 e.ownedByUser.Store(1) 687 return 688 } 689 690 // Finally just give up and wait for the Lock. 691 e.mu.Lock() 692 e.ownedByUser.Store(1) 693 } 694 695 // UnlockUser will check if there are any segments already queued for processing 696 // and wake up a processor goroutine to process them before unlocking e.mu. 697 // This is required because we when packets arrive and endpoint lock is already 698 // held then such packets are queued up to be processed. 699 // 700 // Precondition: e.LockUser() must have been called before calling e.UnlockUser() 701 // +checklocksrelease:e.mu 702 func (e *Endpoint) UnlockUser() { 703 // Lock segment queue before checking so that we avoid a race where 704 // segments can be queued between the time we check if queue is empty 705 // and actually unlock the endpoint mutex. 706 e.segmentQueue.mu.Lock() 707 if e.segmentQueue.emptyLocked() { 708 if e.ownedByUser.Swap(0) != 1 { 709 panic("e.UnlockUser() called without calling e.LockUser()") 710 } 711 e.mu.Unlock() 712 e.segmentQueue.mu.Unlock() 713 return 714 } 715 e.segmentQueue.mu.Unlock() 716 717 // Since we are waking the processor goroutine here just unlock 718 // and let it process the queued segments. 719 if e.ownedByUser.Swap(0) != 1 { 720 panic("e.UnlockUser() called without calling e.LockUser()") 721 } 722 processor := e.protocol.dispatcher.selectProcessor(e.ID) 723 e.mu.Unlock() 724 725 // Wake up the processor for this endpoint to process any queued 726 // segments after releasing the lock to avoid the case where if the 727 // processor goroutine starts running before we release the lock here 728 // then it will fail to process as TryLock() will fail. 729 processor.queueEndpoint(e) 730 return 731 } 732 733 // StopWork halts packet processing. Only to be used in tests. 734 // +checklocksacquire:e.mu 735 func (e *Endpoint) StopWork() { 736 e.mu.Lock() 737 } 738 739 // ResumeWork resumes packet processing. Only to be used in tests. 740 // +checklocksrelease:e.mu 741 func (e *Endpoint) ResumeWork() { 742 e.mu.Unlock() 743 } 744 745 // AssertLockHeld forces the checklocks analyzer to consider e.mu held. This is 746 // used in places where we know that e.mu is held, but checklocks does not, 747 // which can happen when creating new locked objects. You must pass the known 748 // locked endpoint to this function and it must be the same as the caller 749 // endpoint. 750 // TODO(b/226403629): Remove this function once checklocks understands local 751 // variable locks. 752 // +checklocks:locked.mu 753 // +checklocksacquire:e.mu 754 func (e *Endpoint) AssertLockHeld(locked *Endpoint) { 755 if e != locked { 756 panic("AssertLockHeld failed: locked endpoint != asserting endpoint") 757 } 758 } 759 760 // TryLock is a helper that calls TryLock on the endpoint's mutex and 761 // adds the necessary checklocks annotations. 762 // TODO(b/226403629): Remove this once checklocks understands TryLock. 763 // +checklocksacquire:e.mu 764 func (e *Endpoint) TryLock() bool { 765 if e.mu.TryLock() { 766 return true // +checklocksforce 767 } 768 return false // +checklocksignore 769 } 770 771 // setEndpointState updates the state of the endpoint to state atomically. This 772 // method is unexported as the only place we should update the state is in this 773 // package but we allow the state to be read freely without holding e.mu. 774 // 775 // +checklocks:e.mu 776 func (e *Endpoint) setEndpointState(state EndpointState) { 777 oldstate := EndpointState(e.state.Swap(uint32(state))) 778 switch state { 779 case StateEstablished: 780 e.stack.Stats().TCP.CurrentEstablished.Increment() 781 e.stack.Stats().TCP.CurrentConnected.Increment() 782 case StateError: 783 fallthrough 784 case StateClose: 785 if oldstate == StateCloseWait || oldstate == StateEstablished { 786 e.stack.Stats().TCP.EstablishedResets.Increment() 787 } 788 if oldstate.connected() { 789 e.stack.Stats().TCP.CurrentConnected.Decrement() 790 } 791 fallthrough 792 default: 793 if oldstate == StateEstablished { 794 e.stack.Stats().TCP.CurrentEstablished.Decrement() 795 } 796 } 797 } 798 799 // EndpointState returns the current state of the endpoint. 800 func (e *Endpoint) EndpointState() EndpointState { 801 return EndpointState(e.state.Load()) 802 } 803 804 // setRecentTimestamp sets the recentTS field to the provided value. 805 func (e *Endpoint) setRecentTimestamp(recentTS uint32) { 806 e.RecentTS = recentTS 807 e.recentTSTime = e.stack.Clock().NowMonotonic() 808 } 809 810 // recentTimestamp returns the value of the recentTS field. 811 func (e *Endpoint) recentTimestamp() uint32 { 812 return e.RecentTS 813 } 814 815 // TODO(gvisor.dev/issue/6974): Remove once tcp endpoints are composed with a 816 // network.Endpoint, which also defines this function. 817 func calculateTTL(route *stack.Route, ipv4TTL uint8, ipv6HopLimit int16) uint8 { 818 switch netProto := route.NetProto(); netProto { 819 case header.IPv4ProtocolNumber: 820 if ipv4TTL == tcpip.UseDefaultIPv4TTL { 821 return route.DefaultTTL() 822 } 823 return ipv4TTL 824 case header.IPv6ProtocolNumber: 825 if ipv6HopLimit == tcpip.UseDefaultIPv6HopLimit { 826 return route.DefaultTTL() 827 } 828 return uint8(ipv6HopLimit) 829 default: 830 panic(fmt.Sprintf("invalid protocol number = %d", netProto)) 831 } 832 } 833 834 // keepalive is a synchronization wrapper used to appease stateify. See the 835 // comment in endpoint, where it is used. 836 // 837 // +stateify savable 838 type keepalive struct { 839 sync.Mutex `state:"nosave"` 840 idle time.Duration 841 interval time.Duration 842 count int 843 unacked int 844 // should never be a zero timer if the endpoint is not closed. 845 timer timer `state:"nosave"` 846 waker sleep.Waker `state:"nosave"` 847 } 848 849 func newEndpoint(s *stack.Stack, protocol *protocol, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *Endpoint { 850 e := &Endpoint{ 851 stack: s, 852 protocol: protocol, 853 TransportEndpointInfo: stack.TransportEndpointInfo{ 854 NetProto: netProto, 855 TransProto: header.TCPProtocolNumber, 856 }, 857 sndQueueInfo: sndQueueInfo{ 858 TCPSndBufState: stack.TCPSndBufState{ 859 SndMTU: math.MaxInt32, 860 }, 861 }, 862 waiterQueue: waiterQueue, 863 state: atomicbitops.FromUint32(uint32(StateInitial)), 864 keepalive: keepalive{ 865 idle: DefaultKeepaliveIdle, 866 interval: DefaultKeepaliveInterval, 867 count: DefaultKeepaliveCount, 868 }, 869 uniqueID: s.UniqueID(), 870 ipv4TTL: tcpip.UseDefaultIPv4TTL, 871 ipv6HopLimit: tcpip.UseDefaultIPv6HopLimit, 872 // txHash only determines which outgoing queue to use, so 873 // InsecureRNG is fine. 874 txHash: s.InsecureRNG().Uint32(), 875 windowClamp: DefaultReceiveBufferSize, 876 maxSynRetries: DefaultSynRetries, 877 limRdr: &io.LimitedReader{}, 878 } 879 e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits, GetTCPReceiveBufferLimits) 880 e.ops.SetMulticastLoop(true) 881 e.ops.SetQuickAck(true) 882 e.ops.SetSendBufferSize(DefaultSendBufferSize, false /* notify */) 883 e.ops.SetReceiveBufferSize(DefaultReceiveBufferSize, false /* notify */) 884 885 var ss tcpip.TCPSendBufferSizeRangeOption 886 if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil { 887 e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) 888 } 889 890 var rs tcpip.TCPReceiveBufferSizeRangeOption 891 if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil { 892 e.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */) 893 } 894 895 var cs tcpip.CongestionControlOption 896 if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil { 897 e.cc = cs 898 } 899 900 var mrb tcpip.TCPModerateReceiveBufferOption 901 if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil { 902 e.RcvAutoParams.Disabled = !bool(mrb) 903 } 904 905 var de tcpip.TCPDelayEnabled 906 if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de { 907 e.ops.SetDelayOption(true) 908 } 909 910 var tcpLT tcpip.TCPLingerTimeoutOption 911 if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil { 912 e.tcpLingerTimeout = time.Duration(tcpLT) 913 } 914 915 var synRetries tcpip.TCPSynRetriesOption 916 if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil { 917 e.maxSynRetries = uint8(synRetries) 918 } 919 920 if p := s.GetTCPProbe(); p != nil { 921 e.probe = p 922 } 923 924 e.segmentQueue.ep = e 925 926 // TODO(https://gvisor.dev/issues/7493): Defer creating the timer until TCP connection becomes 927 // established. 928 e.keepalive.timer.init(e.stack.Clock(), timerHandler(e, e.keepaliveTimerExpired)) 929 930 return e 931 } 932 933 // Readiness returns the current readiness of the endpoint. For example, if 934 // waiter.EventIn is set, the endpoint is immediately readable. 935 func (e *Endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { 936 result := waiter.EventMask(0) 937 938 switch e.EndpointState() { 939 case StateInitial, StateBound: 940 // This prevents blocking of new sockets which are not 941 // connected when SO_LINGER is set. 942 result |= waiter.EventHUp 943 944 case StateConnecting, StateSynSent, StateSynRecv: 945 // Ready for nothing. 946 947 case StateClose, StateError, StateTimeWait: 948 // Ready for anything. 949 result = mask 950 951 case StateListen: 952 // Check if there's anything in the accepted queue. 953 if (mask & waiter.ReadableEvents) != 0 { 954 e.acceptMu.Lock() 955 if e.acceptQueue.endpoints.Len() != 0 { 956 result |= waiter.ReadableEvents 957 } 958 e.acceptMu.Unlock() 959 } 960 } 961 if e.EndpointState().connected() { 962 // Determine if the endpoint is writable if requested. 963 if (mask & waiter.WritableEvents) != 0 { 964 e.sndQueueInfo.sndQueueMu.Lock() 965 sndBufSize := e.getSendBufferSize() 966 if e.sndQueueInfo.SndClosed || e.sndQueueInfo.SndBufUsed < sndBufSize { 967 result |= waiter.WritableEvents 968 } 969 if e.sndQueueInfo.SndClosed { 970 e.updateConnDirectionState(connDirectionStateSndClosed) 971 } 972 e.sndQueueInfo.sndQueueMu.Unlock() 973 } 974 975 // Determine if the endpoint is readable if requested. 976 if (mask & waiter.ReadableEvents) != 0 { 977 e.rcvQueueMu.Lock() 978 if e.RcvBufUsed > 0 || e.RcvClosed { 979 result |= waiter.ReadableEvents 980 } 981 if e.RcvClosed { 982 e.updateConnDirectionState(connDirectionStateRcvClosed) 983 } 984 e.rcvQueueMu.Unlock() 985 } 986 } 987 988 // Determine whether endpoint is half-closed with rcv shutdown 989 if e.connDirectionState() == connDirectionStateRcvClosed { 990 result |= waiter.EventRdHUp 991 } 992 993 return result 994 } 995 996 // Purging pending rcv segments is only necessary on RST. 997 func (e *Endpoint) purgePendingRcvQueue() { 998 if e.rcv != nil { 999 for e.rcv.pendingRcvdSegments.Len() > 0 { 1000 s := heap.Pop(&e.rcv.pendingRcvdSegments).(*segment) 1001 s.DecRef() 1002 } 1003 } 1004 } 1005 1006 // +checklocks:e.mu 1007 func (e *Endpoint) purgeReadQueue() { 1008 if e.rcv != nil { 1009 e.rcvQueueMu.Lock() 1010 defer e.rcvQueueMu.Unlock() 1011 for { 1012 s := e.rcvQueue.Front() 1013 if s == nil { 1014 break 1015 } 1016 e.rcvQueue.Remove(s) 1017 s.DecRef() 1018 } 1019 e.RcvBufUsed = 0 1020 } 1021 } 1022 1023 // +checklocks:e.mu 1024 func (e *Endpoint) purgeWriteQueue() { 1025 if e.snd != nil { 1026 e.sndQueueInfo.sndQueueMu.Lock() 1027 defer e.sndQueueInfo.sndQueueMu.Unlock() 1028 e.snd.updateWriteNext(nil) 1029 for { 1030 s := e.snd.writeList.Front() 1031 if s == nil { 1032 break 1033 } 1034 e.snd.writeList.Remove(s) 1035 s.DecRef() 1036 } 1037 e.sndQueueInfo.SndBufUsed = 0 1038 e.sndQueueInfo.SndClosed = true 1039 } 1040 } 1041 1042 // Abort implements stack.TransportEndpoint.Abort. 1043 func (e *Endpoint) Abort() { 1044 defer e.drainClosingSegmentQueue() 1045 e.LockUser() 1046 defer e.UnlockUser() 1047 defer e.purgeReadQueue() 1048 // Reset all connected endpoints. 1049 switch state := e.EndpointState(); { 1050 case state.connected(): 1051 e.resetConnectionLocked(&tcpip.ErrAborted{}) 1052 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 1053 return 1054 } 1055 e.closeLocked() 1056 } 1057 1058 // Close puts the endpoint in a closed state and frees all resources associated 1059 // with it. It must be called only once and with no other concurrent calls to 1060 // the endpoint. 1061 func (e *Endpoint) Close() { 1062 e.LockUser() 1063 if e.closed { 1064 e.UnlockUser() 1065 return 1066 } 1067 1068 // We always want to purge the read queue, but do so after the checks in 1069 // shutdownLocked. 1070 e.closeLocked() 1071 e.purgeReadQueue() 1072 if e.EndpointState() == StateClose || e.EndpointState() == StateError { 1073 // It should be safe to purge the read queue now as the endpoint 1074 // is now closed or in an error state and further reads are not 1075 // permitted. 1076 e.UnlockUser() 1077 e.drainClosingSegmentQueue() 1078 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 1079 return 1080 } 1081 e.UnlockUser() 1082 } 1083 1084 // +checklocks:e.mu 1085 func (e *Endpoint) closeLocked() { 1086 linger := e.SocketOptions().GetLinger() 1087 if linger.Enabled && linger.Timeout == 0 { 1088 s := e.EndpointState() 1089 isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv 1090 if isResetState { 1091 // Close the endpoint without doing full shutdown and 1092 // send a RST. 1093 e.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) 1094 return 1095 } 1096 } 1097 1098 // Issue a shutdown so that the peer knows we won't send any more data 1099 // if we're connected, or stop accepting if we're listening. 1100 e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead) 1101 e.closeNoShutdownLocked() 1102 } 1103 1104 // closeNoShutdown closes the endpoint without doing a full shutdown. 1105 // +checklocks:e.mu 1106 func (e *Endpoint) closeNoShutdownLocked() { 1107 // For listening sockets, we always release ports inline so that they 1108 // are immediately available for reuse after Close() is called. If also 1109 // registered, we unregister as well otherwise the next user would fail 1110 // in Listen() when trying to register. 1111 if e.EndpointState() == StateListen && e.isPortReserved { 1112 if e.isRegistered { 1113 e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 1114 e.isRegistered = false 1115 } 1116 1117 portRes := ports.Reservation{ 1118 Networks: e.effectiveNetProtos, 1119 Transport: ProtocolNumber, 1120 Addr: e.TransportEndpointInfo.ID.LocalAddress, 1121 Port: e.TransportEndpointInfo.ID.LocalPort, 1122 Flags: e.boundPortFlags, 1123 BindToDevice: e.boundBindToDevice, 1124 Dest: e.boundDest, 1125 } 1126 e.stack.ReleasePort(portRes) 1127 e.isPortReserved = false 1128 e.boundBindToDevice = 0 1129 e.boundPortFlags = ports.Flags{} 1130 e.boundDest = tcpip.FullAddress{} 1131 } 1132 1133 // Mark endpoint as closed. 1134 e.closed = true 1135 tcpip.AddDanglingEndpoint(e) 1136 1137 eventMask := waiter.ReadableEvents | waiter.WritableEvents 1138 1139 switch e.EndpointState() { 1140 case StateInitial, StateBound, StateListen: 1141 e.setEndpointState(StateClose) 1142 fallthrough 1143 case StateClose, StateError: 1144 eventMask |= waiter.EventHUp 1145 e.cleanupLocked() 1146 case StateConnecting, StateSynSent, StateSynRecv: 1147 // Abort the handshake and set the error. 1148 // Notify that the endpoint is closed. 1149 eventMask |= waiter.EventHUp 1150 e.handshakeFailed(&tcpip.ErrAborted{}) 1151 // Notify that the endpoint is closed. 1152 eventMask |= waiter.EventHUp 1153 case StateFinWait2: 1154 // The socket has been closed and we are in FIN-WAIT-2 so start 1155 // the FIN-WAIT-2 timer. 1156 if e.finWait2Timer == nil { 1157 e.finWait2Timer = e.stack.Clock().AfterFunc(e.tcpLingerTimeout, e.finWait2TimerExpired) 1158 } 1159 } 1160 1161 e.waiterQueue.Notify(eventMask) 1162 } 1163 1164 // closePendingAcceptableConnections closes all connections that have completed 1165 // handshake but not yet been delivered to the application. 1166 func (e *Endpoint) closePendingAcceptableConnectionsLocked() { 1167 e.acceptMu.Lock() 1168 1169 pendingEndpoints := e.acceptQueue.pendingEndpoints 1170 e.acceptQueue.pendingEndpoints = nil 1171 1172 completedEndpoints := make([]*Endpoint, 0, e.acceptQueue.endpoints.Len()) 1173 for n := e.acceptQueue.endpoints.Front(); n != nil; n = n.Next() { 1174 completedEndpoints = append(completedEndpoints, n.Value.(*Endpoint)) 1175 } 1176 e.acceptQueue.endpoints.Init() 1177 e.acceptQueue.capacity = 0 1178 e.acceptMu.Unlock() 1179 1180 // Close any endpoints in SYN-RCVD state. 1181 for n := range pendingEndpoints { 1182 n.Abort() 1183 } 1184 1185 // Reset all connections that are waiting to be accepted. 1186 for _, n := range completedEndpoints { 1187 n.Abort() 1188 } 1189 } 1190 1191 // cleanupLocked frees all resources associated with the endpoint. 1192 // +checklocks:e.mu 1193 func (e *Endpoint) cleanupLocked() { 1194 if e.snd != nil { 1195 e.snd.resendTimer.cleanup() 1196 e.snd.probeTimer.cleanup() 1197 e.snd.reorderTimer.cleanup() 1198 e.snd.corkTimer.cleanup() 1199 } 1200 1201 if e.finWait2Timer != nil { 1202 e.finWait2Timer.Stop() 1203 } 1204 1205 if e.timeWaitTimer != nil { 1206 e.timeWaitTimer.Stop() 1207 } 1208 1209 // Close all endpoints that might have been accepted by TCP but not by 1210 // the client. 1211 e.closePendingAcceptableConnectionsLocked() 1212 e.keepalive.timer.cleanup() 1213 1214 if e.isRegistered { 1215 e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 1216 e.isRegistered = false 1217 } 1218 1219 if e.isPortReserved { 1220 portRes := ports.Reservation{ 1221 Networks: e.effectiveNetProtos, 1222 Transport: ProtocolNumber, 1223 Addr: e.TransportEndpointInfo.ID.LocalAddress, 1224 Port: e.TransportEndpointInfo.ID.LocalPort, 1225 Flags: e.boundPortFlags, 1226 BindToDevice: e.boundBindToDevice, 1227 Dest: e.boundDest, 1228 } 1229 e.stack.ReleasePort(portRes) 1230 e.isPortReserved = false 1231 } 1232 e.boundBindToDevice = 0 1233 e.boundPortFlags = ports.Flags{} 1234 e.boundDest = tcpip.FullAddress{} 1235 1236 if e.route != nil { 1237 e.route.Release() 1238 e.route = nil 1239 } 1240 1241 e.purgeWriteQueue() 1242 // Only purge the read queue here if the socket is fully closed by the 1243 // user. 1244 if e.closed { 1245 e.purgeReadQueue() 1246 } 1247 e.stack.CompleteTransportEndpointCleanup(e) 1248 tcpip.DeleteDanglingEndpoint(e) 1249 } 1250 1251 // wndFromSpace returns the window that we can advertise based on the available 1252 // receive buffer space. 1253 func wndFromSpace(space int) int { 1254 return space >> rcvAdvWndScale 1255 } 1256 1257 // initialReceiveWindow returns the initial receive window to advertise in the 1258 // SYN/SYN-ACK. 1259 func (e *Endpoint) initialReceiveWindow() int { 1260 rcvWnd := wndFromSpace(e.receiveBufferAvailable()) 1261 if rcvWnd > math.MaxUint16 { 1262 rcvWnd = math.MaxUint16 1263 } 1264 1265 // Use the user supplied MSS, if available. 1266 routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2 1267 if rcvWnd > routeWnd { 1268 rcvWnd = routeWnd 1269 } 1270 rcvWndScale := e.rcvWndScaleForHandshake() 1271 1272 // Round-down the rcvWnd to a multiple of wndScale. This ensures that the 1273 // window offered in SYN won't be reduced due to the loss of precision if 1274 // window scaling is enabled after the handshake. 1275 rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale) 1276 1277 // Ensure we can always accept at least 1 byte if the scale specified 1278 // was too high for the provided rcvWnd. 1279 if rcvWnd == 0 { 1280 rcvWnd = 1 1281 } 1282 1283 return rcvWnd 1284 } 1285 1286 // ModerateRecvBuf adjusts the receive buffer and the advertised window 1287 // based on the number of bytes copied to userspace. 1288 func (e *Endpoint) ModerateRecvBuf(copied int) { 1289 e.LockUser() 1290 defer e.UnlockUser() 1291 1292 sendNonZeroWindowUpdate := false 1293 1294 e.rcvQueueMu.Lock() 1295 if e.RcvAutoParams.Disabled { 1296 e.rcvQueueMu.Unlock() 1297 return 1298 } 1299 now := e.stack.Clock().NowMonotonic() 1300 if rtt := e.RcvAutoParams.RTT; rtt == 0 || now.Sub(e.RcvAutoParams.MeasureTime) < rtt { 1301 e.RcvAutoParams.CopiedBytes += copied 1302 e.rcvQueueMu.Unlock() 1303 return 1304 } 1305 prevRTTCopied := e.RcvAutoParams.CopiedBytes + copied 1306 prevCopied := e.RcvAutoParams.PrevCopiedBytes 1307 rcvWnd := 0 1308 if prevRTTCopied > prevCopied { 1309 // The minimal receive window based on what was copied by the app 1310 // in the immediate preceding RTT and some extra buffer for 16 1311 // segments to account for variations. 1312 // We multiply by 2 to account for packet losses. 1313 rcvWnd = prevRTTCopied*2 + 16*int(e.amss) 1314 1315 // Scale for slow start based on bytes copied in this RTT vs previous. 1316 grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied 1317 1318 // Multiply growth factor by 2 again to account for sender being 1319 // in slow-start where the sender grows it's congestion window 1320 // by 100% per RTT. 1321 rcvWnd += grow * 2 1322 1323 // Make sure auto tuned buffer size can always receive upto 2x 1324 // the initial window of 10 segments. 1325 if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd { 1326 rcvWnd = minRcvWnd 1327 } 1328 1329 // Cap the auto tuned buffer size by the maximum permissible 1330 // receive buffer size. 1331 if max := e.maxReceiveBufferSize(); rcvWnd > max { 1332 rcvWnd = max 1333 } 1334 1335 // We do not adjust downwards as that can cause the receiver to 1336 // reject valid data that might already be in flight as the 1337 // acceptable window will shrink. 1338 rcvBufSize := int(e.ops.GetReceiveBufferSize()) 1339 if rcvWnd > rcvBufSize { 1340 availBefore := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize)) 1341 e.ops.SetReceiveBufferSize(int64(rcvWnd), false /* notify */) 1342 availAfter := wndFromSpace(e.receiveBufferAvailableLocked(rcvWnd)) 1343 if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, rcvBufSize); crossed && above { 1344 sendNonZeroWindowUpdate = true 1345 } 1346 } 1347 1348 // We only update PrevCopiedBytes when we grow the buffer because in cases 1349 // where PrevCopiedBytes > prevRTTCopied the existing buffer is already big 1350 // enough to handle the current rate and we don't need to do any 1351 // adjustments. 1352 e.RcvAutoParams.PrevCopiedBytes = prevRTTCopied 1353 } 1354 e.RcvAutoParams.MeasureTime = now 1355 e.RcvAutoParams.CopiedBytes = 0 1356 e.rcvQueueMu.Unlock() 1357 1358 // Send the update after unlocking rcvQueueMu as sending a segment acquires 1359 // the lock to calculate the window to be sent. 1360 if e.EndpointState().connected() && sendNonZeroWindowUpdate { 1361 e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu 1362 } 1363 } 1364 1365 // SetOwner implements tcpip.Endpoint.SetOwner. 1366 func (e *Endpoint) SetOwner(owner tcpip.PacketOwner) { 1367 e.owner = owner 1368 } 1369 1370 // +checklocks:e.mu 1371 func (e *Endpoint) hardErrorLocked() tcpip.Error { 1372 err := e.hardError 1373 e.hardError = nil 1374 return err 1375 } 1376 1377 // +checklocks:e.mu 1378 func (e *Endpoint) lastErrorLocked() tcpip.Error { 1379 e.lastErrorMu.Lock() 1380 defer e.lastErrorMu.Unlock() 1381 err := e.lastError 1382 e.lastError = nil 1383 return err 1384 } 1385 1386 // LastError implements tcpip.Endpoint.LastError. 1387 func (e *Endpoint) LastError() tcpip.Error { 1388 e.LockUser() 1389 defer e.UnlockUser() 1390 if err := e.hardErrorLocked(); err != nil { 1391 return err 1392 } 1393 return e.lastErrorLocked() 1394 } 1395 1396 // LastErrorLocked reads and clears lastError. 1397 // Only to be used in tests. 1398 // +checklocks:e.mu 1399 func (e *Endpoint) LastErrorLocked() tcpip.Error { 1400 return e.lastErrorLocked() 1401 } 1402 1403 // UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError. 1404 func (e *Endpoint) UpdateLastError(err tcpip.Error) { 1405 e.LockUser() 1406 e.lastErrorMu.Lock() 1407 e.lastError = err 1408 e.lastErrorMu.Unlock() 1409 e.UnlockUser() 1410 } 1411 1412 // Read implements tcpip.Endpoint.Read. 1413 func (e *Endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) { 1414 e.LockUser() 1415 defer e.UnlockUser() 1416 1417 if err := e.checkReadLocked(); err != nil { 1418 if _, ok := err.(*tcpip.ErrClosedForReceive); ok { 1419 e.stats.ReadErrors.ReadClosed.Increment() 1420 } 1421 return tcpip.ReadResult{}, err 1422 } 1423 1424 var err error 1425 done := 0 1426 // N.B. Here we get the first segment to be processed. It is safe to not 1427 // hold rcvQueueMu when processing, since we hold e.mu to ensure we only 1428 // remove segments from the list through Read() and that new segments 1429 // cannot be appended. 1430 s := e.rcvQueue.Front() 1431 for s != nil { 1432 var n int 1433 n, err = s.ReadTo(dst, opts.Peek) 1434 // Book keeping first then error handling. 1435 done += n 1436 1437 if opts.Peek { 1438 s = s.Next() 1439 } else { 1440 sendNonZeroWindowUpdate := false 1441 memDelta := 0 1442 for { 1443 seg := e.rcvQueue.Front() 1444 if seg == nil || seg.payloadSize() != 0 { 1445 break 1446 } 1447 e.rcvQueue.Remove(seg) 1448 // Memory is only considered released when the whole segment has been 1449 // read. 1450 memDelta += seg.segMemSize() 1451 seg.DecRef() 1452 } 1453 e.rcvQueueMu.Lock() 1454 e.RcvBufUsed -= n 1455 s = e.rcvQueue.Front() 1456 1457 if memDelta > 0 { 1458 // If the window was small before this read and if the read freed up 1459 // enough buffer space, to either fit an aMSS or half a receive buffer 1460 // (whichever smaller), then notify the protocol goroutine to send a 1461 // window update. 1462 if crossed, above := e.windowCrossedACKThresholdLocked(memDelta, int(e.ops.GetReceiveBufferSize())); crossed && above { 1463 sendNonZeroWindowUpdate = true 1464 } 1465 } 1466 e.rcvQueueMu.Unlock() 1467 1468 if e.EndpointState().connected() && sendNonZeroWindowUpdate { 1469 e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu 1470 } 1471 } 1472 1473 if err != nil { 1474 break 1475 } 1476 } 1477 1478 // If something is read, we must report it. Report error when nothing is read. 1479 if done == 0 && err != nil { 1480 return tcpip.ReadResult{}, &tcpip.ErrBadBuffer{} 1481 } 1482 return tcpip.ReadResult{ 1483 Count: done, 1484 Total: done, 1485 }, nil 1486 } 1487 1488 // checkRead checks that endpoint is in a readable state. 1489 // 1490 // +checklocks:e.mu 1491 func (e *Endpoint) checkReadLocked() tcpip.Error { 1492 e.rcvQueueMu.Lock() 1493 defer e.rcvQueueMu.Unlock() 1494 // When in SYN-SENT state, let the caller block on the receive. 1495 // An application can initiate a non-blocking connect and then block 1496 // on a receive. It can expect to read any data after the handshake 1497 // is complete. RFC793, section 3.9, p58. 1498 if e.EndpointState() == StateSynSent { 1499 return &tcpip.ErrWouldBlock{} 1500 } 1501 1502 // The endpoint can be read if it's connected, or if it's already closed 1503 // but has some pending unread data. Also note that a RST being received 1504 // would cause the state to become StateError so we should allow the 1505 // reads to proceed before returning a ECONNRESET. 1506 bufUsed := e.RcvBufUsed 1507 if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 { 1508 if s == StateError { 1509 if err := e.hardErrorLocked(); err != nil { 1510 return err 1511 } 1512 return &tcpip.ErrClosedForReceive{} 1513 } 1514 e.stats.ReadErrors.NotConnected.Increment() 1515 return &tcpip.ErrNotConnected{} 1516 } 1517 1518 if e.RcvBufUsed == 0 { 1519 if e.RcvClosed || !e.EndpointState().connected() { 1520 return &tcpip.ErrClosedForReceive{} 1521 } 1522 return &tcpip.ErrWouldBlock{} 1523 } 1524 1525 return nil 1526 } 1527 1528 // isEndpointWritableLocked checks if a given endpoint is writable 1529 // and also returns the number of bytes that can be written at this 1530 // moment. If the endpoint is not writable then it returns an error 1531 // indicating the reason why it's not writable. 1532 // +checklocks:e.mu 1533 // +checklocks:e.sndQueueInfo.sndQueueMu 1534 func (e *Endpoint) isEndpointWritableLocked() (int, tcpip.Error) { 1535 // The endpoint cannot be written to if it's not connected. 1536 switch s := e.EndpointState(); { 1537 case s == StateError: 1538 if err := e.hardErrorLocked(); err != nil { 1539 return 0, err 1540 } 1541 return 0, &tcpip.ErrClosedForSend{} 1542 case !s.connecting() && !s.connected(): 1543 return 0, &tcpip.ErrClosedForSend{} 1544 case s.connecting(): 1545 // As per RFC793, page 56, a send request arriving when in connecting 1546 // state, can be queued to be completed after the state becomes 1547 // connected. Return an error code for the caller of endpoint Write to 1548 // try again, until the connection handshake is complete. 1549 return 0, &tcpip.ErrWouldBlock{} 1550 } 1551 1552 // Check if the connection has already been closed for sends. 1553 if e.sndQueueInfo.SndClosed { 1554 return 0, &tcpip.ErrClosedForSend{} 1555 } 1556 1557 sndBufSize := e.getSendBufferSize() 1558 avail := sndBufSize - e.sndQueueInfo.SndBufUsed 1559 if avail <= 0 { 1560 return 0, &tcpip.ErrWouldBlock{} 1561 } 1562 return avail, nil 1563 } 1564 1565 // readFromPayloader reads a slice from the Payloader. 1566 // +checklocks:e.mu 1567 // +checklocks:e.sndQueueInfo.sndQueueMu 1568 func (e *Endpoint) readFromPayloader(p tcpip.Payloader, opts tcpip.WriteOptions, avail int) (buffer.Buffer, tcpip.Error) { 1569 // We can release locks while copying data. 1570 // 1571 // This is not possible if atomic is set, because we can't allow the 1572 // available buffer space to be consumed by some other caller while we 1573 // are copying data in. 1574 limRdr := e.limRdr 1575 if !opts.Atomic { 1576 defer func() { 1577 e.limRdr = limRdr 1578 }() 1579 e.limRdr = nil 1580 1581 e.sndQueueInfo.sndQueueMu.Unlock() 1582 defer e.sndQueueInfo.sndQueueMu.Lock() 1583 1584 e.UnlockUser() 1585 defer e.LockUser() 1586 } 1587 1588 // Fetch data. 1589 var payload buffer.Buffer 1590 if l := p.Len(); l < avail { 1591 avail = l 1592 } 1593 if avail == 0 { 1594 return payload, nil 1595 } 1596 if _, err := payload.WriteFromReaderAndLimitedReader(p, int64(avail), limRdr); err != nil { 1597 payload.Release() 1598 return buffer.Buffer{}, &tcpip.ErrBadBuffer{} 1599 } 1600 return payload, nil 1601 } 1602 1603 // queueSegment reads data from the payloader and returns a segment to be sent. 1604 // +checklocks:e.mu 1605 func (e *Endpoint) queueSegment(p tcpip.Payloader, opts tcpip.WriteOptions) (*segment, int, tcpip.Error) { 1606 e.sndQueueInfo.sndQueueMu.Lock() 1607 defer e.sndQueueInfo.sndQueueMu.Unlock() 1608 1609 avail, err := e.isEndpointWritableLocked() 1610 if err != nil { 1611 e.stats.WriteErrors.WriteClosed.Increment() 1612 return nil, 0, err 1613 } 1614 1615 buf, err := e.readFromPayloader(p, opts, avail) 1616 if err != nil { 1617 return nil, 0, err 1618 } 1619 1620 // Do not queue zero length segments. 1621 if buf.Size() == 0 { 1622 return nil, 0, nil 1623 } 1624 1625 if !opts.Atomic { 1626 // Since we released locks in between it's possible that the 1627 // endpoint transitioned to a CLOSED/ERROR states so make 1628 // sure endpoint is still writable before trying to write. 1629 avail, err := e.isEndpointWritableLocked() 1630 if err != nil { 1631 e.stats.WriteErrors.WriteClosed.Increment() 1632 buf.Release() 1633 return nil, 0, err 1634 } 1635 1636 // A simultaneous call to write on the socket can reduce avail. Discard 1637 // excess data copied if this is the case. 1638 if int64(avail) < buf.Size() { 1639 buf.Truncate(int64(avail)) 1640 } 1641 } 1642 1643 // Add data to the send queue. 1644 size := int(buf.Size()) 1645 s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), buf) 1646 e.sndQueueInfo.SndBufUsed += size 1647 e.snd.writeList.PushBack(s) 1648 1649 return s, size, nil 1650 } 1651 1652 // Write writes data to the endpoint's peer. 1653 func (e *Endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { 1654 // Linux completely ignores any address passed to sendto(2) for TCP sockets 1655 // (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More 1656 // and opts.EndOfRecord are also ignored. 1657 1658 e.LockUser() 1659 defer e.UnlockUser() 1660 1661 // Return if either we didn't queue anything or if an error occurred while 1662 // attempting to queue data. 1663 nextSeg, n, err := e.queueSegment(p, opts) 1664 if n == 0 || err != nil { 1665 return 0, err 1666 } 1667 1668 e.sendData(nextSeg) 1669 return int64(n), nil 1670 } 1671 1672 // selectWindowLocked returns the new window without checking for shrinking or scaling 1673 // applied. 1674 // +checklocks:e.mu 1675 // +checklocks:e.rcvQueueMu 1676 func (e *Endpoint) selectWindowLocked(rcvBufSize int) (wnd seqnum.Size) { 1677 wndFromAvailable := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize)) 1678 maxWindow := wndFromSpace(rcvBufSize) 1679 wndFromUsedBytes := maxWindow - e.RcvBufUsed 1680 1681 // We take the lesser of the wndFromAvailable and wndFromUsedBytes because in 1682 // cases where we receive a lot of small segments the segment overhead is a 1683 // lot higher and we can run out socket buffer space before we can fill the 1684 // previous window we advertised. In cases where we receive MSS sized or close 1685 // MSS sized segments we will probably run out of window space before we 1686 // exhaust receive buffer. 1687 newWnd := wndFromAvailable 1688 if newWnd > wndFromUsedBytes { 1689 newWnd = wndFromUsedBytes 1690 } 1691 if newWnd < 0 { 1692 newWnd = 0 1693 } 1694 return seqnum.Size(newWnd) 1695 } 1696 1697 // selectWindow invokes selectWindowLocked after acquiring e.rcvQueueMu. 1698 // +checklocks:e.mu 1699 func (e *Endpoint) selectWindow() (wnd seqnum.Size) { 1700 e.rcvQueueMu.Lock() 1701 wnd = e.selectWindowLocked(int(e.ops.GetReceiveBufferSize())) 1702 e.rcvQueueMu.Unlock() 1703 return wnd 1704 } 1705 1706 // windowCrossedACKThresholdLocked checks if the receive window to be announced 1707 // would be under aMSS or under the window derived from half receive buffer, 1708 // whichever smaller. This is useful as a receive side silly window syndrome 1709 // prevention mechanism. If window grows to reasonable value, we should send ACK 1710 // to the sender to inform the rx space is now large. We also want ensure a 1711 // series of small read()'s won't trigger a flood of spurious tiny ACK's. 1712 // 1713 // For large receive buffers, the threshold is aMSS - once reader reads more 1714 // than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of 1715 // receive buffer size. This is chosen arbitrarily. 1716 // crossed will be true if the window size crossed the ACK threshold. 1717 // above will be true if the new window is >= ACK threshold and false 1718 // otherwise. 1719 // 1720 // +checklocks:e.mu 1721 // +checklocks:e.rcvQueueMu 1722 func (e *Endpoint) windowCrossedACKThresholdLocked(deltaBefore int, rcvBufSize int) (crossed bool, above bool) { 1723 newAvail := int(e.selectWindowLocked(rcvBufSize)) 1724 oldAvail := newAvail - deltaBefore 1725 if oldAvail < 0 { 1726 oldAvail = 0 1727 } 1728 threshold := int(e.amss) 1729 // rcvBufFraction is the inverse of the fraction of receive buffer size that 1730 // is used to decide if the available buffer space is now above it. 1731 const rcvBufFraction = 2 1732 if wndThreshold := wndFromSpace(rcvBufSize / rcvBufFraction); threshold > wndThreshold { 1733 threshold = wndThreshold 1734 } 1735 1736 switch { 1737 case oldAvail < threshold && newAvail >= threshold: 1738 return true, true 1739 case oldAvail >= threshold && newAvail < threshold: 1740 return true, false 1741 } 1742 return false, false 1743 } 1744 1745 // OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet. 1746 func (e *Endpoint) OnReuseAddressSet(v bool) { 1747 e.LockUser() 1748 e.portFlags.TupleOnly = v 1749 e.UnlockUser() 1750 } 1751 1752 // OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet. 1753 func (e *Endpoint) OnReusePortSet(v bool) { 1754 e.LockUser() 1755 e.portFlags.LoadBalanced = v 1756 e.UnlockUser() 1757 } 1758 1759 // OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet. 1760 func (e *Endpoint) OnKeepAliveSet(bool) { 1761 e.LockUser() 1762 e.resetKeepaliveTimer(true /* receivedData */) 1763 e.UnlockUser() 1764 } 1765 1766 // OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet. 1767 func (e *Endpoint) OnDelayOptionSet(v bool) { 1768 if !v { 1769 e.LockUser() 1770 defer e.UnlockUser() 1771 // Handle delayed data. 1772 if e.EndpointState().connected() { 1773 e.sendData(nil /* next */) 1774 } 1775 } 1776 } 1777 1778 // OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet. 1779 func (e *Endpoint) OnCorkOptionSet(v bool) { 1780 if !v { 1781 e.LockUser() 1782 defer e.UnlockUser() 1783 if e.snd != nil { 1784 e.snd.corkTimer.disable() 1785 } 1786 // Handle the corked data. 1787 if e.EndpointState().connected() { 1788 e.sendData(nil /* next */) 1789 } 1790 } 1791 } 1792 1793 func (e *Endpoint) getSendBufferSize() int { 1794 return int(e.ops.GetSendBufferSize()) 1795 } 1796 1797 // OnSetReceiveBufferSize implements tcpip.SocketOptionsHandler.OnSetReceiveBufferSize. 1798 func (e *Endpoint) OnSetReceiveBufferSize(rcvBufSz, oldSz int64) (newSz int64, postSet func()) { 1799 e.LockUser() 1800 1801 sendNonZeroWindowUpdate := false 1802 e.rcvQueueMu.Lock() 1803 1804 // Make sure the receive buffer size allows us to send a 1805 // non-zero window size. 1806 scale := uint8(0) 1807 if e.rcv != nil { 1808 scale = e.rcv.RcvWndScale 1809 } 1810 if rcvBufSz>>scale == 0 { 1811 rcvBufSz = 1 << scale 1812 } 1813 1814 availBefore := wndFromSpace(e.receiveBufferAvailableLocked(int(oldSz))) 1815 availAfter := wndFromSpace(e.receiveBufferAvailableLocked(int(rcvBufSz))) 1816 e.RcvAutoParams.Disabled = true 1817 1818 // Immediately send an ACK to uncork the sender silly window 1819 // syndrome prevetion, when our available space grows above aMSS 1820 // or half receive buffer, whichever smaller. 1821 if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, int(rcvBufSz)); crossed && above { 1822 sendNonZeroWindowUpdate = true 1823 } 1824 1825 e.rcvQueueMu.Unlock() 1826 1827 postSet = func() { 1828 e.LockUser() 1829 defer e.UnlockUser() 1830 if e.EndpointState().connected() && sendNonZeroWindowUpdate { 1831 e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu 1832 } 1833 1834 } 1835 e.UnlockUser() 1836 return rcvBufSz, postSet 1837 } 1838 1839 // OnSetSendBufferSize implements tcpip.SocketOptionsHandler.OnSetSendBufferSize. 1840 func (e *Endpoint) OnSetSendBufferSize(sz int64) int64 { 1841 e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled.Store(1) 1842 return sz 1843 } 1844 1845 // WakeupWriters implements tcpip.SocketOptionsHandler.WakeupWriters. 1846 func (e *Endpoint) WakeupWriters() { 1847 e.LockUser() 1848 defer e.UnlockUser() 1849 1850 sendBufferSize := e.getSendBufferSize() 1851 e.sndQueueInfo.sndQueueMu.Lock() 1852 notify := (sendBufferSize - e.sndQueueInfo.SndBufUsed) >= e.sndQueueInfo.SndBufUsed>>1 1853 e.sndQueueInfo.sndQueueMu.Unlock() 1854 1855 if notify { 1856 e.waiterQueue.Notify(waiter.WritableEvents) 1857 } 1858 } 1859 1860 // SetSockOptInt sets a socket option. 1861 func (e *Endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { 1862 // Lower 2 bits represents ECN bits. RFC 3168, section 23.1 1863 const inetECNMask = 3 1864 1865 switch opt { 1866 case tcpip.KeepaliveCountOption: 1867 e.LockUser() 1868 e.keepalive.Lock() 1869 e.keepalive.count = v 1870 e.keepalive.Unlock() 1871 e.resetKeepaliveTimer(true /* receivedData */) 1872 e.UnlockUser() 1873 1874 case tcpip.IPv4TOSOption: 1875 e.LockUser() 1876 // TODO(gvisor.dev/issue/995): ECN is not currently supported, 1877 // ignore the bits for now. 1878 e.sendTOS = uint8(v) & ^uint8(inetECNMask) 1879 e.UnlockUser() 1880 1881 case tcpip.IPv6TrafficClassOption: 1882 e.LockUser() 1883 // TODO(gvisor.dev/issue/995): ECN is not currently supported, 1884 // ignore the bits for now. 1885 e.sendTOS = uint8(v) & ^uint8(inetECNMask) 1886 e.UnlockUser() 1887 1888 case tcpip.MaxSegOption: 1889 userMSS := v 1890 if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS { 1891 return &tcpip.ErrInvalidOptionValue{} 1892 } 1893 e.LockUser() 1894 e.userMSS = uint16(userMSS) 1895 e.UnlockUser() 1896 1897 case tcpip.MTUDiscoverOption: 1898 switch v := tcpip.PMTUDStrategy(v); v { 1899 case tcpip.PMTUDiscoveryWant, tcpip.PMTUDiscoveryDont, tcpip.PMTUDiscoveryDo: 1900 e.LockUser() 1901 e.pmtud = v 1902 e.UnlockUser() 1903 case tcpip.PMTUDiscoveryProbe: 1904 // We don't support a way to ignore MTU updates; it's 1905 // either on or it's off. 1906 return &tcpip.ErrNotSupported{} 1907 default: 1908 return &tcpip.ErrNotSupported{} 1909 } 1910 1911 case tcpip.IPv4TTLOption: 1912 e.LockUser() 1913 e.ipv4TTL = uint8(v) 1914 e.UnlockUser() 1915 1916 case tcpip.IPv6HopLimitOption: 1917 e.LockUser() 1918 e.ipv6HopLimit = int16(v) 1919 e.UnlockUser() 1920 1921 case tcpip.TCPSynCountOption: 1922 if v < 1 || v > 255 { 1923 return &tcpip.ErrInvalidOptionValue{} 1924 } 1925 e.LockUser() 1926 e.maxSynRetries = uint8(v) 1927 e.UnlockUser() 1928 1929 case tcpip.TCPWindowClampOption: 1930 if v == 0 { 1931 e.LockUser() 1932 switch e.EndpointState() { 1933 case StateClose, StateInitial: 1934 e.windowClamp = 0 1935 e.UnlockUser() 1936 return nil 1937 default: 1938 e.UnlockUser() 1939 return &tcpip.ErrInvalidOptionValue{} 1940 } 1941 } 1942 var rs tcpip.TCPReceiveBufferSizeRangeOption 1943 if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil { 1944 if v < rs.Min/2 { 1945 v = rs.Min / 2 1946 } 1947 } 1948 e.LockUser() 1949 e.windowClamp = uint32(v) 1950 e.UnlockUser() 1951 } 1952 return nil 1953 } 1954 1955 // HasNIC returns true if the NICID is defined in the stack or id is 0. 1956 func (e *Endpoint) HasNIC(id int32) bool { 1957 return id == 0 || e.stack.HasNIC(tcpip.NICID(id)) 1958 } 1959 1960 // SetSockOpt sets a socket option. 1961 func (e *Endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { 1962 switch v := opt.(type) { 1963 case *tcpip.KeepaliveIdleOption: 1964 e.LockUser() 1965 e.keepalive.Lock() 1966 e.keepalive.idle = time.Duration(*v) 1967 e.keepalive.Unlock() 1968 e.resetKeepaliveTimer(true /* receivedData */) 1969 e.UnlockUser() 1970 1971 case *tcpip.KeepaliveIntervalOption: 1972 e.LockUser() 1973 e.keepalive.Lock() 1974 e.keepalive.interval = time.Duration(*v) 1975 e.keepalive.Unlock() 1976 e.resetKeepaliveTimer(true /* receivedData */) 1977 e.UnlockUser() 1978 1979 case *tcpip.TCPUserTimeoutOption: 1980 e.LockUser() 1981 e.userTimeout = time.Duration(*v) 1982 e.UnlockUser() 1983 1984 case *tcpip.CongestionControlOption: 1985 // Query the available cc algorithms in the stack and 1986 // validate that the specified algorithm is actually 1987 // supported in the stack. 1988 var avail tcpip.TCPAvailableCongestionControlOption 1989 if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil { 1990 return err 1991 } 1992 availCC := strings.Split(string(avail), " ") 1993 for _, cc := range availCC { 1994 if *v == tcpip.CongestionControlOption(cc) { 1995 e.LockUser() 1996 state := e.EndpointState() 1997 e.cc = *v 1998 switch state { 1999 case StateEstablished: 2000 if e.EndpointState() == state { 2001 e.snd.cc = e.snd.initCongestionControl(e.cc) 2002 } 2003 } 2004 e.UnlockUser() 2005 return nil 2006 } 2007 } 2008 2009 // Linux returns ENOENT when an invalid congestion 2010 // control algorithm is specified. 2011 return &tcpip.ErrNoSuchFile{} 2012 2013 case *tcpip.TCPLingerTimeoutOption: 2014 e.LockUser() 2015 2016 switch { 2017 case *v < 0: 2018 // Same as effectively disabling TCPLinger timeout. 2019 *v = -1 2020 case *v == 0: 2021 // Same as the stack default. 2022 var stackLingerTimeout tcpip.TCPLingerTimeoutOption 2023 if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil { 2024 panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err)) 2025 } 2026 *v = stackLingerTimeout 2027 case *v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout): 2028 // Cap it to Stack's default TCP_LINGER2 timeout. 2029 *v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout) 2030 default: 2031 } 2032 2033 e.tcpLingerTimeout = time.Duration(*v) 2034 e.UnlockUser() 2035 2036 case *tcpip.TCPDeferAcceptOption: 2037 e.LockUser() 2038 if time.Duration(*v) > MaxRTO { 2039 *v = tcpip.TCPDeferAcceptOption(MaxRTO) 2040 } 2041 e.deferAccept = time.Duration(*v) 2042 e.UnlockUser() 2043 2044 case *tcpip.SocketDetachFilterOption: 2045 return nil 2046 2047 default: 2048 return nil 2049 } 2050 return nil 2051 } 2052 2053 // readyReceiveSize returns the number of bytes ready to be received. 2054 func (e *Endpoint) readyReceiveSize() (int, tcpip.Error) { 2055 e.LockUser() 2056 defer e.UnlockUser() 2057 2058 // The endpoint cannot be in listen state. 2059 if e.EndpointState() == StateListen { 2060 return 0, &tcpip.ErrInvalidEndpointState{} 2061 } 2062 2063 e.rcvQueueMu.Lock() 2064 defer e.rcvQueueMu.Unlock() 2065 2066 return e.RcvBufUsed, nil 2067 } 2068 2069 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. 2070 func (e *Endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { 2071 switch opt { 2072 case tcpip.KeepaliveCountOption: 2073 e.keepalive.Lock() 2074 v := e.keepalive.count 2075 e.keepalive.Unlock() 2076 return v, nil 2077 2078 case tcpip.IPv4TOSOption: 2079 e.LockUser() 2080 v := int(e.sendTOS) 2081 e.UnlockUser() 2082 return v, nil 2083 2084 case tcpip.IPv6TrafficClassOption: 2085 e.LockUser() 2086 v := int(e.sendTOS) 2087 e.UnlockUser() 2088 return v, nil 2089 2090 case tcpip.MaxSegOption: 2091 // Linux only returns user_mss value if user_mss is set and the socket is 2092 // unconnected. Otherwise Linux returns the actual current MSS. Netstack 2093 // mimics the user_mss behavior, but otherwise just returns the defaultMSS 2094 // for now. 2095 v := header.TCPDefaultMSS 2096 e.LockUser() 2097 if state := e.EndpointState(); e.userMSS > 0 && (state.internal() || state == StateClose || state == StateListen) { 2098 v = int(e.userMSS) 2099 } 2100 e.UnlockUser() 2101 return v, nil 2102 2103 case tcpip.MTUDiscoverOption: 2104 e.LockUser() 2105 v := e.pmtud 2106 e.UnlockUser() 2107 return int(v), nil 2108 2109 case tcpip.ReceiveQueueSizeOption: 2110 return e.readyReceiveSize() 2111 2112 case tcpip.IPv4TTLOption: 2113 e.LockUser() 2114 v := int(e.ipv4TTL) 2115 e.UnlockUser() 2116 return v, nil 2117 2118 case tcpip.IPv6HopLimitOption: 2119 e.LockUser() 2120 v := int(e.ipv6HopLimit) 2121 e.UnlockUser() 2122 return v, nil 2123 2124 case tcpip.TCPSynCountOption: 2125 e.LockUser() 2126 v := int(e.maxSynRetries) 2127 e.UnlockUser() 2128 return v, nil 2129 2130 case tcpip.TCPWindowClampOption: 2131 e.LockUser() 2132 v := int(e.windowClamp) 2133 e.UnlockUser() 2134 return v, nil 2135 2136 case tcpip.MulticastTTLOption: 2137 return 1, nil 2138 2139 default: 2140 return -1, &tcpip.ErrUnknownProtocolOption{} 2141 } 2142 } 2143 2144 func (e *Endpoint) getTCPInfo() tcpip.TCPInfoOption { 2145 info := tcpip.TCPInfoOption{} 2146 e.LockUser() 2147 if state := e.EndpointState(); state.internal() { 2148 info.State = tcpip.EndpointState(StateClose) 2149 } else { 2150 info.State = tcpip.EndpointState(state) 2151 } 2152 snd := e.snd 2153 if snd != nil { 2154 // We do not calculate RTT before sending the data packets. If 2155 // the connection did not send and receive data, then RTT will 2156 // be zero. 2157 snd.rtt.Lock() 2158 info.RTT = snd.rtt.TCPRTTState.SRTT 2159 info.RTTVar = snd.rtt.TCPRTTState.RTTVar 2160 snd.rtt.Unlock() 2161 2162 info.RTO = snd.RTO 2163 info.CcState = snd.state 2164 info.SndSsthresh = uint32(snd.Ssthresh) 2165 info.SndCwnd = uint32(snd.SndCwnd) 2166 info.ReorderSeen = snd.rc.Reord 2167 } 2168 e.UnlockUser() 2169 return info 2170 } 2171 2172 // GetSockOpt implements tcpip.Endpoint.GetSockOpt. 2173 func (e *Endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { 2174 switch o := opt.(type) { 2175 case *tcpip.TCPInfoOption: 2176 *o = e.getTCPInfo() 2177 2178 case *tcpip.KeepaliveIdleOption: 2179 e.keepalive.Lock() 2180 *o = tcpip.KeepaliveIdleOption(e.keepalive.idle) 2181 e.keepalive.Unlock() 2182 2183 case *tcpip.KeepaliveIntervalOption: 2184 e.keepalive.Lock() 2185 *o = tcpip.KeepaliveIntervalOption(e.keepalive.interval) 2186 e.keepalive.Unlock() 2187 2188 case *tcpip.TCPUserTimeoutOption: 2189 e.LockUser() 2190 *o = tcpip.TCPUserTimeoutOption(e.userTimeout) 2191 e.UnlockUser() 2192 2193 case *tcpip.CongestionControlOption: 2194 e.LockUser() 2195 *o = e.cc 2196 e.UnlockUser() 2197 2198 case *tcpip.TCPLingerTimeoutOption: 2199 e.LockUser() 2200 *o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout) 2201 e.UnlockUser() 2202 2203 case *tcpip.TCPDeferAcceptOption: 2204 e.LockUser() 2205 *o = tcpip.TCPDeferAcceptOption(e.deferAccept) 2206 e.UnlockUser() 2207 2208 case *tcpip.OriginalDestinationOption: 2209 e.LockUser() 2210 ipt := e.stack.IPTables() 2211 addr, port, err := ipt.OriginalDst(e.TransportEndpointInfo.ID, e.NetProto, ProtocolNumber) 2212 e.UnlockUser() 2213 if err != nil { 2214 return err 2215 } 2216 *o = tcpip.OriginalDestinationOption{ 2217 Addr: addr, 2218 Port: port, 2219 } 2220 2221 default: 2222 return &tcpip.ErrUnknownProtocolOption{} 2223 } 2224 return nil 2225 } 2226 2227 // checkV4MappedLocked determines the effective network protocol and converts 2228 // addr to its canonical form. 2229 // +checklocks:e.mu 2230 func (e *Endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) { 2231 unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only()) 2232 if err != nil { 2233 return tcpip.FullAddress{}, 0, err 2234 } 2235 return unwrapped, netProto, nil 2236 } 2237 2238 // Disconnect implements tcpip.Endpoint.Disconnect. 2239 func (*Endpoint) Disconnect() tcpip.Error { 2240 return &tcpip.ErrNotSupported{} 2241 } 2242 2243 // Connect connects the endpoint to its peer. 2244 func (e *Endpoint) Connect(addr tcpip.FullAddress) tcpip.Error { 2245 e.LockUser() 2246 defer e.UnlockUser() 2247 err := e.connect(addr, true) 2248 if err != nil { 2249 if !err.IgnoreStats() { 2250 // Connect failed. Let's wake up any waiters. 2251 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 2252 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 2253 e.stats.FailedConnectionAttempts.Increment() 2254 } 2255 } 2256 return err 2257 } 2258 2259 // registerEndpoint registers the endpoint with the provided address. 2260 // 2261 // +checklocks:e.mu 2262 func (e *Endpoint) registerEndpoint(addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber, nicID tcpip.NICID) tcpip.Error { 2263 netProtos := []tcpip.NetworkProtocolNumber{netProto} 2264 if e.TransportEndpointInfo.ID.LocalPort != 0 { 2265 // The endpoint is bound to a port, attempt to register it. 2266 err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 2267 if err != nil { 2268 return err 2269 } 2270 } else { 2271 // The endpoint doesn't have a local port yet, so try to get 2272 // one. Make sure that it isn't one that will result in the same 2273 // address/port for both local and remote (otherwise this 2274 // endpoint would be trying to connect to itself). 2275 sameAddr := e.TransportEndpointInfo.ID.LocalAddress == e.TransportEndpointInfo.ID.RemoteAddress 2276 2277 var twReuse tcpip.TCPTimeWaitReuseOption 2278 if err := e.stack.TransportProtocolOption(ProtocolNumber, &twReuse); err != nil { 2279 panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &twReuse, err)) 2280 } 2281 2282 reuse := twReuse == tcpip.TCPTimeWaitReuseGlobal 2283 if twReuse == tcpip.TCPTimeWaitReuseLoopbackOnly { 2284 switch netProto { 2285 case header.IPv4ProtocolNumber: 2286 reuse = header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.LocalAddress) && header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.RemoteAddress) 2287 case header.IPv6ProtocolNumber: 2288 reuse = e.TransportEndpointInfo.ID.LocalAddress == header.IPv6Loopback && e.TransportEndpointInfo.ID.RemoteAddress == header.IPv6Loopback 2289 } 2290 } 2291 2292 bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) 2293 if _, err := e.stack.PickEphemeralPort(e.stack.SecureRNG(), func(p uint16) (bool, tcpip.Error) { 2294 if sameAddr && p == e.TransportEndpointInfo.ID.RemotePort { 2295 return false, nil 2296 } 2297 portRes := ports.Reservation{ 2298 Networks: netProtos, 2299 Transport: ProtocolNumber, 2300 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2301 Port: p, 2302 Flags: e.portFlags, 2303 BindToDevice: bindToDevice, 2304 Dest: addr, 2305 } 2306 if _, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, nil /* testPort */); err != nil { 2307 if _, ok := err.(*tcpip.ErrPortInUse); !ok || !reuse { 2308 return false, nil 2309 } 2310 transEPID := e.TransportEndpointInfo.ID 2311 transEPID.LocalPort = p 2312 // Check if an endpoint is registered with demuxer in TIME-WAIT and if 2313 // we can reuse it. If we can't find a transport endpoint then we just 2314 // skip using this port as it's possible that either an endpoint has 2315 // bound the port but not registered with demuxer yet (no listen/connect 2316 // done yet) or the reservation was freed between the check above and 2317 // the FindTransportEndpoint below. But rather than retry the same port 2318 // we just skip it and move on. 2319 transEP := e.stack.FindTransportEndpoint(netProto, ProtocolNumber, transEPID, nicID) 2320 if transEP == nil { 2321 // ReservePort failed but there is no registered endpoint with 2322 // demuxer. Which indicates there is at least some endpoint that has 2323 // bound the port. 2324 return false, nil 2325 } 2326 2327 tcpEP := transEP.(*Endpoint) 2328 tcpEP.LockUser() 2329 // If the endpoint is not in TIME-WAIT or if it is in TIME-WAIT but 2330 // less than 1 second has elapsed since its recentTS was updated then 2331 // we cannot reuse the port. 2332 if tcpEP.EndpointState() != StateTimeWait || e.stack.Clock().NowMonotonic().Sub(tcpEP.recentTSTime) < 1*time.Second { 2333 tcpEP.UnlockUser() 2334 return false, nil 2335 } 2336 // Since the endpoint is in TIME-WAIT it should be safe to acquire its 2337 // Lock while holding the lock for this endpoint as endpoints in 2338 // TIME-WAIT do not acquire locks on other endpoints. 2339 tcpEP.transitionToStateCloseLocked() 2340 tcpEP.drainClosingSegmentQueue() 2341 tcpEP.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 2342 tcpEP.UnlockUser() 2343 // Now try and Reserve again if it fails then we skip. 2344 portRes := ports.Reservation{ 2345 Networks: netProtos, 2346 Transport: ProtocolNumber, 2347 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2348 Port: p, 2349 Flags: e.portFlags, 2350 BindToDevice: bindToDevice, 2351 Dest: addr, 2352 } 2353 if _, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, nil /* testPort */); err != nil { 2354 return false, nil 2355 } 2356 } 2357 2358 id := e.TransportEndpointInfo.ID 2359 id.LocalPort = p 2360 if err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.portFlags, bindToDevice); err != nil { 2361 portRes := ports.Reservation{ 2362 Networks: netProtos, 2363 Transport: ProtocolNumber, 2364 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2365 Port: p, 2366 Flags: e.portFlags, 2367 BindToDevice: bindToDevice, 2368 Dest: addr, 2369 } 2370 e.stack.ReleasePort(portRes) 2371 if _, ok := err.(*tcpip.ErrPortInUse); ok { 2372 return false, nil 2373 } 2374 return false, err 2375 } 2376 2377 // Port picking successful. Save the details of 2378 // the selected port. 2379 e.TransportEndpointInfo.ID = id 2380 e.isPortReserved = true 2381 e.boundBindToDevice = bindToDevice 2382 e.boundPortFlags = e.portFlags 2383 e.boundDest = addr 2384 return true, nil 2385 }); err != nil { 2386 e.stack.Stats().TCP.FailedPortReservations.Increment() 2387 return err 2388 } 2389 } 2390 return nil 2391 } 2392 2393 // connect connects the endpoint to its peer. 2394 // +checklocks:e.mu 2395 func (e *Endpoint) connect(addr tcpip.FullAddress, handshake bool) tcpip.Error { 2396 connectingAddr := addr.Addr 2397 2398 addr, netProto, err := e.checkV4MappedLocked(addr) 2399 if err != nil { 2400 return err 2401 } 2402 2403 if e.EndpointState().connected() { 2404 // The endpoint is already connected. If caller hasn't been 2405 // notified yet, return success. 2406 if !e.isConnectNotified { 2407 e.isConnectNotified = true 2408 return nil 2409 } 2410 // Otherwise return that it's already connected. 2411 return &tcpip.ErrAlreadyConnected{} 2412 } 2413 2414 nicID := addr.NIC 2415 switch e.EndpointState() { 2416 case StateBound: 2417 // If we're already bound to a NIC but the caller is requesting 2418 // that we use a different one now, we cannot proceed. 2419 if e.boundNICID == 0 { 2420 break 2421 } 2422 2423 if nicID != 0 && nicID != e.boundNICID { 2424 return &tcpip.ErrHostUnreachable{} 2425 } 2426 2427 nicID = e.boundNICID 2428 2429 case StateInitial: 2430 // Nothing to do. We'll eventually fill-in the gaps in the ID (if any) 2431 // when we find a route. 2432 2433 case StateConnecting, StateSynSent, StateSynRecv: 2434 // A connection request has already been issued but hasn't completed 2435 // yet. 2436 return &tcpip.ErrAlreadyConnecting{} 2437 2438 case StateError: 2439 if err := e.hardErrorLocked(); err != nil { 2440 return err 2441 } 2442 return &tcpip.ErrConnectionAborted{} 2443 2444 default: 2445 return &tcpip.ErrInvalidEndpointState{} 2446 } 2447 2448 // Find a route to the desired destination. 2449 r, err := e.stack.FindRoute(nicID, e.TransportEndpointInfo.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */) 2450 if err != nil { 2451 return err 2452 } 2453 defer r.Release() 2454 2455 e.TransportEndpointInfo.ID.LocalAddress = r.LocalAddress() 2456 e.TransportEndpointInfo.ID.RemoteAddress = r.RemoteAddress() 2457 e.TransportEndpointInfo.ID.RemotePort = addr.Port 2458 2459 oldState := e.EndpointState() 2460 e.setEndpointState(StateConnecting) 2461 if err := e.registerEndpoint(addr, netProto, r.NICID()); err != nil { 2462 e.setEndpointState(oldState) 2463 if _, ok := err.(*tcpip.ErrPortInUse); ok { 2464 return &tcpip.ErrBadLocalAddress{} 2465 } 2466 return err 2467 } 2468 2469 e.isRegistered = true 2470 r.Acquire() 2471 e.route = r 2472 e.boundNICID = nicID 2473 e.effectiveNetProtos = []tcpip.NetworkProtocolNumber{netProto} 2474 e.connectingAddress = connectingAddr 2475 2476 e.initGSO() 2477 2478 // Connect in the restore phase does not perform handshake. Restore its 2479 // connection setting here. 2480 if !handshake { 2481 e.segmentQueue.mu.Lock() 2482 for _, l := range []segmentList{e.segmentQueue.list, e.snd.writeList} { 2483 for s := l.Front(); s != nil; s = s.Next() { 2484 s.id = e.TransportEndpointInfo.ID 2485 e.sndQueueInfo.sndWaker.Assert() 2486 } 2487 } 2488 e.segmentQueue.mu.Unlock() 2489 e.snd.ep.AssertLockHeld(e) 2490 e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0) 2491 e.setEndpointState(StateEstablished) 2492 // Set the new auto tuned send buffer size after entering 2493 // established state. 2494 e.ops.SetSendBufferSize(e.computeTCPSendBufferSize(), false /* notify */) 2495 return &tcpip.ErrConnectStarted{} 2496 } 2497 2498 // Start a new handshake. 2499 h := e.newHandshake() 2500 e.setEndpointState(StateSynSent) 2501 h.start() 2502 e.stack.Stats().TCP.ActiveConnectionOpenings.Increment() 2503 2504 return &tcpip.ErrConnectStarted{} 2505 } 2506 2507 // ConnectEndpoint is not supported. 2508 func (*Endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error { 2509 return &tcpip.ErrInvalidEndpointState{} 2510 } 2511 2512 // Shutdown closes the read and/or write end of the endpoint connection to its 2513 // peer. 2514 func (e *Endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error { 2515 e.LockUser() 2516 defer e.UnlockUser() 2517 2518 if e.EndpointState().connecting() { 2519 // When calling shutdown(2) on a connecting socket, the endpoint must 2520 // enter the error state. But this logic cannot belong to the shutdownLocked 2521 // method because that method is called during a close(2) (and closing a 2522 // connecting socket is not an error). 2523 e.handshakeFailed(&tcpip.ErrConnectionReset{}) 2524 e.waiterQueue.Notify(waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) 2525 return nil 2526 } 2527 2528 return e.shutdownLocked(flags) 2529 } 2530 2531 // +checklocks:e.mu 2532 func (e *Endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error { 2533 e.shutdownFlags |= flags 2534 switch { 2535 case e.EndpointState().connected(): 2536 // Close for read. 2537 if e.shutdownFlags&tcpip.ShutdownRead != 0 { 2538 // Mark read side as closed. 2539 e.rcvQueueMu.Lock() 2540 e.RcvClosed = true 2541 rcvBufUsed := e.RcvBufUsed 2542 e.rcvQueueMu.Unlock() 2543 // If we're fully closed and we have unread data we need to abort 2544 // the connection with a RST. 2545 if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 { 2546 e.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) 2547 return nil 2548 } 2549 // Wake up any readers that maybe waiting for the stream to become 2550 // readable. 2551 events := waiter.ReadableEvents 2552 if e.shutdownFlags&tcpip.ShutdownWrite == 0 { 2553 // If ShutdownWrite is not set, write end won't close and 2554 // we end up with a half-closed connection 2555 events |= waiter.EventRdHUp 2556 } 2557 e.waiterQueue.Notify(events) 2558 } 2559 2560 // Close for write. 2561 if e.shutdownFlags&tcpip.ShutdownWrite != 0 { 2562 e.sndQueueInfo.sndQueueMu.Lock() 2563 if e.sndQueueInfo.SndClosed { 2564 // Already closed. 2565 e.sndQueueInfo.sndQueueMu.Unlock() 2566 if e.EndpointState() == StateTimeWait { 2567 return &tcpip.ErrNotConnected{} 2568 } 2569 return nil 2570 } 2571 2572 // Queue fin segment. 2573 s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), buffer.Buffer{}) 2574 e.snd.writeList.PushBack(s) 2575 // Mark endpoint as closed. 2576 e.sndQueueInfo.SndClosed = true 2577 e.sndQueueInfo.sndQueueMu.Unlock() 2578 2579 // Drain the send queue. 2580 e.sendData(s) 2581 2582 // Mark send side as closed. 2583 e.snd.Closed = true 2584 2585 // Wake up any writers that maybe waiting for the stream to become 2586 // writable. 2587 e.waiterQueue.Notify(waiter.WritableEvents) 2588 } 2589 2590 return nil 2591 case e.EndpointState() == StateListen: 2592 if e.shutdownFlags&tcpip.ShutdownRead != 0 { 2593 // Reset all connections from the accept queue and keep the 2594 // worker running so that it can continue handling incoming 2595 // segments by replying with RST. 2596 // 2597 // By not removing this endpoint from the demuxer mapping, we 2598 // ensure that any other bind to the same port fails, as on Linux. 2599 e.rcvQueueMu.Lock() 2600 e.RcvClosed = true 2601 e.rcvQueueMu.Unlock() 2602 e.closePendingAcceptableConnectionsLocked() 2603 // Notify waiters that the endpoint is shutdown. 2604 e.waiterQueue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) 2605 } 2606 return nil 2607 default: 2608 return &tcpip.ErrNotConnected{} 2609 } 2610 } 2611 2612 // Listen puts the endpoint in "listen" mode, which allows it to accept 2613 // new connections. 2614 func (e *Endpoint) Listen(backlog int) tcpip.Error { 2615 if err := e.listen(backlog); err != nil { 2616 if !err.IgnoreStats() { 2617 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 2618 e.stats.FailedConnectionAttempts.Increment() 2619 } 2620 return err 2621 } 2622 return nil 2623 } 2624 2625 func (e *Endpoint) listen(backlog int) tcpip.Error { 2626 e.LockUser() 2627 defer e.UnlockUser() 2628 2629 if e.EndpointState() == StateListen && !e.closed { 2630 e.acceptMu.Lock() 2631 defer e.acceptMu.Unlock() 2632 2633 // Adjust the size of the backlog iff we can fit 2634 // existing pending connections into the new one. 2635 if e.acceptQueue.endpoints.Len() > backlog { 2636 return &tcpip.ErrInvalidEndpointState{} 2637 } 2638 e.acceptQueue.capacity = backlog 2639 2640 if e.acceptQueue.pendingEndpoints == nil { 2641 e.acceptQueue.pendingEndpoints = make(map[*Endpoint]struct{}) 2642 } 2643 2644 e.shutdownFlags = 0 2645 e.updateConnDirectionState(connDirectionStateOpen) 2646 e.rcvQueueMu.Lock() 2647 e.RcvClosed = false 2648 e.rcvQueueMu.Unlock() 2649 2650 return nil 2651 } 2652 2653 if e.EndpointState() == StateInitial { 2654 // The listen is called on an unbound socket, the socket is 2655 // automatically bound to a random free port with the local 2656 // address set to INADDR_ANY. 2657 if err := e.bindLocked(tcpip.FullAddress{}); err != nil { 2658 return err 2659 } 2660 } 2661 2662 // Endpoint must be bound before it can transition to listen mode. 2663 if e.EndpointState() != StateBound { 2664 e.stats.ReadErrors.InvalidEndpointState.Increment() 2665 return &tcpip.ErrInvalidEndpointState{} 2666 } 2667 2668 // Setting this state after RegisterTransportEndpoint will result in a 2669 // race where the endpoint is in Bound but reachable via the demuxer. Instead 2670 // we set it to listen so that incoming packets will just be queued to the 2671 // inbound segment queue by the TCP processor. 2672 e.setEndpointState(StateListen) 2673 // Register the endpoint. 2674 if err := e.stack.RegisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil { 2675 e.transitionToStateCloseLocked() 2676 return err 2677 } 2678 2679 e.isRegistered = true 2680 2681 // The queue may be non-zero when we're restoring the endpoint, and it 2682 // may be pre-populated with some previously accepted (but not Accepted) 2683 // endpoints. 2684 e.acceptMu.Lock() 2685 if e.acceptQueue.pendingEndpoints == nil { 2686 e.acceptQueue.pendingEndpoints = make(map[*Endpoint]struct{}) 2687 } 2688 if e.acceptQueue.capacity == 0 { 2689 e.acceptQueue.capacity = backlog 2690 } 2691 e.acceptMu.Unlock() 2692 2693 // Initialize the listening context. 2694 rcvWnd := seqnum.Size(e.receiveBufferAvailable()) 2695 e.listenCtx = newListenContext(e.stack, e.protocol, e, rcvWnd, e.ops.GetV6Only(), e.NetProto) 2696 2697 return nil 2698 } 2699 2700 // Accept returns a new endpoint if a peer has established a connection 2701 // to an endpoint previously set to listen mode. 2702 // 2703 // addr if not-nil will contain the peer address of the returned endpoint. 2704 func (e *Endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) { 2705 e.LockUser() 2706 defer e.UnlockUser() 2707 2708 e.rcvQueueMu.Lock() 2709 rcvClosed := e.RcvClosed 2710 e.rcvQueueMu.Unlock() 2711 // Endpoint must be in listen state before it can accept connections. 2712 if rcvClosed || e.EndpointState() != StateListen { 2713 return nil, nil, &tcpip.ErrInvalidEndpointState{} 2714 } 2715 2716 // Get the new accepted endpoint. 2717 var n *Endpoint 2718 e.acceptMu.Lock() 2719 if element := e.acceptQueue.endpoints.Front(); element != nil { 2720 n = e.acceptQueue.endpoints.Remove(element).(*Endpoint) 2721 } 2722 e.acceptMu.Unlock() 2723 if n == nil { 2724 return nil, nil, &tcpip.ErrWouldBlock{} 2725 } 2726 if peerAddr != nil { 2727 *peerAddr = n.getRemoteAddress() 2728 } 2729 return n, n.waiterQueue, nil 2730 } 2731 2732 // Bind binds the endpoint to a specific local port and optionally address. 2733 func (e *Endpoint) Bind(addr tcpip.FullAddress) (err tcpip.Error) { 2734 e.LockUser() 2735 defer e.UnlockUser() 2736 2737 return e.bindLocked(addr) 2738 } 2739 2740 // +checklocks:e.mu 2741 func (e *Endpoint) bindLocked(addr tcpip.FullAddress) (err tcpip.Error) { 2742 // Don't allow binding once endpoint is not in the initial state 2743 // anymore. This is because once the endpoint goes into a connected or 2744 // listen state, it is already bound. 2745 if e.EndpointState() != StateInitial { 2746 return &tcpip.ErrAlreadyBound{} 2747 } 2748 2749 e.BindAddr = addr.Addr 2750 addr, netProto, err := e.checkV4MappedLocked(addr) 2751 if err != nil { 2752 return err 2753 } 2754 2755 netProtos := []tcpip.NetworkProtocolNumber{netProto} 2756 2757 // Expand netProtos to include v4 and v6 under dual-stack if the caller is 2758 // binding to a wildcard (empty) address, and this is an IPv6 endpoint with 2759 // v6only set to false. 2760 if netProto == header.IPv6ProtocolNumber { 2761 stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber) 2762 alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == tcpip.Address{} && stackHasV4 2763 if alsoBindToV4 { 2764 netProtos = append(netProtos, header.IPv4ProtocolNumber) 2765 } 2766 } 2767 2768 var nic tcpip.NICID 2769 // If an address is specified, we must ensure that it's one of our 2770 // local addresses. 2771 if addr.Addr.Len() != 0 { 2772 nic = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) 2773 if nic == 0 { 2774 return &tcpip.ErrBadLocalAddress{} 2775 } 2776 e.TransportEndpointInfo.ID.LocalAddress = addr.Addr 2777 } 2778 2779 bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) 2780 portRes := ports.Reservation{ 2781 Networks: netProtos, 2782 Transport: ProtocolNumber, 2783 Addr: addr.Addr, 2784 Port: addr.Port, 2785 Flags: e.portFlags, 2786 BindToDevice: bindToDevice, 2787 Dest: tcpip.FullAddress{}, 2788 } 2789 port, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, func(p uint16) (bool, tcpip.Error) { 2790 id := e.TransportEndpointInfo.ID 2791 id.LocalPort = p 2792 // CheckRegisterTransportEndpoint should only return an error if there is a 2793 // listening endpoint bound with the same id and portFlags and bindToDevice 2794 // options. 2795 // 2796 // NOTE: Only listening and connected endpoint register with 2797 // demuxer. Further connected endpoints always have a remote 2798 // address/port. Hence this will only return an error if there is a matching 2799 // listening endpoint. 2800 if err := e.stack.CheckRegisterTransportEndpoint(netProtos, ProtocolNumber, id, e.portFlags, bindToDevice); err != nil { 2801 return false, nil 2802 } 2803 return true, nil 2804 }) 2805 if err != nil { 2806 e.stack.Stats().TCP.FailedPortReservations.Increment() 2807 return err 2808 } 2809 2810 e.boundBindToDevice = bindToDevice 2811 e.boundPortFlags = e.portFlags 2812 // TODO(gvisor.dev/issue/3691): Add test to verify boundNICID is correct. 2813 e.boundNICID = nic 2814 e.isPortReserved = true 2815 e.effectiveNetProtos = netProtos 2816 e.TransportEndpointInfo.ID.LocalPort = port 2817 2818 // Mark endpoint as bound. 2819 e.setEndpointState(StateBound) 2820 2821 return nil 2822 } 2823 2824 // GetLocalAddress returns the address to which the endpoint is bound. 2825 func (e *Endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { 2826 e.LockUser() 2827 defer e.UnlockUser() 2828 2829 return tcpip.FullAddress{ 2830 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2831 Port: e.TransportEndpointInfo.ID.LocalPort, 2832 NIC: e.boundNICID, 2833 }, nil 2834 } 2835 2836 // GetRemoteAddress returns the address to which the endpoint is connected. 2837 func (e *Endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { 2838 e.LockUser() 2839 defer e.UnlockUser() 2840 2841 if !e.EndpointState().connected() { 2842 return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} 2843 } 2844 2845 return e.getRemoteAddress(), nil 2846 } 2847 2848 func (e *Endpoint) getRemoteAddress() tcpip.FullAddress { 2849 return tcpip.FullAddress{ 2850 Addr: e.TransportEndpointInfo.ID.RemoteAddress, 2851 Port: e.TransportEndpointInfo.ID.RemotePort, 2852 NIC: e.boundNICID, 2853 } 2854 } 2855 2856 // HandlePacket implements stack.TransportEndpoint.HandlePacket. 2857 func (*Endpoint) HandlePacket(stack.TransportEndpointID, *stack.PacketBuffer) { 2858 // TCP HandlePacket is not required anymore as inbound packets first 2859 // land at the Dispatcher which then can either deliver using the 2860 // worker go routine or directly do the invoke the tcp processing inline 2861 // based on the state of the endpoint. 2862 } 2863 2864 func (e *Endpoint) enqueueSegment(s *segment) bool { 2865 // Send packet to worker goroutine. 2866 if !e.segmentQueue.enqueue(s) { 2867 // The queue is full, so we drop the segment. 2868 e.stack.Stats().DroppedPackets.Increment() 2869 e.stats.ReceiveErrors.SegmentQueueDropped.Increment() 2870 return false 2871 } 2872 return true 2873 } 2874 2875 func (e *Endpoint) onICMPError(err tcpip.Error, transErr stack.TransportError, pkt *stack.PacketBuffer) { 2876 // Update last error first. 2877 e.lastErrorMu.Lock() 2878 e.lastError = err 2879 e.lastErrorMu.Unlock() 2880 2881 var recvErr bool 2882 switch pkt.NetworkProtocolNumber { 2883 case header.IPv4ProtocolNumber: 2884 recvErr = e.SocketOptions().GetIPv4RecvError() 2885 case header.IPv6ProtocolNumber: 2886 recvErr = e.SocketOptions().GetIPv6RecvError() 2887 default: 2888 panic(fmt.Sprintf("unhandled network protocol number = %d", pkt.NetworkProtocolNumber)) 2889 } 2890 2891 if recvErr { 2892 e.SocketOptions().QueueErr(&tcpip.SockError{ 2893 Err: err, 2894 Cause: transErr, 2895 // Linux passes the payload with the TCP header. We don't know if the TCP 2896 // header even exists, it may not for fragmented packets. 2897 Payload: pkt.Data().AsRange().ToView(), 2898 Dst: tcpip.FullAddress{ 2899 NIC: pkt.NICID, 2900 Addr: e.TransportEndpointInfo.ID.RemoteAddress, 2901 Port: e.TransportEndpointInfo.ID.RemotePort, 2902 }, 2903 Offender: tcpip.FullAddress{ 2904 NIC: pkt.NICID, 2905 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2906 Port: e.TransportEndpointInfo.ID.LocalPort, 2907 }, 2908 NetProto: pkt.NetworkProtocolNumber, 2909 }) 2910 } 2911 2912 if e.EndpointState().connecting() { 2913 e.mu.Lock() 2914 if lEP := e.h.listenEP; lEP != nil { 2915 // Remove from listening endpoints pending list. 2916 lEP.acceptMu.Lock() 2917 delete(lEP.acceptQueue.pendingEndpoints, e) 2918 lEP.acceptMu.Unlock() 2919 lEP.stats.FailedConnectionAttempts.Increment() 2920 } 2921 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 2922 e.cleanupLocked() 2923 e.hardError = err 2924 e.setEndpointState(StateError) 2925 e.mu.Unlock() 2926 e.drainClosingSegmentQueue() 2927 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 2928 } 2929 } 2930 2931 // HandleError implements stack.TransportEndpoint. 2932 func (e *Endpoint) HandleError(transErr stack.TransportError, pkt *stack.PacketBuffer) { 2933 handlePacketTooBig := func(mtu uint32) { 2934 e.sndQueueInfo.sndQueueMu.Lock() 2935 update := false 2936 if v := int(mtu); v < e.sndQueueInfo.SndMTU { 2937 e.sndQueueInfo.SndMTU = v 2938 update = true 2939 } 2940 newMTU := e.sndQueueInfo.SndMTU 2941 e.sndQueueInfo.sndQueueMu.Unlock() 2942 if update { 2943 e.mu.Lock() 2944 defer e.mu.Unlock() 2945 if e.snd != nil { 2946 e.snd.updateMaxPayloadSize(newMTU, 1 /* count */) // +checklocksforce:e.snd.ep.mu 2947 } 2948 } 2949 } 2950 2951 // TODO(gvisor.dev/issues/5270): Handle all transport errors. 2952 switch transErr.Kind() { 2953 case stack.PacketTooBigTransportError: 2954 handlePacketTooBig(transErr.Info()) 2955 case stack.DestinationHostUnreachableTransportError: 2956 e.onICMPError(&tcpip.ErrHostUnreachable{}, transErr, pkt) 2957 case stack.DestinationNetworkUnreachableTransportError: 2958 e.onICMPError(&tcpip.ErrNetworkUnreachable{}, transErr, pkt) 2959 case stack.DestinationPortUnreachableTransportError: 2960 e.onICMPError(&tcpip.ErrConnectionRefused{}, transErr, pkt) 2961 case stack.DestinationProtoUnreachableTransportError: 2962 e.onICMPError(&tcpip.ErrUnknownProtocolOption{}, transErr, pkt) 2963 case stack.SourceRouteFailedTransportError: 2964 e.onICMPError(&tcpip.ErrNotSupported{}, transErr, pkt) 2965 case stack.SourceHostIsolatedTransportError: 2966 e.onICMPError(&tcpip.ErrNoNet{}, transErr, pkt) 2967 case stack.DestinationHostDownTransportError: 2968 e.onICMPError(&tcpip.ErrHostDown{}, transErr, pkt) 2969 } 2970 } 2971 2972 // updateSndBufferUsage is called by the protocol goroutine when room opens up 2973 // in the send buffer. The number of newly available bytes is v. 2974 func (e *Endpoint) updateSndBufferUsage(v int) { 2975 sendBufferSize := e.getSendBufferSize() 2976 e.sndQueueInfo.sndQueueMu.Lock() 2977 notify := e.sndQueueInfo.SndBufUsed >= sendBufferSize>>1 2978 e.sndQueueInfo.SndBufUsed -= v 2979 2980 // Get the new send buffer size with auto tuning, but do not set it 2981 // unless we decide to notify the writers. 2982 newSndBufSz := e.computeTCPSendBufferSize() 2983 2984 // We only notify when there is half the sendBufferSize available after 2985 // a full buffer event occurs. This ensures that we don't wake up 2986 // writers to queue just 1-2 segments and go back to sleep. 2987 notify = notify && e.sndQueueInfo.SndBufUsed < int(newSndBufSz)>>1 2988 e.sndQueueInfo.sndQueueMu.Unlock() 2989 2990 if notify { 2991 // Set the new send buffer size calculated from auto tuning. 2992 e.ops.SetSendBufferSize(newSndBufSz, false /* notify */) 2993 e.waiterQueue.Notify(waiter.WritableEvents) 2994 } 2995 } 2996 2997 // readyToRead is called by the protocol goroutine when a new segment is ready 2998 // to be read, or when the connection is closed for receiving (in which case 2999 // s will be nil). 3000 // 3001 // +checklocks:e.mu 3002 func (e *Endpoint) readyToRead(s *segment) { 3003 e.rcvQueueMu.Lock() 3004 if s != nil { 3005 e.RcvBufUsed += s.payloadSize() 3006 s.IncRef() 3007 e.rcvQueue.PushBack(s) 3008 } else { 3009 e.RcvClosed = true 3010 } 3011 e.rcvQueueMu.Unlock() 3012 e.waiterQueue.Notify(waiter.ReadableEvents) 3013 } 3014 3015 // receiveBufferAvailableLocked calculates how many bytes are still available 3016 // in the receive buffer. 3017 // +checklocks:e.rcvQueueMu 3018 func (e *Endpoint) receiveBufferAvailableLocked(rcvBufSize int) int { 3019 // We may use more bytes than the buffer size when the receive buffer 3020 // shrinks. 3021 memUsed := e.receiveMemUsed() 3022 if memUsed >= rcvBufSize { 3023 return 0 3024 } 3025 3026 return rcvBufSize - memUsed 3027 } 3028 3029 // receiveBufferAvailable calculates how many bytes are still available in the 3030 // receive buffer based on the actual memory used by all segments held in 3031 // receive buffer/pending and segment queue. 3032 func (e *Endpoint) receiveBufferAvailable() int { 3033 e.rcvQueueMu.Lock() 3034 available := e.receiveBufferAvailableLocked(int(e.ops.GetReceiveBufferSize())) 3035 e.rcvQueueMu.Unlock() 3036 return available 3037 } 3038 3039 // receiveBufferUsed returns the amount of in-use receive buffer. 3040 func (e *Endpoint) receiveBufferUsed() int { 3041 e.rcvQueueMu.Lock() 3042 used := e.RcvBufUsed 3043 e.rcvQueueMu.Unlock() 3044 return used 3045 } 3046 3047 // receiveMemUsed returns the total memory in use by segments held by this 3048 // endpoint. 3049 func (e *Endpoint) receiveMemUsed() int { 3050 return int(e.rcvMemUsed.Load()) 3051 } 3052 3053 // updateReceiveMemUsed adds the provided delta to e.rcvMemUsed. 3054 func (e *Endpoint) updateReceiveMemUsed(delta int) { 3055 e.rcvMemUsed.Add(int32(delta)) 3056 } 3057 3058 // maxReceiveBufferSize returns the stack wide maximum receive buffer size for 3059 // an endpoint. 3060 func (e *Endpoint) maxReceiveBufferSize() int { 3061 var rs tcpip.TCPReceiveBufferSizeRangeOption 3062 if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil { 3063 // As a fallback return the hardcoded max buffer size. 3064 return MaxBufferSize 3065 } 3066 return rs.Max 3067 } 3068 3069 // directionState returns the close state of send and receive part of the endpoint 3070 func (e *Endpoint) connDirectionState() connDirectionState { 3071 return connDirectionState(e.connectionDirectionState.Load()) 3072 } 3073 3074 // updateDirectionState updates the close state of send and receive part of the endpoint 3075 func (e *Endpoint) updateConnDirectionState(state connDirectionState) connDirectionState { 3076 return connDirectionState(e.connectionDirectionState.Swap(uint32(e.connDirectionState() | state))) 3077 } 3078 3079 // rcvWndScaleForHandshake computes the receive window scale to offer to the 3080 // peer when window scaling is enabled (true by default). If auto-tuning is 3081 // disabled then the window scaling factor is based on the size of the 3082 // receiveBuffer otherwise we use the max permissible receive buffer size to 3083 // compute the scale. 3084 func (e *Endpoint) rcvWndScaleForHandshake() int { 3085 bufSizeForScale := e.ops.GetReceiveBufferSize() 3086 3087 e.rcvQueueMu.Lock() 3088 autoTuningDisabled := e.RcvAutoParams.Disabled 3089 e.rcvQueueMu.Unlock() 3090 if autoTuningDisabled { 3091 return FindWndScale(seqnum.Size(bufSizeForScale)) 3092 } 3093 3094 return FindWndScale(seqnum.Size(e.maxReceiveBufferSize())) 3095 } 3096 3097 // updateRecentTimestamp updates the recent timestamp using the algorithm 3098 // described in https://tools.ietf.org/html/rfc7323#section-4.3 3099 func (e *Endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) { 3100 if e.SendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) { 3101 e.setRecentTimestamp(tsVal) 3102 } 3103 } 3104 3105 // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if 3106 // the SYN options indicate that timestamp option was negotiated. It also 3107 // initializes the recentTS with the value provided in synOpts.TSval. 3108 func (e *Endpoint) maybeEnableTimestamp(synOpts header.TCPSynOptions) { 3109 if synOpts.TS { 3110 e.SendTSOk = true 3111 e.setRecentTimestamp(synOpts.TSVal) 3112 } 3113 } 3114 3115 func (e *Endpoint) tsVal(now tcpip.MonotonicTime) uint32 { 3116 return e.TSOffset.TSVal(now) 3117 } 3118 3119 func (e *Endpoint) tsValNow() uint32 { 3120 return e.tsVal(e.stack.Clock().NowMonotonic()) 3121 } 3122 3123 func (e *Endpoint) elapsed(now tcpip.MonotonicTime, tsEcr uint32) time.Duration { 3124 return e.TSOffset.Elapsed(now, tsEcr) 3125 } 3126 3127 // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint 3128 // if the SYN options indicate that the SACK option was negotiated and the TCP 3129 // stack is configured to enable TCP SACK option. 3130 func (e *Endpoint) maybeEnableSACKPermitted(synOpts header.TCPSynOptions) { 3131 var v tcpip.TCPSACKEnabled 3132 if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil { 3133 // Stack doesn't support SACK. So just return. 3134 return 3135 } 3136 if bool(v) && synOpts.SACKPermitted { 3137 e.SACKPermitted = true 3138 e.stack.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery) 3139 } 3140 } 3141 3142 // maxOptionSize return the maximum size of TCP options. 3143 func (e *Endpoint) maxOptionSize() (size int) { 3144 var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock 3145 options := e.makeOptions(maxSackBlocks[:]) 3146 size = len(options) 3147 putOptions(options) 3148 3149 return size 3150 } 3151 3152 // completeStateLocked makes a full copy of the endpoint and returns it. This is 3153 // used before invoking the probe. 3154 // 3155 // +checklocks:e.mu 3156 func (e *Endpoint) completeStateLocked(s *stack.TCPEndpointState) { 3157 s.TCPEndpointStateInner = e.TCPEndpointStateInner 3158 s.ID = stack.TCPEndpointID(e.TransportEndpointInfo.ID) 3159 s.SegTime = e.stack.Clock().NowMonotonic() 3160 s.Receiver = e.rcv.TCPReceiverState 3161 s.Sender = e.snd.TCPSenderState 3162 3163 sndBufSize := e.getSendBufferSize() 3164 // Copy the send buffer atomically. 3165 e.sndQueueInfo.sndQueueMu.Lock() 3166 e.sndQueueInfo.CloneState(&s.SndBufState) 3167 s.SndBufState.SndBufSize = sndBufSize 3168 e.sndQueueInfo.sndQueueMu.Unlock() 3169 3170 // Copy the receive buffer atomically. 3171 e.rcvQueueMu.Lock() 3172 s.RcvBufState = e.TCPRcvBufState 3173 e.rcvQueueMu.Unlock() 3174 3175 // Copy the endpoint TCP Option state. 3176 s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks) 3177 copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks]) 3178 s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy() 3179 3180 e.snd.rtt.Lock() 3181 s.Sender.RTTState = e.snd.rtt.TCPRTTState 3182 e.snd.rtt.Unlock() 3183 3184 if cubic, ok := e.snd.cc.(*cubicState); ok { 3185 s.Sender.Cubic = cubic.TCPCubicState 3186 s.Sender.Cubic.TimeSinceLastCongestion = e.stack.Clock().NowMonotonic().Sub(s.Sender.Cubic.T) 3187 } 3188 3189 s.Sender.RACKState = e.snd.rc.TCPRACKState 3190 s.Sender.RetransmitTS = e.snd.retransmitTS 3191 s.Sender.SpuriousRecovery = e.snd.spuriousRecovery 3192 } 3193 3194 func (e *Endpoint) initHostGSO() { 3195 switch e.route.NetProto() { 3196 case header.IPv4ProtocolNumber: 3197 e.gso.Type = stack.GSOTCPv4 3198 e.gso.L3HdrLen = header.IPv4MinimumSize 3199 case header.IPv6ProtocolNumber: 3200 e.gso.Type = stack.GSOTCPv6 3201 e.gso.L3HdrLen = header.IPv6MinimumSize 3202 default: 3203 panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto)) 3204 } 3205 e.gso.NeedsCsum = true 3206 e.gso.CsumOffset = header.TCPChecksumOffset 3207 e.gso.MaxSize = e.route.GSOMaxSize() 3208 } 3209 3210 func (e *Endpoint) initGSO() { 3211 if e.route.HasHostGSOCapability() { 3212 e.initHostGSO() 3213 } else if e.route.HasGVisorGSOCapability() { 3214 e.gso = stack.GSO{ 3215 MaxSize: e.route.GSOMaxSize(), 3216 Type: stack.GSOGvisor, 3217 NeedsCsum: false, 3218 } 3219 } 3220 } 3221 3222 // State implements tcpip.Endpoint.State. It exports the endpoint's protocol 3223 // state for diagnostics. 3224 func (e *Endpoint) State() uint32 { 3225 return uint32(e.EndpointState()) 3226 } 3227 3228 // Info returns a copy of the endpoint info. 3229 func (e *Endpoint) Info() tcpip.EndpointInfo { 3230 e.LockUser() 3231 // Make a copy of the endpoint info. 3232 ret := e.TransportEndpointInfo 3233 e.UnlockUser() 3234 return &ret 3235 } 3236 3237 // Stats returns a pointer to the endpoint stats. 3238 func (e *Endpoint) Stats() tcpip.EndpointStats { 3239 return &e.stats 3240 } 3241 3242 // Wait implements stack.TransportEndpoint.Wait. 3243 func (e *Endpoint) Wait() { 3244 waitEntry, notifyCh := waiter.NewChannelEntry(waiter.EventHUp) 3245 e.waiterQueue.EventRegister(&waitEntry) 3246 defer e.waiterQueue.EventUnregister(&waitEntry) 3247 switch e.EndpointState() { 3248 case StateClose, StateError: 3249 return 3250 } 3251 <-notifyCh 3252 } 3253 3254 // SocketOptions implements tcpip.Endpoint.SocketOptions. 3255 func (e *Endpoint) SocketOptions() *tcpip.SocketOptions { 3256 return &e.ops 3257 } 3258 3259 // GetTCPSendBufferLimits is used to get send buffer size limits for TCP. 3260 func GetTCPSendBufferLimits(sh tcpip.StackHandler) tcpip.SendBufferSizeOption { 3261 // This type assertion is safe because only the TCP stack calls this 3262 // function. 3263 ss := sh.(*stack.Stack).TCPSendBufferLimits() 3264 return tcpip.SendBufferSizeOption{ 3265 Min: ss.Min, 3266 Default: ss.Default, 3267 Max: ss.Max, 3268 } 3269 } 3270 3271 // allowOutOfWindowAck returns true if an out-of-window ACK can be sent now. 3272 func (e *Endpoint) allowOutOfWindowAck() bool { 3273 now := e.stack.Clock().NowMonotonic() 3274 3275 if e.lastOutOfWindowAckTime != (tcpip.MonotonicTime{}) { 3276 var limit stack.TCPInvalidRateLimitOption 3277 if err := e.stack.Option(&limit); err != nil { 3278 panic(fmt.Sprintf("e.stack.Option(%+v) failed with error: %s", limit, err)) 3279 } 3280 if now.Sub(e.lastOutOfWindowAckTime) < time.Duration(limit) { 3281 return false 3282 } 3283 } 3284 3285 e.lastOutOfWindowAckTime = now 3286 return true 3287 } 3288 3289 // GetTCPReceiveBufferLimits is used to get send buffer size limits for TCP. 3290 func GetTCPReceiveBufferLimits(s tcpip.StackHandler) tcpip.ReceiveBufferSizeOption { 3291 var ss tcpip.TCPReceiveBufferSizeRangeOption 3292 if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil { 3293 panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err)) 3294 } 3295 3296 return tcpip.ReceiveBufferSizeOption{ 3297 Min: ss.Min, 3298 Default: ss.Default, 3299 Max: ss.Max, 3300 } 3301 } 3302 3303 // computeTCPSendBufferSize implements auto tuning of send buffer size and 3304 // returns the new send buffer size. 3305 func (e *Endpoint) computeTCPSendBufferSize() int64 { 3306 curSndBufSz := int64(e.getSendBufferSize()) 3307 3308 // Auto tuning is disabled when the user explicitly sets the send 3309 // buffer size with SO_SNDBUF option. 3310 if disabled := e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled.Load(); disabled == 1 { 3311 return curSndBufSz 3312 } 3313 3314 const packetOverheadFactor = 2 3315 curMSS := e.snd.MaxPayloadSize 3316 numSeg := InitialCwnd 3317 if numSeg < e.snd.SndCwnd { 3318 numSeg = e.snd.SndCwnd 3319 } 3320 3321 // SndCwnd indicates the number of segments that can be sent. This means 3322 // that the sender can send upto #SndCwnd segments and the send buffer 3323 // size should be set to SndCwnd*MSS to accommodate sending of all the 3324 // segments. 3325 newSndBufSz := int64(numSeg * curMSS * packetOverheadFactor) 3326 if newSndBufSz < curSndBufSz { 3327 return curSndBufSz 3328 } 3329 if ss := GetTCPSendBufferLimits(e.stack); int64(ss.Max) < newSndBufSz { 3330 newSndBufSz = int64(ss.Max) 3331 } 3332 3333 return newSndBufSz 3334 } 3335 3336 // GetAcceptConn implements tcpip.SocketOptionsHandler. 3337 func (e *Endpoint) GetAcceptConn() bool { 3338 return EndpointState(e.State()) == StateListen 3339 }