github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/tcpip/transport/tcp/endpoint.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "container/heap" 19 "encoding/binary" 20 "fmt" 21 "io" 22 "math" 23 "runtime" 24 "strings" 25 "time" 26 27 "github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops" 28 "github.com/nicocha30/gvisor-ligolo/pkg/buffer" 29 "github.com/nicocha30/gvisor-ligolo/pkg/sleep" 30 "github.com/nicocha30/gvisor-ligolo/pkg/sync" 31 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip" 32 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/hash/jenkins" 33 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/header" 34 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/ports" 35 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/seqnum" 36 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/stack" 37 "github.com/nicocha30/gvisor-ligolo/pkg/waiter" 38 ) 39 40 // EndpointState represents the state of a TCP endpoint. 41 type EndpointState tcpip.EndpointState 42 43 // Endpoint states. Note that are represented in a netstack-specific manner and 44 // may not be meaningful externally. Specifically, they need to be translated to 45 // Linux's representation for these states if presented to userspace. 46 const ( 47 _ EndpointState = iota 48 // TCP protocol states in sync with the definitions in 49 // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/tcp_states.h#L13 50 StateEstablished 51 StateSynSent 52 StateSynRecv 53 StateFinWait1 54 StateFinWait2 55 StateTimeWait 56 StateClose 57 StateCloseWait 58 StateLastAck 59 StateListen 60 StateClosing 61 62 // Endpoint states internal to netstack. 63 StateInitial 64 StateBound 65 StateConnecting // Connect() called, but the initial SYN hasn't been sent. 66 StateError 67 ) 68 69 const ( 70 // rcvAdvWndScale is used to split the available socket buffer into 71 // application buffer and the window to be advertised to the peer. This is 72 // currently hard coded to split the available space equally. 73 rcvAdvWndScale = 1 74 75 // SegOverheadFactor is used to multiply the value provided by the 76 // user on a SetSockOpt for setting the socket send/receive buffer sizes. 77 SegOverheadFactor = 2 78 ) 79 80 type connDirectionState uint32 81 82 // Connection direction states used for directionState checks in endpoint struct 83 // to detect half-closed connection and deliver POLLRDHUP 84 const ( 85 connDirectionStateOpen connDirectionState = 0 86 connDirectionStateRcvClosed connDirectionState = 1 87 connDirectionStateSndClosed connDirectionState = 2 88 connDirectionStateAll connDirectionState = connDirectionStateOpen | connDirectionStateRcvClosed | connDirectionStateSndClosed 89 ) 90 91 // connected returns true when s is one of the states representing an 92 // endpoint connected to a peer. 93 func (s EndpointState) connected() bool { 94 switch s { 95 case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing: 96 return true 97 default: 98 return false 99 } 100 } 101 102 // connecting returns true when s is one of the states representing a 103 // connection in progress, but not yet fully established. 104 func (s EndpointState) connecting() bool { 105 switch s { 106 case StateConnecting, StateSynSent, StateSynRecv: 107 return true 108 default: 109 return false 110 } 111 } 112 113 // internal returns true when the state is netstack internal. 114 func (s EndpointState) internal() bool { 115 switch s { 116 case StateInitial, StateBound, StateConnecting, StateError: 117 return true 118 default: 119 return false 120 } 121 } 122 123 // handshake returns true when s is one of the states representing an endpoint 124 // in the middle of a TCP handshake. 125 func (s EndpointState) handshake() bool { 126 switch s { 127 case StateSynSent, StateSynRecv: 128 return true 129 default: 130 return false 131 } 132 } 133 134 // closed returns true when s is one of the states an endpoint transitions to 135 // when closed or when it encounters an error. This is distinct from a newly 136 // initialized endpoint that was never connected. 137 func (s EndpointState) closed() bool { 138 switch s { 139 case StateClose, StateError: 140 return true 141 default: 142 return false 143 } 144 } 145 146 // String implements fmt.Stringer.String. 147 func (s EndpointState) String() string { 148 switch s { 149 case StateInitial: 150 return "INITIAL" 151 case StateBound: 152 return "BOUND" 153 case StateConnecting: 154 return "CONNECTING" 155 case StateError: 156 return "ERROR" 157 case StateEstablished: 158 return "ESTABLISHED" 159 case StateSynSent: 160 return "SYN-SENT" 161 case StateSynRecv: 162 return "SYN-RCVD" 163 case StateFinWait1: 164 return "FIN-WAIT1" 165 case StateFinWait2: 166 return "FIN-WAIT2" 167 case StateTimeWait: 168 return "TIME-WAIT" 169 case StateClose: 170 return "CLOSED" 171 case StateCloseWait: 172 return "CLOSE-WAIT" 173 case StateLastAck: 174 return "LAST-ACK" 175 case StateListen: 176 return "LISTEN" 177 case StateClosing: 178 return "CLOSING" 179 default: 180 panic("unreachable") 181 } 182 } 183 184 // SACKInfo holds TCP SACK related information for a given endpoint. 185 // 186 // +stateify savable 187 type SACKInfo struct { 188 // Blocks is the maximum number of SACK blocks we track 189 // per endpoint. 190 Blocks [MaxSACKBlocks]header.SACKBlock 191 192 // NumBlocks is the number of valid SACK blocks stored in the 193 // blocks array above. 194 NumBlocks int 195 } 196 197 // ReceiveErrors collect segment receive errors within transport layer. 198 // 199 // +stateify savable 200 type ReceiveErrors struct { 201 tcpip.ReceiveErrors 202 203 // SegmentQueueDropped is the number of segments dropped due to 204 // a full segment queue. 205 SegmentQueueDropped tcpip.StatCounter 206 207 // ChecksumErrors is the number of segments dropped due to bad checksums. 208 ChecksumErrors tcpip.StatCounter 209 210 // ListenOverflowSynDrop is the number of times the listen queue overflowed 211 // and a SYN was dropped. 212 ListenOverflowSynDrop tcpip.StatCounter 213 214 // ListenOverflowAckDrop is the number of times the final ACK 215 // in the handshake was dropped due to overflow. 216 ListenOverflowAckDrop tcpip.StatCounter 217 218 // ZeroRcvWindowState is the number of times we advertised 219 // a zero receive window when rcvQueue is full. 220 ZeroRcvWindowState tcpip.StatCounter 221 222 // WantZeroWindow is the number of times we wanted to advertise a 223 // zero receive window but couldn't because it would have caused 224 // the receive window's right edge to shrink. 225 WantZeroRcvWindow tcpip.StatCounter 226 } 227 228 // SendErrors collect segment send errors within the transport layer. 229 // 230 // +stateify savable 231 type SendErrors struct { 232 tcpip.SendErrors 233 234 // SegmentSendToNetworkFailed is the number of TCP segments failed to be sent 235 // to the network endpoint. 236 SegmentSendToNetworkFailed tcpip.StatCounter 237 238 // SynSendToNetworkFailed is the number of TCP SYNs failed to be sent 239 // to the network endpoint. 240 SynSendToNetworkFailed tcpip.StatCounter 241 242 // Retransmits is the number of TCP segments retransmitted. 243 Retransmits tcpip.StatCounter 244 245 // FastRetransmit is the number of segments retransmitted in fast 246 // recovery. 247 FastRetransmit tcpip.StatCounter 248 249 // Timeouts is the number of times the RTO expired. 250 Timeouts tcpip.StatCounter 251 } 252 253 // Stats holds statistics about the endpoint. 254 // 255 // +stateify savable 256 type Stats struct { 257 // SegmentsReceived is the number of TCP segments received that 258 // the transport layer successfully parsed. 259 SegmentsReceived tcpip.StatCounter 260 261 // SegmentsSent is the number of TCP segments sent. 262 SegmentsSent tcpip.StatCounter 263 264 // FailedConnectionAttempts is the number of times we saw Connect and 265 // Accept errors. 266 FailedConnectionAttempts tcpip.StatCounter 267 268 // ReceiveErrors collects segment receive errors within the 269 // transport layer. 270 ReceiveErrors ReceiveErrors 271 272 // ReadErrors collects segment read errors from an endpoint read call. 273 ReadErrors tcpip.ReadErrors 274 275 // SendErrors collects segment send errors within the transport layer. 276 SendErrors SendErrors 277 278 // WriteErrors collects segment write errors from an endpoint write call. 279 WriteErrors tcpip.WriteErrors 280 } 281 282 // IsEndpointStats is an empty method to implement the tcpip.EndpointStats 283 // marker interface. 284 func (*Stats) IsEndpointStats() {} 285 286 // sndQueueInfo implements a send queue. 287 // 288 // +stateify savable 289 type sndQueueInfo struct { 290 sndQueueMu sync.Mutex `state:"nosave"` 291 stack.TCPSndBufState 292 293 // sndWaker is used to signal the protocol goroutine when there may be 294 // segments that need to be sent. 295 sndWaker sleep.Waker `state:"manual"` 296 } 297 298 // CloneState clones sq into other. It is not thread safe 299 func (sq *sndQueueInfo) CloneState(other *stack.TCPSndBufState) { 300 other.SndBufSize = sq.SndBufSize 301 other.SndBufUsed = sq.SndBufUsed 302 other.SndClosed = sq.SndClosed 303 other.PacketTooBigCount = sq.PacketTooBigCount 304 other.SndMTU = sq.SndMTU 305 other.AutoTuneSndBufDisabled = atomicbitops.FromUint32(sq.AutoTuneSndBufDisabled.RacyLoad()) 306 } 307 308 // endpoint represents a TCP endpoint. This struct serves as the interface 309 // between users of the endpoint and the protocol implementation; it is legal to 310 // have concurrent goroutines make calls into the endpoint, they are properly 311 // synchronized. The protocol implementation, however, runs in a single 312 // goroutine. 313 // 314 // Each endpoint has a few mutexes: 315 // 316 // e.mu -> Primary mutex for an endpoint must be held for all operations except 317 // in e.Readiness where acquiring it will result in a deadlock in epoll 318 // implementation. 319 // 320 // The following three mutexes can be acquired independent of e.mu but if 321 // acquired with e.mu then e.mu must be acquired first. 322 // 323 // e.acceptMu -> Protects e.acceptQueue. 324 // e.rcvQueueMu -> Protects e.rcvQueue's associated fields but not e.rcvQueue 325 // itself. 326 // e.sndQueueMu -> Protects the e.sndQueue and associated fields. 327 // e.lastErrorMu -> Protects the lastError field. 328 // 329 // LOCKING/UNLOCKING of the endpoint. The locking of an endpoint is different 330 // based on the context in which the lock is acquired. In the syscall context 331 // e.LockUser/e.UnlockUser should be used and when doing background processing 332 // e.mu.Lock/e.mu.Unlock should be used. The distinction is described below 333 // in brief. 334 // 335 // The reason for this locking behaviour is to avoid wakeups to handle packets. 336 // In cases where the endpoint is already locked the background processor can 337 // queue the packet up and go its merry way and the lock owner will eventually 338 // process the backlog when releasing the lock. Similarly when acquiring the 339 // lock from say a syscall goroutine we can implement a bit of spinning if we 340 // know that the lock is not held by another syscall goroutine. Background 341 // processors should never hold the lock for long and we can avoid an expensive 342 // sleep/wakeup by spinning for a shortwhile. 343 // 344 // For more details please see the detailed documentation on 345 // e.LockUser/e.UnlockUser methods. 346 // 347 // +stateify savable 348 type endpoint struct { 349 stack.TCPEndpointStateInner 350 stack.TransportEndpointInfo 351 tcpip.DefaultSocketOptionsHandler 352 353 // endpointEntry is used to queue endpoints for processing to the 354 // a given tcp processor goroutine. 355 // 356 // Precondition: epQueue.mu must be held to read/write this field.. 357 endpointEntry `state:"nosave"` 358 359 // pendingProcessingMu protects pendingProcessing. 360 pendingProcessingMu sync.Mutex `state:"nosave"` 361 362 // pendingProcessing is true if this endpoint is queued for processing 363 // to a TCP processor. 364 // +checklocks:pendingProcessingMu 365 pendingProcessing bool `state:"nosave"` 366 367 // The following fields are initialized at creation time and do not 368 // change throughout the lifetime of the endpoint. 369 stack *stack.Stack `state:"manual"` 370 protocol *protocol `state:"manual"` 371 waiterQueue *waiter.Queue `state:"wait"` 372 uniqueID uint64 373 374 // hardError is meaningful only when state is stateError. It stores the 375 // error to be returned when read/write syscalls are called and the 376 // endpoint is in this state. hardError is protected by endpoint mu. 377 hardError tcpip.Error 378 379 // lastError represents the last error that the endpoint reported; 380 // access to it is protected by the following mutex. 381 lastErrorMu sync.Mutex `state:"nosave"` 382 lastError tcpip.Error 383 384 rcvQueueMu sync.Mutex `state:"nosave"` 385 386 // +checklocks:rcvQueueMu 387 stack.TCPRcvBufState 388 389 // rcvMemUsed tracks the total amount of memory in use by received segments 390 // held in rcvQueue, pendingRcvdSegments and the segment queue. This is used to 391 // compute the window and the actual available buffer space. This is distinct 392 // from rcvBufUsed above which is the actual number of payload bytes held in 393 // the buffer not including any segment overheads. 394 rcvMemUsed atomicbitops.Int32 395 396 // mu protects all endpoint fields unless documented otherwise. mu must 397 // be acquired before interacting with the endpoint fields. 398 // 399 // During handshake, mu is locked by the protocol listen goroutine and 400 // released by the handshake completion goroutine. 401 mu sync.CrossGoroutineMutex `state:"nosave"` 402 ownedByUser atomicbitops.Uint32 403 404 // rcvQueue is the queue for ready-for-delivery segments. 405 // 406 // +checklocks:mu 407 rcvQueue segmentList `state:"wait"` 408 409 // state must be read/set using the EndpointState()/setEndpointState() 410 // methods. 411 state atomicbitops.Uint32 `state:".(EndpointState)"` 412 413 // connectionDirectionState holds current state of send and receive, 414 // accessed atomically 415 connectionDirectionState atomicbitops.Uint32 416 417 // origEndpointState is only used during a restore phase to save the 418 // endpoint state at restore time as the socket is moved to it's correct 419 // state. 420 origEndpointState uint32 `state:"nosave"` 421 422 isPortReserved bool `state:"manual"` 423 isRegistered bool `state:"manual"` 424 boundNICID tcpip.NICID 425 route *stack.Route `state:"manual"` 426 ipv4TTL uint8 427 ipv6HopLimit int16 428 isConnectNotified bool 429 430 // h stores a reference to the current handshake state if the endpoint is in 431 // the SYN-SENT or SYN-RECV states, in which case endpoint == endpoint.h.ep. 432 // nil otherwise. 433 // +checklocks:mu 434 h *handshake 435 436 // portFlags stores the current values of port related flags. 437 portFlags ports.Flags 438 439 // Values used to reserve a port or register a transport endpoint 440 // (which ever happens first). 441 boundBindToDevice tcpip.NICID 442 boundPortFlags ports.Flags 443 boundDest tcpip.FullAddress 444 445 // effectiveNetProtos contains the network protocols actually in use. In 446 // most cases it will only contain "netProto", but in cases like IPv6 447 // endpoints with v6only set to false, this could include multiple 448 // protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g., 449 // IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped 450 // address). 451 effectiveNetProtos []tcpip.NetworkProtocolNumber 452 453 // recentTSTime is the unix time when we last updated 454 // TCPEndpointStateInner.RecentTS. 455 recentTSTime tcpip.MonotonicTime 456 457 // shutdownFlags represent the current shutdown state of the endpoint. 458 shutdownFlags tcpip.ShutdownFlags 459 460 // tcpRecovery is the loss recovery algorithm used by TCP. 461 tcpRecovery tcpip.TCPRecovery 462 463 // sack holds TCP SACK related information for this endpoint. 464 sack SACKInfo 465 466 // delay enables Nagle's algorithm. 467 // 468 // delay is a boolean (0 is false) and must be accessed atomically. 469 delay uint32 470 471 // scoreboard holds TCP SACK Scoreboard information for this endpoint. 472 scoreboard *SACKScoreboard 473 474 // segmentQueue is used to hand received segments to the protocol 475 // goroutine. Segments are queued as long as the queue is not full, 476 // and dropped when it is. 477 segmentQueue segmentQueue `state:"wait"` 478 479 // userMSS if non-zero is the MSS value explicitly set by the user 480 // for this endpoint using the TCP_MAXSEG setsockopt. 481 userMSS uint16 482 483 // maxSynRetries is the maximum number of SYN retransmits that TCP should 484 // send before aborting the attempt to connect. It cannot exceed 255. 485 // 486 // NOTE: This is currently a no-op and does not change the SYN 487 // retransmissions. 488 maxSynRetries uint8 489 490 // windowClamp is used to bound the size of the advertised window to 491 // this value. 492 windowClamp uint32 493 494 // sndQueueInfo contains the implementation of the endpoint's send queue. 495 sndQueueInfo sndQueueInfo 496 497 // cc stores the name of the Congestion Control algorithm to use for 498 // this endpoint. 499 cc tcpip.CongestionControlOption 500 501 // keepalive manages TCP keepalive state. When the connection is idle 502 // (no data sent or received) for keepaliveIdle, we start sending 503 // keepalives every keepalive.interval. If we send keepalive.count 504 // without hearing a response, the connection is closed. 505 keepalive keepalive 506 507 // userTimeout if non-zero specifies a user specified timeout for 508 // a connection w/ pending data to send. A connection that has pending 509 // unacked data will be forcibily aborted if the timeout is reached 510 // without any data being acked. 511 userTimeout time.Duration 512 513 // deferAccept if non-zero specifies a user specified time during 514 // which the final ACK of a handshake will be dropped provided the 515 // ACK is a bare ACK and carries no data. If the timeout is crossed then 516 // the bare ACK is accepted and the connection is delivered to the 517 // listener. 518 deferAccept time.Duration 519 520 // acceptMu protects accepQueue 521 acceptMu sync.Mutex `state:"nosave"` 522 523 // acceptQueue is used by a listening endpoint to send newly accepted 524 // connections to the endpoint so that they can be read by Accept() 525 // calls. 526 // 527 // +checklocks:acceptMu 528 acceptQueue acceptQueue 529 530 // The following are only used from the protocol goroutine, and 531 // therefore don't need locks to protect them. 532 rcv *receiver `state:"wait"` 533 snd *sender `state:"wait"` 534 535 // The goroutine drain completion notification channel. 536 drainDone chan struct{} `state:"nosave"` 537 538 // The goroutine undrain notification channel. This is currently used as 539 // a way to block the worker goroutines. Today nothing closes/writes 540 // this channel and this causes any goroutines waiting on this to just 541 // block. This is used during save/restore to prevent worker goroutines 542 // from mutating state as it's being saved. 543 undrain chan struct{} `state:"nosave"` 544 545 // probe if not nil is invoked on every received segment. It is passed 546 // a copy of the current state of the endpoint. 547 probe stack.TCPProbeFunc `state:"nosave"` 548 549 // The following are only used to assist the restore run to re-connect. 550 connectingAddress tcpip.Address 551 552 // amss is the advertised MSS to the peer by this endpoint. 553 amss uint16 554 555 // sendTOS represents IPv4 TOS or IPv6 TrafficClass, 556 // applied while sending packets. Defaults to 0 as on Linux. 557 sendTOS uint8 558 559 gso stack.GSO 560 561 stats Stats 562 563 // tcpLingerTimeout is the maximum amount of a time a socket 564 // a socket stays in TIME_WAIT state before being marked 565 // closed. 566 tcpLingerTimeout time.Duration 567 568 // closed indicates that the user has called closed on the 569 // endpoint and at this point the endpoint is only around 570 // to complete the TCP shutdown. 571 closed bool 572 573 // txHash is the transport layer hash to be set on outbound packets 574 // emitted by this endpoint. 575 txHash uint32 576 577 // owner is used to get uid and gid of the packet. 578 owner tcpip.PacketOwner 579 580 // ops is used to get socket level options. 581 ops tcpip.SocketOptions 582 583 // lastOutOfWindowAckTime is the time at which the an ACK was sent in response 584 // to an out of window segment being received by this endpoint. 585 lastOutOfWindowAckTime tcpip.MonotonicTime 586 587 // finWait2Timer is used to reap orphaned sockets in FIN-WAIT-2 where the peer 588 // is yet to send a FIN but on our end the socket is fully closed i.e. endpoint.Close() 589 // has been called on the socket. This timer is not started for sockets that 590 // are waiting for a peer FIN but are not closed. 591 finWait2Timer tcpip.Timer `state:"nosave"` 592 593 // timeWaitTimer is used to reap a socket once a socket has been in TIME-WAIT state 594 // for tcp.DefaultTCPTimeWaitTimeout seconds. 595 timeWaitTimer tcpip.Timer `state:"nosave"` 596 597 // listenCtx is used by listening endpoints to store state used while listening for 598 // connections. Nil otherwise. 599 listenCtx *listenContext `state:"nosave"` 600 } 601 602 // UniqueID implements stack.TransportEndpoint.UniqueID. 603 func (e *endpoint) UniqueID() uint64 { 604 return e.uniqueID 605 } 606 607 // calculateAdvertisedMSS calculates the MSS to advertise. 608 // 609 // If userMSS is non-zero and is not greater than the maximum possible MSS for 610 // r, it will be used; otherwise, the maximum possible MSS will be used. 611 func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 { 612 // The maximum possible MSS is dependent on the route. 613 // TODO(b/143359391): Respect TCP Min and Max size. 614 maxMSS := uint16(r.MTU() - header.TCPMinimumSize) 615 616 if userMSS != 0 && userMSS < maxMSS { 617 return userMSS 618 } 619 620 return maxMSS 621 } 622 623 // isOwnedByUser() returns true if the endpoint lock is currently 624 // held by a user(syscall) goroutine. 625 func (e *endpoint) isOwnedByUser() bool { 626 return e.ownedByUser.Load() == 1 627 } 628 629 // LockUser tries to lock e.mu and if it fails it will check if the lock is held 630 // by another syscall goroutine. If yes, then it will goto sleep waiting for the 631 // lock to be released, if not then it will spin till it acquires the lock or 632 // another syscall goroutine acquires it in which case it will goto sleep as 633 // described above. 634 // 635 // The assumption behind spinning here being that background packet processing 636 // should not be holding the lock for long and spinning reduces latency as we 637 // avoid an expensive sleep/wakeup of the syscall goroutine). 638 // +checklocksacquire:e.mu 639 func (e *endpoint) LockUser() { 640 const iterations = 5 641 for i := 0; i < iterations; i++ { 642 // Try first if the sock is locked then check if it's owned 643 // by another user goroutine if not then we spin, otherwise 644 // we just go to sleep on the Lock() and wait. 645 if !e.TryLock() { 646 // If socket is owned by the user then just go to sleep 647 // as the lock could be held for a reasonably long time. 648 if e.ownedByUser.Load() == 1 { 649 e.mu.Lock() 650 e.ownedByUser.Store(1) 651 return 652 } 653 // Spin but don't yield the processor since the lower half 654 // should yield the lock soon. 655 continue 656 } 657 e.ownedByUser.Store(1) 658 return 659 } 660 661 for i := 0; i < iterations; i++ { 662 // Try first if the sock is locked then check if it's owned 663 // by another user goroutine if not then we spin, otherwise 664 // we just go to sleep on the Lock() and wait. 665 if !e.TryLock() { 666 // If socket is owned by the user then just go to sleep 667 // as the lock could be held for a reasonably long time. 668 if e.ownedByUser.Load() == 1 { 669 e.mu.Lock() 670 e.ownedByUser.Store(1) 671 return 672 } 673 // Spin but yield the processor since the lower half 674 // should yield the lock soon. 675 runtime.Gosched() 676 continue 677 } 678 e.ownedByUser.Store(1) 679 return 680 } 681 682 // Finally just give up and wait for the Lock. 683 e.mu.Lock() 684 e.ownedByUser.Store(1) 685 } 686 687 // UnlockUser will check if there are any segments already queued for processing 688 // and wake up a processor goroutine to process them before unlocking e.mu. 689 // This is required because we when packets arrive and endpoint lock is already 690 // held then such packets are queued up to be processed. 691 // 692 // Precondition: e.LockUser() must have been called before calling e.UnlockUser() 693 // +checklocksrelease:e.mu 694 func (e *endpoint) UnlockUser() { 695 // Lock segment queue before checking so that we avoid a race where 696 // segments can be queued between the time we check if queue is empty 697 // and actually unlock the endpoint mutex. 698 e.segmentQueue.mu.Lock() 699 if e.segmentQueue.emptyLocked() { 700 if e.ownedByUser.Swap(0) != 1 { 701 panic("e.UnlockUser() called without calling e.LockUser()") 702 } 703 e.mu.Unlock() 704 e.segmentQueue.mu.Unlock() 705 return 706 } 707 e.segmentQueue.mu.Unlock() 708 709 // Since we are waking the processor goroutine here just unlock 710 // and let it process the queued segments. 711 if e.ownedByUser.Swap(0) != 1 { 712 panic("e.UnlockUser() called without calling e.LockUser()") 713 } 714 processor := e.protocol.dispatcher.selectProcessor(e.ID) 715 e.mu.Unlock() 716 717 // Wake up the processor for this endpoint to process any queued 718 // segments after releasing the lock to avoid the case where if the 719 // processor goroutine starts running before we release the lock here 720 // then it will fail to process as TryLock() will fail. 721 processor.queueEndpoint(e) 722 return 723 } 724 725 // StopWork halts packet processing. Only to be used in tests. 726 // +checklocksacquire:e.mu 727 func (e *endpoint) StopWork() { 728 e.mu.Lock() 729 } 730 731 // ResumeWork resumes packet processing. Only to be used in tests. 732 // +checklocksrelease:e.mu 733 func (e *endpoint) ResumeWork() { 734 e.mu.Unlock() 735 } 736 737 // AssertLockHeld forces the checklocks analyzer to consider e.mu held. This is 738 // used in places where we know that e.mu is held, but checklocks does not, 739 // which can happen when creating new locked objects. You must pass the known 740 // locked endpoint to this function and it must be the same as the caller 741 // endpoint. 742 // TODO(b/226403629): Remove this function once checklocks understands local 743 // variable locks. 744 // +checklocks:locked.mu 745 // +checklocksacquire:e.mu 746 func (e *endpoint) AssertLockHeld(locked *endpoint) { 747 if e != locked { 748 panic("AssertLockHeld failed: locked endpoint != asserting endpoint") 749 } 750 } 751 752 // TryLock is a helper that calls TryLock on the endpoint's mutex and 753 // adds the necessary checklocks annotations. 754 // TODO(b/226403629): Remove this once checklocks understands TryLock. 755 // +checklocksacquire:e.mu 756 func (e *endpoint) TryLock() bool { 757 if e.mu.TryLock() { 758 return true // +checklocksforce 759 } 760 return false // +checklocksignore 761 } 762 763 // setEndpointState updates the state of the endpoint to state atomically. This 764 // method is unexported as the only place we should update the state is in this 765 // package but we allow the state to be read freely without holding e.mu. 766 // 767 // +checklocks:e.mu 768 func (e *endpoint) setEndpointState(state EndpointState) { 769 oldstate := EndpointState(e.state.Swap(uint32(state))) 770 switch state { 771 case StateEstablished: 772 e.stack.Stats().TCP.CurrentEstablished.Increment() 773 e.stack.Stats().TCP.CurrentConnected.Increment() 774 case StateError: 775 fallthrough 776 case StateClose: 777 if oldstate == StateCloseWait || oldstate == StateEstablished { 778 e.stack.Stats().TCP.EstablishedResets.Increment() 779 } 780 if oldstate.connected() { 781 e.stack.Stats().TCP.CurrentConnected.Decrement() 782 } 783 fallthrough 784 default: 785 if oldstate == StateEstablished { 786 e.stack.Stats().TCP.CurrentEstablished.Decrement() 787 } 788 } 789 } 790 791 // EndpointState returns the current state of the endpoint. 792 func (e *endpoint) EndpointState() EndpointState { 793 return EndpointState(e.state.Load()) 794 } 795 796 // setRecentTimestamp sets the recentTS field to the provided value. 797 func (e *endpoint) setRecentTimestamp(recentTS uint32) { 798 e.RecentTS = recentTS 799 e.recentTSTime = e.stack.Clock().NowMonotonic() 800 } 801 802 // recentTimestamp returns the value of the recentTS field. 803 func (e *endpoint) recentTimestamp() uint32 { 804 return e.RecentTS 805 } 806 807 // TODO(gvisor.dev/issue/6974): Remove once tcp endpoints are composed with a 808 // network.Endpoint, which also defines this function. 809 func calculateTTL(route *stack.Route, ipv4TTL uint8, ipv6HopLimit int16) uint8 { 810 switch netProto := route.NetProto(); netProto { 811 case header.IPv4ProtocolNumber: 812 if ipv4TTL == tcpip.UseDefaultIPv4TTL { 813 return route.DefaultTTL() 814 } 815 return ipv4TTL 816 case header.IPv6ProtocolNumber: 817 if ipv6HopLimit == tcpip.UseDefaultIPv6HopLimit { 818 return route.DefaultTTL() 819 } 820 return uint8(ipv6HopLimit) 821 default: 822 panic(fmt.Sprintf("invalid protocol number = %d", netProto)) 823 } 824 } 825 826 // keepalive is a synchronization wrapper used to appease stateify. See the 827 // comment in endpoint, where it is used. 828 // 829 // +stateify savable 830 type keepalive struct { 831 sync.Mutex `state:"nosave"` 832 idle time.Duration 833 interval time.Duration 834 count int 835 unacked int 836 // should never be a zero timer if the endpoint is not closed. 837 timer timer `state:"nosave"` 838 waker sleep.Waker `state:"nosave"` 839 } 840 841 func newEndpoint(s *stack.Stack, protocol *protocol, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint { 842 e := &endpoint{ 843 stack: s, 844 protocol: protocol, 845 TransportEndpointInfo: stack.TransportEndpointInfo{ 846 NetProto: netProto, 847 TransProto: header.TCPProtocolNumber, 848 }, 849 sndQueueInfo: sndQueueInfo{ 850 TCPSndBufState: stack.TCPSndBufState{ 851 SndMTU: math.MaxInt32, 852 }, 853 }, 854 waiterQueue: waiterQueue, 855 state: atomicbitops.FromUint32(uint32(StateInitial)), 856 keepalive: keepalive{ 857 idle: DefaultKeepaliveIdle, 858 interval: DefaultKeepaliveInterval, 859 count: DefaultKeepaliveCount, 860 }, 861 uniqueID: s.UniqueID(), 862 ipv4TTL: tcpip.UseDefaultIPv4TTL, 863 ipv6HopLimit: tcpip.UseDefaultIPv6HopLimit, 864 txHash: s.Rand().Uint32(), 865 windowClamp: DefaultReceiveBufferSize, 866 maxSynRetries: DefaultSynRetries, 867 } 868 e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits, GetTCPReceiveBufferLimits) 869 e.ops.SetMulticastLoop(true) 870 e.ops.SetQuickAck(true) 871 e.ops.SetSendBufferSize(DefaultSendBufferSize, false /* notify */) 872 e.ops.SetReceiveBufferSize(DefaultReceiveBufferSize, false /* notify */) 873 874 var ss tcpip.TCPSendBufferSizeRangeOption 875 if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil { 876 e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) 877 } 878 879 var rs tcpip.TCPReceiveBufferSizeRangeOption 880 if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil { 881 e.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */) 882 } 883 884 var cs tcpip.CongestionControlOption 885 if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil { 886 e.cc = cs 887 } 888 889 var mrb tcpip.TCPModerateReceiveBufferOption 890 if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil { 891 e.RcvAutoParams.Disabled = !bool(mrb) 892 } 893 894 var de tcpip.TCPDelayEnabled 895 if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de { 896 e.ops.SetDelayOption(true) 897 } 898 899 var tcpLT tcpip.TCPLingerTimeoutOption 900 if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil { 901 e.tcpLingerTimeout = time.Duration(tcpLT) 902 } 903 904 var synRetries tcpip.TCPSynRetriesOption 905 if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil { 906 e.maxSynRetries = uint8(synRetries) 907 } 908 909 if p := s.GetTCPProbe(); p != nil { 910 e.probe = p 911 } 912 913 e.segmentQueue.ep = e 914 915 // TODO(https://gvisor.dev/issues/7493): Defer creating the timer until TCP connection becomes 916 // established. 917 e.keepalive.timer.init(e.stack.Clock(), maybeFailTimerHandler(e, e.keepaliveTimerExpired)) 918 919 return e 920 } 921 922 // Readiness returns the current readiness of the endpoint. For example, if 923 // waiter.EventIn is set, the endpoint is immediately readable. 924 func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { 925 result := waiter.EventMask(0) 926 927 switch e.EndpointState() { 928 case StateInitial, StateBound: 929 // This prevents blocking of new sockets which are not 930 // connected when SO_LINGER is set. 931 result |= waiter.EventHUp 932 933 case StateConnecting, StateSynSent, StateSynRecv: 934 // Ready for nothing. 935 936 case StateClose, StateError, StateTimeWait: 937 // Ready for anything. 938 result = mask 939 940 case StateListen: 941 // Check if there's anything in the accepted queue. 942 if (mask & waiter.ReadableEvents) != 0 { 943 e.acceptMu.Lock() 944 if e.acceptQueue.endpoints.Len() != 0 { 945 result |= waiter.ReadableEvents 946 } 947 e.acceptMu.Unlock() 948 } 949 } 950 if e.EndpointState().connected() { 951 // Determine if the endpoint is writable if requested. 952 if (mask & waiter.WritableEvents) != 0 { 953 e.sndQueueInfo.sndQueueMu.Lock() 954 sndBufSize := e.getSendBufferSize() 955 if e.sndQueueInfo.SndClosed || e.sndQueueInfo.SndBufUsed < sndBufSize { 956 result |= waiter.WritableEvents 957 } 958 if e.sndQueueInfo.SndClosed { 959 e.updateConnDirectionState(connDirectionStateSndClosed) 960 } 961 e.sndQueueInfo.sndQueueMu.Unlock() 962 } 963 964 // Determine if the endpoint is readable if requested. 965 if (mask & waiter.ReadableEvents) != 0 { 966 e.rcvQueueMu.Lock() 967 if e.RcvBufUsed > 0 || e.RcvClosed { 968 result |= waiter.ReadableEvents 969 } 970 if e.RcvClosed { 971 e.updateConnDirectionState(connDirectionStateRcvClosed) 972 } 973 e.rcvQueueMu.Unlock() 974 } 975 } 976 977 // Determine whether endpoint is half-closed with rcv shutdown 978 if e.connDirectionState() == connDirectionStateRcvClosed { 979 result |= waiter.EventRdHUp 980 } 981 982 return result 983 } 984 985 // Purging pending rcv segments is only necessary on RST. 986 func (e *endpoint) purgePendingRcvQueue() { 987 if e.rcv != nil { 988 for e.rcv.pendingRcvdSegments.Len() > 0 { 989 s := heap.Pop(&e.rcv.pendingRcvdSegments).(*segment) 990 s.DecRef() 991 } 992 } 993 } 994 995 // +checklocks:e.mu 996 func (e *endpoint) purgeReadQueue() { 997 if e.rcv != nil { 998 e.rcvQueueMu.Lock() 999 defer e.rcvQueueMu.Unlock() 1000 for { 1001 s := e.rcvQueue.Front() 1002 if s == nil { 1003 break 1004 } 1005 e.rcvQueue.Remove(s) 1006 s.DecRef() 1007 } 1008 e.RcvBufUsed = 0 1009 } 1010 } 1011 1012 // +checklocks:e.mu 1013 func (e *endpoint) purgeWriteQueue() { 1014 if e.snd != nil { 1015 e.sndQueueInfo.sndQueueMu.Lock() 1016 defer e.sndQueueInfo.sndQueueMu.Unlock() 1017 e.snd.updateWriteNext(nil) 1018 for { 1019 s := e.snd.writeList.Front() 1020 if s == nil { 1021 break 1022 } 1023 e.snd.writeList.Remove(s) 1024 s.DecRef() 1025 } 1026 e.sndQueueInfo.SndBufUsed = 0 1027 e.sndQueueInfo.SndClosed = true 1028 } 1029 } 1030 1031 // Abort implements stack.TransportEndpoint.Abort. 1032 func (e *endpoint) Abort() { 1033 defer e.drainClosingSegmentQueue() 1034 e.LockUser() 1035 defer e.UnlockUser() 1036 defer e.purgeReadQueue() 1037 // Reset all connected endpoints. 1038 switch state := e.EndpointState(); { 1039 case state.connected(): 1040 e.resetConnectionLocked(&tcpip.ErrAborted{}) 1041 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 1042 return 1043 } 1044 e.closeLocked() 1045 } 1046 1047 // Close puts the endpoint in a closed state and frees all resources associated 1048 // with it. It must be called only once and with no other concurrent calls to 1049 // the endpoint. 1050 func (e *endpoint) Close() { 1051 e.LockUser() 1052 if e.closed { 1053 e.UnlockUser() 1054 return 1055 } 1056 1057 // We always want to purge the read queue, but do so after the checks in 1058 // shutdownLocked. 1059 e.closeLocked() 1060 e.purgeReadQueue() 1061 if e.EndpointState() == StateClose || e.EndpointState() == StateError { 1062 // It should be safe to purge the read queue now as the endpoint 1063 // is now closed or in an error state and further reads are not 1064 // permitted. 1065 e.UnlockUser() 1066 e.drainClosingSegmentQueue() 1067 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 1068 return 1069 } 1070 e.UnlockUser() 1071 } 1072 1073 // +checklocks:e.mu 1074 func (e *endpoint) closeLocked() { 1075 linger := e.SocketOptions().GetLinger() 1076 if linger.Enabled && linger.Timeout == 0 { 1077 s := e.EndpointState() 1078 isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv 1079 if isResetState { 1080 // Close the endpoint without doing full shutdown and 1081 // send a RST. 1082 e.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) 1083 return 1084 } 1085 } 1086 1087 // Issue a shutdown so that the peer knows we won't send any more data 1088 // if we're connected, or stop accepting if we're listening. 1089 e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead) 1090 e.closeNoShutdownLocked() 1091 } 1092 1093 // closeNoShutdown closes the endpoint without doing a full shutdown. 1094 // +checklocks:e.mu 1095 func (e *endpoint) closeNoShutdownLocked() { 1096 // For listening sockets, we always release ports inline so that they 1097 // are immediately available for reuse after Close() is called. If also 1098 // registered, we unregister as well otherwise the next user would fail 1099 // in Listen() when trying to register. 1100 if e.EndpointState() == StateListen && e.isPortReserved { 1101 if e.isRegistered { 1102 e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 1103 e.isRegistered = false 1104 } 1105 1106 portRes := ports.Reservation{ 1107 Networks: e.effectiveNetProtos, 1108 Transport: ProtocolNumber, 1109 Addr: e.TransportEndpointInfo.ID.LocalAddress, 1110 Port: e.TransportEndpointInfo.ID.LocalPort, 1111 Flags: e.boundPortFlags, 1112 BindToDevice: e.boundBindToDevice, 1113 Dest: e.boundDest, 1114 } 1115 e.stack.ReleasePort(portRes) 1116 e.isPortReserved = false 1117 e.boundBindToDevice = 0 1118 e.boundPortFlags = ports.Flags{} 1119 e.boundDest = tcpip.FullAddress{} 1120 } 1121 1122 // Mark endpoint as closed. 1123 e.closed = true 1124 tcpip.AddDanglingEndpoint(e) 1125 1126 eventMask := waiter.ReadableEvents | waiter.WritableEvents 1127 1128 switch e.EndpointState() { 1129 case StateInitial, StateBound, StateListen: 1130 e.setEndpointState(StateClose) 1131 fallthrough 1132 case StateClose, StateError: 1133 eventMask |= waiter.EventHUp 1134 e.cleanupLocked() 1135 case StateConnecting, StateSynSent, StateSynRecv: 1136 // Abort the handshake and set the error. 1137 // Notify that the endpoint is closed. 1138 eventMask |= waiter.EventHUp 1139 e.handshakeFailed(&tcpip.ErrAborted{}) 1140 // Notify that the endpoint is closed. 1141 eventMask |= waiter.EventHUp 1142 case StateFinWait2: 1143 // The socket has been closed and we are in FIN-WAIT-2 so start 1144 // the FIN-WAIT-2 timer. 1145 if e.finWait2Timer == nil { 1146 e.finWait2Timer = e.stack.Clock().AfterFunc(e.tcpLingerTimeout, e.finWait2TimerExpired) 1147 } 1148 } 1149 1150 e.waiterQueue.Notify(eventMask) 1151 } 1152 1153 // closePendingAcceptableConnections closes all connections that have completed 1154 // handshake but not yet been delivered to the application. 1155 func (e *endpoint) closePendingAcceptableConnectionsLocked() { 1156 e.acceptMu.Lock() 1157 1158 pendingEndpoints := e.acceptQueue.pendingEndpoints 1159 e.acceptQueue.pendingEndpoints = nil 1160 1161 completedEndpoints := make([]*endpoint, 0, e.acceptQueue.endpoints.Len()) 1162 for n := e.acceptQueue.endpoints.Front(); n != nil; n = n.Next() { 1163 completedEndpoints = append(completedEndpoints, n.Value.(*endpoint)) 1164 } 1165 e.acceptQueue.endpoints.Init() 1166 e.acceptQueue.capacity = 0 1167 e.acceptMu.Unlock() 1168 1169 // Close any endpoints in SYN-RCVD state. 1170 for n := range pendingEndpoints { 1171 n.Abort() 1172 } 1173 1174 // Reset all connections that are waiting to be accepted. 1175 for _, n := range completedEndpoints { 1176 n.Abort() 1177 } 1178 } 1179 1180 // cleanupLocked frees all resources associated with the endpoint. 1181 // +checklocks:e.mu 1182 func (e *endpoint) cleanupLocked() { 1183 if e.snd != nil { 1184 e.snd.resendTimer.cleanup() 1185 e.snd.probeTimer.cleanup() 1186 e.snd.reorderTimer.cleanup() 1187 } 1188 1189 if e.finWait2Timer != nil { 1190 e.finWait2Timer.Stop() 1191 } 1192 1193 if e.timeWaitTimer != nil { 1194 e.timeWaitTimer.Stop() 1195 } 1196 1197 // Close all endpoints that might have been accepted by TCP but not by 1198 // the client. 1199 e.closePendingAcceptableConnectionsLocked() 1200 e.keepalive.timer.cleanup() 1201 1202 if e.isRegistered { 1203 e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 1204 e.isRegistered = false 1205 } 1206 1207 if e.isPortReserved { 1208 portRes := ports.Reservation{ 1209 Networks: e.effectiveNetProtos, 1210 Transport: ProtocolNumber, 1211 Addr: e.TransportEndpointInfo.ID.LocalAddress, 1212 Port: e.TransportEndpointInfo.ID.LocalPort, 1213 Flags: e.boundPortFlags, 1214 BindToDevice: e.boundBindToDevice, 1215 Dest: e.boundDest, 1216 } 1217 e.stack.ReleasePort(portRes) 1218 e.isPortReserved = false 1219 } 1220 e.boundBindToDevice = 0 1221 e.boundPortFlags = ports.Flags{} 1222 e.boundDest = tcpip.FullAddress{} 1223 1224 if e.route != nil { 1225 e.route.Release() 1226 e.route = nil 1227 } 1228 1229 e.purgeWriteQueue() 1230 // Only purge the read queue here if the socket is fully closed by the 1231 // user. 1232 if e.closed { 1233 e.purgeReadQueue() 1234 } 1235 e.stack.CompleteTransportEndpointCleanup(e) 1236 tcpip.DeleteDanglingEndpoint(e) 1237 } 1238 1239 // wndFromSpace returns the window that we can advertise based on the available 1240 // receive buffer space. 1241 func wndFromSpace(space int) int { 1242 return space >> rcvAdvWndScale 1243 } 1244 1245 // initialReceiveWindow returns the initial receive window to advertise in the 1246 // SYN/SYN-ACK. 1247 func (e *endpoint) initialReceiveWindow() int { 1248 rcvWnd := wndFromSpace(e.receiveBufferAvailable()) 1249 if rcvWnd > math.MaxUint16 { 1250 rcvWnd = math.MaxUint16 1251 } 1252 1253 // Use the user supplied MSS, if available. 1254 routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2 1255 if rcvWnd > routeWnd { 1256 rcvWnd = routeWnd 1257 } 1258 rcvWndScale := e.rcvWndScaleForHandshake() 1259 1260 // Round-down the rcvWnd to a multiple of wndScale. This ensures that the 1261 // window offered in SYN won't be reduced due to the loss of precision if 1262 // window scaling is enabled after the handshake. 1263 rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale) 1264 1265 // Ensure we can always accept at least 1 byte if the scale specified 1266 // was too high for the provided rcvWnd. 1267 if rcvWnd == 0 { 1268 rcvWnd = 1 1269 } 1270 1271 return rcvWnd 1272 } 1273 1274 // ModerateRecvBuf adjusts the receive buffer and the advertised window 1275 // based on the number of bytes copied to userspace. 1276 func (e *endpoint) ModerateRecvBuf(copied int) { 1277 e.LockUser() 1278 defer e.UnlockUser() 1279 1280 sendNonZeroWindowUpdate := false 1281 1282 e.rcvQueueMu.Lock() 1283 if e.RcvAutoParams.Disabled { 1284 e.rcvQueueMu.Unlock() 1285 return 1286 } 1287 now := e.stack.Clock().NowMonotonic() 1288 if rtt := e.RcvAutoParams.RTT; rtt == 0 || now.Sub(e.RcvAutoParams.MeasureTime) < rtt { 1289 e.RcvAutoParams.CopiedBytes += copied 1290 e.rcvQueueMu.Unlock() 1291 return 1292 } 1293 prevRTTCopied := e.RcvAutoParams.CopiedBytes + copied 1294 prevCopied := e.RcvAutoParams.PrevCopiedBytes 1295 rcvWnd := 0 1296 if prevRTTCopied > prevCopied { 1297 // The minimal receive window based on what was copied by the app 1298 // in the immediate preceding RTT and some extra buffer for 16 1299 // segments to account for variations. 1300 // We multiply by 2 to account for packet losses. 1301 rcvWnd = prevRTTCopied*2 + 16*int(e.amss) 1302 1303 // Scale for slow start based on bytes copied in this RTT vs previous. 1304 grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied 1305 1306 // Multiply growth factor by 2 again to account for sender being 1307 // in slow-start where the sender grows it's congestion window 1308 // by 100% per RTT. 1309 rcvWnd += grow * 2 1310 1311 // Make sure auto tuned buffer size can always receive upto 2x 1312 // the initial window of 10 segments. 1313 if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd { 1314 rcvWnd = minRcvWnd 1315 } 1316 1317 // Cap the auto tuned buffer size by the maximum permissible 1318 // receive buffer size. 1319 if max := e.maxReceiveBufferSize(); rcvWnd > max { 1320 rcvWnd = max 1321 } 1322 1323 // We do not adjust downwards as that can cause the receiver to 1324 // reject valid data that might already be in flight as the 1325 // acceptable window will shrink. 1326 rcvBufSize := int(e.ops.GetReceiveBufferSize()) 1327 if rcvWnd > rcvBufSize { 1328 availBefore := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize)) 1329 e.ops.SetReceiveBufferSize(int64(rcvWnd), false /* notify */) 1330 availAfter := wndFromSpace(e.receiveBufferAvailableLocked(rcvWnd)) 1331 if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, rcvBufSize); crossed && above { 1332 sendNonZeroWindowUpdate = true 1333 } 1334 } 1335 1336 // We only update PrevCopiedBytes when we grow the buffer because in cases 1337 // where PrevCopiedBytes > prevRTTCopied the existing buffer is already big 1338 // enough to handle the current rate and we don't need to do any 1339 // adjustments. 1340 e.RcvAutoParams.PrevCopiedBytes = prevRTTCopied 1341 } 1342 e.RcvAutoParams.MeasureTime = now 1343 e.RcvAutoParams.CopiedBytes = 0 1344 e.rcvQueueMu.Unlock() 1345 1346 // Send the update after unlocking rcvQueueMu as sending a segment acquires 1347 // the lock to calculate the window to be sent. 1348 if e.EndpointState().connected() && sendNonZeroWindowUpdate { 1349 e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu 1350 } 1351 } 1352 1353 // SetOwner implements tcpip.Endpoint.SetOwner. 1354 func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { 1355 e.owner = owner 1356 } 1357 1358 // +checklocks:e.mu 1359 func (e *endpoint) hardErrorLocked() tcpip.Error { 1360 err := e.hardError 1361 e.hardError = nil 1362 return err 1363 } 1364 1365 // +checklocks:e.mu 1366 func (e *endpoint) lastErrorLocked() tcpip.Error { 1367 e.lastErrorMu.Lock() 1368 defer e.lastErrorMu.Unlock() 1369 err := e.lastError 1370 e.lastError = nil 1371 return err 1372 } 1373 1374 // LastError implements tcpip.Endpoint.LastError. 1375 func (e *endpoint) LastError() tcpip.Error { 1376 e.LockUser() 1377 defer e.UnlockUser() 1378 if err := e.hardErrorLocked(); err != nil { 1379 return err 1380 } 1381 return e.lastErrorLocked() 1382 } 1383 1384 // LastErrorLocked reads and clears lastError. 1385 // Only to be used in tests. 1386 // +checklocks:e.mu 1387 func (e *endpoint) LastErrorLocked() tcpip.Error { 1388 return e.lastErrorLocked() 1389 } 1390 1391 // UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError. 1392 func (e *endpoint) UpdateLastError(err tcpip.Error) { 1393 e.LockUser() 1394 e.lastErrorMu.Lock() 1395 e.lastError = err 1396 e.lastErrorMu.Unlock() 1397 e.UnlockUser() 1398 } 1399 1400 // Read implements tcpip.Endpoint.Read. 1401 func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) { 1402 e.LockUser() 1403 defer e.UnlockUser() 1404 1405 if err := e.checkReadLocked(); err != nil { 1406 if _, ok := err.(*tcpip.ErrClosedForReceive); ok { 1407 e.stats.ReadErrors.ReadClosed.Increment() 1408 } 1409 return tcpip.ReadResult{}, err 1410 } 1411 1412 var err error 1413 done := 0 1414 // N.B. Here we get the first segment to be processed. It is safe to not 1415 // hold rcvQueueMu when processing, since we hold e.mu to ensure we only 1416 // remove segments from the list through Read() and that new segments 1417 // cannot be appended. 1418 s := e.rcvQueue.Front() 1419 for s != nil { 1420 var n int 1421 n, err = s.ReadTo(dst, opts.Peek) 1422 // Book keeping first then error handling. 1423 done += n 1424 1425 if opts.Peek { 1426 s = s.Next() 1427 } else { 1428 sendNonZeroWindowUpdate := false 1429 memDelta := 0 1430 for { 1431 seg := e.rcvQueue.Front() 1432 if seg == nil || seg.payloadSize() != 0 { 1433 break 1434 } 1435 e.rcvQueue.Remove(seg) 1436 // Memory is only considered released when the whole segment has been 1437 // read. 1438 memDelta += seg.segMemSize() 1439 seg.DecRef() 1440 } 1441 e.rcvQueueMu.Lock() 1442 e.RcvBufUsed -= n 1443 s = e.rcvQueue.Front() 1444 1445 if memDelta > 0 { 1446 // If the window was small before this read and if the read freed up 1447 // enough buffer space, to either fit an aMSS or half a receive buffer 1448 // (whichever smaller), then notify the protocol goroutine to send a 1449 // window update. 1450 if crossed, above := e.windowCrossedACKThresholdLocked(memDelta, int(e.ops.GetReceiveBufferSize())); crossed && above { 1451 sendNonZeroWindowUpdate = true 1452 } 1453 } 1454 e.rcvQueueMu.Unlock() 1455 1456 if e.EndpointState().connected() && sendNonZeroWindowUpdate { 1457 e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu 1458 } 1459 } 1460 1461 if err != nil { 1462 break 1463 } 1464 } 1465 1466 // If something is read, we must report it. Report error when nothing is read. 1467 if done == 0 && err != nil { 1468 return tcpip.ReadResult{}, &tcpip.ErrBadBuffer{} 1469 } 1470 return tcpip.ReadResult{ 1471 Count: done, 1472 Total: done, 1473 }, nil 1474 } 1475 1476 // checkRead checks that endpoint is in a readable state. 1477 // 1478 // +checklocks:e.mu 1479 func (e *endpoint) checkReadLocked() tcpip.Error { 1480 e.rcvQueueMu.Lock() 1481 defer e.rcvQueueMu.Unlock() 1482 // When in SYN-SENT state, let the caller block on the receive. 1483 // An application can initiate a non-blocking connect and then block 1484 // on a receive. It can expect to read any data after the handshake 1485 // is complete. RFC793, section 3.9, p58. 1486 if e.EndpointState() == StateSynSent { 1487 return &tcpip.ErrWouldBlock{} 1488 } 1489 1490 // The endpoint can be read if it's connected, or if it's already closed 1491 // but has some pending unread data. Also note that a RST being received 1492 // would cause the state to become StateError so we should allow the 1493 // reads to proceed before returning a ECONNRESET. 1494 bufUsed := e.RcvBufUsed 1495 if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 { 1496 if s == StateError { 1497 if err := e.hardErrorLocked(); err != nil { 1498 return err 1499 } 1500 return &tcpip.ErrClosedForReceive{} 1501 } 1502 e.stats.ReadErrors.NotConnected.Increment() 1503 return &tcpip.ErrNotConnected{} 1504 } 1505 1506 if e.RcvBufUsed == 0 { 1507 if e.RcvClosed || !e.EndpointState().connected() { 1508 return &tcpip.ErrClosedForReceive{} 1509 } 1510 return &tcpip.ErrWouldBlock{} 1511 } 1512 1513 return nil 1514 } 1515 1516 // isEndpointWritableLocked checks if a given endpoint is writable 1517 // and also returns the number of bytes that can be written at this 1518 // moment. If the endpoint is not writable then it returns an error 1519 // indicating the reason why it's not writable. 1520 // +checklocks:e.mu 1521 // +checklocks:e.sndQueueInfo.sndQueueMu 1522 func (e *endpoint) isEndpointWritableLocked() (int, tcpip.Error) { 1523 // The endpoint cannot be written to if it's not connected. 1524 switch s := e.EndpointState(); { 1525 case s == StateError: 1526 if err := e.hardErrorLocked(); err != nil { 1527 return 0, err 1528 } 1529 return 0, &tcpip.ErrClosedForSend{} 1530 case !s.connecting() && !s.connected(): 1531 return 0, &tcpip.ErrClosedForSend{} 1532 case s.connecting(): 1533 // As per RFC793, page 56, a send request arriving when in connecting 1534 // state, can be queued to be completed after the state becomes 1535 // connected. Return an error code for the caller of endpoint Write to 1536 // try again, until the connection handshake is complete. 1537 return 0, &tcpip.ErrWouldBlock{} 1538 } 1539 1540 // Check if the connection has already been closed for sends. 1541 if e.sndQueueInfo.SndClosed { 1542 return 0, &tcpip.ErrClosedForSend{} 1543 } 1544 1545 sndBufSize := e.getSendBufferSize() 1546 avail := sndBufSize - e.sndQueueInfo.SndBufUsed 1547 if avail <= 0 { 1548 return 0, &tcpip.ErrWouldBlock{} 1549 } 1550 return avail, nil 1551 } 1552 1553 // readFromPayloader reads a slice from the Payloader. 1554 // +checklocks:e.mu 1555 // +checklocks:e.sndQueueInfo.sndQueueMu 1556 func (e *endpoint) readFromPayloader(p tcpip.Payloader, opts tcpip.WriteOptions, avail int) (buffer.Buffer, tcpip.Error) { 1557 // We can release locks while copying data. 1558 // 1559 // This is not possible if atomic is set, because we can't allow the 1560 // available buffer space to be consumed by some other caller while we 1561 // are copying data in. 1562 if !opts.Atomic { 1563 e.sndQueueInfo.sndQueueMu.Unlock() 1564 defer e.sndQueueInfo.sndQueueMu.Lock() 1565 1566 e.UnlockUser() 1567 defer e.LockUser() 1568 } 1569 1570 // Fetch data. 1571 var payload buffer.Buffer 1572 if l := p.Len(); l < avail { 1573 avail = l 1574 } 1575 if avail == 0 { 1576 return payload, nil 1577 } 1578 if _, err := payload.WriteFromReader(p, int64(avail)); err != nil { 1579 payload.Release() 1580 return buffer.Buffer{}, &tcpip.ErrBadBuffer{} 1581 } 1582 return payload, nil 1583 } 1584 1585 // queueSegment reads data from the payloader and returns a segment to be sent. 1586 // +checklocks:e.mu 1587 func (e *endpoint) queueSegment(p tcpip.Payloader, opts tcpip.WriteOptions) (*segment, int, tcpip.Error) { 1588 e.sndQueueInfo.sndQueueMu.Lock() 1589 defer e.sndQueueInfo.sndQueueMu.Unlock() 1590 1591 avail, err := e.isEndpointWritableLocked() 1592 if err != nil { 1593 e.stats.WriteErrors.WriteClosed.Increment() 1594 return nil, 0, err 1595 } 1596 1597 buf, err := e.readFromPayloader(p, opts, avail) 1598 if err != nil { 1599 return nil, 0, err 1600 } 1601 1602 // Do not queue zero length segments. 1603 if buf.Size() == 0 { 1604 return nil, 0, nil 1605 } 1606 1607 if !opts.Atomic { 1608 // Since we released locks in between it's possible that the 1609 // endpoint transitioned to a CLOSED/ERROR states so make 1610 // sure endpoint is still writable before trying to write. 1611 avail, err := e.isEndpointWritableLocked() 1612 if err != nil { 1613 e.stats.WriteErrors.WriteClosed.Increment() 1614 buf.Release() 1615 return nil, 0, err 1616 } 1617 1618 // A simultaneous call to write on the socket can reduce avail. Discard 1619 // excess data copied if this is the case. 1620 if int64(avail) < buf.Size() { 1621 buf.Truncate(int64(avail)) 1622 } 1623 } 1624 1625 // Add data to the send queue. 1626 size := int(buf.Size()) 1627 s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), buf) 1628 e.sndQueueInfo.SndBufUsed += size 1629 e.snd.writeList.PushBack(s) 1630 1631 return s, size, nil 1632 } 1633 1634 // Write writes data to the endpoint's peer. 1635 func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { 1636 // Linux completely ignores any address passed to sendto(2) for TCP sockets 1637 // (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More 1638 // and opts.EndOfRecord are also ignored. 1639 1640 e.LockUser() 1641 defer e.UnlockUser() 1642 1643 // Return if either we didn't queue anything or if an error occurred while 1644 // attempting to queue data. 1645 nextSeg, n, err := e.queueSegment(p, opts) 1646 if n == 0 || err != nil { 1647 return 0, err 1648 } 1649 1650 e.sendData(nextSeg) 1651 return int64(n), nil 1652 } 1653 1654 // selectWindowLocked returns the new window without checking for shrinking or scaling 1655 // applied. 1656 // +checklocks:e.mu 1657 // +checklocks:e.rcvQueueMu 1658 func (e *endpoint) selectWindowLocked(rcvBufSize int) (wnd seqnum.Size) { 1659 wndFromAvailable := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize)) 1660 maxWindow := wndFromSpace(rcvBufSize) 1661 wndFromUsedBytes := maxWindow - e.RcvBufUsed 1662 1663 // We take the lesser of the wndFromAvailable and wndFromUsedBytes because in 1664 // cases where we receive a lot of small segments the segment overhead is a 1665 // lot higher and we can run out socket buffer space before we can fill the 1666 // previous window we advertised. In cases where we receive MSS sized or close 1667 // MSS sized segments we will probably run out of window space before we 1668 // exhaust receive buffer. 1669 newWnd := wndFromAvailable 1670 if newWnd > wndFromUsedBytes { 1671 newWnd = wndFromUsedBytes 1672 } 1673 if newWnd < 0 { 1674 newWnd = 0 1675 } 1676 return seqnum.Size(newWnd) 1677 } 1678 1679 // selectWindow invokes selectWindowLocked after acquiring e.rcvQueueMu. 1680 // +checklocks:e.mu 1681 func (e *endpoint) selectWindow() (wnd seqnum.Size) { 1682 e.rcvQueueMu.Lock() 1683 wnd = e.selectWindowLocked(int(e.ops.GetReceiveBufferSize())) 1684 e.rcvQueueMu.Unlock() 1685 return wnd 1686 } 1687 1688 // windowCrossedACKThresholdLocked checks if the receive window to be announced 1689 // would be under aMSS or under the window derived from half receive buffer, 1690 // whichever smaller. This is useful as a receive side silly window syndrome 1691 // prevention mechanism. If window grows to reasonable value, we should send ACK 1692 // to the sender to inform the rx space is now large. We also want ensure a 1693 // series of small read()'s won't trigger a flood of spurious tiny ACK's. 1694 // 1695 // For large receive buffers, the threshold is aMSS - once reader reads more 1696 // than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of 1697 // receive buffer size. This is chosen arbitrarily. 1698 // crossed will be true if the window size crossed the ACK threshold. 1699 // above will be true if the new window is >= ACK threshold and false 1700 // otherwise. 1701 // 1702 // +checklocks:e.mu 1703 // +checklocks:e.rcvQueueMu 1704 func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int, rcvBufSize int) (crossed bool, above bool) { 1705 newAvail := int(e.selectWindowLocked(rcvBufSize)) 1706 oldAvail := newAvail - deltaBefore 1707 if oldAvail < 0 { 1708 oldAvail = 0 1709 } 1710 threshold := int(e.amss) 1711 // rcvBufFraction is the inverse of the fraction of receive buffer size that 1712 // is used to decide if the available buffer space is now above it. 1713 const rcvBufFraction = 2 1714 if wndThreshold := wndFromSpace(rcvBufSize / rcvBufFraction); threshold > wndThreshold { 1715 threshold = wndThreshold 1716 } 1717 1718 switch { 1719 case oldAvail < threshold && newAvail >= threshold: 1720 return true, true 1721 case oldAvail >= threshold && newAvail < threshold: 1722 return true, false 1723 } 1724 return false, false 1725 } 1726 1727 // OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet. 1728 func (e *endpoint) OnReuseAddressSet(v bool) { 1729 e.LockUser() 1730 e.portFlags.TupleOnly = v 1731 e.UnlockUser() 1732 } 1733 1734 // OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet. 1735 func (e *endpoint) OnReusePortSet(v bool) { 1736 e.LockUser() 1737 e.portFlags.LoadBalanced = v 1738 e.UnlockUser() 1739 } 1740 1741 // OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet. 1742 func (e *endpoint) OnKeepAliveSet(bool) { 1743 e.LockUser() 1744 e.resetKeepaliveTimer(true /* receivedData */) 1745 e.UnlockUser() 1746 } 1747 1748 // OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet. 1749 func (e *endpoint) OnDelayOptionSet(v bool) { 1750 if !v { 1751 e.LockUser() 1752 defer e.UnlockUser() 1753 // Handle delayed data. 1754 if e.EndpointState().connected() { 1755 e.sendData(nil /* next */) 1756 } 1757 } 1758 } 1759 1760 // OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet. 1761 func (e *endpoint) OnCorkOptionSet(v bool) { 1762 if !v { 1763 e.LockUser() 1764 defer e.UnlockUser() 1765 // Handle the corked data. 1766 if e.EndpointState().connected() { 1767 e.sendData(nil /* next */) 1768 } 1769 } 1770 } 1771 1772 func (e *endpoint) getSendBufferSize() int { 1773 return int(e.ops.GetSendBufferSize()) 1774 } 1775 1776 // OnSetReceiveBufferSize implements tcpip.SocketOptionsHandler.OnSetReceiveBufferSize. 1777 func (e *endpoint) OnSetReceiveBufferSize(rcvBufSz, oldSz int64) (newSz int64, postSet func()) { 1778 e.LockUser() 1779 1780 sendNonZeroWindowUpdate := false 1781 e.rcvQueueMu.Lock() 1782 1783 // Make sure the receive buffer size allows us to send a 1784 // non-zero window size. 1785 scale := uint8(0) 1786 if e.rcv != nil { 1787 scale = e.rcv.RcvWndScale 1788 } 1789 if rcvBufSz>>scale == 0 { 1790 rcvBufSz = 1 << scale 1791 } 1792 1793 availBefore := wndFromSpace(e.receiveBufferAvailableLocked(int(oldSz))) 1794 availAfter := wndFromSpace(e.receiveBufferAvailableLocked(int(rcvBufSz))) 1795 e.RcvAutoParams.Disabled = true 1796 1797 // Immediately send an ACK to uncork the sender silly window 1798 // syndrome prevetion, when our available space grows above aMSS 1799 // or half receive buffer, whichever smaller. 1800 if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, int(rcvBufSz)); crossed && above { 1801 sendNonZeroWindowUpdate = true 1802 } 1803 1804 e.rcvQueueMu.Unlock() 1805 1806 postSet = func() { 1807 e.LockUser() 1808 defer e.UnlockUser() 1809 if e.EndpointState().connected() && sendNonZeroWindowUpdate { 1810 e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu 1811 } 1812 1813 } 1814 e.UnlockUser() 1815 return rcvBufSz, postSet 1816 } 1817 1818 // OnSetSendBufferSize implements tcpip.SocketOptionsHandler.OnSetSendBufferSize. 1819 func (e *endpoint) OnSetSendBufferSize(sz int64) int64 { 1820 e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled.Store(1) 1821 return sz 1822 } 1823 1824 // WakeupWriters implements tcpip.SocketOptionsHandler.WakeupWriters. 1825 func (e *endpoint) WakeupWriters() { 1826 e.LockUser() 1827 defer e.UnlockUser() 1828 1829 sendBufferSize := e.getSendBufferSize() 1830 e.sndQueueInfo.sndQueueMu.Lock() 1831 notify := (sendBufferSize - e.sndQueueInfo.SndBufUsed) >= e.sndQueueInfo.SndBufUsed>>1 1832 e.sndQueueInfo.sndQueueMu.Unlock() 1833 1834 if notify { 1835 e.waiterQueue.Notify(waiter.WritableEvents) 1836 } 1837 } 1838 1839 // SetSockOptInt sets a socket option. 1840 func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { 1841 // Lower 2 bits represents ECN bits. RFC 3168, section 23.1 1842 const inetECNMask = 3 1843 1844 switch opt { 1845 case tcpip.KeepaliveCountOption: 1846 e.LockUser() 1847 e.keepalive.Lock() 1848 e.keepalive.count = v 1849 e.keepalive.Unlock() 1850 e.resetKeepaliveTimer(true /* receivedData */) 1851 e.UnlockUser() 1852 1853 case tcpip.IPv4TOSOption: 1854 e.LockUser() 1855 // TODO(gvisor.dev/issue/995): ECN is not currently supported, 1856 // ignore the bits for now. 1857 e.sendTOS = uint8(v) & ^uint8(inetECNMask) 1858 e.UnlockUser() 1859 1860 case tcpip.IPv6TrafficClassOption: 1861 e.LockUser() 1862 // TODO(gvisor.dev/issue/995): ECN is not currently supported, 1863 // ignore the bits for now. 1864 e.sendTOS = uint8(v) & ^uint8(inetECNMask) 1865 e.UnlockUser() 1866 1867 case tcpip.MaxSegOption: 1868 userMSS := v 1869 if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS { 1870 return &tcpip.ErrInvalidOptionValue{} 1871 } 1872 e.LockUser() 1873 e.userMSS = uint16(userMSS) 1874 e.UnlockUser() 1875 1876 case tcpip.MTUDiscoverOption: 1877 // Return not supported if attempting to set this option to 1878 // anything other than path MTU discovery disabled. 1879 if v != tcpip.PMTUDiscoveryDont { 1880 return &tcpip.ErrNotSupported{} 1881 } 1882 1883 case tcpip.IPv4TTLOption: 1884 e.LockUser() 1885 e.ipv4TTL = uint8(v) 1886 e.UnlockUser() 1887 1888 case tcpip.IPv6HopLimitOption: 1889 e.LockUser() 1890 e.ipv6HopLimit = int16(v) 1891 e.UnlockUser() 1892 1893 case tcpip.TCPSynCountOption: 1894 if v < 1 || v > 255 { 1895 return &tcpip.ErrInvalidOptionValue{} 1896 } 1897 e.LockUser() 1898 e.maxSynRetries = uint8(v) 1899 e.UnlockUser() 1900 1901 case tcpip.TCPWindowClampOption: 1902 if v == 0 { 1903 e.LockUser() 1904 switch e.EndpointState() { 1905 case StateClose, StateInitial: 1906 e.windowClamp = 0 1907 e.UnlockUser() 1908 return nil 1909 default: 1910 e.UnlockUser() 1911 return &tcpip.ErrInvalidOptionValue{} 1912 } 1913 } 1914 var rs tcpip.TCPReceiveBufferSizeRangeOption 1915 if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil { 1916 if v < rs.Min/2 { 1917 v = rs.Min / 2 1918 } 1919 } 1920 e.LockUser() 1921 e.windowClamp = uint32(v) 1922 e.UnlockUser() 1923 } 1924 return nil 1925 } 1926 1927 func (e *endpoint) HasNIC(id int32) bool { 1928 return id == 0 || e.stack.HasNIC(tcpip.NICID(id)) 1929 } 1930 1931 // SetSockOpt sets a socket option. 1932 func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { 1933 switch v := opt.(type) { 1934 case *tcpip.KeepaliveIdleOption: 1935 e.LockUser() 1936 e.keepalive.Lock() 1937 e.keepalive.idle = time.Duration(*v) 1938 e.keepalive.Unlock() 1939 e.resetKeepaliveTimer(true /* receivedData */) 1940 e.UnlockUser() 1941 1942 case *tcpip.KeepaliveIntervalOption: 1943 e.LockUser() 1944 e.keepalive.Lock() 1945 e.keepalive.interval = time.Duration(*v) 1946 e.keepalive.Unlock() 1947 e.resetKeepaliveTimer(true /* receivedData */) 1948 e.UnlockUser() 1949 1950 case *tcpip.TCPUserTimeoutOption: 1951 e.LockUser() 1952 e.userTimeout = time.Duration(*v) 1953 e.UnlockUser() 1954 1955 case *tcpip.CongestionControlOption: 1956 // Query the available cc algorithms in the stack and 1957 // validate that the specified algorithm is actually 1958 // supported in the stack. 1959 var avail tcpip.TCPAvailableCongestionControlOption 1960 if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil { 1961 return err 1962 } 1963 availCC := strings.Split(string(avail), " ") 1964 for _, cc := range availCC { 1965 if *v == tcpip.CongestionControlOption(cc) { 1966 e.LockUser() 1967 state := e.EndpointState() 1968 e.cc = *v 1969 switch state { 1970 case StateEstablished: 1971 if e.EndpointState() == state { 1972 e.snd.cc = e.snd.initCongestionControl(e.cc) 1973 } 1974 } 1975 e.UnlockUser() 1976 return nil 1977 } 1978 } 1979 1980 // Linux returns ENOENT when an invalid congestion 1981 // control algorithm is specified. 1982 return &tcpip.ErrNoSuchFile{} 1983 1984 case *tcpip.TCPLingerTimeoutOption: 1985 e.LockUser() 1986 1987 switch { 1988 case *v < 0: 1989 // Same as effectively disabling TCPLinger timeout. 1990 *v = -1 1991 case *v == 0: 1992 // Same as the stack default. 1993 var stackLingerTimeout tcpip.TCPLingerTimeoutOption 1994 if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil { 1995 panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err)) 1996 } 1997 *v = stackLingerTimeout 1998 case *v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout): 1999 // Cap it to Stack's default TCP_LINGER2 timeout. 2000 *v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout) 2001 default: 2002 } 2003 2004 e.tcpLingerTimeout = time.Duration(*v) 2005 e.UnlockUser() 2006 2007 case *tcpip.TCPDeferAcceptOption: 2008 e.LockUser() 2009 if time.Duration(*v) > MaxRTO { 2010 *v = tcpip.TCPDeferAcceptOption(MaxRTO) 2011 } 2012 e.deferAccept = time.Duration(*v) 2013 e.UnlockUser() 2014 2015 case *tcpip.SocketDetachFilterOption: 2016 return nil 2017 2018 default: 2019 return nil 2020 } 2021 return nil 2022 } 2023 2024 // readyReceiveSize returns the number of bytes ready to be received. 2025 func (e *endpoint) readyReceiveSize() (int, tcpip.Error) { 2026 e.LockUser() 2027 defer e.UnlockUser() 2028 2029 // The endpoint cannot be in listen state. 2030 if e.EndpointState() == StateListen { 2031 return 0, &tcpip.ErrInvalidEndpointState{} 2032 } 2033 2034 e.rcvQueueMu.Lock() 2035 defer e.rcvQueueMu.Unlock() 2036 2037 return e.RcvBufUsed, nil 2038 } 2039 2040 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. 2041 func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { 2042 switch opt { 2043 case tcpip.KeepaliveCountOption: 2044 e.keepalive.Lock() 2045 v := e.keepalive.count 2046 e.keepalive.Unlock() 2047 return v, nil 2048 2049 case tcpip.IPv4TOSOption: 2050 e.LockUser() 2051 v := int(e.sendTOS) 2052 e.UnlockUser() 2053 return v, nil 2054 2055 case tcpip.IPv6TrafficClassOption: 2056 e.LockUser() 2057 v := int(e.sendTOS) 2058 e.UnlockUser() 2059 return v, nil 2060 2061 case tcpip.MaxSegOption: 2062 // This is just stubbed out. Linux never returns the user_mss 2063 // value as it either returns the defaultMSS or returns the 2064 // actual current MSS. Netstack just returns the defaultMSS 2065 // always for now. 2066 v := header.TCPDefaultMSS 2067 return v, nil 2068 2069 case tcpip.MTUDiscoverOption: 2070 // Always return the path MTU discovery disabled setting since 2071 // it's the only one supported. 2072 return tcpip.PMTUDiscoveryDont, nil 2073 2074 case tcpip.ReceiveQueueSizeOption: 2075 return e.readyReceiveSize() 2076 2077 case tcpip.IPv4TTLOption: 2078 e.LockUser() 2079 v := int(e.ipv4TTL) 2080 e.UnlockUser() 2081 return v, nil 2082 2083 case tcpip.IPv6HopLimitOption: 2084 e.LockUser() 2085 v := int(e.ipv6HopLimit) 2086 e.UnlockUser() 2087 return v, nil 2088 2089 case tcpip.TCPSynCountOption: 2090 e.LockUser() 2091 v := int(e.maxSynRetries) 2092 e.UnlockUser() 2093 return v, nil 2094 2095 case tcpip.TCPWindowClampOption: 2096 e.LockUser() 2097 v := int(e.windowClamp) 2098 e.UnlockUser() 2099 return v, nil 2100 2101 case tcpip.MulticastTTLOption: 2102 return 1, nil 2103 2104 default: 2105 return -1, &tcpip.ErrUnknownProtocolOption{} 2106 } 2107 } 2108 2109 func (e *endpoint) getTCPInfo() tcpip.TCPInfoOption { 2110 info := tcpip.TCPInfoOption{} 2111 e.LockUser() 2112 if state := e.EndpointState(); state.internal() { 2113 info.State = tcpip.EndpointState(StateClose) 2114 } else { 2115 info.State = tcpip.EndpointState(state) 2116 } 2117 snd := e.snd 2118 if snd != nil { 2119 // We do not calculate RTT before sending the data packets. If 2120 // the connection did not send and receive data, then RTT will 2121 // be zero. 2122 snd.rtt.Lock() 2123 info.RTT = snd.rtt.TCPRTTState.SRTT 2124 info.RTTVar = snd.rtt.TCPRTTState.RTTVar 2125 snd.rtt.Unlock() 2126 2127 info.RTO = snd.RTO 2128 info.CcState = snd.state 2129 info.SndSsthresh = uint32(snd.Ssthresh) 2130 info.SndCwnd = uint32(snd.SndCwnd) 2131 info.ReorderSeen = snd.rc.Reord 2132 } 2133 e.UnlockUser() 2134 return info 2135 } 2136 2137 // GetSockOpt implements tcpip.Endpoint.GetSockOpt. 2138 func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { 2139 switch o := opt.(type) { 2140 case *tcpip.TCPInfoOption: 2141 *o = e.getTCPInfo() 2142 2143 case *tcpip.KeepaliveIdleOption: 2144 e.keepalive.Lock() 2145 *o = tcpip.KeepaliveIdleOption(e.keepalive.idle) 2146 e.keepalive.Unlock() 2147 2148 case *tcpip.KeepaliveIntervalOption: 2149 e.keepalive.Lock() 2150 *o = tcpip.KeepaliveIntervalOption(e.keepalive.interval) 2151 e.keepalive.Unlock() 2152 2153 case *tcpip.TCPUserTimeoutOption: 2154 e.LockUser() 2155 *o = tcpip.TCPUserTimeoutOption(e.userTimeout) 2156 e.UnlockUser() 2157 2158 case *tcpip.CongestionControlOption: 2159 e.LockUser() 2160 *o = e.cc 2161 e.UnlockUser() 2162 2163 case *tcpip.TCPLingerTimeoutOption: 2164 e.LockUser() 2165 *o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout) 2166 e.UnlockUser() 2167 2168 case *tcpip.TCPDeferAcceptOption: 2169 e.LockUser() 2170 *o = tcpip.TCPDeferAcceptOption(e.deferAccept) 2171 e.UnlockUser() 2172 2173 case *tcpip.OriginalDestinationOption: 2174 e.LockUser() 2175 ipt := e.stack.IPTables() 2176 addr, port, err := ipt.OriginalDst(e.TransportEndpointInfo.ID, e.NetProto, ProtocolNumber) 2177 e.UnlockUser() 2178 if err != nil { 2179 return err 2180 } 2181 *o = tcpip.OriginalDestinationOption{ 2182 Addr: addr, 2183 Port: port, 2184 } 2185 2186 default: 2187 return &tcpip.ErrUnknownProtocolOption{} 2188 } 2189 return nil 2190 } 2191 2192 // checkV4MappedLocked determines the effective network protocol and converts 2193 // addr to its canonical form. 2194 // +checklocks:e.mu 2195 func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) { 2196 unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only()) 2197 if err != nil { 2198 return tcpip.FullAddress{}, 0, err 2199 } 2200 return unwrapped, netProto, nil 2201 } 2202 2203 // Disconnect implements tcpip.Endpoint.Disconnect. 2204 func (*endpoint) Disconnect() tcpip.Error { 2205 return &tcpip.ErrNotSupported{} 2206 } 2207 2208 // Connect connects the endpoint to its peer. 2209 func (e *endpoint) Connect(addr tcpip.FullAddress) tcpip.Error { 2210 e.LockUser() 2211 defer e.UnlockUser() 2212 err := e.connect(addr, true) 2213 if err != nil { 2214 if !err.IgnoreStats() { 2215 // Connect failed. Let's wake up any waiters. 2216 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 2217 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 2218 e.stats.FailedConnectionAttempts.Increment() 2219 } 2220 } 2221 return err 2222 } 2223 2224 // registerEndpoint registers the endpoint with the provided address. 2225 // 2226 // +checklocks:e.mu 2227 func (e *endpoint) registerEndpoint(addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber, nicID tcpip.NICID) tcpip.Error { 2228 netProtos := []tcpip.NetworkProtocolNumber{netProto} 2229 if e.TransportEndpointInfo.ID.LocalPort != 0 { 2230 // The endpoint is bound to a port, attempt to register it. 2231 err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 2232 if err != nil { 2233 return err 2234 } 2235 } else { 2236 // The endpoint doesn't have a local port yet, so try to get 2237 // one. Make sure that it isn't one that will result in the same 2238 // address/port for both local and remote (otherwise this 2239 // endpoint would be trying to connect to itself). 2240 sameAddr := e.TransportEndpointInfo.ID.LocalAddress == e.TransportEndpointInfo.ID.RemoteAddress 2241 2242 // Calculate a port offset based on the destination IP/port and 2243 // src IP to ensure that for a given tuple (srcIP, destIP, 2244 // destPort) the offset used as a starting point is the same to 2245 // ensure that we can cycle through the port space effectively. 2246 portBuf := make([]byte, 2) 2247 binary.LittleEndian.PutUint16(portBuf, e.ID.RemotePort) 2248 2249 h := jenkins.Sum32(e.protocol.portOffsetSecret) 2250 for _, s := range [][]byte{ 2251 e.ID.LocalAddress.AsSlice(), 2252 e.ID.RemoteAddress.AsSlice(), 2253 portBuf, 2254 } { 2255 // Per io.Writer.Write: 2256 // 2257 // Write must return a non-nil error if it returns n < len(p). 2258 if _, err := h.Write(s); err != nil { 2259 panic(err) 2260 } 2261 } 2262 portOffset := h.Sum32() 2263 2264 var twReuse tcpip.TCPTimeWaitReuseOption 2265 if err := e.stack.TransportProtocolOption(ProtocolNumber, &twReuse); err != nil { 2266 panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &twReuse, err)) 2267 } 2268 2269 reuse := twReuse == tcpip.TCPTimeWaitReuseGlobal 2270 if twReuse == tcpip.TCPTimeWaitReuseLoopbackOnly { 2271 switch netProto { 2272 case header.IPv4ProtocolNumber: 2273 reuse = header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.LocalAddress) && header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.RemoteAddress) 2274 case header.IPv6ProtocolNumber: 2275 reuse = e.TransportEndpointInfo.ID.LocalAddress == header.IPv6Loopback && e.TransportEndpointInfo.ID.RemoteAddress == header.IPv6Loopback 2276 } 2277 } 2278 2279 bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) 2280 if _, err := e.stack.PickEphemeralPortStable(portOffset, func(p uint16) (bool, tcpip.Error) { 2281 if sameAddr && p == e.TransportEndpointInfo.ID.RemotePort { 2282 return false, nil 2283 } 2284 portRes := ports.Reservation{ 2285 Networks: netProtos, 2286 Transport: ProtocolNumber, 2287 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2288 Port: p, 2289 Flags: e.portFlags, 2290 BindToDevice: bindToDevice, 2291 Dest: addr, 2292 } 2293 if _, err := e.stack.ReservePort(e.stack.Rand(), portRes, nil /* testPort */); err != nil { 2294 if _, ok := err.(*tcpip.ErrPortInUse); !ok || !reuse { 2295 return false, nil 2296 } 2297 transEPID := e.TransportEndpointInfo.ID 2298 transEPID.LocalPort = p 2299 // Check if an endpoint is registered with demuxer in TIME-WAIT and if 2300 // we can reuse it. If we can't find a transport endpoint then we just 2301 // skip using this port as it's possible that either an endpoint has 2302 // bound the port but not registered with demuxer yet (no listen/connect 2303 // done yet) or the reservation was freed between the check above and 2304 // the FindTransportEndpoint below. But rather than retry the same port 2305 // we just skip it and move on. 2306 transEP := e.stack.FindTransportEndpoint(netProto, ProtocolNumber, transEPID, nicID) 2307 if transEP == nil { 2308 // ReservePort failed but there is no registered endpoint with 2309 // demuxer. Which indicates there is at least some endpoint that has 2310 // bound the port. 2311 return false, nil 2312 } 2313 2314 tcpEP := transEP.(*endpoint) 2315 tcpEP.LockUser() 2316 // If the endpoint is not in TIME-WAIT or if it is in TIME-WAIT but 2317 // less than 1 second has elapsed since its recentTS was updated then 2318 // we cannot reuse the port. 2319 if tcpEP.EndpointState() != StateTimeWait || e.stack.Clock().NowMonotonic().Sub(tcpEP.recentTSTime) < 1*time.Second { 2320 tcpEP.UnlockUser() 2321 return false, nil 2322 } 2323 // Since the endpoint is in TIME-WAIT it should be safe to acquire its 2324 // Lock while holding the lock for this endpoint as endpoints in 2325 // TIME-WAIT do not acquire locks on other endpoints. 2326 tcpEP.transitionToStateCloseLocked() 2327 tcpEP.drainClosingSegmentQueue() 2328 tcpEP.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 2329 tcpEP.UnlockUser() 2330 // Now try and Reserve again if it fails then we skip. 2331 portRes := ports.Reservation{ 2332 Networks: netProtos, 2333 Transport: ProtocolNumber, 2334 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2335 Port: p, 2336 Flags: e.portFlags, 2337 BindToDevice: bindToDevice, 2338 Dest: addr, 2339 } 2340 if _, err := e.stack.ReservePort(e.stack.Rand(), portRes, nil /* testPort */); err != nil { 2341 return false, nil 2342 } 2343 } 2344 2345 id := e.TransportEndpointInfo.ID 2346 id.LocalPort = p 2347 if err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.portFlags, bindToDevice); err != nil { 2348 portRes := ports.Reservation{ 2349 Networks: netProtos, 2350 Transport: ProtocolNumber, 2351 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2352 Port: p, 2353 Flags: e.portFlags, 2354 BindToDevice: bindToDevice, 2355 Dest: addr, 2356 } 2357 e.stack.ReleasePort(portRes) 2358 if _, ok := err.(*tcpip.ErrPortInUse); ok { 2359 return false, nil 2360 } 2361 return false, err 2362 } 2363 2364 // Port picking successful. Save the details of 2365 // the selected port. 2366 e.TransportEndpointInfo.ID = id 2367 e.isPortReserved = true 2368 e.boundBindToDevice = bindToDevice 2369 e.boundPortFlags = e.portFlags 2370 e.boundDest = addr 2371 return true, nil 2372 }); err != nil { 2373 e.stack.Stats().TCP.FailedPortReservations.Increment() 2374 return err 2375 } 2376 } 2377 return nil 2378 } 2379 2380 // connect connects the endpoint to its peer. 2381 // +checklocks:e.mu 2382 func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool) tcpip.Error { 2383 connectingAddr := addr.Addr 2384 2385 addr, netProto, err := e.checkV4MappedLocked(addr) 2386 if err != nil { 2387 return err 2388 } 2389 2390 if e.EndpointState().connected() { 2391 // The endpoint is already connected. If caller hasn't been 2392 // notified yet, return success. 2393 if !e.isConnectNotified { 2394 e.isConnectNotified = true 2395 return nil 2396 } 2397 // Otherwise return that it's already connected. 2398 return &tcpip.ErrAlreadyConnected{} 2399 } 2400 2401 nicID := addr.NIC 2402 switch e.EndpointState() { 2403 case StateBound: 2404 // If we're already bound to a NIC but the caller is requesting 2405 // that we use a different one now, we cannot proceed. 2406 if e.boundNICID == 0 { 2407 break 2408 } 2409 2410 if nicID != 0 && nicID != e.boundNICID { 2411 return &tcpip.ErrHostUnreachable{} 2412 } 2413 2414 nicID = e.boundNICID 2415 2416 case StateInitial: 2417 // Nothing to do. We'll eventually fill-in the gaps in the ID (if any) 2418 // when we find a route. 2419 2420 case StateConnecting, StateSynSent, StateSynRecv: 2421 // A connection request has already been issued but hasn't completed 2422 // yet. 2423 return &tcpip.ErrAlreadyConnecting{} 2424 2425 case StateError: 2426 if err := e.hardErrorLocked(); err != nil { 2427 return err 2428 } 2429 return &tcpip.ErrConnectionAborted{} 2430 2431 default: 2432 return &tcpip.ErrInvalidEndpointState{} 2433 } 2434 2435 // Find a route to the desired destination. 2436 r, err := e.stack.FindRoute(nicID, e.TransportEndpointInfo.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */) 2437 if err != nil { 2438 return err 2439 } 2440 defer r.Release() 2441 2442 e.TransportEndpointInfo.ID.LocalAddress = r.LocalAddress() 2443 e.TransportEndpointInfo.ID.RemoteAddress = r.RemoteAddress() 2444 e.TransportEndpointInfo.ID.RemotePort = addr.Port 2445 2446 oldState := e.EndpointState() 2447 e.setEndpointState(StateConnecting) 2448 if err := e.registerEndpoint(addr, netProto, r.NICID()); err != nil { 2449 e.setEndpointState(oldState) 2450 if _, ok := err.(*tcpip.ErrPortInUse); ok { 2451 return &tcpip.ErrBadLocalAddress{} 2452 } 2453 return err 2454 } 2455 2456 e.isRegistered = true 2457 r.Acquire() 2458 e.route = r 2459 e.boundNICID = nicID 2460 e.effectiveNetProtos = []tcpip.NetworkProtocolNumber{netProto} 2461 e.connectingAddress = connectingAddr 2462 2463 e.initGSO() 2464 2465 // Connect in the restore phase does not perform handshake. Restore its 2466 // connection setting here. 2467 if !handshake { 2468 e.segmentQueue.mu.Lock() 2469 for _, l := range []segmentList{e.segmentQueue.list, e.snd.writeList} { 2470 for s := l.Front(); s != nil; s = s.Next() { 2471 s.id = e.TransportEndpointInfo.ID 2472 e.sndQueueInfo.sndWaker.Assert() 2473 } 2474 } 2475 e.segmentQueue.mu.Unlock() 2476 e.snd.ep.AssertLockHeld(e) 2477 e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0) 2478 e.setEndpointState(StateEstablished) 2479 // Set the new auto tuned send buffer size after entering 2480 // established state. 2481 e.ops.SetSendBufferSize(e.computeTCPSendBufferSize(), false /* notify */) 2482 return &tcpip.ErrConnectStarted{} 2483 } 2484 2485 // Start a new handshake. 2486 h := e.newHandshake() 2487 e.setEndpointState(StateSynSent) 2488 h.start() 2489 e.stack.Stats().TCP.ActiveConnectionOpenings.Increment() 2490 2491 return &tcpip.ErrConnectStarted{} 2492 } 2493 2494 // ConnectEndpoint is not supported. 2495 func (*endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error { 2496 return &tcpip.ErrInvalidEndpointState{} 2497 } 2498 2499 // Shutdown closes the read and/or write end of the endpoint connection to its 2500 // peer. 2501 func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error { 2502 e.LockUser() 2503 defer e.UnlockUser() 2504 2505 if e.EndpointState().connecting() { 2506 // When calling shutdown(2) on a connecting socket, the endpoint must 2507 // enter the error state. But this logic cannot belong to the shutdownLocked 2508 // method because that method is called during a close(2) (and closing a 2509 // connecting socket is not an error). 2510 e.handshakeFailed(&tcpip.ErrConnectionReset{}) 2511 e.waiterQueue.Notify(waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) 2512 return nil 2513 } 2514 2515 return e.shutdownLocked(flags) 2516 } 2517 2518 // +checklocks:e.mu 2519 func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error { 2520 e.shutdownFlags |= flags 2521 switch { 2522 case e.EndpointState().connected(): 2523 // Close for read. 2524 if e.shutdownFlags&tcpip.ShutdownRead != 0 { 2525 // Mark read side as closed. 2526 e.rcvQueueMu.Lock() 2527 e.RcvClosed = true 2528 rcvBufUsed := e.RcvBufUsed 2529 e.rcvQueueMu.Unlock() 2530 // If we're fully closed and we have unread data we need to abort 2531 // the connection with a RST. 2532 if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 { 2533 e.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) 2534 return nil 2535 } 2536 // Wake up any readers that maybe waiting for the stream to become 2537 // readable. 2538 events := waiter.ReadableEvents 2539 if e.shutdownFlags&tcpip.ShutdownWrite == 0 { 2540 // If ShutdownWrite is not set, write end won't close and 2541 // we end up with a half-closed connection 2542 events |= waiter.EventRdHUp 2543 } 2544 e.waiterQueue.Notify(events) 2545 } 2546 2547 // Close for write. 2548 if e.shutdownFlags&tcpip.ShutdownWrite != 0 { 2549 e.sndQueueInfo.sndQueueMu.Lock() 2550 if e.sndQueueInfo.SndClosed { 2551 // Already closed. 2552 e.sndQueueInfo.sndQueueMu.Unlock() 2553 if e.EndpointState() == StateTimeWait { 2554 return &tcpip.ErrNotConnected{} 2555 } 2556 return nil 2557 } 2558 2559 // Queue fin segment. 2560 s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), buffer.Buffer{}) 2561 e.snd.writeList.PushBack(s) 2562 // Mark endpoint as closed. 2563 e.sndQueueInfo.SndClosed = true 2564 e.sndQueueInfo.sndQueueMu.Unlock() 2565 2566 // Drain the send queue. 2567 e.sendData(s) 2568 2569 // Mark send side as closed. 2570 e.snd.Closed = true 2571 2572 // Wake up any writers that maybe waiting for the stream to become 2573 // writable. 2574 e.waiterQueue.Notify(waiter.WritableEvents) 2575 } 2576 2577 return nil 2578 case e.EndpointState() == StateListen: 2579 if e.shutdownFlags&tcpip.ShutdownRead != 0 { 2580 // Reset all connections from the accept queue and keep the 2581 // worker running so that it can continue handling incoming 2582 // segments by replying with RST. 2583 // 2584 // By not removing this endpoint from the demuxer mapping, we 2585 // ensure that any other bind to the same port fails, as on Linux. 2586 e.rcvQueueMu.Lock() 2587 e.RcvClosed = true 2588 e.rcvQueueMu.Unlock() 2589 e.closePendingAcceptableConnectionsLocked() 2590 // Notify waiters that the endpoint is shutdown. 2591 e.waiterQueue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) 2592 } 2593 return nil 2594 default: 2595 return &tcpip.ErrNotConnected{} 2596 } 2597 } 2598 2599 // Listen puts the endpoint in "listen" mode, which allows it to accept 2600 // new connections. 2601 func (e *endpoint) Listen(backlog int) tcpip.Error { 2602 err := e.listen(backlog) 2603 if err != nil { 2604 if !err.IgnoreStats() { 2605 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 2606 e.stats.FailedConnectionAttempts.Increment() 2607 } 2608 } 2609 return err 2610 } 2611 2612 func (e *endpoint) listen(backlog int) tcpip.Error { 2613 e.LockUser() 2614 defer e.UnlockUser() 2615 2616 if e.EndpointState() == StateListen && !e.closed { 2617 e.acceptMu.Lock() 2618 defer e.acceptMu.Unlock() 2619 2620 // Adjust the size of the backlog iff we can fit 2621 // existing pending connections into the new one. 2622 if e.acceptQueue.endpoints.Len() > backlog { 2623 return &tcpip.ErrInvalidEndpointState{} 2624 } 2625 e.acceptQueue.capacity = backlog 2626 2627 if e.acceptQueue.pendingEndpoints == nil { 2628 e.acceptQueue.pendingEndpoints = make(map[*endpoint]struct{}) 2629 } 2630 2631 e.shutdownFlags = 0 2632 e.updateConnDirectionState(connDirectionStateOpen) 2633 e.rcvQueueMu.Lock() 2634 e.RcvClosed = false 2635 e.rcvQueueMu.Unlock() 2636 2637 return nil 2638 } 2639 2640 if e.EndpointState() == StateInitial { 2641 // The listen is called on an unbound socket, the socket is 2642 // automatically bound to a random free port with the local 2643 // address set to INADDR_ANY. 2644 if err := e.bindLocked(tcpip.FullAddress{}); err != nil { 2645 return err 2646 } 2647 } 2648 2649 // Endpoint must be bound before it can transition to listen mode. 2650 if e.EndpointState() != StateBound { 2651 e.stats.ReadErrors.InvalidEndpointState.Increment() 2652 return &tcpip.ErrInvalidEndpointState{} 2653 } 2654 2655 // Setting this state after RegisterTransportEndpoint will result in a 2656 // race where the endpoint is in Bound but reachable via the demuxer. Instead 2657 // we set it to listen so that incoming packets will just be queued to the 2658 // inbound segment queue by the TCP processor. 2659 e.setEndpointState(StateListen) 2660 // Register the endpoint. 2661 if err := e.stack.RegisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil { 2662 e.transitionToStateCloseLocked() 2663 return err 2664 } 2665 2666 e.isRegistered = true 2667 2668 // The queue may be non-zero when we're restoring the endpoint, and it 2669 // may be pre-populated with some previously accepted (but not Accepted) 2670 // endpoints. 2671 e.acceptMu.Lock() 2672 if e.acceptQueue.pendingEndpoints == nil { 2673 e.acceptQueue.pendingEndpoints = make(map[*endpoint]struct{}) 2674 } 2675 if e.acceptQueue.capacity == 0 { 2676 e.acceptQueue.capacity = backlog 2677 } 2678 e.acceptMu.Unlock() 2679 2680 // Initialize the listening context. 2681 rcvWnd := seqnum.Size(e.receiveBufferAvailable()) 2682 e.listenCtx = newListenContext(e.stack, e.protocol, e, rcvWnd, e.ops.GetV6Only(), e.NetProto) 2683 2684 return nil 2685 } 2686 2687 // Accept returns a new endpoint if a peer has established a connection 2688 // to an endpoint previously set to listen mode. 2689 // 2690 // addr if not-nil will contain the peer address of the returned endpoint. 2691 func (e *endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) { 2692 e.LockUser() 2693 defer e.UnlockUser() 2694 2695 e.rcvQueueMu.Lock() 2696 rcvClosed := e.RcvClosed 2697 e.rcvQueueMu.Unlock() 2698 // Endpoint must be in listen state before it can accept connections. 2699 if rcvClosed || e.EndpointState() != StateListen { 2700 return nil, nil, &tcpip.ErrInvalidEndpointState{} 2701 } 2702 2703 // Get the new accepted endpoint. 2704 var n *endpoint 2705 e.acceptMu.Lock() 2706 if element := e.acceptQueue.endpoints.Front(); element != nil { 2707 n = e.acceptQueue.endpoints.Remove(element).(*endpoint) 2708 } 2709 e.acceptMu.Unlock() 2710 if n == nil { 2711 return nil, nil, &tcpip.ErrWouldBlock{} 2712 } 2713 if peerAddr != nil { 2714 *peerAddr = n.getRemoteAddress() 2715 } 2716 return n, n.waiterQueue, nil 2717 } 2718 2719 // Bind binds the endpoint to a specific local port and optionally address. 2720 func (e *endpoint) Bind(addr tcpip.FullAddress) (err tcpip.Error) { 2721 e.LockUser() 2722 defer e.UnlockUser() 2723 2724 return e.bindLocked(addr) 2725 } 2726 2727 // +checklocks:e.mu 2728 func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err tcpip.Error) { 2729 // Don't allow binding once endpoint is not in the initial state 2730 // anymore. This is because once the endpoint goes into a connected or 2731 // listen state, it is already bound. 2732 if e.EndpointState() != StateInitial { 2733 return &tcpip.ErrAlreadyBound{} 2734 } 2735 2736 e.BindAddr = addr.Addr 2737 addr, netProto, err := e.checkV4MappedLocked(addr) 2738 if err != nil { 2739 return err 2740 } 2741 2742 netProtos := []tcpip.NetworkProtocolNumber{netProto} 2743 2744 // Expand netProtos to include v4 and v6 under dual-stack if the caller is 2745 // binding to a wildcard (empty) address, and this is an IPv6 endpoint with 2746 // v6only set to false. 2747 if netProto == header.IPv6ProtocolNumber { 2748 stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber) 2749 alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == tcpip.Address{} && stackHasV4 2750 if alsoBindToV4 { 2751 netProtos = append(netProtos, header.IPv4ProtocolNumber) 2752 } 2753 } 2754 2755 var nic tcpip.NICID 2756 // If an address is specified, we must ensure that it's one of our 2757 // local addresses. 2758 if addr.Addr.Len() != 0 { 2759 nic = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) 2760 if nic == 0 { 2761 return &tcpip.ErrBadLocalAddress{} 2762 } 2763 e.TransportEndpointInfo.ID.LocalAddress = addr.Addr 2764 } 2765 2766 bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) 2767 portRes := ports.Reservation{ 2768 Networks: netProtos, 2769 Transport: ProtocolNumber, 2770 Addr: addr.Addr, 2771 Port: addr.Port, 2772 Flags: e.portFlags, 2773 BindToDevice: bindToDevice, 2774 Dest: tcpip.FullAddress{}, 2775 } 2776 port, err := e.stack.ReservePort(e.stack.Rand(), portRes, func(p uint16) (bool, tcpip.Error) { 2777 id := e.TransportEndpointInfo.ID 2778 id.LocalPort = p 2779 // CheckRegisterTransportEndpoint should only return an error if there is a 2780 // listening endpoint bound with the same id and portFlags and bindToDevice 2781 // options. 2782 // 2783 // NOTE: Only listening and connected endpoint register with 2784 // demuxer. Further connected endpoints always have a remote 2785 // address/port. Hence this will only return an error if there is a matching 2786 // listening endpoint. 2787 if err := e.stack.CheckRegisterTransportEndpoint(netProtos, ProtocolNumber, id, e.portFlags, bindToDevice); err != nil { 2788 return false, nil 2789 } 2790 return true, nil 2791 }) 2792 if err != nil { 2793 e.stack.Stats().TCP.FailedPortReservations.Increment() 2794 return err 2795 } 2796 2797 e.boundBindToDevice = bindToDevice 2798 e.boundPortFlags = e.portFlags 2799 // TODO(gvisor.dev/issue/3691): Add test to verify boundNICID is correct. 2800 e.boundNICID = nic 2801 e.isPortReserved = true 2802 e.effectiveNetProtos = netProtos 2803 e.TransportEndpointInfo.ID.LocalPort = port 2804 2805 // Mark endpoint as bound. 2806 e.setEndpointState(StateBound) 2807 2808 return nil 2809 } 2810 2811 // GetLocalAddress returns the address to which the endpoint is bound. 2812 func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { 2813 e.LockUser() 2814 defer e.UnlockUser() 2815 2816 return tcpip.FullAddress{ 2817 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2818 Port: e.TransportEndpointInfo.ID.LocalPort, 2819 NIC: e.boundNICID, 2820 }, nil 2821 } 2822 2823 // GetRemoteAddress returns the address to which the endpoint is connected. 2824 func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { 2825 e.LockUser() 2826 defer e.UnlockUser() 2827 2828 if !e.EndpointState().connected() { 2829 return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} 2830 } 2831 2832 return e.getRemoteAddress(), nil 2833 } 2834 2835 func (e *endpoint) getRemoteAddress() tcpip.FullAddress { 2836 return tcpip.FullAddress{ 2837 Addr: e.TransportEndpointInfo.ID.RemoteAddress, 2838 Port: e.TransportEndpointInfo.ID.RemotePort, 2839 NIC: e.boundNICID, 2840 } 2841 } 2842 2843 func (*endpoint) HandlePacket(stack.TransportEndpointID, stack.PacketBufferPtr) { 2844 // TCP HandlePacket is not required anymore as inbound packets first 2845 // land at the Dispatcher which then can either deliver using the 2846 // worker go routine or directly do the invoke the tcp processing inline 2847 // based on the state of the endpoint. 2848 } 2849 2850 func (e *endpoint) enqueueSegment(s *segment) bool { 2851 // Send packet to worker goroutine. 2852 if !e.segmentQueue.enqueue(s) { 2853 // The queue is full, so we drop the segment. 2854 e.stack.Stats().DroppedPackets.Increment() 2855 e.stats.ReceiveErrors.SegmentQueueDropped.Increment() 2856 return false 2857 } 2858 return true 2859 } 2860 2861 func (e *endpoint) onICMPError(err tcpip.Error, transErr stack.TransportError, pkt stack.PacketBufferPtr) { 2862 // Update last error first. 2863 e.lastErrorMu.Lock() 2864 e.lastError = err 2865 e.lastErrorMu.Unlock() 2866 2867 var recvErr bool 2868 switch pkt.NetworkProtocolNumber { 2869 case header.IPv4ProtocolNumber: 2870 recvErr = e.SocketOptions().GetIPv4RecvError() 2871 case header.IPv6ProtocolNumber: 2872 recvErr = e.SocketOptions().GetIPv6RecvError() 2873 default: 2874 panic(fmt.Sprintf("unhandled network protocol number = %d", pkt.NetworkProtocolNumber)) 2875 } 2876 2877 if recvErr { 2878 e.SocketOptions().QueueErr(&tcpip.SockError{ 2879 Err: err, 2880 Cause: transErr, 2881 // Linux passes the payload with the TCP header. We don't know if the TCP 2882 // header even exists, it may not for fragmented packets. 2883 Payload: pkt.Data().AsRange().ToView(), 2884 Dst: tcpip.FullAddress{ 2885 NIC: pkt.NICID, 2886 Addr: e.TransportEndpointInfo.ID.RemoteAddress, 2887 Port: e.TransportEndpointInfo.ID.RemotePort, 2888 }, 2889 Offender: tcpip.FullAddress{ 2890 NIC: pkt.NICID, 2891 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2892 Port: e.TransportEndpointInfo.ID.LocalPort, 2893 }, 2894 NetProto: pkt.NetworkProtocolNumber, 2895 }) 2896 } 2897 2898 if e.EndpointState().connecting() { 2899 e.mu.Lock() 2900 if lEP := e.h.listenEP; lEP != nil { 2901 // Remove from listening endpoints pending list. 2902 lEP.acceptMu.Lock() 2903 delete(lEP.acceptQueue.pendingEndpoints, e) 2904 lEP.acceptMu.Unlock() 2905 lEP.stats.FailedConnectionAttempts.Increment() 2906 } 2907 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 2908 e.cleanupLocked() 2909 e.hardError = err 2910 e.setEndpointState(StateError) 2911 e.mu.Unlock() 2912 e.drainClosingSegmentQueue() 2913 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 2914 } 2915 } 2916 2917 // HandleError implements stack.TransportEndpoint. 2918 func (e *endpoint) HandleError(transErr stack.TransportError, pkt stack.PacketBufferPtr) { 2919 handlePacketTooBig := func(mtu uint32) { 2920 e.sndQueueInfo.sndQueueMu.Lock() 2921 update := false 2922 if v := int(mtu); v < e.sndQueueInfo.SndMTU { 2923 e.sndQueueInfo.SndMTU = v 2924 update = true 2925 } 2926 newMTU := e.sndQueueInfo.SndMTU 2927 e.sndQueueInfo.sndQueueMu.Unlock() 2928 if update { 2929 e.mu.Lock() 2930 defer e.mu.Unlock() 2931 if e.snd != nil { 2932 e.snd.updateMaxPayloadSize(newMTU, 1 /* count */) // +checklocksforce:e.snd.ep.mu 2933 } 2934 } 2935 } 2936 2937 // TODO(gvisor.dev/issues/5270): Handle all transport errors. 2938 switch transErr.Kind() { 2939 case stack.PacketTooBigTransportError: 2940 handlePacketTooBig(transErr.Info()) 2941 case stack.DestinationHostUnreachableTransportError: 2942 e.onICMPError(&tcpip.ErrHostUnreachable{}, transErr, pkt) 2943 case stack.DestinationNetworkUnreachableTransportError: 2944 e.onICMPError(&tcpip.ErrNetworkUnreachable{}, transErr, pkt) 2945 case stack.DestinationPortUnreachableTransportError: 2946 e.onICMPError(&tcpip.ErrConnectionRefused{}, transErr, pkt) 2947 case stack.DestinationProtoUnreachableTransportError: 2948 e.onICMPError(&tcpip.ErrUnknownProtocolOption{}, transErr, pkt) 2949 case stack.SourceRouteFailedTransportError: 2950 e.onICMPError(&tcpip.ErrNotSupported{}, transErr, pkt) 2951 case stack.SourceHostIsolatedTransportError: 2952 e.onICMPError(&tcpip.ErrNoNet{}, transErr, pkt) 2953 case stack.DestinationHostDownTransportError: 2954 e.onICMPError(&tcpip.ErrHostDown{}, transErr, pkt) 2955 } 2956 } 2957 2958 // updateSndBufferUsage is called by the protocol goroutine when room opens up 2959 // in the send buffer. The number of newly available bytes is v. 2960 func (e *endpoint) updateSndBufferUsage(v int) { 2961 sendBufferSize := e.getSendBufferSize() 2962 e.sndQueueInfo.sndQueueMu.Lock() 2963 notify := e.sndQueueInfo.SndBufUsed >= sendBufferSize>>1 2964 e.sndQueueInfo.SndBufUsed -= v 2965 2966 // Get the new send buffer size with auto tuning, but do not set it 2967 // unless we decide to notify the writers. 2968 newSndBufSz := e.computeTCPSendBufferSize() 2969 2970 // We only notify when there is half the sendBufferSize available after 2971 // a full buffer event occurs. This ensures that we don't wake up 2972 // writers to queue just 1-2 segments and go back to sleep. 2973 notify = notify && e.sndQueueInfo.SndBufUsed < int(newSndBufSz)>>1 2974 e.sndQueueInfo.sndQueueMu.Unlock() 2975 2976 if notify { 2977 // Set the new send buffer size calculated from auto tuning. 2978 e.ops.SetSendBufferSize(newSndBufSz, false /* notify */) 2979 e.waiterQueue.Notify(waiter.WritableEvents) 2980 } 2981 } 2982 2983 // readyToRead is called by the protocol goroutine when a new segment is ready 2984 // to be read, or when the connection is closed for receiving (in which case 2985 // s will be nil). 2986 // 2987 // +checklocks:e.mu 2988 func (e *endpoint) readyToRead(s *segment) { 2989 e.rcvQueueMu.Lock() 2990 if s != nil { 2991 e.RcvBufUsed += s.payloadSize() 2992 s.IncRef() 2993 e.rcvQueue.PushBack(s) 2994 } else { 2995 e.RcvClosed = true 2996 } 2997 e.rcvQueueMu.Unlock() 2998 e.waiterQueue.Notify(waiter.ReadableEvents) 2999 } 3000 3001 // receiveBufferAvailableLocked calculates how many bytes are still available 3002 // in the receive buffer. 3003 // +checklocks:e.rcvQueueMu 3004 func (e *endpoint) receiveBufferAvailableLocked(rcvBufSize int) int { 3005 // We may use more bytes than the buffer size when the receive buffer 3006 // shrinks. 3007 memUsed := e.receiveMemUsed() 3008 if memUsed >= rcvBufSize { 3009 return 0 3010 } 3011 3012 return rcvBufSize - memUsed 3013 } 3014 3015 // receiveBufferAvailable calculates how many bytes are still available in the 3016 // receive buffer based on the actual memory used by all segments held in 3017 // receive buffer/pending and segment queue. 3018 func (e *endpoint) receiveBufferAvailable() int { 3019 e.rcvQueueMu.Lock() 3020 available := e.receiveBufferAvailableLocked(int(e.ops.GetReceiveBufferSize())) 3021 e.rcvQueueMu.Unlock() 3022 return available 3023 } 3024 3025 // receiveBufferUsed returns the amount of in-use receive buffer. 3026 func (e *endpoint) receiveBufferUsed() int { 3027 e.rcvQueueMu.Lock() 3028 used := e.RcvBufUsed 3029 e.rcvQueueMu.Unlock() 3030 return used 3031 } 3032 3033 // receiveMemUsed returns the total memory in use by segments held by this 3034 // endpoint. 3035 func (e *endpoint) receiveMemUsed() int { 3036 return int(e.rcvMemUsed.Load()) 3037 } 3038 3039 // updateReceiveMemUsed adds the provided delta to e.rcvMemUsed. 3040 func (e *endpoint) updateReceiveMemUsed(delta int) { 3041 e.rcvMemUsed.Add(int32(delta)) 3042 } 3043 3044 // maxReceiveBufferSize returns the stack wide maximum receive buffer size for 3045 // an endpoint. 3046 func (e *endpoint) maxReceiveBufferSize() int { 3047 var rs tcpip.TCPReceiveBufferSizeRangeOption 3048 if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil { 3049 // As a fallback return the hardcoded max buffer size. 3050 return MaxBufferSize 3051 } 3052 return rs.Max 3053 } 3054 3055 // directionState returns the close state of send and receive part of the endpoint 3056 func (e *endpoint) connDirectionState() connDirectionState { 3057 return connDirectionState(e.connectionDirectionState.Load()) 3058 } 3059 3060 // updateDirectionState updates the close state of send and receive part of the endpoint 3061 func (e *endpoint) updateConnDirectionState(state connDirectionState) connDirectionState { 3062 return connDirectionState(e.connectionDirectionState.Swap(uint32(e.connDirectionState() | state))) 3063 } 3064 3065 // rcvWndScaleForHandshake computes the receive window scale to offer to the 3066 // peer when window scaling is enabled (true by default). If auto-tuning is 3067 // disabled then the window scaling factor is based on the size of the 3068 // receiveBuffer otherwise we use the max permissible receive buffer size to 3069 // compute the scale. 3070 func (e *endpoint) rcvWndScaleForHandshake() int { 3071 bufSizeForScale := e.ops.GetReceiveBufferSize() 3072 3073 e.rcvQueueMu.Lock() 3074 autoTuningDisabled := e.RcvAutoParams.Disabled 3075 e.rcvQueueMu.Unlock() 3076 if autoTuningDisabled { 3077 return FindWndScale(seqnum.Size(bufSizeForScale)) 3078 } 3079 3080 return FindWndScale(seqnum.Size(e.maxReceiveBufferSize())) 3081 } 3082 3083 // updateRecentTimestamp updates the recent timestamp using the algorithm 3084 // described in https://tools.ietf.org/html/rfc7323#section-4.3 3085 func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) { 3086 if e.SendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) { 3087 e.setRecentTimestamp(tsVal) 3088 } 3089 } 3090 3091 // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if 3092 // the SYN options indicate that timestamp option was negotiated. It also 3093 // initializes the recentTS with the value provided in synOpts.TSval. 3094 func (e *endpoint) maybeEnableTimestamp(synOpts header.TCPSynOptions) { 3095 if synOpts.TS { 3096 e.SendTSOk = true 3097 e.setRecentTimestamp(synOpts.TSVal) 3098 } 3099 } 3100 3101 func (e *endpoint) tsVal(now tcpip.MonotonicTime) uint32 { 3102 return e.TSOffset.TSVal(now) 3103 } 3104 3105 func (e *endpoint) tsValNow() uint32 { 3106 return e.tsVal(e.stack.Clock().NowMonotonic()) 3107 } 3108 3109 func (e *endpoint) elapsed(now tcpip.MonotonicTime, tsEcr uint32) time.Duration { 3110 return e.TSOffset.Elapsed(now, tsEcr) 3111 } 3112 3113 // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint 3114 // if the SYN options indicate that the SACK option was negotiated and the TCP 3115 // stack is configured to enable TCP SACK option. 3116 func (e *endpoint) maybeEnableSACKPermitted(synOpts header.TCPSynOptions) { 3117 var v tcpip.TCPSACKEnabled 3118 if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil { 3119 // Stack doesn't support SACK. So just return. 3120 return 3121 } 3122 if bool(v) && synOpts.SACKPermitted { 3123 e.SACKPermitted = true 3124 e.stack.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery) 3125 } 3126 } 3127 3128 // maxOptionSize return the maximum size of TCP options. 3129 func (e *endpoint) maxOptionSize() (size int) { 3130 var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock 3131 options := e.makeOptions(maxSackBlocks[:]) 3132 size = len(options) 3133 putOptions(options) 3134 3135 return size 3136 } 3137 3138 // completeStateLocked makes a full copy of the endpoint and returns it. This is 3139 // used before invoking the probe. 3140 // 3141 // +checklocks:e.mu 3142 func (e *endpoint) completeStateLocked(s *stack.TCPEndpointState) { 3143 s.TCPEndpointStateInner = e.TCPEndpointStateInner 3144 s.ID = stack.TCPEndpointID(e.TransportEndpointInfo.ID) 3145 s.SegTime = e.stack.Clock().NowMonotonic() 3146 s.Receiver = e.rcv.TCPReceiverState 3147 s.Sender = e.snd.TCPSenderState 3148 3149 sndBufSize := e.getSendBufferSize() 3150 // Copy the send buffer atomically. 3151 e.sndQueueInfo.sndQueueMu.Lock() 3152 e.sndQueueInfo.CloneState(&s.SndBufState) 3153 s.SndBufState.SndBufSize = sndBufSize 3154 e.sndQueueInfo.sndQueueMu.Unlock() 3155 3156 // Copy the receive buffer atomically. 3157 e.rcvQueueMu.Lock() 3158 s.RcvBufState = e.TCPRcvBufState 3159 e.rcvQueueMu.Unlock() 3160 3161 // Copy the endpoint TCP Option state. 3162 s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks) 3163 copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks]) 3164 s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy() 3165 3166 e.snd.rtt.Lock() 3167 s.Sender.RTTState = e.snd.rtt.TCPRTTState 3168 e.snd.rtt.Unlock() 3169 3170 if cubic, ok := e.snd.cc.(*cubicState); ok { 3171 s.Sender.Cubic = cubic.TCPCubicState 3172 s.Sender.Cubic.TimeSinceLastCongestion = e.stack.Clock().NowMonotonic().Sub(s.Sender.Cubic.T) 3173 } 3174 3175 s.Sender.RACKState = e.snd.rc.TCPRACKState 3176 s.Sender.RetransmitTS = e.snd.retransmitTS 3177 s.Sender.SpuriousRecovery = e.snd.spuriousRecovery 3178 } 3179 3180 func (e *endpoint) initHostGSO() { 3181 switch e.route.NetProto() { 3182 case header.IPv4ProtocolNumber: 3183 e.gso.Type = stack.GSOTCPv4 3184 e.gso.L3HdrLen = header.IPv4MinimumSize 3185 case header.IPv6ProtocolNumber: 3186 e.gso.Type = stack.GSOTCPv6 3187 e.gso.L3HdrLen = header.IPv6MinimumSize 3188 default: 3189 panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto)) 3190 } 3191 e.gso.NeedsCsum = true 3192 e.gso.CsumOffset = header.TCPChecksumOffset 3193 e.gso.MaxSize = e.route.GSOMaxSize() 3194 } 3195 3196 func (e *endpoint) initGSO() { 3197 if e.route.HasHostGSOCapability() { 3198 e.initHostGSO() 3199 } else if e.route.HasGvisorGSOCapability() { 3200 e.gso = stack.GSO{ 3201 MaxSize: e.route.GSOMaxSize(), 3202 Type: stack.GSOGvisor, 3203 NeedsCsum: false, 3204 } 3205 } 3206 } 3207 3208 // State implements tcpip.Endpoint.State. It exports the endpoint's protocol 3209 // state for diagnostics. 3210 func (e *endpoint) State() uint32 { 3211 return uint32(e.EndpointState()) 3212 } 3213 3214 // Info returns a copy of the endpoint info. 3215 func (e *endpoint) Info() tcpip.EndpointInfo { 3216 e.LockUser() 3217 // Make a copy of the endpoint info. 3218 ret := e.TransportEndpointInfo 3219 e.UnlockUser() 3220 return &ret 3221 } 3222 3223 // Stats returns a pointer to the endpoint stats. 3224 func (e *endpoint) Stats() tcpip.EndpointStats { 3225 return &e.stats 3226 } 3227 3228 // Wait implements stack.TransportEndpoint.Wait. 3229 func (e *endpoint) Wait() { 3230 waitEntry, notifyCh := waiter.NewChannelEntry(waiter.EventHUp) 3231 e.waiterQueue.EventRegister(&waitEntry) 3232 defer e.waiterQueue.EventUnregister(&waitEntry) 3233 switch e.EndpointState() { 3234 case StateClose, StateError: 3235 return 3236 } 3237 <-notifyCh 3238 } 3239 3240 // SocketOptions implements tcpip.Endpoint.SocketOptions. 3241 func (e *endpoint) SocketOptions() *tcpip.SocketOptions { 3242 return &e.ops 3243 } 3244 3245 // GetTCPSendBufferLimits is used to get send buffer size limits for TCP. 3246 func GetTCPSendBufferLimits(s tcpip.StackHandler) tcpip.SendBufferSizeOption { 3247 var ss tcpip.TCPSendBufferSizeRangeOption 3248 if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil { 3249 panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err)) 3250 } 3251 3252 return tcpip.SendBufferSizeOption{ 3253 Min: ss.Min, 3254 Default: ss.Default, 3255 Max: ss.Max, 3256 } 3257 } 3258 3259 // allowOutOfWindowAck returns true if an out-of-window ACK can be sent now. 3260 func (e *endpoint) allowOutOfWindowAck() bool { 3261 now := e.stack.Clock().NowMonotonic() 3262 3263 if e.lastOutOfWindowAckTime != (tcpip.MonotonicTime{}) { 3264 var limit stack.TCPInvalidRateLimitOption 3265 if err := e.stack.Option(&limit); err != nil { 3266 panic(fmt.Sprintf("e.stack.Option(%+v) failed with error: %s", limit, err)) 3267 } 3268 if now.Sub(e.lastOutOfWindowAckTime) < time.Duration(limit) { 3269 return false 3270 } 3271 } 3272 3273 e.lastOutOfWindowAckTime = now 3274 return true 3275 } 3276 3277 // GetTCPReceiveBufferLimits is used to get send buffer size limits for TCP. 3278 func GetTCPReceiveBufferLimits(s tcpip.StackHandler) tcpip.ReceiveBufferSizeOption { 3279 var ss tcpip.TCPReceiveBufferSizeRangeOption 3280 if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil { 3281 panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err)) 3282 } 3283 3284 return tcpip.ReceiveBufferSizeOption{ 3285 Min: ss.Min, 3286 Default: ss.Default, 3287 Max: ss.Max, 3288 } 3289 } 3290 3291 // computeTCPSendBufferSize implements auto tuning of send buffer size and 3292 // returns the new send buffer size. 3293 func (e *endpoint) computeTCPSendBufferSize() int64 { 3294 curSndBufSz := int64(e.getSendBufferSize()) 3295 3296 // Auto tuning is disabled when the user explicitly sets the send 3297 // buffer size with SO_SNDBUF option. 3298 if disabled := e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled.Load(); disabled == 1 { 3299 return curSndBufSz 3300 } 3301 3302 const packetOverheadFactor = 2 3303 curMSS := e.snd.MaxPayloadSize 3304 numSeg := InitialCwnd 3305 if numSeg < e.snd.SndCwnd { 3306 numSeg = e.snd.SndCwnd 3307 } 3308 3309 // SndCwnd indicates the number of segments that can be sent. This means 3310 // that the sender can send upto #SndCwnd segments and the send buffer 3311 // size should be set to SndCwnd*MSS to accommodate sending of all the 3312 // segments. 3313 newSndBufSz := int64(numSeg * curMSS * packetOverheadFactor) 3314 if newSndBufSz < curSndBufSz { 3315 return curSndBufSz 3316 } 3317 if ss := GetTCPSendBufferLimits(e.stack); int64(ss.Max) < newSndBufSz { 3318 newSndBufSz = int64(ss.Max) 3319 } 3320 3321 return newSndBufSz 3322 }