github.com/sagernet/gvisor@v0.0.0-20240428053021-e691de28565f/pkg/tcpip/transport/tcp/endpoint.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"container/heap"
    19  	"fmt"
    20  	"io"
    21  	"math"
    22  	"runtime"
    23  	"strings"
    24  	"time"
    25  
    26  	"github.com/sagernet/gvisor/pkg/atomicbitops"
    27  	"github.com/sagernet/gvisor/pkg/buffer"
    28  	"github.com/sagernet/gvisor/pkg/sleep"
    29  	"github.com/sagernet/gvisor/pkg/sync"
    30  	"github.com/sagernet/gvisor/pkg/tcpip"
    31  	"github.com/sagernet/gvisor/pkg/tcpip/header"
    32  	"github.com/sagernet/gvisor/pkg/tcpip/ports"
    33  	"github.com/sagernet/gvisor/pkg/tcpip/seqnum"
    34  	"github.com/sagernet/gvisor/pkg/tcpip/stack"
    35  	"github.com/sagernet/gvisor/pkg/waiter"
    36  )
    37  
    38  // EndpointState represents the state of a TCP endpoint.
    39  type EndpointState tcpip.EndpointState
    40  
    41  // Endpoint states. Note that are represented in a netstack-specific manner and
    42  // may not be meaningful externally. Specifically, they need to be translated to
    43  // Linux's representation for these states if presented to userspace.
    44  const (
    45  	_ EndpointState = iota
    46  	// TCP protocol states in sync with the definitions in
    47  	// https://github.com/torvalds/linux/blob/7acac4b3196/include/net/tcp_states.h#L13
    48  	StateEstablished
    49  	StateSynSent
    50  	StateSynRecv
    51  	StateFinWait1
    52  	StateFinWait2
    53  	StateTimeWait
    54  	StateClose
    55  	StateCloseWait
    56  	StateLastAck
    57  	StateListen
    58  	StateClosing
    59  
    60  	// Endpoint states internal to netstack.
    61  	StateInitial
    62  	StateBound
    63  	StateConnecting // Connect() called, but the initial SYN hasn't been sent.
    64  	StateError
    65  )
    66  
    67  const (
    68  	// rcvAdvWndScale is used to split the available socket buffer into
    69  	// application buffer and the window to be advertised to the peer. This is
    70  	// currently hard coded to split the available space equally.
    71  	rcvAdvWndScale = 1
    72  
    73  	// SegOverheadFactor is used to multiply the value provided by the
    74  	// user on a SetSockOpt for setting the socket send/receive buffer sizes.
    75  	SegOverheadFactor = 2
    76  )
    77  
    78  type connDirectionState uint32
    79  
    80  // Connection direction states used for directionState checks in endpoint struct
    81  // to detect half-closed connection and deliver POLLRDHUP
    82  const (
    83  	connDirectionStateOpen      connDirectionState = 0
    84  	connDirectionStateRcvClosed connDirectionState = 1
    85  	connDirectionStateSndClosed connDirectionState = 2
    86  	connDirectionStateAll       connDirectionState = connDirectionStateOpen | connDirectionStateRcvClosed | connDirectionStateSndClosed
    87  )
    88  
    89  // connected returns true when s is one of the states representing an
    90  // endpoint connected to a peer.
    91  func (s EndpointState) connected() bool {
    92  	switch s {
    93  	case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
    94  		return true
    95  	default:
    96  		return false
    97  	}
    98  }
    99  
   100  // connecting returns true when s is one of the states representing a
   101  // connection in progress, but not yet fully established.
   102  func (s EndpointState) connecting() bool {
   103  	switch s {
   104  	case StateConnecting, StateSynSent, StateSynRecv:
   105  		return true
   106  	default:
   107  		return false
   108  	}
   109  }
   110  
   111  // internal returns true when the state is netstack internal.
   112  func (s EndpointState) internal() bool {
   113  	switch s {
   114  	case StateInitial, StateBound, StateConnecting, StateError:
   115  		return true
   116  	default:
   117  		return false
   118  	}
   119  }
   120  
   121  // handshake returns true when s is one of the states representing an endpoint
   122  // in the middle of a TCP handshake.
   123  func (s EndpointState) handshake() bool {
   124  	switch s {
   125  	case StateSynSent, StateSynRecv:
   126  		return true
   127  	default:
   128  		return false
   129  	}
   130  }
   131  
   132  // closed returns true when s is one of the states an endpoint transitions to
   133  // when closed or when it encounters an error. This is distinct from a newly
   134  // initialized endpoint that was never connected.
   135  func (s EndpointState) closed() bool {
   136  	switch s {
   137  	case StateClose, StateError:
   138  		return true
   139  	default:
   140  		return false
   141  	}
   142  }
   143  
   144  // String implements fmt.Stringer.String.
   145  func (s EndpointState) String() string {
   146  	switch s {
   147  	case StateInitial:
   148  		return "INITIAL"
   149  	case StateBound:
   150  		return "BOUND"
   151  	case StateConnecting:
   152  		return "CONNECTING"
   153  	case StateError:
   154  		return "ERROR"
   155  	case StateEstablished:
   156  		return "ESTABLISHED"
   157  	case StateSynSent:
   158  		return "SYN-SENT"
   159  	case StateSynRecv:
   160  		return "SYN-RCVD"
   161  	case StateFinWait1:
   162  		return "FIN-WAIT1"
   163  	case StateFinWait2:
   164  		return "FIN-WAIT2"
   165  	case StateTimeWait:
   166  		return "TIME-WAIT"
   167  	case StateClose:
   168  		return "CLOSED"
   169  	case StateCloseWait:
   170  		return "CLOSE-WAIT"
   171  	case StateLastAck:
   172  		return "LAST-ACK"
   173  	case StateListen:
   174  		return "LISTEN"
   175  	case StateClosing:
   176  		return "CLOSING"
   177  	default:
   178  		panic("unreachable")
   179  	}
   180  }
   181  
   182  // SACKInfo holds TCP SACK related information for a given endpoint.
   183  //
   184  // +stateify savable
   185  type SACKInfo struct {
   186  	// Blocks is the maximum number of SACK blocks we track
   187  	// per endpoint.
   188  	Blocks [MaxSACKBlocks]header.SACKBlock
   189  
   190  	// NumBlocks is the number of valid SACK blocks stored in the
   191  	// blocks array above.
   192  	NumBlocks int
   193  }
   194  
   195  // ReceiveErrors collect segment receive errors within transport layer.
   196  //
   197  // +stateify savable
   198  type ReceiveErrors struct {
   199  	tcpip.ReceiveErrors
   200  
   201  	// SegmentQueueDropped is the number of segments dropped due to
   202  	// a full segment queue.
   203  	SegmentQueueDropped tcpip.StatCounter
   204  
   205  	// ChecksumErrors is the number of segments dropped due to bad checksums.
   206  	ChecksumErrors tcpip.StatCounter
   207  
   208  	// ListenOverflowSynDrop is the number of times the listen queue overflowed
   209  	// and a SYN was dropped.
   210  	ListenOverflowSynDrop tcpip.StatCounter
   211  
   212  	// ListenOverflowAckDrop is the number of times the final ACK
   213  	// in the handshake was dropped due to overflow.
   214  	ListenOverflowAckDrop tcpip.StatCounter
   215  
   216  	// ZeroRcvWindowState is the number of times we advertised
   217  	// a zero receive window when rcvQueue is full.
   218  	ZeroRcvWindowState tcpip.StatCounter
   219  
   220  	// WantZeroWindow is the number of times we wanted to advertise a
   221  	// zero receive window but couldn't because it would have caused
   222  	// the receive window's right edge to shrink.
   223  	WantZeroRcvWindow tcpip.StatCounter
   224  }
   225  
   226  // SendErrors collect segment send errors within the transport layer.
   227  //
   228  // +stateify savable
   229  type SendErrors struct {
   230  	tcpip.SendErrors
   231  
   232  	// SegmentSendToNetworkFailed is the number of TCP segments failed to be sent
   233  	// to the network endpoint.
   234  	SegmentSendToNetworkFailed tcpip.StatCounter
   235  
   236  	// SynSendToNetworkFailed is the number of TCP SYNs failed to be sent
   237  	// to the network endpoint.
   238  	SynSendToNetworkFailed tcpip.StatCounter
   239  
   240  	// Retransmits is the number of TCP segments retransmitted.
   241  	Retransmits tcpip.StatCounter
   242  
   243  	// FastRetransmit is the number of segments retransmitted in fast
   244  	// recovery.
   245  	FastRetransmit tcpip.StatCounter
   246  
   247  	// Timeouts is the number of times the RTO expired.
   248  	Timeouts tcpip.StatCounter
   249  }
   250  
   251  // Stats holds statistics about the endpoint.
   252  //
   253  // +stateify savable
   254  type Stats struct {
   255  	// SegmentsReceived is the number of TCP segments received that
   256  	// the transport layer successfully parsed.
   257  	SegmentsReceived tcpip.StatCounter
   258  
   259  	// SegmentsSent is the number of TCP segments sent.
   260  	SegmentsSent tcpip.StatCounter
   261  
   262  	// FailedConnectionAttempts is the number of times we saw Connect and
   263  	// Accept errors.
   264  	FailedConnectionAttempts tcpip.StatCounter
   265  
   266  	// ReceiveErrors collects segment receive errors within the
   267  	// transport layer.
   268  	ReceiveErrors ReceiveErrors
   269  
   270  	// ReadErrors collects segment read errors from an endpoint read call.
   271  	ReadErrors tcpip.ReadErrors
   272  
   273  	// SendErrors collects segment send errors within the transport layer.
   274  	SendErrors SendErrors
   275  
   276  	// WriteErrors collects segment write errors from an endpoint write call.
   277  	WriteErrors tcpip.WriteErrors
   278  }
   279  
   280  // IsEndpointStats is an empty method to implement the tcpip.EndpointStats
   281  // marker interface.
   282  func (*Stats) IsEndpointStats() {}
   283  
   284  // sndQueueInfo implements a send queue.
   285  //
   286  // +stateify savable
   287  type sndQueueInfo struct {
   288  	sndQueueMu sync.Mutex `state:"nosave"`
   289  	stack.TCPSndBufState
   290  
   291  	// sndWaker is used to signal the protocol goroutine when there may be
   292  	// segments that need to be sent.
   293  	sndWaker sleep.Waker `state:"manual"`
   294  }
   295  
   296  // CloneState clones sq into other. It is not thread safe
   297  func (sq *sndQueueInfo) CloneState(other *stack.TCPSndBufState) {
   298  	other.SndBufSize = sq.SndBufSize
   299  	other.SndBufUsed = sq.SndBufUsed
   300  	other.SndClosed = sq.SndClosed
   301  	other.PacketTooBigCount = sq.PacketTooBigCount
   302  	other.SndMTU = sq.SndMTU
   303  	other.AutoTuneSndBufDisabled = atomicbitops.FromUint32(sq.AutoTuneSndBufDisabled.RacyLoad())
   304  }
   305  
   306  // Endpoint represents a TCP endpoint. This struct serves as the interface
   307  // between users of the endpoint and the protocol implementation; it is legal to
   308  // have concurrent goroutines make calls into the endpoint, they are properly
   309  // synchronized. The protocol implementation, however, runs in a single
   310  // goroutine.
   311  //
   312  // Each endpoint has a few mutexes:
   313  //
   314  // e.mu -> Primary mutex for an endpoint must be held for all operations except
   315  // in e.Readiness where acquiring it will result in a deadlock in epoll
   316  // implementation.
   317  //
   318  // The following three mutexes can be acquired independent of e.mu but if
   319  // acquired with e.mu then e.mu must be acquired first.
   320  //
   321  // e.acceptMu -> Protects e.acceptQueue.
   322  // e.rcvQueueMu -> Protects e.rcvQueue's associated fields but not e.rcvQueue
   323  // itself.
   324  // e.sndQueueMu -> Protects the e.sndQueue and associated fields.
   325  // e.lastErrorMu -> Protects the lastError field.
   326  //
   327  // LOCKING/UNLOCKING of the endpoint.  The locking of an endpoint is different
   328  // based on the context in which the lock is acquired. In the syscall context
   329  // e.LockUser/e.UnlockUser should be used and when doing background processing
   330  // e.mu.Lock/e.mu.Unlock should be used. The distinction is described below
   331  // in brief.
   332  //
   333  // The reason for this locking behaviour is to avoid wakeups to handle packets.
   334  // In cases where the endpoint is already locked the background processor can
   335  // queue the packet up and go its merry way and the lock owner will eventually
   336  // process the backlog when releasing the lock. Similarly when acquiring the
   337  // lock from say a syscall goroutine we can implement a bit of spinning if we
   338  // know that the lock is not held by another syscall goroutine. Background
   339  // processors should never hold the lock for long and we can avoid an expensive
   340  // sleep/wakeup by spinning for a shortwhile.
   341  //
   342  // For more details please see the detailed documentation on
   343  // e.LockUser/e.UnlockUser methods.
   344  //
   345  // +stateify savable
   346  type Endpoint struct {
   347  	stack.TCPEndpointStateInner
   348  	stack.TransportEndpointInfo
   349  	tcpip.DefaultSocketOptionsHandler
   350  
   351  	// EndpointEntry is used to queue endpoints for processing to the
   352  	// a given tcp processor goroutine.
   353  	//
   354  	// Precondition: epQueue.mu must be held to read/write this field..
   355  	endpointEntry `state:"nosave"`
   356  
   357  	// pendingProcessingMu protects pendingProcessing.
   358  	pendingProcessingMu sync.Mutex `state:"nosave"`
   359  
   360  	// pendingProcessing is true if this endpoint is queued for processing
   361  	// to a TCP processor.
   362  	// +checklocks:pendingProcessingMu
   363  	pendingProcessing bool `state:"nosave"`
   364  
   365  	// The following fields are initialized at creation time and do not
   366  	// change throughout the lifetime of the endpoint.
   367  	stack       *stack.Stack  `state:"manual"`
   368  	protocol    *protocol     `state:"manual"`
   369  	waiterQueue *waiter.Queue `state:"wait"`
   370  	uniqueID    uint64
   371  
   372  	// hardError is meaningful only when state is stateError. It stores the
   373  	// error to be returned when read/write syscalls are called and the
   374  	// endpoint is in this state. hardError is protected by endpoint mu.
   375  	hardError tcpip.Error
   376  
   377  	// lastError represents the last error that the endpoint reported;
   378  	// access to it is protected by the following mutex.
   379  	lastErrorMu sync.Mutex `state:"nosave"`
   380  	lastError   tcpip.Error
   381  
   382  	rcvQueueMu sync.Mutex `state:"nosave"`
   383  
   384  	// +checklocks:rcvQueueMu
   385  	stack.TCPRcvBufState
   386  
   387  	// rcvMemUsed tracks the total amount of memory in use by received segments
   388  	// held in rcvQueue, pendingRcvdSegments and the segment queue. This is used to
   389  	// compute the window and the actual available buffer space. This is distinct
   390  	// from rcvBufUsed above which is the actual number of payload bytes held in
   391  	// the buffer not including any segment overheads.
   392  	rcvMemUsed atomicbitops.Int32
   393  
   394  	// mu protects all endpoint fields unless documented otherwise. mu must
   395  	// be acquired before interacting with the endpoint fields.
   396  	//
   397  	// During handshake, mu is locked by the protocol listen goroutine and
   398  	// released by the handshake completion goroutine.
   399  	mu          sync.CrossGoroutineMutex `state:"nosave"`
   400  	ownedByUser atomicbitops.Uint32
   401  
   402  	// rcvQueue is the queue for ready-for-delivery segments.
   403  	//
   404  	// +checklocks:mu
   405  	rcvQueue segmentList `state:"wait"`
   406  
   407  	// state must be read/set using the EndpointState()/setEndpointState()
   408  	// methods.
   409  	state atomicbitops.Uint32 `state:".(EndpointState)"`
   410  
   411  	// connectionDirectionState holds current state of send and receive,
   412  	// accessed atomically
   413  	connectionDirectionState atomicbitops.Uint32
   414  
   415  	// origEndpointState is only used during a restore phase to save the
   416  	// endpoint state at restore time as the socket is moved to it's correct
   417  	// state.
   418  	origEndpointState uint32 `state:"nosave"`
   419  
   420  	isPortReserved    bool `state:"manual"`
   421  	isRegistered      bool `state:"manual"`
   422  	boundNICID        tcpip.NICID
   423  	route             *stack.Route `state:"manual"`
   424  	ipv4TTL           uint8
   425  	ipv6HopLimit      int16
   426  	isConnectNotified bool
   427  
   428  	// h stores a reference to the current handshake state if the endpoint is in
   429  	// the SYN-SENT or SYN-RECV states, in which case endpoint == endpoint.h.ep.
   430  	// nil otherwise.
   431  	// +checklocks:mu
   432  	h *handshake
   433  
   434  	// portFlags stores the current values of port related flags.
   435  	portFlags ports.Flags
   436  
   437  	// Values used to reserve a port or register a transport endpoint
   438  	// (which ever happens first).
   439  	boundBindToDevice tcpip.NICID
   440  	boundPortFlags    ports.Flags
   441  	boundDest         tcpip.FullAddress
   442  
   443  	// effectiveNetProtos contains the network protocols actually in use. In
   444  	// most cases it will only contain "netProto", but in cases like IPv6
   445  	// endpoints with v6only set to false, this could include multiple
   446  	// protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g.,
   447  	// IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped
   448  	// address).
   449  	effectiveNetProtos []tcpip.NetworkProtocolNumber
   450  
   451  	// recentTSTime is the unix time when we last updated
   452  	// TCPEndpointStateInner.RecentTS.
   453  	recentTSTime tcpip.MonotonicTime
   454  
   455  	// shutdownFlags represent the current shutdown state of the endpoint.
   456  	shutdownFlags tcpip.ShutdownFlags
   457  
   458  	// tcpRecovery is the loss recovery algorithm used by TCP.
   459  	tcpRecovery tcpip.TCPRecovery
   460  
   461  	// sack holds TCP SACK related information for this endpoint.
   462  	sack SACKInfo
   463  
   464  	// delay enables Nagle's algorithm.
   465  	//
   466  	// delay is a boolean (0 is false) and must be accessed atomically.
   467  	delay uint32
   468  
   469  	// scoreboard holds TCP SACK Scoreboard information for this endpoint.
   470  	scoreboard *SACKScoreboard
   471  
   472  	// segmentQueue is used to hand received segments to the protocol
   473  	// goroutine. Segments are queued as long as the queue is not full,
   474  	// and dropped when it is.
   475  	segmentQueue segmentQueue `state:"wait"`
   476  
   477  	// userMSS if non-zero is the MSS value explicitly set by the user
   478  	// for this endpoint using the TCP_MAXSEG setsockopt.
   479  	userMSS uint16
   480  
   481  	// maxSynRetries is the maximum number of SYN retransmits that TCP should
   482  	// send before aborting the attempt to connect. It cannot exceed 255.
   483  	//
   484  	// NOTE: This is currently a no-op and does not change the SYN
   485  	// retransmissions.
   486  	maxSynRetries uint8
   487  
   488  	// windowClamp is used to bound the size of the advertised window to
   489  	// this value.
   490  	windowClamp uint32
   491  
   492  	// sndQueueInfo contains the implementation of the endpoint's send queue.
   493  	sndQueueInfo sndQueueInfo
   494  
   495  	// cc stores the name of the Congestion Control algorithm to use for
   496  	// this endpoint.
   497  	cc tcpip.CongestionControlOption
   498  
   499  	// keepalive manages TCP keepalive state. When the connection is idle
   500  	// (no data sent or received) for keepaliveIdle, we start sending
   501  	// keepalives every keepalive.interval. If we send keepalive.count
   502  	// without hearing a response, the connection is closed.
   503  	keepalive keepalive
   504  
   505  	// userTimeout if non-zero specifies a user specified timeout for
   506  	// a connection w/ pending data to send. A connection that has pending
   507  	// unacked data will be forcibily aborted if the timeout is reached
   508  	// without any data being acked.
   509  	userTimeout time.Duration
   510  
   511  	// deferAccept if non-zero specifies a user specified time during
   512  	// which the final ACK of a handshake will be dropped provided the
   513  	// ACK is a bare ACK and carries no data. If the timeout is crossed then
   514  	// the bare ACK is accepted and the connection is delivered to the
   515  	// listener.
   516  	deferAccept time.Duration
   517  
   518  	// acceptMu protects accepQueue
   519  	acceptMu sync.Mutex `state:"nosave"`
   520  
   521  	// acceptQueue is used by a listening endpoint to send newly accepted
   522  	// connections to the endpoint so that they can be read by Accept()
   523  	// calls.
   524  	//
   525  	// +checklocks:acceptMu
   526  	acceptQueue acceptQueue
   527  
   528  	// The following are only used from the protocol goroutine, and
   529  	// therefore don't need locks to protect them.
   530  	rcv *receiver `state:"wait"`
   531  	snd *sender   `state:"wait"`
   532  
   533  	// The goroutine drain completion notification channel.
   534  	drainDone chan struct{} `state:"nosave"`
   535  
   536  	// The goroutine undrain notification channel. This is currently used as
   537  	// a way to block the worker goroutines. Today nothing closes/writes
   538  	// this channel and this causes any goroutines waiting on this to just
   539  	// block. This is used during save/restore to prevent worker goroutines
   540  	// from mutating state as it's being saved.
   541  	undrain chan struct{} `state:"nosave"`
   542  
   543  	// probe if not nil is invoked on every received segment. It is passed
   544  	// a copy of the current state of the endpoint.
   545  	probe stack.TCPProbeFunc `state:"nosave"`
   546  
   547  	// The following are only used to assist the restore run to re-connect.
   548  	connectingAddress tcpip.Address
   549  
   550  	// amss is the advertised MSS to the peer by this endpoint.
   551  	amss uint16
   552  
   553  	// sendTOS represents IPv4 TOS or IPv6 TrafficClass,
   554  	// applied while sending packets. Defaults to 0 as on Linux.
   555  	sendTOS uint8
   556  
   557  	gso stack.GSO
   558  
   559  	stats Stats
   560  
   561  	// tcpLingerTimeout is the maximum amount of a time a socket
   562  	// a socket stays in TIME_WAIT state before being marked
   563  	// closed.
   564  	tcpLingerTimeout time.Duration
   565  
   566  	// closed indicates that the user has called closed on the
   567  	// endpoint and at this point the endpoint is only around
   568  	// to complete the TCP shutdown.
   569  	closed bool
   570  
   571  	// txHash is the transport layer hash to be set on outbound packets
   572  	// emitted by this endpoint.
   573  	txHash uint32
   574  
   575  	// owner is used to get uid and gid of the packet.
   576  	owner tcpip.PacketOwner
   577  
   578  	// ops is used to get socket level options.
   579  	ops tcpip.SocketOptions
   580  
   581  	// lastOutOfWindowAckTime is the time at which the an ACK was sent in response
   582  	// to an out of window segment being received by this endpoint.
   583  	lastOutOfWindowAckTime tcpip.MonotonicTime
   584  
   585  	// finWait2Timer is used to reap orphaned sockets in FIN-WAIT-2 where the peer
   586  	// is yet to send a FIN but on our end the socket is fully closed i.e. endpoint.Close()
   587  	// has been called on the socket. This timer is not started for sockets that
   588  	// are waiting for a peer FIN but are not closed.
   589  	finWait2Timer tcpip.Timer `state:"nosave"`
   590  
   591  	// timeWaitTimer is used to reap a socket once a socket has been in TIME-WAIT state
   592  	// for tcp.DefaultTCPTimeWaitTimeout seconds.
   593  	timeWaitTimer tcpip.Timer `state:"nosave"`
   594  
   595  	// listenCtx is used by listening endpoints to store state used while listening for
   596  	// connections. Nil otherwise.
   597  	listenCtx *listenContext `state:"nosave"`
   598  
   599  	// limRdr is reused to avoid allocations.
   600  	//
   601  	// +checklocks:mu
   602  	limRdr *io.LimitedReader `state:"nosave"`
   603  }
   604  
   605  // UniqueID implements stack.TransportEndpoint.UniqueID.
   606  func (e *Endpoint) UniqueID() uint64 {
   607  	return e.uniqueID
   608  }
   609  
   610  // calculateAdvertisedMSS calculates the MSS to advertise.
   611  //
   612  // If userMSS is non-zero and is not greater than the maximum possible MSS for
   613  // r, it will be used; otherwise, the maximum possible MSS will be used.
   614  func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 {
   615  	// The maximum possible MSS is dependent on the route.
   616  	// TODO(b/143359391): Respect TCP Min and Max size.
   617  	maxMSS := uint16(r.MTU() - header.TCPMinimumSize)
   618  
   619  	if userMSS != 0 && userMSS < maxMSS {
   620  		return userMSS
   621  	}
   622  
   623  	return maxMSS
   624  }
   625  
   626  // isOwnedByUser() returns true if the endpoint lock is currently
   627  // held by a user(syscall) goroutine.
   628  func (e *Endpoint) isOwnedByUser() bool {
   629  	return e.ownedByUser.Load() == 1
   630  }
   631  
   632  // LockUser tries to lock e.mu and if it fails it will check if the lock is held
   633  // by another syscall goroutine. If yes, then it will goto sleep waiting for the
   634  // lock to be released, if not then it will spin till it acquires the lock or
   635  // another syscall goroutine acquires it in which case it will goto sleep as
   636  // described above.
   637  //
   638  // The assumption behind spinning here being that background packet processing
   639  // should not be holding the lock for long and spinning reduces latency as we
   640  // avoid an expensive sleep/wakeup of the syscall goroutine).
   641  // +checklocksacquire:e.mu
   642  func (e *Endpoint) LockUser() {
   643  	const iterations = 5
   644  	for i := 0; i < iterations; i++ {
   645  		// Try first if the sock is locked then check if it's owned
   646  		// by another user goroutine if not then we spin, otherwise
   647  		// we just go to sleep on the Lock() and wait.
   648  		if !e.TryLock() {
   649  			// If socket is owned by the user then just go to sleep
   650  			// as the lock could be held for a reasonably long time.
   651  			if e.ownedByUser.Load() == 1 {
   652  				e.mu.Lock()
   653  				e.ownedByUser.Store(1)
   654  				return
   655  			}
   656  			// Spin but don't yield the processor since the lower half
   657  			// should yield the lock soon.
   658  			continue
   659  		}
   660  		e.ownedByUser.Store(1)
   661  		return
   662  	}
   663  
   664  	for i := 0; i < iterations; i++ {
   665  		// Try first if the sock is locked then check if it's owned
   666  		// by another user goroutine if not then we spin, otherwise
   667  		// we just go to sleep on the Lock() and wait.
   668  		if !e.TryLock() {
   669  			// If socket is owned by the user then just go to sleep
   670  			// as the lock could be held for a reasonably long time.
   671  			if e.ownedByUser.Load() == 1 {
   672  				e.mu.Lock()
   673  				e.ownedByUser.Store(1)
   674  				return
   675  			}
   676  			// Spin but yield the processor since the lower half
   677  			// should yield the lock soon.
   678  			runtime.Gosched()
   679  			continue
   680  		}
   681  		e.ownedByUser.Store(1)
   682  		return
   683  	}
   684  
   685  	// Finally just give up and wait for the Lock.
   686  	e.mu.Lock()
   687  	e.ownedByUser.Store(1)
   688  }
   689  
   690  // UnlockUser will check if there are any segments already queued for processing
   691  // and wake up a processor goroutine to process them before unlocking e.mu.
   692  // This is required because we when packets arrive and endpoint lock is already
   693  // held then such packets are queued up to be processed.
   694  //
   695  // Precondition: e.LockUser() must have been called before calling e.UnlockUser()
   696  // +checklocksrelease:e.mu
   697  func (e *Endpoint) UnlockUser() {
   698  	// Lock segment queue before checking so that we avoid a race where
   699  	// segments can be queued between the time we check if queue is empty
   700  	// and actually unlock the endpoint mutex.
   701  	e.segmentQueue.mu.Lock()
   702  	if e.segmentQueue.emptyLocked() {
   703  		if e.ownedByUser.Swap(0) != 1 {
   704  			panic("e.UnlockUser() called without calling e.LockUser()")
   705  		}
   706  		e.mu.Unlock()
   707  		e.segmentQueue.mu.Unlock()
   708  		return
   709  	}
   710  	e.segmentQueue.mu.Unlock()
   711  
   712  	// Since we are waking the processor goroutine here just unlock
   713  	// and let it process the queued segments.
   714  	if e.ownedByUser.Swap(0) != 1 {
   715  		panic("e.UnlockUser() called without calling e.LockUser()")
   716  	}
   717  	processor := e.protocol.dispatcher.selectProcessor(e.ID)
   718  	e.mu.Unlock()
   719  
   720  	// Wake up the processor for this endpoint to process any queued
   721  	// segments after releasing the lock to avoid the case where if the
   722  	// processor goroutine starts running before we release the lock here
   723  	// then it will fail to process as TryLock() will fail.
   724  	processor.queueEndpoint(e)
   725  	return
   726  }
   727  
   728  // StopWork halts packet processing. Only to be used in tests.
   729  // +checklocksacquire:e.mu
   730  func (e *Endpoint) StopWork() {
   731  	e.mu.Lock()
   732  }
   733  
   734  // ResumeWork resumes packet processing. Only to be used in tests.
   735  // +checklocksrelease:e.mu
   736  func (e *Endpoint) ResumeWork() {
   737  	e.mu.Unlock()
   738  }
   739  
   740  // AssertLockHeld forces the checklocks analyzer to consider e.mu held. This is
   741  // used in places where we know that e.mu is held, but checklocks does not,
   742  // which can happen when creating new locked objects. You must pass the known
   743  // locked endpoint to this function and it must be the same as the caller
   744  // endpoint.
   745  // TODO(b/226403629): Remove this function once checklocks understands local
   746  // variable locks.
   747  // +checklocks:locked.mu
   748  // +checklocksacquire:e.mu
   749  func (e *Endpoint) AssertLockHeld(locked *Endpoint) {
   750  	if e != locked {
   751  		panic("AssertLockHeld failed: locked endpoint != asserting endpoint")
   752  	}
   753  }
   754  
   755  // TryLock is a helper that calls TryLock on the endpoint's mutex and
   756  // adds the necessary checklocks annotations.
   757  // TODO(b/226403629): Remove this once checklocks understands TryLock.
   758  // +checklocksacquire:e.mu
   759  func (e *Endpoint) TryLock() bool {
   760  	if e.mu.TryLock() {
   761  		return true // +checklocksforce
   762  	}
   763  	return false // +checklocksignore
   764  }
   765  
   766  // setEndpointState updates the state of the endpoint to state atomically. This
   767  // method is unexported as the only place we should update the state is in this
   768  // package but we allow the state to be read freely without holding e.mu.
   769  //
   770  // +checklocks:e.mu
   771  func (e *Endpoint) setEndpointState(state EndpointState) {
   772  	oldstate := EndpointState(e.state.Swap(uint32(state)))
   773  	switch state {
   774  	case StateEstablished:
   775  		e.stack.Stats().TCP.CurrentEstablished.Increment()
   776  		e.stack.Stats().TCP.CurrentConnected.Increment()
   777  	case StateError:
   778  		fallthrough
   779  	case StateClose:
   780  		if oldstate == StateCloseWait || oldstate == StateEstablished {
   781  			e.stack.Stats().TCP.EstablishedResets.Increment()
   782  		}
   783  		if oldstate.connected() {
   784  			e.stack.Stats().TCP.CurrentConnected.Decrement()
   785  		}
   786  		fallthrough
   787  	default:
   788  		if oldstate == StateEstablished {
   789  			e.stack.Stats().TCP.CurrentEstablished.Decrement()
   790  		}
   791  	}
   792  }
   793  
   794  // EndpointState returns the current state of the endpoint.
   795  func (e *Endpoint) EndpointState() EndpointState {
   796  	return EndpointState(e.state.Load())
   797  }
   798  
   799  // setRecentTimestamp sets the recentTS field to the provided value.
   800  func (e *Endpoint) setRecentTimestamp(recentTS uint32) {
   801  	e.RecentTS = recentTS
   802  	e.recentTSTime = e.stack.Clock().NowMonotonic()
   803  }
   804  
   805  // recentTimestamp returns the value of the recentTS field.
   806  func (e *Endpoint) recentTimestamp() uint32 {
   807  	return e.RecentTS
   808  }
   809  
   810  // TODO(gvisor.dev/issue/6974): Remove once tcp endpoints are composed with a
   811  // network.Endpoint, which also defines this function.
   812  func calculateTTL(route *stack.Route, ipv4TTL uint8, ipv6HopLimit int16) uint8 {
   813  	switch netProto := route.NetProto(); netProto {
   814  	case header.IPv4ProtocolNumber:
   815  		if ipv4TTL == tcpip.UseDefaultIPv4TTL {
   816  			return route.DefaultTTL()
   817  		}
   818  		return ipv4TTL
   819  	case header.IPv6ProtocolNumber:
   820  		if ipv6HopLimit == tcpip.UseDefaultIPv6HopLimit {
   821  			return route.DefaultTTL()
   822  		}
   823  		return uint8(ipv6HopLimit)
   824  	default:
   825  		panic(fmt.Sprintf("invalid protocol number = %d", netProto))
   826  	}
   827  }
   828  
   829  // keepalive is a synchronization wrapper used to appease stateify. See the
   830  // comment in endpoint, where it is used.
   831  //
   832  // +stateify savable
   833  type keepalive struct {
   834  	sync.Mutex `state:"nosave"`
   835  	idle       time.Duration
   836  	interval   time.Duration
   837  	count      int
   838  	unacked    int
   839  	// should never be a zero timer if the endpoint is not closed.
   840  	timer timer       `state:"nosave"`
   841  	waker sleep.Waker `state:"nosave"`
   842  }
   843  
   844  func newEndpoint(s *stack.Stack, protocol *protocol, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *Endpoint {
   845  	e := &Endpoint{
   846  		stack:    s,
   847  		protocol: protocol,
   848  		TransportEndpointInfo: stack.TransportEndpointInfo{
   849  			NetProto:   netProto,
   850  			TransProto: header.TCPProtocolNumber,
   851  		},
   852  		sndQueueInfo: sndQueueInfo{
   853  			TCPSndBufState: stack.TCPSndBufState{
   854  				SndMTU: math.MaxInt32,
   855  			},
   856  		},
   857  		waiterQueue: waiterQueue,
   858  		state:       atomicbitops.FromUint32(uint32(StateInitial)),
   859  		keepalive: keepalive{
   860  			idle:     DefaultKeepaliveIdle,
   861  			interval: DefaultKeepaliveInterval,
   862  			count:    DefaultKeepaliveCount,
   863  		},
   864  		uniqueID:     s.UniqueID(),
   865  		ipv4TTL:      tcpip.UseDefaultIPv4TTL,
   866  		ipv6HopLimit: tcpip.UseDefaultIPv6HopLimit,
   867  		// txHash only determines which outgoing queue to use, so
   868  		// InsecureRNG is fine.
   869  		txHash:        s.InsecureRNG().Uint32(),
   870  		windowClamp:   DefaultReceiveBufferSize,
   871  		maxSynRetries: DefaultSynRetries,
   872  		limRdr:        &io.LimitedReader{},
   873  	}
   874  	e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits, GetTCPReceiveBufferLimits)
   875  	e.ops.SetMulticastLoop(true)
   876  	e.ops.SetQuickAck(true)
   877  	e.ops.SetSendBufferSize(DefaultSendBufferSize, false /* notify */)
   878  	e.ops.SetReceiveBufferSize(DefaultReceiveBufferSize, false /* notify */)
   879  
   880  	var ss tcpip.TCPSendBufferSizeRangeOption
   881  	if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
   882  		e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */)
   883  	}
   884  
   885  	var rs tcpip.TCPReceiveBufferSizeRangeOption
   886  	if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
   887  		e.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */)
   888  	}
   889  
   890  	var cs tcpip.CongestionControlOption
   891  	if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil {
   892  		e.cc = cs
   893  	}
   894  
   895  	var mrb tcpip.TCPModerateReceiveBufferOption
   896  	if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil {
   897  		e.RcvAutoParams.Disabled = !bool(mrb)
   898  	}
   899  
   900  	var de tcpip.TCPDelayEnabled
   901  	if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de {
   902  		e.ops.SetDelayOption(true)
   903  	}
   904  
   905  	var tcpLT tcpip.TCPLingerTimeoutOption
   906  	if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil {
   907  		e.tcpLingerTimeout = time.Duration(tcpLT)
   908  	}
   909  
   910  	var synRetries tcpip.TCPSynRetriesOption
   911  	if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil {
   912  		e.maxSynRetries = uint8(synRetries)
   913  	}
   914  
   915  	if p := s.GetTCPProbe(); p != nil {
   916  		e.probe = p
   917  	}
   918  
   919  	e.segmentQueue.ep = e
   920  
   921  	// TODO(https://gvisor.dev/issues/7493): Defer creating the timer until TCP connection becomes
   922  	// established.
   923  	e.keepalive.timer.init(e.stack.Clock(), timerHandler(e, e.keepaliveTimerExpired))
   924  
   925  	return e
   926  }
   927  
   928  // Readiness returns the current readiness of the endpoint. For example, if
   929  // waiter.EventIn is set, the endpoint is immediately readable.
   930  func (e *Endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
   931  	result := waiter.EventMask(0)
   932  
   933  	switch e.EndpointState() {
   934  	case StateInitial, StateBound:
   935  		// This prevents blocking of new sockets which are not
   936  		// connected when SO_LINGER is set.
   937  		result |= waiter.EventHUp
   938  
   939  	case StateConnecting, StateSynSent, StateSynRecv:
   940  		// Ready for nothing.
   941  
   942  	case StateClose, StateError, StateTimeWait:
   943  		// Ready for anything.
   944  		result = mask
   945  
   946  	case StateListen:
   947  		// Check if there's anything in the accepted queue.
   948  		if (mask & waiter.ReadableEvents) != 0 {
   949  			e.acceptMu.Lock()
   950  			if e.acceptQueue.endpoints.Len() != 0 {
   951  				result |= waiter.ReadableEvents
   952  			}
   953  			e.acceptMu.Unlock()
   954  		}
   955  	}
   956  	if e.EndpointState().connected() {
   957  		// Determine if the endpoint is writable if requested.
   958  		if (mask & waiter.WritableEvents) != 0 {
   959  			e.sndQueueInfo.sndQueueMu.Lock()
   960  			sndBufSize := e.getSendBufferSize()
   961  			if e.sndQueueInfo.SndClosed || e.sndQueueInfo.SndBufUsed < sndBufSize {
   962  				result |= waiter.WritableEvents
   963  			}
   964  			if e.sndQueueInfo.SndClosed {
   965  				e.updateConnDirectionState(connDirectionStateSndClosed)
   966  			}
   967  			e.sndQueueInfo.sndQueueMu.Unlock()
   968  		}
   969  
   970  		// Determine if the endpoint is readable if requested.
   971  		if (mask & waiter.ReadableEvents) != 0 {
   972  			e.rcvQueueMu.Lock()
   973  			if e.RcvBufUsed > 0 || e.RcvClosed {
   974  				result |= waiter.ReadableEvents
   975  			}
   976  			if e.RcvClosed {
   977  				e.updateConnDirectionState(connDirectionStateRcvClosed)
   978  			}
   979  			e.rcvQueueMu.Unlock()
   980  		}
   981  	}
   982  
   983  	// Determine whether endpoint is half-closed with rcv shutdown
   984  	if e.connDirectionState() == connDirectionStateRcvClosed {
   985  		result |= waiter.EventRdHUp
   986  	}
   987  
   988  	return result
   989  }
   990  
   991  // Purging pending rcv segments is only necessary on RST.
   992  func (e *Endpoint) purgePendingRcvQueue() {
   993  	if e.rcv != nil {
   994  		for e.rcv.pendingRcvdSegments.Len() > 0 {
   995  			s := heap.Pop(&e.rcv.pendingRcvdSegments).(*segment)
   996  			s.DecRef()
   997  		}
   998  	}
   999  }
  1000  
  1001  // +checklocks:e.mu
  1002  func (e *Endpoint) purgeReadQueue() {
  1003  	if e.rcv != nil {
  1004  		e.rcvQueueMu.Lock()
  1005  		defer e.rcvQueueMu.Unlock()
  1006  		for {
  1007  			s := e.rcvQueue.Front()
  1008  			if s == nil {
  1009  				break
  1010  			}
  1011  			e.rcvQueue.Remove(s)
  1012  			s.DecRef()
  1013  		}
  1014  		e.RcvBufUsed = 0
  1015  	}
  1016  }
  1017  
  1018  // +checklocks:e.mu
  1019  func (e *Endpoint) purgeWriteQueue() {
  1020  	if e.snd != nil {
  1021  		e.sndQueueInfo.sndQueueMu.Lock()
  1022  		defer e.sndQueueInfo.sndQueueMu.Unlock()
  1023  		e.snd.updateWriteNext(nil)
  1024  		for {
  1025  			s := e.snd.writeList.Front()
  1026  			if s == nil {
  1027  				break
  1028  			}
  1029  			e.snd.writeList.Remove(s)
  1030  			s.DecRef()
  1031  		}
  1032  		e.sndQueueInfo.SndBufUsed = 0
  1033  		e.sndQueueInfo.SndClosed = true
  1034  	}
  1035  }
  1036  
  1037  // Abort implements stack.TransportEndpoint.Abort.
  1038  func (e *Endpoint) Abort() {
  1039  	defer e.drainClosingSegmentQueue()
  1040  	e.LockUser()
  1041  	defer e.UnlockUser()
  1042  	defer e.purgeReadQueue()
  1043  	// Reset all connected endpoints.
  1044  	switch state := e.EndpointState(); {
  1045  	case state.connected():
  1046  		e.resetConnectionLocked(&tcpip.ErrAborted{})
  1047  		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  1048  		return
  1049  	}
  1050  	e.closeLocked()
  1051  }
  1052  
  1053  // Close puts the endpoint in a closed state and frees all resources associated
  1054  // with it. It must be called only once and with no other concurrent calls to
  1055  // the endpoint.
  1056  func (e *Endpoint) Close() {
  1057  	e.LockUser()
  1058  	if e.closed {
  1059  		e.UnlockUser()
  1060  		return
  1061  	}
  1062  
  1063  	// We always want to purge the read queue, but do so after the checks in
  1064  	// shutdownLocked.
  1065  	e.closeLocked()
  1066  	e.purgeReadQueue()
  1067  	if e.EndpointState() == StateClose || e.EndpointState() == StateError {
  1068  		// It should be safe to purge the read queue now as the endpoint
  1069  		// is now closed or in an error state and further reads are not
  1070  		// permitted.
  1071  		e.UnlockUser()
  1072  		e.drainClosingSegmentQueue()
  1073  		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  1074  		return
  1075  	}
  1076  	e.UnlockUser()
  1077  }
  1078  
  1079  // +checklocks:e.mu
  1080  func (e *Endpoint) closeLocked() {
  1081  	linger := e.SocketOptions().GetLinger()
  1082  	if linger.Enabled && linger.Timeout == 0 {
  1083  		s := e.EndpointState()
  1084  		isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv
  1085  		if isResetState {
  1086  			// Close the endpoint without doing full shutdown and
  1087  			// send a RST.
  1088  			e.resetConnectionLocked(&tcpip.ErrConnectionAborted{})
  1089  			return
  1090  		}
  1091  	}
  1092  
  1093  	// Issue a shutdown so that the peer knows we won't send any more data
  1094  	// if we're connected, or stop accepting if we're listening.
  1095  	e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead)
  1096  	e.closeNoShutdownLocked()
  1097  }
  1098  
  1099  // closeNoShutdown closes the endpoint without doing a full shutdown.
  1100  // +checklocks:e.mu
  1101  func (e *Endpoint) closeNoShutdownLocked() {
  1102  	// For listening sockets, we always release ports inline so that they
  1103  	// are immediately available for reuse after Close() is called. If also
  1104  	// registered, we unregister as well otherwise the next user would fail
  1105  	// in Listen() when trying to register.
  1106  	if e.EndpointState() == StateListen && e.isPortReserved {
  1107  		if e.isRegistered {
  1108  			e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
  1109  			e.isRegistered = false
  1110  		}
  1111  
  1112  		portRes := ports.Reservation{
  1113  			Networks:     e.effectiveNetProtos,
  1114  			Transport:    ProtocolNumber,
  1115  			Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  1116  			Port:         e.TransportEndpointInfo.ID.LocalPort,
  1117  			Flags:        e.boundPortFlags,
  1118  			BindToDevice: e.boundBindToDevice,
  1119  			Dest:         e.boundDest,
  1120  		}
  1121  		e.stack.ReleasePort(portRes)
  1122  		e.isPortReserved = false
  1123  		e.boundBindToDevice = 0
  1124  		e.boundPortFlags = ports.Flags{}
  1125  		e.boundDest = tcpip.FullAddress{}
  1126  	}
  1127  
  1128  	// Mark endpoint as closed.
  1129  	e.closed = true
  1130  	tcpip.AddDanglingEndpoint(e)
  1131  
  1132  	eventMask := waiter.ReadableEvents | waiter.WritableEvents
  1133  
  1134  	switch e.EndpointState() {
  1135  	case StateInitial, StateBound, StateListen:
  1136  		e.setEndpointState(StateClose)
  1137  		fallthrough
  1138  	case StateClose, StateError:
  1139  		eventMask |= waiter.EventHUp
  1140  		e.cleanupLocked()
  1141  	case StateConnecting, StateSynSent, StateSynRecv:
  1142  		// Abort the handshake and set the error.
  1143  		// Notify that the endpoint is closed.
  1144  		eventMask |= waiter.EventHUp
  1145  		e.handshakeFailed(&tcpip.ErrAborted{})
  1146  		// Notify that the endpoint is closed.
  1147  		eventMask |= waiter.EventHUp
  1148  	case StateFinWait2:
  1149  		// The socket has been closed and we are in FIN-WAIT-2 so start
  1150  		// the FIN-WAIT-2 timer.
  1151  		if e.finWait2Timer == nil {
  1152  			e.finWait2Timer = e.stack.Clock().AfterFunc(e.tcpLingerTimeout, e.finWait2TimerExpired)
  1153  		}
  1154  	}
  1155  
  1156  	e.waiterQueue.Notify(eventMask)
  1157  }
  1158  
  1159  // closePendingAcceptableConnections closes all connections that have completed
  1160  // handshake but not yet been delivered to the application.
  1161  func (e *Endpoint) closePendingAcceptableConnectionsLocked() {
  1162  	e.acceptMu.Lock()
  1163  
  1164  	pendingEndpoints := e.acceptQueue.pendingEndpoints
  1165  	e.acceptQueue.pendingEndpoints = nil
  1166  
  1167  	completedEndpoints := make([]*Endpoint, 0, e.acceptQueue.endpoints.Len())
  1168  	for n := e.acceptQueue.endpoints.Front(); n != nil; n = n.Next() {
  1169  		completedEndpoints = append(completedEndpoints, n.Value.(*Endpoint))
  1170  	}
  1171  	e.acceptQueue.endpoints.Init()
  1172  	e.acceptQueue.capacity = 0
  1173  	e.acceptMu.Unlock()
  1174  
  1175  	// Close any endpoints in SYN-RCVD state.
  1176  	for n := range pendingEndpoints {
  1177  		n.Abort()
  1178  	}
  1179  
  1180  	// Reset all connections that are waiting to be accepted.
  1181  	for _, n := range completedEndpoints {
  1182  		n.Abort()
  1183  	}
  1184  }
  1185  
  1186  // cleanupLocked frees all resources associated with the endpoint.
  1187  // +checklocks:e.mu
  1188  func (e *Endpoint) cleanupLocked() {
  1189  	if e.snd != nil {
  1190  		e.snd.resendTimer.cleanup()
  1191  		e.snd.probeTimer.cleanup()
  1192  		e.snd.reorderTimer.cleanup()
  1193  		e.snd.corkTimer.cleanup()
  1194  	}
  1195  
  1196  	if e.finWait2Timer != nil {
  1197  		e.finWait2Timer.Stop()
  1198  	}
  1199  
  1200  	if e.timeWaitTimer != nil {
  1201  		e.timeWaitTimer.Stop()
  1202  	}
  1203  
  1204  	// Close all endpoints that might have been accepted by TCP but not by
  1205  	// the client.
  1206  	e.closePendingAcceptableConnectionsLocked()
  1207  	e.keepalive.timer.cleanup()
  1208  
  1209  	if e.isRegistered {
  1210  		e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
  1211  		e.isRegistered = false
  1212  	}
  1213  
  1214  	if e.isPortReserved {
  1215  		portRes := ports.Reservation{
  1216  			Networks:     e.effectiveNetProtos,
  1217  			Transport:    ProtocolNumber,
  1218  			Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  1219  			Port:         e.TransportEndpointInfo.ID.LocalPort,
  1220  			Flags:        e.boundPortFlags,
  1221  			BindToDevice: e.boundBindToDevice,
  1222  			Dest:         e.boundDest,
  1223  		}
  1224  		e.stack.ReleasePort(portRes)
  1225  		e.isPortReserved = false
  1226  	}
  1227  	e.boundBindToDevice = 0
  1228  	e.boundPortFlags = ports.Flags{}
  1229  	e.boundDest = tcpip.FullAddress{}
  1230  
  1231  	if e.route != nil {
  1232  		e.route.Release()
  1233  		e.route = nil
  1234  	}
  1235  
  1236  	e.purgeWriteQueue()
  1237  	// Only purge the read queue here if the socket is fully closed by the
  1238  	// user.
  1239  	if e.closed {
  1240  		e.purgeReadQueue()
  1241  	}
  1242  	e.stack.CompleteTransportEndpointCleanup(e)
  1243  	tcpip.DeleteDanglingEndpoint(e)
  1244  }
  1245  
  1246  // wndFromSpace returns the window that we can advertise based on the available
  1247  // receive buffer space.
  1248  func wndFromSpace(space int) int {
  1249  	return space >> rcvAdvWndScale
  1250  }
  1251  
  1252  // initialReceiveWindow returns the initial receive window to advertise in the
  1253  // SYN/SYN-ACK.
  1254  func (e *Endpoint) initialReceiveWindow() int {
  1255  	rcvWnd := wndFromSpace(e.receiveBufferAvailable())
  1256  	if rcvWnd > math.MaxUint16 {
  1257  		rcvWnd = math.MaxUint16
  1258  	}
  1259  
  1260  	// Use the user supplied MSS, if available.
  1261  	routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2
  1262  	if rcvWnd > routeWnd {
  1263  		rcvWnd = routeWnd
  1264  	}
  1265  	rcvWndScale := e.rcvWndScaleForHandshake()
  1266  
  1267  	// Round-down the rcvWnd to a multiple of wndScale. This ensures that the
  1268  	// window offered in SYN won't be reduced due to the loss of precision if
  1269  	// window scaling is enabled after the handshake.
  1270  	rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale)
  1271  
  1272  	// Ensure we can always accept at least 1 byte if the scale specified
  1273  	// was too high for the provided rcvWnd.
  1274  	if rcvWnd == 0 {
  1275  		rcvWnd = 1
  1276  	}
  1277  
  1278  	return rcvWnd
  1279  }
  1280  
  1281  // ModerateRecvBuf adjusts the receive buffer and the advertised window
  1282  // based on the number of bytes copied to userspace.
  1283  func (e *Endpoint) ModerateRecvBuf(copied int) {
  1284  	e.LockUser()
  1285  	defer e.UnlockUser()
  1286  
  1287  	sendNonZeroWindowUpdate := false
  1288  
  1289  	e.rcvQueueMu.Lock()
  1290  	if e.RcvAutoParams.Disabled {
  1291  		e.rcvQueueMu.Unlock()
  1292  		return
  1293  	}
  1294  	now := e.stack.Clock().NowMonotonic()
  1295  	if rtt := e.RcvAutoParams.RTT; rtt == 0 || now.Sub(e.RcvAutoParams.MeasureTime) < rtt {
  1296  		e.RcvAutoParams.CopiedBytes += copied
  1297  		e.rcvQueueMu.Unlock()
  1298  		return
  1299  	}
  1300  	prevRTTCopied := e.RcvAutoParams.CopiedBytes + copied
  1301  	prevCopied := e.RcvAutoParams.PrevCopiedBytes
  1302  	rcvWnd := 0
  1303  	if prevRTTCopied > prevCopied {
  1304  		// The minimal receive window based on what was copied by the app
  1305  		// in the immediate preceding RTT and some extra buffer for 16
  1306  		// segments to account for variations.
  1307  		// We multiply by 2 to account for packet losses.
  1308  		rcvWnd = prevRTTCopied*2 + 16*int(e.amss)
  1309  
  1310  		// Scale for slow start based on bytes copied in this RTT vs previous.
  1311  		grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied
  1312  
  1313  		// Multiply growth factor by 2 again to account for sender being
  1314  		// in slow-start where the sender grows it's congestion window
  1315  		// by 100% per RTT.
  1316  		rcvWnd += grow * 2
  1317  
  1318  		// Make sure auto tuned buffer size can always receive upto 2x
  1319  		// the initial window of 10 segments.
  1320  		if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd {
  1321  			rcvWnd = minRcvWnd
  1322  		}
  1323  
  1324  		// Cap the auto tuned buffer size by the maximum permissible
  1325  		// receive buffer size.
  1326  		if max := e.maxReceiveBufferSize(); rcvWnd > max {
  1327  			rcvWnd = max
  1328  		}
  1329  
  1330  		// We do not adjust downwards as that can cause the receiver to
  1331  		// reject valid data that might already be in flight as the
  1332  		// acceptable window will shrink.
  1333  		rcvBufSize := int(e.ops.GetReceiveBufferSize())
  1334  		if rcvWnd > rcvBufSize {
  1335  			availBefore := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize))
  1336  			e.ops.SetReceiveBufferSize(int64(rcvWnd), false /* notify */)
  1337  			availAfter := wndFromSpace(e.receiveBufferAvailableLocked(rcvWnd))
  1338  			if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, rcvBufSize); crossed && above {
  1339  				sendNonZeroWindowUpdate = true
  1340  			}
  1341  		}
  1342  
  1343  		// We only update PrevCopiedBytes when we grow the buffer because in cases
  1344  		// where PrevCopiedBytes > prevRTTCopied the existing buffer is already big
  1345  		// enough to handle the current rate and we don't need to do any
  1346  		// adjustments.
  1347  		e.RcvAutoParams.PrevCopiedBytes = prevRTTCopied
  1348  	}
  1349  	e.RcvAutoParams.MeasureTime = now
  1350  	e.RcvAutoParams.CopiedBytes = 0
  1351  	e.rcvQueueMu.Unlock()
  1352  
  1353  	// Send the update after unlocking rcvQueueMu as sending a segment acquires
  1354  	// the lock to calculate the window to be sent.
  1355  	if e.EndpointState().connected() && sendNonZeroWindowUpdate {
  1356  		e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu
  1357  	}
  1358  }
  1359  
  1360  // SetOwner implements tcpip.Endpoint.SetOwner.
  1361  func (e *Endpoint) SetOwner(owner tcpip.PacketOwner) {
  1362  	e.owner = owner
  1363  }
  1364  
  1365  // +checklocks:e.mu
  1366  func (e *Endpoint) hardErrorLocked() tcpip.Error {
  1367  	err := e.hardError
  1368  	e.hardError = nil
  1369  	return err
  1370  }
  1371  
  1372  // +checklocks:e.mu
  1373  func (e *Endpoint) lastErrorLocked() tcpip.Error {
  1374  	e.lastErrorMu.Lock()
  1375  	defer e.lastErrorMu.Unlock()
  1376  	err := e.lastError
  1377  	e.lastError = nil
  1378  	return err
  1379  }
  1380  
  1381  // LastError implements tcpip.Endpoint.LastError.
  1382  func (e *Endpoint) LastError() tcpip.Error {
  1383  	e.LockUser()
  1384  	defer e.UnlockUser()
  1385  	if err := e.hardErrorLocked(); err != nil {
  1386  		return err
  1387  	}
  1388  	return e.lastErrorLocked()
  1389  }
  1390  
  1391  // LastErrorLocked reads and clears lastError.
  1392  // Only to be used in tests.
  1393  // +checklocks:e.mu
  1394  func (e *Endpoint) LastErrorLocked() tcpip.Error {
  1395  	return e.lastErrorLocked()
  1396  }
  1397  
  1398  // UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError.
  1399  func (e *Endpoint) UpdateLastError(err tcpip.Error) {
  1400  	e.LockUser()
  1401  	e.lastErrorMu.Lock()
  1402  	e.lastError = err
  1403  	e.lastErrorMu.Unlock()
  1404  	e.UnlockUser()
  1405  }
  1406  
  1407  // Read implements tcpip.Endpoint.Read.
  1408  func (e *Endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) {
  1409  	e.LockUser()
  1410  	defer e.UnlockUser()
  1411  
  1412  	if err := e.checkReadLocked(); err != nil {
  1413  		if _, ok := err.(*tcpip.ErrClosedForReceive); ok {
  1414  			e.stats.ReadErrors.ReadClosed.Increment()
  1415  		}
  1416  		return tcpip.ReadResult{}, err
  1417  	}
  1418  
  1419  	var err error
  1420  	done := 0
  1421  	// N.B. Here we get the first segment to be processed. It is safe to not
  1422  	// hold rcvQueueMu when processing, since we hold e.mu to ensure we only
  1423  	// remove segments from the list through Read() and that new segments
  1424  	// cannot be appended.
  1425  	s := e.rcvQueue.Front()
  1426  	for s != nil {
  1427  		var n int
  1428  		n, err = s.ReadTo(dst, opts.Peek)
  1429  		// Book keeping first then error handling.
  1430  		done += n
  1431  
  1432  		if opts.Peek {
  1433  			s = s.Next()
  1434  		} else {
  1435  			sendNonZeroWindowUpdate := false
  1436  			memDelta := 0
  1437  			for {
  1438  				seg := e.rcvQueue.Front()
  1439  				if seg == nil || seg.payloadSize() != 0 {
  1440  					break
  1441  				}
  1442  				e.rcvQueue.Remove(seg)
  1443  				// Memory is only considered released when the whole segment has been
  1444  				// read.
  1445  				memDelta += seg.segMemSize()
  1446  				seg.DecRef()
  1447  			}
  1448  			e.rcvQueueMu.Lock()
  1449  			e.RcvBufUsed -= n
  1450  			s = e.rcvQueue.Front()
  1451  
  1452  			if memDelta > 0 {
  1453  				// If the window was small before this read and if the read freed up
  1454  				// enough buffer space, to either fit an aMSS or half a receive buffer
  1455  				// (whichever smaller), then notify the protocol goroutine to send a
  1456  				// window update.
  1457  				if crossed, above := e.windowCrossedACKThresholdLocked(memDelta, int(e.ops.GetReceiveBufferSize())); crossed && above {
  1458  					sendNonZeroWindowUpdate = true
  1459  				}
  1460  			}
  1461  			e.rcvQueueMu.Unlock()
  1462  
  1463  			if e.EndpointState().connected() && sendNonZeroWindowUpdate {
  1464  				e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu
  1465  			}
  1466  		}
  1467  
  1468  		if err != nil {
  1469  			break
  1470  		}
  1471  	}
  1472  
  1473  	// If something is read, we must report it. Report error when nothing is read.
  1474  	if done == 0 && err != nil {
  1475  		return tcpip.ReadResult{}, &tcpip.ErrBadBuffer{}
  1476  	}
  1477  	return tcpip.ReadResult{
  1478  		Count: done,
  1479  		Total: done,
  1480  	}, nil
  1481  }
  1482  
  1483  // checkRead checks that endpoint is in a readable state.
  1484  //
  1485  // +checklocks:e.mu
  1486  func (e *Endpoint) checkReadLocked() tcpip.Error {
  1487  	e.rcvQueueMu.Lock()
  1488  	defer e.rcvQueueMu.Unlock()
  1489  	// When in SYN-SENT state, let the caller block on the receive.
  1490  	// An application can initiate a non-blocking connect and then block
  1491  	// on a receive. It can expect to read any data after the handshake
  1492  	// is complete. RFC793, section 3.9, p58.
  1493  	if e.EndpointState() == StateSynSent {
  1494  		return &tcpip.ErrWouldBlock{}
  1495  	}
  1496  
  1497  	// The endpoint can be read if it's connected, or if it's already closed
  1498  	// but has some pending unread data. Also note that a RST being received
  1499  	// would cause the state to become StateError so we should allow the
  1500  	// reads to proceed before returning a ECONNRESET.
  1501  	bufUsed := e.RcvBufUsed
  1502  	if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 {
  1503  		if s == StateError {
  1504  			if err := e.hardErrorLocked(); err != nil {
  1505  				return err
  1506  			}
  1507  			return &tcpip.ErrClosedForReceive{}
  1508  		}
  1509  		e.stats.ReadErrors.NotConnected.Increment()
  1510  		return &tcpip.ErrNotConnected{}
  1511  	}
  1512  
  1513  	if e.RcvBufUsed == 0 {
  1514  		if e.RcvClosed || !e.EndpointState().connected() {
  1515  			return &tcpip.ErrClosedForReceive{}
  1516  		}
  1517  		return &tcpip.ErrWouldBlock{}
  1518  	}
  1519  
  1520  	return nil
  1521  }
  1522  
  1523  // isEndpointWritableLocked checks if a given endpoint is writable
  1524  // and also returns the number of bytes that can be written at this
  1525  // moment. If the endpoint is not writable then it returns an error
  1526  // indicating the reason why it's not writable.
  1527  // +checklocks:e.mu
  1528  // +checklocks:e.sndQueueInfo.sndQueueMu
  1529  func (e *Endpoint) isEndpointWritableLocked() (int, tcpip.Error) {
  1530  	// The endpoint cannot be written to if it's not connected.
  1531  	switch s := e.EndpointState(); {
  1532  	case s == StateError:
  1533  		if err := e.hardErrorLocked(); err != nil {
  1534  			return 0, err
  1535  		}
  1536  		return 0, &tcpip.ErrClosedForSend{}
  1537  	case !s.connecting() && !s.connected():
  1538  		return 0, &tcpip.ErrClosedForSend{}
  1539  	case s.connecting():
  1540  		// As per RFC793, page 56, a send request arriving when in connecting
  1541  		// state, can be queued to be completed after the state becomes
  1542  		// connected. Return an error code for the caller of endpoint Write to
  1543  		// try again, until the connection handshake is complete.
  1544  		return 0, &tcpip.ErrWouldBlock{}
  1545  	}
  1546  
  1547  	// Check if the connection has already been closed for sends.
  1548  	if e.sndQueueInfo.SndClosed {
  1549  		return 0, &tcpip.ErrClosedForSend{}
  1550  	}
  1551  
  1552  	sndBufSize := e.getSendBufferSize()
  1553  	avail := sndBufSize - e.sndQueueInfo.SndBufUsed
  1554  	if avail <= 0 {
  1555  		return 0, &tcpip.ErrWouldBlock{}
  1556  	}
  1557  	return avail, nil
  1558  }
  1559  
  1560  // readFromPayloader reads a slice from the Payloader.
  1561  // +checklocks:e.mu
  1562  // +checklocks:e.sndQueueInfo.sndQueueMu
  1563  func (e *Endpoint) readFromPayloader(p tcpip.Payloader, opts tcpip.WriteOptions, avail int) (buffer.Buffer, tcpip.Error) {
  1564  	// We can release locks while copying data.
  1565  	//
  1566  	// This is not possible if atomic is set, because we can't allow the
  1567  	// available buffer space to be consumed by some other caller while we
  1568  	// are copying data in.
  1569  	limRdr := e.limRdr
  1570  	if !opts.Atomic {
  1571  		defer func() {
  1572  			e.limRdr = limRdr
  1573  		}()
  1574  		e.limRdr = nil
  1575  
  1576  		e.sndQueueInfo.sndQueueMu.Unlock()
  1577  		defer e.sndQueueInfo.sndQueueMu.Lock()
  1578  
  1579  		e.UnlockUser()
  1580  		defer e.LockUser()
  1581  	}
  1582  
  1583  	// Fetch data.
  1584  	var payload buffer.Buffer
  1585  	if l := p.Len(); l < avail {
  1586  		avail = l
  1587  	}
  1588  	if avail == 0 {
  1589  		return payload, nil
  1590  	}
  1591  	if _, err := payload.WriteFromReaderAndLimitedReader(p, int64(avail), limRdr); err != nil {
  1592  		payload.Release()
  1593  		return buffer.Buffer{}, &tcpip.ErrBadBuffer{}
  1594  	}
  1595  	return payload, nil
  1596  }
  1597  
  1598  // queueSegment reads data from the payloader and returns a segment to be sent.
  1599  // +checklocks:e.mu
  1600  func (e *Endpoint) queueSegment(p tcpip.Payloader, opts tcpip.WriteOptions) (*segment, int, tcpip.Error) {
  1601  	e.sndQueueInfo.sndQueueMu.Lock()
  1602  	defer e.sndQueueInfo.sndQueueMu.Unlock()
  1603  
  1604  	avail, err := e.isEndpointWritableLocked()
  1605  	if err != nil {
  1606  		e.stats.WriteErrors.WriteClosed.Increment()
  1607  		return nil, 0, err
  1608  	}
  1609  
  1610  	buf, err := e.readFromPayloader(p, opts, avail)
  1611  	if err != nil {
  1612  		return nil, 0, err
  1613  	}
  1614  
  1615  	// Do not queue zero length segments.
  1616  	if buf.Size() == 0 {
  1617  		return nil, 0, nil
  1618  	}
  1619  
  1620  	if !opts.Atomic {
  1621  		// Since we released locks in between it's possible that the
  1622  		// endpoint transitioned to a CLOSED/ERROR states so make
  1623  		// sure endpoint is still writable before trying to write.
  1624  		avail, err := e.isEndpointWritableLocked()
  1625  		if err != nil {
  1626  			e.stats.WriteErrors.WriteClosed.Increment()
  1627  			buf.Release()
  1628  			return nil, 0, err
  1629  		}
  1630  
  1631  		// A simultaneous call to write on the socket can reduce avail. Discard
  1632  		// excess data copied if this is the case.
  1633  		if int64(avail) < buf.Size() {
  1634  			buf.Truncate(int64(avail))
  1635  		}
  1636  	}
  1637  
  1638  	// Add data to the send queue.
  1639  	size := int(buf.Size())
  1640  	s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), buf)
  1641  	e.sndQueueInfo.SndBufUsed += size
  1642  	e.snd.writeList.PushBack(s)
  1643  
  1644  	return s, size, nil
  1645  }
  1646  
  1647  // Write writes data to the endpoint's peer.
  1648  func (e *Endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) {
  1649  	// Linux completely ignores any address passed to sendto(2) for TCP sockets
  1650  	// (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More
  1651  	// and opts.EndOfRecord are also ignored.
  1652  
  1653  	e.LockUser()
  1654  	defer e.UnlockUser()
  1655  
  1656  	// Return if either we didn't queue anything or if an error occurred while
  1657  	// attempting to queue data.
  1658  	nextSeg, n, err := e.queueSegment(p, opts)
  1659  	if n == 0 || err != nil {
  1660  		return 0, err
  1661  	}
  1662  
  1663  	e.sendData(nextSeg)
  1664  	return int64(n), nil
  1665  }
  1666  
  1667  // selectWindowLocked returns the new window without checking for shrinking or scaling
  1668  // applied.
  1669  // +checklocks:e.mu
  1670  // +checklocks:e.rcvQueueMu
  1671  func (e *Endpoint) selectWindowLocked(rcvBufSize int) (wnd seqnum.Size) {
  1672  	wndFromAvailable := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize))
  1673  	maxWindow := wndFromSpace(rcvBufSize)
  1674  	wndFromUsedBytes := maxWindow - e.RcvBufUsed
  1675  
  1676  	// We take the lesser of the wndFromAvailable and wndFromUsedBytes because in
  1677  	// cases where we receive a lot of small segments the segment overhead is a
  1678  	// lot higher and we can run out socket buffer space before we can fill the
  1679  	// previous window we advertised. In cases where we receive MSS sized or close
  1680  	// MSS sized segments we will probably run out of window space before we
  1681  	// exhaust receive buffer.
  1682  	newWnd := wndFromAvailable
  1683  	if newWnd > wndFromUsedBytes {
  1684  		newWnd = wndFromUsedBytes
  1685  	}
  1686  	if newWnd < 0 {
  1687  		newWnd = 0
  1688  	}
  1689  	return seqnum.Size(newWnd)
  1690  }
  1691  
  1692  // selectWindow invokes selectWindowLocked after acquiring e.rcvQueueMu.
  1693  // +checklocks:e.mu
  1694  func (e *Endpoint) selectWindow() (wnd seqnum.Size) {
  1695  	e.rcvQueueMu.Lock()
  1696  	wnd = e.selectWindowLocked(int(e.ops.GetReceiveBufferSize()))
  1697  	e.rcvQueueMu.Unlock()
  1698  	return wnd
  1699  }
  1700  
  1701  // windowCrossedACKThresholdLocked checks if the receive window to be announced
  1702  // would be under aMSS or under the window derived from half receive buffer,
  1703  // whichever smaller. This is useful as a receive side silly window syndrome
  1704  // prevention mechanism. If window grows to reasonable value, we should send ACK
  1705  // to the sender to inform the rx space is now large. We also want ensure a
  1706  // series of small read()'s won't trigger a flood of spurious tiny ACK's.
  1707  //
  1708  // For large receive buffers, the threshold is aMSS - once reader reads more
  1709  // than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of
  1710  // receive buffer size. This is chosen arbitrarily.
  1711  // crossed will be true if the window size crossed the ACK threshold.
  1712  // above will be true if the new window is >= ACK threshold and false
  1713  // otherwise.
  1714  //
  1715  // +checklocks:e.mu
  1716  // +checklocks:e.rcvQueueMu
  1717  func (e *Endpoint) windowCrossedACKThresholdLocked(deltaBefore int, rcvBufSize int) (crossed bool, above bool) {
  1718  	newAvail := int(e.selectWindowLocked(rcvBufSize))
  1719  	oldAvail := newAvail - deltaBefore
  1720  	if oldAvail < 0 {
  1721  		oldAvail = 0
  1722  	}
  1723  	threshold := int(e.amss)
  1724  	// rcvBufFraction is the inverse of the fraction of receive buffer size that
  1725  	// is used to decide if the available buffer space is now above it.
  1726  	const rcvBufFraction = 2
  1727  	if wndThreshold := wndFromSpace(rcvBufSize / rcvBufFraction); threshold > wndThreshold {
  1728  		threshold = wndThreshold
  1729  	}
  1730  
  1731  	switch {
  1732  	case oldAvail < threshold && newAvail >= threshold:
  1733  		return true, true
  1734  	case oldAvail >= threshold && newAvail < threshold:
  1735  		return true, false
  1736  	}
  1737  	return false, false
  1738  }
  1739  
  1740  // OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet.
  1741  func (e *Endpoint) OnReuseAddressSet(v bool) {
  1742  	e.LockUser()
  1743  	e.portFlags.TupleOnly = v
  1744  	e.UnlockUser()
  1745  }
  1746  
  1747  // OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet.
  1748  func (e *Endpoint) OnReusePortSet(v bool) {
  1749  	e.LockUser()
  1750  	e.portFlags.LoadBalanced = v
  1751  	e.UnlockUser()
  1752  }
  1753  
  1754  // OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet.
  1755  func (e *Endpoint) OnKeepAliveSet(bool) {
  1756  	e.LockUser()
  1757  	e.resetKeepaliveTimer(true /* receivedData */)
  1758  	e.UnlockUser()
  1759  }
  1760  
  1761  // OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet.
  1762  func (e *Endpoint) OnDelayOptionSet(v bool) {
  1763  	if !v {
  1764  		e.LockUser()
  1765  		defer e.UnlockUser()
  1766  		// Handle delayed data.
  1767  		if e.EndpointState().connected() {
  1768  			e.sendData(nil /* next */)
  1769  		}
  1770  	}
  1771  }
  1772  
  1773  // OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet.
  1774  func (e *Endpoint) OnCorkOptionSet(v bool) {
  1775  	if !v {
  1776  		e.LockUser()
  1777  		defer e.UnlockUser()
  1778  		if e.snd != nil {
  1779  			e.snd.corkTimer.disable()
  1780  		}
  1781  		// Handle the corked data.
  1782  		if e.EndpointState().connected() {
  1783  			e.sendData(nil /* next */)
  1784  		}
  1785  	}
  1786  }
  1787  
  1788  func (e *Endpoint) getSendBufferSize() int {
  1789  	return int(e.ops.GetSendBufferSize())
  1790  }
  1791  
  1792  // OnSetReceiveBufferSize implements tcpip.SocketOptionsHandler.OnSetReceiveBufferSize.
  1793  func (e *Endpoint) OnSetReceiveBufferSize(rcvBufSz, oldSz int64) (newSz int64, postSet func()) {
  1794  	e.LockUser()
  1795  
  1796  	sendNonZeroWindowUpdate := false
  1797  	e.rcvQueueMu.Lock()
  1798  
  1799  	// Make sure the receive buffer size allows us to send a
  1800  	// non-zero window size.
  1801  	scale := uint8(0)
  1802  	if e.rcv != nil {
  1803  		scale = e.rcv.RcvWndScale
  1804  	}
  1805  	if rcvBufSz>>scale == 0 {
  1806  		rcvBufSz = 1 << scale
  1807  	}
  1808  
  1809  	availBefore := wndFromSpace(e.receiveBufferAvailableLocked(int(oldSz)))
  1810  	availAfter := wndFromSpace(e.receiveBufferAvailableLocked(int(rcvBufSz)))
  1811  	e.RcvAutoParams.Disabled = true
  1812  
  1813  	// Immediately send an ACK to uncork the sender silly window
  1814  	// syndrome prevetion, when our available space grows above aMSS
  1815  	// or half receive buffer, whichever smaller.
  1816  	if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, int(rcvBufSz)); crossed && above {
  1817  		sendNonZeroWindowUpdate = true
  1818  	}
  1819  
  1820  	e.rcvQueueMu.Unlock()
  1821  
  1822  	postSet = func() {
  1823  		e.LockUser()
  1824  		defer e.UnlockUser()
  1825  		if e.EndpointState().connected() && sendNonZeroWindowUpdate {
  1826  			e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu
  1827  		}
  1828  
  1829  	}
  1830  	e.UnlockUser()
  1831  	return rcvBufSz, postSet
  1832  }
  1833  
  1834  // OnSetSendBufferSize implements tcpip.SocketOptionsHandler.OnSetSendBufferSize.
  1835  func (e *Endpoint) OnSetSendBufferSize(sz int64) int64 {
  1836  	e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled.Store(1)
  1837  	return sz
  1838  }
  1839  
  1840  // WakeupWriters implements tcpip.SocketOptionsHandler.WakeupWriters.
  1841  func (e *Endpoint) WakeupWriters() {
  1842  	e.LockUser()
  1843  	defer e.UnlockUser()
  1844  
  1845  	sendBufferSize := e.getSendBufferSize()
  1846  	e.sndQueueInfo.sndQueueMu.Lock()
  1847  	notify := (sendBufferSize - e.sndQueueInfo.SndBufUsed) >= e.sndQueueInfo.SndBufUsed>>1
  1848  	e.sndQueueInfo.sndQueueMu.Unlock()
  1849  
  1850  	if notify {
  1851  		e.waiterQueue.Notify(waiter.WritableEvents)
  1852  	}
  1853  }
  1854  
  1855  // SetSockOptInt sets a socket option.
  1856  func (e *Endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error {
  1857  	// Lower 2 bits represents ECN bits. RFC 3168, section 23.1
  1858  	const inetECNMask = 3
  1859  
  1860  	switch opt {
  1861  	case tcpip.KeepaliveCountOption:
  1862  		e.LockUser()
  1863  		e.keepalive.Lock()
  1864  		e.keepalive.count = v
  1865  		e.keepalive.Unlock()
  1866  		e.resetKeepaliveTimer(true /* receivedData */)
  1867  		e.UnlockUser()
  1868  
  1869  	case tcpip.IPv4TOSOption:
  1870  		e.LockUser()
  1871  		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
  1872  		// ignore the bits for now.
  1873  		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
  1874  		e.UnlockUser()
  1875  
  1876  	case tcpip.IPv6TrafficClassOption:
  1877  		e.LockUser()
  1878  		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
  1879  		// ignore the bits for now.
  1880  		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
  1881  		e.UnlockUser()
  1882  
  1883  	case tcpip.MaxSegOption:
  1884  		userMSS := v
  1885  		if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS {
  1886  			return &tcpip.ErrInvalidOptionValue{}
  1887  		}
  1888  		e.LockUser()
  1889  		e.userMSS = uint16(userMSS)
  1890  		e.UnlockUser()
  1891  
  1892  	case tcpip.MTUDiscoverOption:
  1893  		// Return not supported if attempting to set this option to
  1894  		// anything other than path MTU discovery disabled.
  1895  		if v != tcpip.PMTUDiscoveryDont {
  1896  			return &tcpip.ErrNotSupported{}
  1897  		}
  1898  
  1899  	case tcpip.IPv4TTLOption:
  1900  		e.LockUser()
  1901  		e.ipv4TTL = uint8(v)
  1902  		e.UnlockUser()
  1903  
  1904  	case tcpip.IPv6HopLimitOption:
  1905  		e.LockUser()
  1906  		e.ipv6HopLimit = int16(v)
  1907  		e.UnlockUser()
  1908  
  1909  	case tcpip.TCPSynCountOption:
  1910  		if v < 1 || v > 255 {
  1911  			return &tcpip.ErrInvalidOptionValue{}
  1912  		}
  1913  		e.LockUser()
  1914  		e.maxSynRetries = uint8(v)
  1915  		e.UnlockUser()
  1916  
  1917  	case tcpip.TCPWindowClampOption:
  1918  		if v == 0 {
  1919  			e.LockUser()
  1920  			switch e.EndpointState() {
  1921  			case StateClose, StateInitial:
  1922  				e.windowClamp = 0
  1923  				e.UnlockUser()
  1924  				return nil
  1925  			default:
  1926  				e.UnlockUser()
  1927  				return &tcpip.ErrInvalidOptionValue{}
  1928  			}
  1929  		}
  1930  		var rs tcpip.TCPReceiveBufferSizeRangeOption
  1931  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
  1932  			if v < rs.Min/2 {
  1933  				v = rs.Min / 2
  1934  			}
  1935  		}
  1936  		e.LockUser()
  1937  		e.windowClamp = uint32(v)
  1938  		e.UnlockUser()
  1939  	}
  1940  	return nil
  1941  }
  1942  
  1943  // HasNIC returns true if the NICID is defined in the stack or id is 0.
  1944  func (e *Endpoint) HasNIC(id int32) bool {
  1945  	return id == 0 || e.stack.HasNIC(tcpip.NICID(id))
  1946  }
  1947  
  1948  // SetSockOpt sets a socket option.
  1949  func (e *Endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error {
  1950  	switch v := opt.(type) {
  1951  	case *tcpip.KeepaliveIdleOption:
  1952  		e.LockUser()
  1953  		e.keepalive.Lock()
  1954  		e.keepalive.idle = time.Duration(*v)
  1955  		e.keepalive.Unlock()
  1956  		e.resetKeepaliveTimer(true /* receivedData */)
  1957  		e.UnlockUser()
  1958  
  1959  	case *tcpip.KeepaliveIntervalOption:
  1960  		e.LockUser()
  1961  		e.keepalive.Lock()
  1962  		e.keepalive.interval = time.Duration(*v)
  1963  		e.keepalive.Unlock()
  1964  		e.resetKeepaliveTimer(true /* receivedData */)
  1965  		e.UnlockUser()
  1966  
  1967  	case *tcpip.TCPUserTimeoutOption:
  1968  		e.LockUser()
  1969  		e.userTimeout = time.Duration(*v)
  1970  		e.UnlockUser()
  1971  
  1972  	case *tcpip.CongestionControlOption:
  1973  		// Query the available cc algorithms in the stack and
  1974  		// validate that the specified algorithm is actually
  1975  		// supported in the stack.
  1976  		var avail tcpip.TCPAvailableCongestionControlOption
  1977  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil {
  1978  			return err
  1979  		}
  1980  		availCC := strings.Split(string(avail), " ")
  1981  		for _, cc := range availCC {
  1982  			if *v == tcpip.CongestionControlOption(cc) {
  1983  				e.LockUser()
  1984  				state := e.EndpointState()
  1985  				e.cc = *v
  1986  				switch state {
  1987  				case StateEstablished:
  1988  					if e.EndpointState() == state {
  1989  						e.snd.cc = e.snd.initCongestionControl(e.cc)
  1990  					}
  1991  				}
  1992  				e.UnlockUser()
  1993  				return nil
  1994  			}
  1995  		}
  1996  
  1997  		// Linux returns ENOENT when an invalid congestion
  1998  		// control algorithm is specified.
  1999  		return &tcpip.ErrNoSuchFile{}
  2000  
  2001  	case *tcpip.TCPLingerTimeoutOption:
  2002  		e.LockUser()
  2003  
  2004  		switch {
  2005  		case *v < 0:
  2006  			// Same as effectively disabling TCPLinger timeout.
  2007  			*v = -1
  2008  		case *v == 0:
  2009  			// Same as the stack default.
  2010  			var stackLingerTimeout tcpip.TCPLingerTimeoutOption
  2011  			if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil {
  2012  				panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err))
  2013  			}
  2014  			*v = stackLingerTimeout
  2015  		case *v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout):
  2016  			// Cap it to Stack's default TCP_LINGER2 timeout.
  2017  			*v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout)
  2018  		default:
  2019  		}
  2020  
  2021  		e.tcpLingerTimeout = time.Duration(*v)
  2022  		e.UnlockUser()
  2023  
  2024  	case *tcpip.TCPDeferAcceptOption:
  2025  		e.LockUser()
  2026  		if time.Duration(*v) > MaxRTO {
  2027  			*v = tcpip.TCPDeferAcceptOption(MaxRTO)
  2028  		}
  2029  		e.deferAccept = time.Duration(*v)
  2030  		e.UnlockUser()
  2031  
  2032  	case *tcpip.SocketDetachFilterOption:
  2033  		return nil
  2034  
  2035  	default:
  2036  		return nil
  2037  	}
  2038  	return nil
  2039  }
  2040  
  2041  // readyReceiveSize returns the number of bytes ready to be received.
  2042  func (e *Endpoint) readyReceiveSize() (int, tcpip.Error) {
  2043  	e.LockUser()
  2044  	defer e.UnlockUser()
  2045  
  2046  	// The endpoint cannot be in listen state.
  2047  	if e.EndpointState() == StateListen {
  2048  		return 0, &tcpip.ErrInvalidEndpointState{}
  2049  	}
  2050  
  2051  	e.rcvQueueMu.Lock()
  2052  	defer e.rcvQueueMu.Unlock()
  2053  
  2054  	return e.RcvBufUsed, nil
  2055  }
  2056  
  2057  // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
  2058  func (e *Endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) {
  2059  	switch opt {
  2060  	case tcpip.KeepaliveCountOption:
  2061  		e.keepalive.Lock()
  2062  		v := e.keepalive.count
  2063  		e.keepalive.Unlock()
  2064  		return v, nil
  2065  
  2066  	case tcpip.IPv4TOSOption:
  2067  		e.LockUser()
  2068  		v := int(e.sendTOS)
  2069  		e.UnlockUser()
  2070  		return v, nil
  2071  
  2072  	case tcpip.IPv6TrafficClassOption:
  2073  		e.LockUser()
  2074  		v := int(e.sendTOS)
  2075  		e.UnlockUser()
  2076  		return v, nil
  2077  
  2078  	case tcpip.MaxSegOption:
  2079  		// Linux only returns user_mss value if user_mss is set and the socket is
  2080  		// unconnected. Otherwise Linux returns the actual current MSS. Netstack
  2081  		// mimics the user_mss behavior, but otherwise just returns the defaultMSS
  2082  		// for now.
  2083  		v := header.TCPDefaultMSS
  2084  		e.LockUser()
  2085  		if state := e.EndpointState(); e.userMSS > 0 && (state.internal() || state == StateClose || state == StateListen) {
  2086  			v = int(e.userMSS)
  2087  		}
  2088  		e.UnlockUser()
  2089  		return v, nil
  2090  
  2091  	case tcpip.MTUDiscoverOption:
  2092  		// Always return the path MTU discovery disabled setting since
  2093  		// it's the only one supported.
  2094  		return tcpip.PMTUDiscoveryDont, nil
  2095  
  2096  	case tcpip.ReceiveQueueSizeOption:
  2097  		return e.readyReceiveSize()
  2098  
  2099  	case tcpip.IPv4TTLOption:
  2100  		e.LockUser()
  2101  		v := int(e.ipv4TTL)
  2102  		e.UnlockUser()
  2103  		return v, nil
  2104  
  2105  	case tcpip.IPv6HopLimitOption:
  2106  		e.LockUser()
  2107  		v := int(e.ipv6HopLimit)
  2108  		e.UnlockUser()
  2109  		return v, nil
  2110  
  2111  	case tcpip.TCPSynCountOption:
  2112  		e.LockUser()
  2113  		v := int(e.maxSynRetries)
  2114  		e.UnlockUser()
  2115  		return v, nil
  2116  
  2117  	case tcpip.TCPWindowClampOption:
  2118  		e.LockUser()
  2119  		v := int(e.windowClamp)
  2120  		e.UnlockUser()
  2121  		return v, nil
  2122  
  2123  	case tcpip.MulticastTTLOption:
  2124  		return 1, nil
  2125  
  2126  	default:
  2127  		return -1, &tcpip.ErrUnknownProtocolOption{}
  2128  	}
  2129  }
  2130  
  2131  func (e *Endpoint) getTCPInfo() tcpip.TCPInfoOption {
  2132  	info := tcpip.TCPInfoOption{}
  2133  	e.LockUser()
  2134  	if state := e.EndpointState(); state.internal() {
  2135  		info.State = tcpip.EndpointState(StateClose)
  2136  	} else {
  2137  		info.State = tcpip.EndpointState(state)
  2138  	}
  2139  	snd := e.snd
  2140  	if snd != nil {
  2141  		// We do not calculate RTT before sending the data packets. If
  2142  		// the connection did not send and receive data, then RTT will
  2143  		// be zero.
  2144  		snd.rtt.Lock()
  2145  		info.RTT = snd.rtt.TCPRTTState.SRTT
  2146  		info.RTTVar = snd.rtt.TCPRTTState.RTTVar
  2147  		snd.rtt.Unlock()
  2148  
  2149  		info.RTO = snd.RTO
  2150  		info.CcState = snd.state
  2151  		info.SndSsthresh = uint32(snd.Ssthresh)
  2152  		info.SndCwnd = uint32(snd.SndCwnd)
  2153  		info.ReorderSeen = snd.rc.Reord
  2154  	}
  2155  	e.UnlockUser()
  2156  	return info
  2157  }
  2158  
  2159  // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
  2160  func (e *Endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error {
  2161  	switch o := opt.(type) {
  2162  	case *tcpip.TCPInfoOption:
  2163  		*o = e.getTCPInfo()
  2164  
  2165  	case *tcpip.KeepaliveIdleOption:
  2166  		e.keepalive.Lock()
  2167  		*o = tcpip.KeepaliveIdleOption(e.keepalive.idle)
  2168  		e.keepalive.Unlock()
  2169  
  2170  	case *tcpip.KeepaliveIntervalOption:
  2171  		e.keepalive.Lock()
  2172  		*o = tcpip.KeepaliveIntervalOption(e.keepalive.interval)
  2173  		e.keepalive.Unlock()
  2174  
  2175  	case *tcpip.TCPUserTimeoutOption:
  2176  		e.LockUser()
  2177  		*o = tcpip.TCPUserTimeoutOption(e.userTimeout)
  2178  		e.UnlockUser()
  2179  
  2180  	case *tcpip.CongestionControlOption:
  2181  		e.LockUser()
  2182  		*o = e.cc
  2183  		e.UnlockUser()
  2184  
  2185  	case *tcpip.TCPLingerTimeoutOption:
  2186  		e.LockUser()
  2187  		*o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout)
  2188  		e.UnlockUser()
  2189  
  2190  	case *tcpip.TCPDeferAcceptOption:
  2191  		e.LockUser()
  2192  		*o = tcpip.TCPDeferAcceptOption(e.deferAccept)
  2193  		e.UnlockUser()
  2194  
  2195  	case *tcpip.OriginalDestinationOption:
  2196  		e.LockUser()
  2197  		ipt := e.stack.IPTables()
  2198  		addr, port, err := ipt.OriginalDst(e.TransportEndpointInfo.ID, e.NetProto, ProtocolNumber)
  2199  		e.UnlockUser()
  2200  		if err != nil {
  2201  			return err
  2202  		}
  2203  		*o = tcpip.OriginalDestinationOption{
  2204  			Addr: addr,
  2205  			Port: port,
  2206  		}
  2207  
  2208  	default:
  2209  		return &tcpip.ErrUnknownProtocolOption{}
  2210  	}
  2211  	return nil
  2212  }
  2213  
  2214  // checkV4MappedLocked determines the effective network protocol and converts
  2215  // addr to its canonical form.
  2216  // +checklocks:e.mu
  2217  func (e *Endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) {
  2218  	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only())
  2219  	if err != nil {
  2220  		return tcpip.FullAddress{}, 0, err
  2221  	}
  2222  	return unwrapped, netProto, nil
  2223  }
  2224  
  2225  // Disconnect implements tcpip.Endpoint.Disconnect.
  2226  func (*Endpoint) Disconnect() tcpip.Error {
  2227  	return &tcpip.ErrNotSupported{}
  2228  }
  2229  
  2230  // Connect connects the endpoint to its peer.
  2231  func (e *Endpoint) Connect(addr tcpip.FullAddress) tcpip.Error {
  2232  	e.LockUser()
  2233  	defer e.UnlockUser()
  2234  	err := e.connect(addr, true)
  2235  	if err != nil {
  2236  		if !err.IgnoreStats() {
  2237  			// Connect failed. Let's wake up any waiters.
  2238  			e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  2239  			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
  2240  			e.stats.FailedConnectionAttempts.Increment()
  2241  		}
  2242  	}
  2243  	return err
  2244  }
  2245  
  2246  // registerEndpoint registers the endpoint with the provided address.
  2247  //
  2248  // +checklocks:e.mu
  2249  func (e *Endpoint) registerEndpoint(addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber, nicID tcpip.NICID) tcpip.Error {
  2250  	netProtos := []tcpip.NetworkProtocolNumber{netProto}
  2251  	if e.TransportEndpointInfo.ID.LocalPort != 0 {
  2252  		// The endpoint is bound to a port, attempt to register it.
  2253  		err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
  2254  		if err != nil {
  2255  			return err
  2256  		}
  2257  	} else {
  2258  		// The endpoint doesn't have a local port yet, so try to get
  2259  		// one. Make sure that it isn't one that will result in the same
  2260  		// address/port for both local and remote (otherwise this
  2261  		// endpoint would be trying to connect to itself).
  2262  		sameAddr := e.TransportEndpointInfo.ID.LocalAddress == e.TransportEndpointInfo.ID.RemoteAddress
  2263  
  2264  		var twReuse tcpip.TCPTimeWaitReuseOption
  2265  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &twReuse); err != nil {
  2266  			panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &twReuse, err))
  2267  		}
  2268  
  2269  		reuse := twReuse == tcpip.TCPTimeWaitReuseGlobal
  2270  		if twReuse == tcpip.TCPTimeWaitReuseLoopbackOnly {
  2271  			switch netProto {
  2272  			case header.IPv4ProtocolNumber:
  2273  				reuse = header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.LocalAddress) && header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.RemoteAddress)
  2274  			case header.IPv6ProtocolNumber:
  2275  				reuse = e.TransportEndpointInfo.ID.LocalAddress == header.IPv6Loopback && e.TransportEndpointInfo.ID.RemoteAddress == header.IPv6Loopback
  2276  			}
  2277  		}
  2278  
  2279  		bindToDevice := tcpip.NICID(e.ops.GetBindToDevice())
  2280  		if _, err := e.stack.PickEphemeralPort(e.stack.SecureRNG(), func(p uint16) (bool, tcpip.Error) {
  2281  			if sameAddr && p == e.TransportEndpointInfo.ID.RemotePort {
  2282  				return false, nil
  2283  			}
  2284  			portRes := ports.Reservation{
  2285  				Networks:     netProtos,
  2286  				Transport:    ProtocolNumber,
  2287  				Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  2288  				Port:         p,
  2289  				Flags:        e.portFlags,
  2290  				BindToDevice: bindToDevice,
  2291  				Dest:         addr,
  2292  			}
  2293  			if _, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, nil /* testPort */); err != nil {
  2294  				if _, ok := err.(*tcpip.ErrPortInUse); !ok || !reuse {
  2295  					return false, nil
  2296  				}
  2297  				transEPID := e.TransportEndpointInfo.ID
  2298  				transEPID.LocalPort = p
  2299  				// Check if an endpoint is registered with demuxer in TIME-WAIT and if
  2300  				// we can reuse it. If we can't find a transport endpoint then we just
  2301  				// skip using this port as it's possible that either an endpoint has
  2302  				// bound the port but not registered with demuxer yet (no listen/connect
  2303  				// done yet) or the reservation was freed between the check above and
  2304  				// the FindTransportEndpoint below. But rather than retry the same port
  2305  				// we just skip it and move on.
  2306  				transEP := e.stack.FindTransportEndpoint(netProto, ProtocolNumber, transEPID, nicID)
  2307  				if transEP == nil {
  2308  					// ReservePort failed but there is no registered endpoint with
  2309  					// demuxer. Which indicates there is at least some endpoint that has
  2310  					// bound the port.
  2311  					return false, nil
  2312  				}
  2313  
  2314  				tcpEP := transEP.(*Endpoint)
  2315  				tcpEP.LockUser()
  2316  				// If the endpoint is not in TIME-WAIT or if it is in TIME-WAIT but
  2317  				// less than 1 second has elapsed since its recentTS was updated then
  2318  				// we cannot reuse the port.
  2319  				if tcpEP.EndpointState() != StateTimeWait || e.stack.Clock().NowMonotonic().Sub(tcpEP.recentTSTime) < 1*time.Second {
  2320  					tcpEP.UnlockUser()
  2321  					return false, nil
  2322  				}
  2323  				// Since the endpoint is in TIME-WAIT it should be safe to acquire its
  2324  				// Lock while holding the lock for this endpoint as endpoints in
  2325  				// TIME-WAIT do not acquire locks on other endpoints.
  2326  				tcpEP.transitionToStateCloseLocked()
  2327  				tcpEP.drainClosingSegmentQueue()
  2328  				tcpEP.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  2329  				tcpEP.UnlockUser()
  2330  				// Now try and Reserve again if it fails then we skip.
  2331  				portRes := ports.Reservation{
  2332  					Networks:     netProtos,
  2333  					Transport:    ProtocolNumber,
  2334  					Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  2335  					Port:         p,
  2336  					Flags:        e.portFlags,
  2337  					BindToDevice: bindToDevice,
  2338  					Dest:         addr,
  2339  				}
  2340  				if _, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, nil /* testPort */); err != nil {
  2341  					return false, nil
  2342  				}
  2343  			}
  2344  
  2345  			id := e.TransportEndpointInfo.ID
  2346  			id.LocalPort = p
  2347  			if err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.portFlags, bindToDevice); err != nil {
  2348  				portRes := ports.Reservation{
  2349  					Networks:     netProtos,
  2350  					Transport:    ProtocolNumber,
  2351  					Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  2352  					Port:         p,
  2353  					Flags:        e.portFlags,
  2354  					BindToDevice: bindToDevice,
  2355  					Dest:         addr,
  2356  				}
  2357  				e.stack.ReleasePort(portRes)
  2358  				if _, ok := err.(*tcpip.ErrPortInUse); ok {
  2359  					return false, nil
  2360  				}
  2361  				return false, err
  2362  			}
  2363  
  2364  			// Port picking successful. Save the details of
  2365  			// the selected port.
  2366  			e.TransportEndpointInfo.ID = id
  2367  			e.isPortReserved = true
  2368  			e.boundBindToDevice = bindToDevice
  2369  			e.boundPortFlags = e.portFlags
  2370  			e.boundDest = addr
  2371  			return true, nil
  2372  		}); err != nil {
  2373  			e.stack.Stats().TCP.FailedPortReservations.Increment()
  2374  			return err
  2375  		}
  2376  	}
  2377  	return nil
  2378  }
  2379  
  2380  // connect connects the endpoint to its peer.
  2381  // +checklocks:e.mu
  2382  func (e *Endpoint) connect(addr tcpip.FullAddress, handshake bool) tcpip.Error {
  2383  	connectingAddr := addr.Addr
  2384  
  2385  	addr, netProto, err := e.checkV4MappedLocked(addr)
  2386  	if err != nil {
  2387  		return err
  2388  	}
  2389  
  2390  	if e.EndpointState().connected() {
  2391  		// The endpoint is already connected. If caller hasn't been
  2392  		// notified yet, return success.
  2393  		if !e.isConnectNotified {
  2394  			e.isConnectNotified = true
  2395  			return nil
  2396  		}
  2397  		// Otherwise return that it's already connected.
  2398  		return &tcpip.ErrAlreadyConnected{}
  2399  	}
  2400  
  2401  	nicID := addr.NIC
  2402  	switch e.EndpointState() {
  2403  	case StateBound:
  2404  		// If we're already bound to a NIC but the caller is requesting
  2405  		// that we use a different one now, we cannot proceed.
  2406  		if e.boundNICID == 0 {
  2407  			break
  2408  		}
  2409  
  2410  		if nicID != 0 && nicID != e.boundNICID {
  2411  			return &tcpip.ErrHostUnreachable{}
  2412  		}
  2413  
  2414  		nicID = e.boundNICID
  2415  
  2416  	case StateInitial:
  2417  		// Nothing to do. We'll eventually fill-in the gaps in the ID (if any)
  2418  		// when we find a route.
  2419  
  2420  	case StateConnecting, StateSynSent, StateSynRecv:
  2421  		// A connection request has already been issued but hasn't completed
  2422  		// yet.
  2423  		return &tcpip.ErrAlreadyConnecting{}
  2424  
  2425  	case StateError:
  2426  		if err := e.hardErrorLocked(); err != nil {
  2427  			return err
  2428  		}
  2429  		return &tcpip.ErrConnectionAborted{}
  2430  
  2431  	default:
  2432  		return &tcpip.ErrInvalidEndpointState{}
  2433  	}
  2434  
  2435  	// Find a route to the desired destination.
  2436  	r, err := e.stack.FindRoute(nicID, e.TransportEndpointInfo.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */)
  2437  	if err != nil {
  2438  		return err
  2439  	}
  2440  	defer r.Release()
  2441  
  2442  	e.TransportEndpointInfo.ID.LocalAddress = r.LocalAddress()
  2443  	e.TransportEndpointInfo.ID.RemoteAddress = r.RemoteAddress()
  2444  	e.TransportEndpointInfo.ID.RemotePort = addr.Port
  2445  
  2446  	oldState := e.EndpointState()
  2447  	e.setEndpointState(StateConnecting)
  2448  	if err := e.registerEndpoint(addr, netProto, r.NICID()); err != nil {
  2449  		e.setEndpointState(oldState)
  2450  		if _, ok := err.(*tcpip.ErrPortInUse); ok {
  2451  			return &tcpip.ErrBadLocalAddress{}
  2452  		}
  2453  		return err
  2454  	}
  2455  
  2456  	e.isRegistered = true
  2457  	r.Acquire()
  2458  	e.route = r
  2459  	e.boundNICID = nicID
  2460  	e.effectiveNetProtos = []tcpip.NetworkProtocolNumber{netProto}
  2461  	e.connectingAddress = connectingAddr
  2462  
  2463  	e.initGSO()
  2464  
  2465  	// Connect in the restore phase does not perform handshake. Restore its
  2466  	// connection setting here.
  2467  	if !handshake {
  2468  		e.segmentQueue.mu.Lock()
  2469  		for _, l := range []segmentList{e.segmentQueue.list, e.snd.writeList} {
  2470  			for s := l.Front(); s != nil; s = s.Next() {
  2471  				s.id = e.TransportEndpointInfo.ID
  2472  				e.sndQueueInfo.sndWaker.Assert()
  2473  			}
  2474  		}
  2475  		e.segmentQueue.mu.Unlock()
  2476  		e.snd.ep.AssertLockHeld(e)
  2477  		e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
  2478  		e.setEndpointState(StateEstablished)
  2479  		// Set the new auto tuned send buffer size after entering
  2480  		// established state.
  2481  		e.ops.SetSendBufferSize(e.computeTCPSendBufferSize(), false /* notify */)
  2482  		return &tcpip.ErrConnectStarted{}
  2483  	}
  2484  
  2485  	// Start a new handshake.
  2486  	h := e.newHandshake()
  2487  	e.setEndpointState(StateSynSent)
  2488  	h.start()
  2489  	e.stack.Stats().TCP.ActiveConnectionOpenings.Increment()
  2490  
  2491  	return &tcpip.ErrConnectStarted{}
  2492  }
  2493  
  2494  // ConnectEndpoint is not supported.
  2495  func (*Endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error {
  2496  	return &tcpip.ErrInvalidEndpointState{}
  2497  }
  2498  
  2499  // Shutdown closes the read and/or write end of the endpoint connection to its
  2500  // peer.
  2501  func (e *Endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error {
  2502  	e.LockUser()
  2503  	defer e.UnlockUser()
  2504  
  2505  	if e.EndpointState().connecting() {
  2506  		// When calling shutdown(2) on a connecting socket, the endpoint must
  2507  		// enter the error state. But this logic cannot belong to the shutdownLocked
  2508  		// method because that method is called during a close(2) (and closing a
  2509  		// connecting socket is not an error).
  2510  		e.handshakeFailed(&tcpip.ErrConnectionReset{})
  2511  		e.waiterQueue.Notify(waiter.WritableEvents | waiter.EventHUp | waiter.EventErr)
  2512  		return nil
  2513  	}
  2514  
  2515  	return e.shutdownLocked(flags)
  2516  }
  2517  
  2518  // +checklocks:e.mu
  2519  func (e *Endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error {
  2520  	e.shutdownFlags |= flags
  2521  	switch {
  2522  	case e.EndpointState().connected():
  2523  		// Close for read.
  2524  		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
  2525  			// Mark read side as closed.
  2526  			e.rcvQueueMu.Lock()
  2527  			e.RcvClosed = true
  2528  			rcvBufUsed := e.RcvBufUsed
  2529  			e.rcvQueueMu.Unlock()
  2530  			// If we're fully closed and we have unread data we need to abort
  2531  			// the connection with a RST.
  2532  			if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 {
  2533  				e.resetConnectionLocked(&tcpip.ErrConnectionAborted{})
  2534  				return nil
  2535  			}
  2536  			// Wake up any readers that maybe waiting for the stream to become
  2537  			// readable.
  2538  			events := waiter.ReadableEvents
  2539  			if e.shutdownFlags&tcpip.ShutdownWrite == 0 {
  2540  				// If ShutdownWrite is not set, write end won't close and
  2541  				// we end up with a half-closed connection
  2542  				events |= waiter.EventRdHUp
  2543  			}
  2544  			e.waiterQueue.Notify(events)
  2545  		}
  2546  
  2547  		// Close for write.
  2548  		if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
  2549  			e.sndQueueInfo.sndQueueMu.Lock()
  2550  			if e.sndQueueInfo.SndClosed {
  2551  				// Already closed.
  2552  				e.sndQueueInfo.sndQueueMu.Unlock()
  2553  				if e.EndpointState() == StateTimeWait {
  2554  					return &tcpip.ErrNotConnected{}
  2555  				}
  2556  				return nil
  2557  			}
  2558  
  2559  			// Queue fin segment.
  2560  			s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), buffer.Buffer{})
  2561  			e.snd.writeList.PushBack(s)
  2562  			// Mark endpoint as closed.
  2563  			e.sndQueueInfo.SndClosed = true
  2564  			e.sndQueueInfo.sndQueueMu.Unlock()
  2565  
  2566  			// Drain the send queue.
  2567  			e.sendData(s)
  2568  
  2569  			// Mark send side as closed.
  2570  			e.snd.Closed = true
  2571  
  2572  			// Wake up any writers that maybe waiting for the stream to become
  2573  			// writable.
  2574  			e.waiterQueue.Notify(waiter.WritableEvents)
  2575  		}
  2576  
  2577  		return nil
  2578  	case e.EndpointState() == StateListen:
  2579  		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
  2580  			// Reset all connections from the accept queue and keep the
  2581  			// worker running so that it can continue handling incoming
  2582  			// segments by replying with RST.
  2583  			//
  2584  			// By not removing this endpoint from the demuxer mapping, we
  2585  			// ensure that any other bind to the same port fails, as on Linux.
  2586  			e.rcvQueueMu.Lock()
  2587  			e.RcvClosed = true
  2588  			e.rcvQueueMu.Unlock()
  2589  			e.closePendingAcceptableConnectionsLocked()
  2590  			// Notify waiters that the endpoint is shutdown.
  2591  			e.waiterQueue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr)
  2592  		}
  2593  		return nil
  2594  	default:
  2595  		return &tcpip.ErrNotConnected{}
  2596  	}
  2597  }
  2598  
  2599  // Listen puts the endpoint in "listen" mode, which allows it to accept
  2600  // new connections.
  2601  func (e *Endpoint) Listen(backlog int) tcpip.Error {
  2602  	if err := e.listen(backlog); err != nil {
  2603  		if !err.IgnoreStats() {
  2604  			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
  2605  			e.stats.FailedConnectionAttempts.Increment()
  2606  		}
  2607  		return err
  2608  	}
  2609  	return nil
  2610  }
  2611  
  2612  func (e *Endpoint) listen(backlog int) tcpip.Error {
  2613  	e.LockUser()
  2614  	defer e.UnlockUser()
  2615  
  2616  	if e.EndpointState() == StateListen && !e.closed {
  2617  		e.acceptMu.Lock()
  2618  		defer e.acceptMu.Unlock()
  2619  
  2620  		// Adjust the size of the backlog iff we can fit
  2621  		// existing pending connections into the new one.
  2622  		if e.acceptQueue.endpoints.Len() > backlog {
  2623  			return &tcpip.ErrInvalidEndpointState{}
  2624  		}
  2625  		e.acceptQueue.capacity = backlog
  2626  
  2627  		if e.acceptQueue.pendingEndpoints == nil {
  2628  			e.acceptQueue.pendingEndpoints = make(map[*Endpoint]struct{})
  2629  		}
  2630  
  2631  		e.shutdownFlags = 0
  2632  		e.updateConnDirectionState(connDirectionStateOpen)
  2633  		e.rcvQueueMu.Lock()
  2634  		e.RcvClosed = false
  2635  		e.rcvQueueMu.Unlock()
  2636  
  2637  		return nil
  2638  	}
  2639  
  2640  	if e.EndpointState() == StateInitial {
  2641  		// The listen is called on an unbound socket, the socket is
  2642  		// automatically bound to a random free port with the local
  2643  		// address set to INADDR_ANY.
  2644  		if err := e.bindLocked(tcpip.FullAddress{}); err != nil {
  2645  			return err
  2646  		}
  2647  	}
  2648  
  2649  	// Endpoint must be bound before it can transition to listen mode.
  2650  	if e.EndpointState() != StateBound {
  2651  		e.stats.ReadErrors.InvalidEndpointState.Increment()
  2652  		return &tcpip.ErrInvalidEndpointState{}
  2653  	}
  2654  
  2655  	// Setting this state after RegisterTransportEndpoint will result in a
  2656  	// race where the endpoint is in Bound but reachable via the demuxer. Instead
  2657  	// we set it to listen so that incoming packets will just be queued to the
  2658  	// inbound segment queue by the TCP processor.
  2659  	e.setEndpointState(StateListen)
  2660  	// Register the endpoint.
  2661  	if err := e.stack.RegisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil {
  2662  		e.transitionToStateCloseLocked()
  2663  		return err
  2664  	}
  2665  
  2666  	e.isRegistered = true
  2667  
  2668  	// The queue may be non-zero when we're restoring the endpoint, and it
  2669  	// may be pre-populated with some previously accepted (but not Accepted)
  2670  	// endpoints.
  2671  	e.acceptMu.Lock()
  2672  	if e.acceptQueue.pendingEndpoints == nil {
  2673  		e.acceptQueue.pendingEndpoints = make(map[*Endpoint]struct{})
  2674  	}
  2675  	if e.acceptQueue.capacity == 0 {
  2676  		e.acceptQueue.capacity = backlog
  2677  	}
  2678  	e.acceptMu.Unlock()
  2679  
  2680  	// Initialize the listening context.
  2681  	rcvWnd := seqnum.Size(e.receiveBufferAvailable())
  2682  	e.listenCtx = newListenContext(e.stack, e.protocol, e, rcvWnd, e.ops.GetV6Only(), e.NetProto)
  2683  
  2684  	return nil
  2685  }
  2686  
  2687  // Accept returns a new endpoint if a peer has established a connection
  2688  // to an endpoint previously set to listen mode.
  2689  //
  2690  // addr if not-nil will contain the peer address of the returned endpoint.
  2691  func (e *Endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) {
  2692  	e.LockUser()
  2693  	defer e.UnlockUser()
  2694  
  2695  	e.rcvQueueMu.Lock()
  2696  	rcvClosed := e.RcvClosed
  2697  	e.rcvQueueMu.Unlock()
  2698  	// Endpoint must be in listen state before it can accept connections.
  2699  	if rcvClosed || e.EndpointState() != StateListen {
  2700  		return nil, nil, &tcpip.ErrInvalidEndpointState{}
  2701  	}
  2702  
  2703  	// Get the new accepted endpoint.
  2704  	var n *Endpoint
  2705  	e.acceptMu.Lock()
  2706  	if element := e.acceptQueue.endpoints.Front(); element != nil {
  2707  		n = e.acceptQueue.endpoints.Remove(element).(*Endpoint)
  2708  	}
  2709  	e.acceptMu.Unlock()
  2710  	if n == nil {
  2711  		return nil, nil, &tcpip.ErrWouldBlock{}
  2712  	}
  2713  	if peerAddr != nil {
  2714  		*peerAddr = n.getRemoteAddress()
  2715  	}
  2716  	return n, n.waiterQueue, nil
  2717  }
  2718  
  2719  // Bind binds the endpoint to a specific local port and optionally address.
  2720  func (e *Endpoint) Bind(addr tcpip.FullAddress) (err tcpip.Error) {
  2721  	e.LockUser()
  2722  	defer e.UnlockUser()
  2723  
  2724  	return e.bindLocked(addr)
  2725  }
  2726  
  2727  // +checklocks:e.mu
  2728  func (e *Endpoint) bindLocked(addr tcpip.FullAddress) (err tcpip.Error) {
  2729  	// Don't allow binding once endpoint is not in the initial state
  2730  	// anymore. This is because once the endpoint goes into a connected or
  2731  	// listen state, it is already bound.
  2732  	if e.EndpointState() != StateInitial {
  2733  		return &tcpip.ErrAlreadyBound{}
  2734  	}
  2735  
  2736  	e.BindAddr = addr.Addr
  2737  	addr, netProto, err := e.checkV4MappedLocked(addr)
  2738  	if err != nil {
  2739  		return err
  2740  	}
  2741  
  2742  	netProtos := []tcpip.NetworkProtocolNumber{netProto}
  2743  
  2744  	// Expand netProtos to include v4 and v6 under dual-stack if the caller is
  2745  	// binding to a wildcard (empty) address, and this is an IPv6 endpoint with
  2746  	// v6only set to false.
  2747  	if netProto == header.IPv6ProtocolNumber {
  2748  		stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber)
  2749  		alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == tcpip.Address{} && stackHasV4
  2750  		if alsoBindToV4 {
  2751  			netProtos = append(netProtos, header.IPv4ProtocolNumber)
  2752  		}
  2753  	}
  2754  
  2755  	var nic tcpip.NICID
  2756  	// If an address is specified, we must ensure that it's one of our
  2757  	// local addresses.
  2758  	if addr.Addr.Len() != 0 {
  2759  		nic = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
  2760  		if nic == 0 {
  2761  			return &tcpip.ErrBadLocalAddress{}
  2762  		}
  2763  		e.TransportEndpointInfo.ID.LocalAddress = addr.Addr
  2764  	}
  2765  
  2766  	bindToDevice := tcpip.NICID(e.ops.GetBindToDevice())
  2767  	portRes := ports.Reservation{
  2768  		Networks:     netProtos,
  2769  		Transport:    ProtocolNumber,
  2770  		Addr:         addr.Addr,
  2771  		Port:         addr.Port,
  2772  		Flags:        e.portFlags,
  2773  		BindToDevice: bindToDevice,
  2774  		Dest:         tcpip.FullAddress{},
  2775  	}
  2776  	port, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, func(p uint16) (bool, tcpip.Error) {
  2777  		id := e.TransportEndpointInfo.ID
  2778  		id.LocalPort = p
  2779  		// CheckRegisterTransportEndpoint should only return an error if there is a
  2780  		// listening endpoint bound with the same id and portFlags and bindToDevice
  2781  		// options.
  2782  		//
  2783  		// NOTE: Only listening and connected endpoint register with
  2784  		// demuxer. Further connected endpoints always have a remote
  2785  		// address/port. Hence this will only return an error if there is a matching
  2786  		// listening endpoint.
  2787  		if err := e.stack.CheckRegisterTransportEndpoint(netProtos, ProtocolNumber, id, e.portFlags, bindToDevice); err != nil {
  2788  			return false, nil
  2789  		}
  2790  		return true, nil
  2791  	})
  2792  	if err != nil {
  2793  		e.stack.Stats().TCP.FailedPortReservations.Increment()
  2794  		return err
  2795  	}
  2796  
  2797  	e.boundBindToDevice = bindToDevice
  2798  	e.boundPortFlags = e.portFlags
  2799  	// TODO(gvisor.dev/issue/3691): Add test to verify boundNICID is correct.
  2800  	e.boundNICID = nic
  2801  	e.isPortReserved = true
  2802  	e.effectiveNetProtos = netProtos
  2803  	e.TransportEndpointInfo.ID.LocalPort = port
  2804  
  2805  	// Mark endpoint as bound.
  2806  	e.setEndpointState(StateBound)
  2807  
  2808  	return nil
  2809  }
  2810  
  2811  // GetLocalAddress returns the address to which the endpoint is bound.
  2812  func (e *Endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) {
  2813  	e.LockUser()
  2814  	defer e.UnlockUser()
  2815  
  2816  	return tcpip.FullAddress{
  2817  		Addr: e.TransportEndpointInfo.ID.LocalAddress,
  2818  		Port: e.TransportEndpointInfo.ID.LocalPort,
  2819  		NIC:  e.boundNICID,
  2820  	}, nil
  2821  }
  2822  
  2823  // GetRemoteAddress returns the address to which the endpoint is connected.
  2824  func (e *Endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) {
  2825  	e.LockUser()
  2826  	defer e.UnlockUser()
  2827  
  2828  	if !e.EndpointState().connected() {
  2829  		return tcpip.FullAddress{}, &tcpip.ErrNotConnected{}
  2830  	}
  2831  
  2832  	return e.getRemoteAddress(), nil
  2833  }
  2834  
  2835  func (e *Endpoint) getRemoteAddress() tcpip.FullAddress {
  2836  	return tcpip.FullAddress{
  2837  		Addr: e.TransportEndpointInfo.ID.RemoteAddress,
  2838  		Port: e.TransportEndpointInfo.ID.RemotePort,
  2839  		NIC:  e.boundNICID,
  2840  	}
  2841  }
  2842  
  2843  // HandlePacket implements stack.TransportEndpoint.HandlePacket.
  2844  func (*Endpoint) HandlePacket(stack.TransportEndpointID, *stack.PacketBuffer) {
  2845  	// TCP HandlePacket is not required anymore as inbound packets first
  2846  	// land at the Dispatcher which then can either deliver using the
  2847  	// worker go routine or directly do the invoke the tcp processing inline
  2848  	// based on the state of the endpoint.
  2849  }
  2850  
  2851  func (e *Endpoint) enqueueSegment(s *segment) bool {
  2852  	// Send packet to worker goroutine.
  2853  	if !e.segmentQueue.enqueue(s) {
  2854  		// The queue is full, so we drop the segment.
  2855  		e.stack.Stats().DroppedPackets.Increment()
  2856  		e.stats.ReceiveErrors.SegmentQueueDropped.Increment()
  2857  		return false
  2858  	}
  2859  	return true
  2860  }
  2861  
  2862  func (e *Endpoint) onICMPError(err tcpip.Error, transErr stack.TransportError, pkt *stack.PacketBuffer) {
  2863  	// Update last error first.
  2864  	e.lastErrorMu.Lock()
  2865  	e.lastError = err
  2866  	e.lastErrorMu.Unlock()
  2867  
  2868  	var recvErr bool
  2869  	switch pkt.NetworkProtocolNumber {
  2870  	case header.IPv4ProtocolNumber:
  2871  		recvErr = e.SocketOptions().GetIPv4RecvError()
  2872  	case header.IPv6ProtocolNumber:
  2873  		recvErr = e.SocketOptions().GetIPv6RecvError()
  2874  	default:
  2875  		panic(fmt.Sprintf("unhandled network protocol number = %d", pkt.NetworkProtocolNumber))
  2876  	}
  2877  
  2878  	if recvErr {
  2879  		e.SocketOptions().QueueErr(&tcpip.SockError{
  2880  			Err:   err,
  2881  			Cause: transErr,
  2882  			// Linux passes the payload with the TCP header. We don't know if the TCP
  2883  			// header even exists, it may not for fragmented packets.
  2884  			Payload: pkt.Data().AsRange().ToView(),
  2885  			Dst: tcpip.FullAddress{
  2886  				NIC:  pkt.NICID,
  2887  				Addr: e.TransportEndpointInfo.ID.RemoteAddress,
  2888  				Port: e.TransportEndpointInfo.ID.RemotePort,
  2889  			},
  2890  			Offender: tcpip.FullAddress{
  2891  				NIC:  pkt.NICID,
  2892  				Addr: e.TransportEndpointInfo.ID.LocalAddress,
  2893  				Port: e.TransportEndpointInfo.ID.LocalPort,
  2894  			},
  2895  			NetProto: pkt.NetworkProtocolNumber,
  2896  		})
  2897  	}
  2898  
  2899  	if e.EndpointState().connecting() {
  2900  		e.mu.Lock()
  2901  		if lEP := e.h.listenEP; lEP != nil {
  2902  			// Remove from listening endpoints pending list.
  2903  			lEP.acceptMu.Lock()
  2904  			delete(lEP.acceptQueue.pendingEndpoints, e)
  2905  			lEP.acceptMu.Unlock()
  2906  			lEP.stats.FailedConnectionAttempts.Increment()
  2907  		}
  2908  		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
  2909  		e.cleanupLocked()
  2910  		e.hardError = err
  2911  		e.setEndpointState(StateError)
  2912  		e.mu.Unlock()
  2913  		e.drainClosingSegmentQueue()
  2914  		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  2915  	}
  2916  }
  2917  
  2918  // HandleError implements stack.TransportEndpoint.
  2919  func (e *Endpoint) HandleError(transErr stack.TransportError, pkt *stack.PacketBuffer) {
  2920  	handlePacketTooBig := func(mtu uint32) {
  2921  		e.sndQueueInfo.sndQueueMu.Lock()
  2922  		update := false
  2923  		if v := int(mtu); v < e.sndQueueInfo.SndMTU {
  2924  			e.sndQueueInfo.SndMTU = v
  2925  			update = true
  2926  		}
  2927  		newMTU := e.sndQueueInfo.SndMTU
  2928  		e.sndQueueInfo.sndQueueMu.Unlock()
  2929  		if update {
  2930  			e.mu.Lock()
  2931  			defer e.mu.Unlock()
  2932  			if e.snd != nil {
  2933  				e.snd.updateMaxPayloadSize(newMTU, 1 /* count */) // +checklocksforce:e.snd.ep.mu
  2934  			}
  2935  		}
  2936  	}
  2937  
  2938  	// TODO(gvisor.dev/issues/5270): Handle all transport errors.
  2939  	switch transErr.Kind() {
  2940  	case stack.PacketTooBigTransportError:
  2941  		handlePacketTooBig(transErr.Info())
  2942  	case stack.DestinationHostUnreachableTransportError:
  2943  		e.onICMPError(&tcpip.ErrHostUnreachable{}, transErr, pkt)
  2944  	case stack.DestinationNetworkUnreachableTransportError:
  2945  		e.onICMPError(&tcpip.ErrNetworkUnreachable{}, transErr, pkt)
  2946  	case stack.DestinationPortUnreachableTransportError:
  2947  		e.onICMPError(&tcpip.ErrConnectionRefused{}, transErr, pkt)
  2948  	case stack.DestinationProtoUnreachableTransportError:
  2949  		e.onICMPError(&tcpip.ErrUnknownProtocolOption{}, transErr, pkt)
  2950  	case stack.SourceRouteFailedTransportError:
  2951  		e.onICMPError(&tcpip.ErrNotSupported{}, transErr, pkt)
  2952  	case stack.SourceHostIsolatedTransportError:
  2953  		e.onICMPError(&tcpip.ErrNoNet{}, transErr, pkt)
  2954  	case stack.DestinationHostDownTransportError:
  2955  		e.onICMPError(&tcpip.ErrHostDown{}, transErr, pkt)
  2956  	}
  2957  }
  2958  
  2959  // updateSndBufferUsage is called by the protocol goroutine when room opens up
  2960  // in the send buffer. The number of newly available bytes is v.
  2961  func (e *Endpoint) updateSndBufferUsage(v int) {
  2962  	sendBufferSize := e.getSendBufferSize()
  2963  	e.sndQueueInfo.sndQueueMu.Lock()
  2964  	notify := e.sndQueueInfo.SndBufUsed >= sendBufferSize>>1
  2965  	e.sndQueueInfo.SndBufUsed -= v
  2966  
  2967  	// Get the new send buffer size with auto tuning, but do not set it
  2968  	// unless we decide to notify the writers.
  2969  	newSndBufSz := e.computeTCPSendBufferSize()
  2970  
  2971  	// We only notify when there is half the sendBufferSize available after
  2972  	// a full buffer event occurs. This ensures that we don't wake up
  2973  	// writers to queue just 1-2 segments and go back to sleep.
  2974  	notify = notify && e.sndQueueInfo.SndBufUsed < int(newSndBufSz)>>1
  2975  	e.sndQueueInfo.sndQueueMu.Unlock()
  2976  
  2977  	if notify {
  2978  		// Set the new send buffer size calculated from auto tuning.
  2979  		e.ops.SetSendBufferSize(newSndBufSz, false /* notify */)
  2980  		e.waiterQueue.Notify(waiter.WritableEvents)
  2981  	}
  2982  }
  2983  
  2984  // readyToRead is called by the protocol goroutine when a new segment is ready
  2985  // to be read, or when the connection is closed for receiving (in which case
  2986  // s will be nil).
  2987  //
  2988  // +checklocks:e.mu
  2989  func (e *Endpoint) readyToRead(s *segment) {
  2990  	e.rcvQueueMu.Lock()
  2991  	if s != nil {
  2992  		e.RcvBufUsed += s.payloadSize()
  2993  		s.IncRef()
  2994  		e.rcvQueue.PushBack(s)
  2995  	} else {
  2996  		e.RcvClosed = true
  2997  	}
  2998  	e.rcvQueueMu.Unlock()
  2999  	e.waiterQueue.Notify(waiter.ReadableEvents)
  3000  }
  3001  
  3002  // receiveBufferAvailableLocked calculates how many bytes are still available
  3003  // in the receive buffer.
  3004  // +checklocks:e.rcvQueueMu
  3005  func (e *Endpoint) receiveBufferAvailableLocked(rcvBufSize int) int {
  3006  	// We may use more bytes than the buffer size when the receive buffer
  3007  	// shrinks.
  3008  	memUsed := e.receiveMemUsed()
  3009  	if memUsed >= rcvBufSize {
  3010  		return 0
  3011  	}
  3012  
  3013  	return rcvBufSize - memUsed
  3014  }
  3015  
  3016  // receiveBufferAvailable calculates how many bytes are still available in the
  3017  // receive buffer based on the actual memory used by all segments held in
  3018  // receive buffer/pending and segment queue.
  3019  func (e *Endpoint) receiveBufferAvailable() int {
  3020  	e.rcvQueueMu.Lock()
  3021  	available := e.receiveBufferAvailableLocked(int(e.ops.GetReceiveBufferSize()))
  3022  	e.rcvQueueMu.Unlock()
  3023  	return available
  3024  }
  3025  
  3026  // receiveBufferUsed returns the amount of in-use receive buffer.
  3027  func (e *Endpoint) receiveBufferUsed() int {
  3028  	e.rcvQueueMu.Lock()
  3029  	used := e.RcvBufUsed
  3030  	e.rcvQueueMu.Unlock()
  3031  	return used
  3032  }
  3033  
  3034  // receiveMemUsed returns the total memory in use by segments held by this
  3035  // endpoint.
  3036  func (e *Endpoint) receiveMemUsed() int {
  3037  	return int(e.rcvMemUsed.Load())
  3038  }
  3039  
  3040  // updateReceiveMemUsed adds the provided delta to e.rcvMemUsed.
  3041  func (e *Endpoint) updateReceiveMemUsed(delta int) {
  3042  	e.rcvMemUsed.Add(int32(delta))
  3043  }
  3044  
  3045  // maxReceiveBufferSize returns the stack wide maximum receive buffer size for
  3046  // an endpoint.
  3047  func (e *Endpoint) maxReceiveBufferSize() int {
  3048  	var rs tcpip.TCPReceiveBufferSizeRangeOption
  3049  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil {
  3050  		// As a fallback return the hardcoded max buffer size.
  3051  		return MaxBufferSize
  3052  	}
  3053  	return rs.Max
  3054  }
  3055  
  3056  // directionState returns the close state of send and receive part of the endpoint
  3057  func (e *Endpoint) connDirectionState() connDirectionState {
  3058  	return connDirectionState(e.connectionDirectionState.Load())
  3059  }
  3060  
  3061  // updateDirectionState updates the close state of send and receive part of the endpoint
  3062  func (e *Endpoint) updateConnDirectionState(state connDirectionState) connDirectionState {
  3063  	return connDirectionState(e.connectionDirectionState.Swap(uint32(e.connDirectionState() | state)))
  3064  }
  3065  
  3066  // rcvWndScaleForHandshake computes the receive window scale to offer to the
  3067  // peer when window scaling is enabled (true by default). If auto-tuning is
  3068  // disabled then the window scaling factor is based on the size of the
  3069  // receiveBuffer otherwise we use the max permissible receive buffer size to
  3070  // compute the scale.
  3071  func (e *Endpoint) rcvWndScaleForHandshake() int {
  3072  	bufSizeForScale := e.ops.GetReceiveBufferSize()
  3073  
  3074  	e.rcvQueueMu.Lock()
  3075  	autoTuningDisabled := e.RcvAutoParams.Disabled
  3076  	e.rcvQueueMu.Unlock()
  3077  	if autoTuningDisabled {
  3078  		return FindWndScale(seqnum.Size(bufSizeForScale))
  3079  	}
  3080  
  3081  	return FindWndScale(seqnum.Size(e.maxReceiveBufferSize()))
  3082  }
  3083  
  3084  // updateRecentTimestamp updates the recent timestamp using the algorithm
  3085  // described in https://tools.ietf.org/html/rfc7323#section-4.3
  3086  func (e *Endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) {
  3087  	if e.SendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) {
  3088  		e.setRecentTimestamp(tsVal)
  3089  	}
  3090  }
  3091  
  3092  // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if
  3093  // the SYN options indicate that timestamp option was negotiated. It also
  3094  // initializes the recentTS with the value provided in synOpts.TSval.
  3095  func (e *Endpoint) maybeEnableTimestamp(synOpts header.TCPSynOptions) {
  3096  	if synOpts.TS {
  3097  		e.SendTSOk = true
  3098  		e.setRecentTimestamp(synOpts.TSVal)
  3099  	}
  3100  }
  3101  
  3102  func (e *Endpoint) tsVal(now tcpip.MonotonicTime) uint32 {
  3103  	return e.TSOffset.TSVal(now)
  3104  }
  3105  
  3106  func (e *Endpoint) tsValNow() uint32 {
  3107  	return e.tsVal(e.stack.Clock().NowMonotonic())
  3108  }
  3109  
  3110  func (e *Endpoint) elapsed(now tcpip.MonotonicTime, tsEcr uint32) time.Duration {
  3111  	return e.TSOffset.Elapsed(now, tsEcr)
  3112  }
  3113  
  3114  // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint
  3115  // if the SYN options indicate that the SACK option was negotiated and the TCP
  3116  // stack is configured to enable TCP SACK option.
  3117  func (e *Endpoint) maybeEnableSACKPermitted(synOpts header.TCPSynOptions) {
  3118  	var v tcpip.TCPSACKEnabled
  3119  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil {
  3120  		// Stack doesn't support SACK. So just return.
  3121  		return
  3122  	}
  3123  	if bool(v) && synOpts.SACKPermitted {
  3124  		e.SACKPermitted = true
  3125  		e.stack.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery)
  3126  	}
  3127  }
  3128  
  3129  // maxOptionSize return the maximum size of TCP options.
  3130  func (e *Endpoint) maxOptionSize() (size int) {
  3131  	var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock
  3132  	options := e.makeOptions(maxSackBlocks[:])
  3133  	size = len(options)
  3134  	putOptions(options)
  3135  
  3136  	return size
  3137  }
  3138  
  3139  // completeStateLocked makes a full copy of the endpoint and returns it. This is
  3140  // used before invoking the probe.
  3141  //
  3142  // +checklocks:e.mu
  3143  func (e *Endpoint) completeStateLocked(s *stack.TCPEndpointState) {
  3144  	s.TCPEndpointStateInner = e.TCPEndpointStateInner
  3145  	s.ID = stack.TCPEndpointID(e.TransportEndpointInfo.ID)
  3146  	s.SegTime = e.stack.Clock().NowMonotonic()
  3147  	s.Receiver = e.rcv.TCPReceiverState
  3148  	s.Sender = e.snd.TCPSenderState
  3149  
  3150  	sndBufSize := e.getSendBufferSize()
  3151  	// Copy the send buffer atomically.
  3152  	e.sndQueueInfo.sndQueueMu.Lock()
  3153  	e.sndQueueInfo.CloneState(&s.SndBufState)
  3154  	s.SndBufState.SndBufSize = sndBufSize
  3155  	e.sndQueueInfo.sndQueueMu.Unlock()
  3156  
  3157  	// Copy the receive buffer atomically.
  3158  	e.rcvQueueMu.Lock()
  3159  	s.RcvBufState = e.TCPRcvBufState
  3160  	e.rcvQueueMu.Unlock()
  3161  
  3162  	// Copy the endpoint TCP Option state.
  3163  	s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks)
  3164  	copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks])
  3165  	s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy()
  3166  
  3167  	e.snd.rtt.Lock()
  3168  	s.Sender.RTTState = e.snd.rtt.TCPRTTState
  3169  	e.snd.rtt.Unlock()
  3170  
  3171  	if cubic, ok := e.snd.cc.(*cubicState); ok {
  3172  		s.Sender.Cubic = cubic.TCPCubicState
  3173  		s.Sender.Cubic.TimeSinceLastCongestion = e.stack.Clock().NowMonotonic().Sub(s.Sender.Cubic.T)
  3174  	}
  3175  
  3176  	s.Sender.RACKState = e.snd.rc.TCPRACKState
  3177  	s.Sender.RetransmitTS = e.snd.retransmitTS
  3178  	s.Sender.SpuriousRecovery = e.snd.spuriousRecovery
  3179  }
  3180  
  3181  func (e *Endpoint) initHostGSO() {
  3182  	switch e.route.NetProto() {
  3183  	case header.IPv4ProtocolNumber:
  3184  		e.gso.Type = stack.GSOTCPv4
  3185  		e.gso.L3HdrLen = header.IPv4MinimumSize
  3186  	case header.IPv6ProtocolNumber:
  3187  		e.gso.Type = stack.GSOTCPv6
  3188  		e.gso.L3HdrLen = header.IPv6MinimumSize
  3189  	default:
  3190  		panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto))
  3191  	}
  3192  	e.gso.NeedsCsum = true
  3193  	e.gso.CsumOffset = header.TCPChecksumOffset
  3194  	e.gso.MaxSize = e.route.GSOMaxSize()
  3195  }
  3196  
  3197  func (e *Endpoint) initGSO() {
  3198  	if e.route.HasHostGSOCapability() {
  3199  		e.initHostGSO()
  3200  	} else if e.route.HasGVisorGSOCapability() {
  3201  		e.gso = stack.GSO{
  3202  			MaxSize:   e.route.GSOMaxSize(),
  3203  			Type:      stack.GSOGvisor,
  3204  			NeedsCsum: false,
  3205  		}
  3206  	}
  3207  }
  3208  
  3209  // State implements tcpip.Endpoint.State. It exports the endpoint's protocol
  3210  // state for diagnostics.
  3211  func (e *Endpoint) State() uint32 {
  3212  	return uint32(e.EndpointState())
  3213  }
  3214  
  3215  // Info returns a copy of the endpoint info.
  3216  func (e *Endpoint) Info() tcpip.EndpointInfo {
  3217  	e.LockUser()
  3218  	// Make a copy of the endpoint info.
  3219  	ret := e.TransportEndpointInfo
  3220  	e.UnlockUser()
  3221  	return &ret
  3222  }
  3223  
  3224  // Stats returns a pointer to the endpoint stats.
  3225  func (e *Endpoint) Stats() tcpip.EndpointStats {
  3226  	return &e.stats
  3227  }
  3228  
  3229  // Wait implements stack.TransportEndpoint.Wait.
  3230  func (e *Endpoint) Wait() {
  3231  	waitEntry, notifyCh := waiter.NewChannelEntry(waiter.EventHUp)
  3232  	e.waiterQueue.EventRegister(&waitEntry)
  3233  	defer e.waiterQueue.EventUnregister(&waitEntry)
  3234  	switch e.EndpointState() {
  3235  	case StateClose, StateError:
  3236  		return
  3237  	}
  3238  	<-notifyCh
  3239  }
  3240  
  3241  // SocketOptions implements tcpip.Endpoint.SocketOptions.
  3242  func (e *Endpoint) SocketOptions() *tcpip.SocketOptions {
  3243  	return &e.ops
  3244  }
  3245  
  3246  // GetTCPSendBufferLimits is used to get send buffer size limits for TCP.
  3247  func GetTCPSendBufferLimits(sh tcpip.StackHandler) tcpip.SendBufferSizeOption {
  3248  	// This type assertion is safe because only the TCP stack calls this
  3249  	// function.
  3250  	ss := sh.(*stack.Stack).TCPSendBufferLimits()
  3251  	return tcpip.SendBufferSizeOption{
  3252  		Min:     ss.Min,
  3253  		Default: ss.Default,
  3254  		Max:     ss.Max,
  3255  	}
  3256  }
  3257  
  3258  // allowOutOfWindowAck returns true if an out-of-window ACK can be sent now.
  3259  func (e *Endpoint) allowOutOfWindowAck() bool {
  3260  	now := e.stack.Clock().NowMonotonic()
  3261  
  3262  	if e.lastOutOfWindowAckTime != (tcpip.MonotonicTime{}) {
  3263  		var limit stack.TCPInvalidRateLimitOption
  3264  		if err := e.stack.Option(&limit); err != nil {
  3265  			panic(fmt.Sprintf("e.stack.Option(%+v) failed with error: %s", limit, err))
  3266  		}
  3267  		if now.Sub(e.lastOutOfWindowAckTime) < time.Duration(limit) {
  3268  			return false
  3269  		}
  3270  	}
  3271  
  3272  	e.lastOutOfWindowAckTime = now
  3273  	return true
  3274  }
  3275  
  3276  // GetTCPReceiveBufferLimits is used to get send buffer size limits for TCP.
  3277  func GetTCPReceiveBufferLimits(s tcpip.StackHandler) tcpip.ReceiveBufferSizeOption {
  3278  	var ss tcpip.TCPReceiveBufferSizeRangeOption
  3279  	if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil {
  3280  		panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err))
  3281  	}
  3282  
  3283  	return tcpip.ReceiveBufferSizeOption{
  3284  		Min:     ss.Min,
  3285  		Default: ss.Default,
  3286  		Max:     ss.Max,
  3287  	}
  3288  }
  3289  
  3290  // computeTCPSendBufferSize implements auto tuning of send buffer size and
  3291  // returns the new send buffer size.
  3292  func (e *Endpoint) computeTCPSendBufferSize() int64 {
  3293  	curSndBufSz := int64(e.getSendBufferSize())
  3294  
  3295  	// Auto tuning is disabled when the user explicitly sets the send
  3296  	// buffer size with SO_SNDBUF option.
  3297  	if disabled := e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled.Load(); disabled == 1 {
  3298  		return curSndBufSz
  3299  	}
  3300  
  3301  	const packetOverheadFactor = 2
  3302  	curMSS := e.snd.MaxPayloadSize
  3303  	numSeg := InitialCwnd
  3304  	if numSeg < e.snd.SndCwnd {
  3305  		numSeg = e.snd.SndCwnd
  3306  	}
  3307  
  3308  	// SndCwnd indicates the number of segments that can be sent. This means
  3309  	// that the sender can send upto #SndCwnd segments and the send buffer
  3310  	// size should be set to SndCwnd*MSS to accommodate sending of all the
  3311  	// segments.
  3312  	newSndBufSz := int64(numSeg * curMSS * packetOverheadFactor)
  3313  	if newSndBufSz < curSndBufSz {
  3314  		return curSndBufSz
  3315  	}
  3316  	if ss := GetTCPSendBufferLimits(e.stack); int64(ss.Max) < newSndBufSz {
  3317  		newSndBufSz = int64(ss.Max)
  3318  	}
  3319  
  3320  	return newSndBufSz
  3321  }
  3322  
  3323  // GetAcceptConn implements tcpip.SocketOptionsHandler.
  3324  func (e *Endpoint) GetAcceptConn() bool {
  3325  	return EndpointState(e.State()) == StateListen
  3326  }