gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/tcpip/transport/tcp/endpoint.go

gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/tcpip/transport/tcp/endpoint.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"container/heap"
    19  	"fmt"
    20  	"io"
    21  	"math"
    22  	"runtime"
    23  	"strings"
    24  	"time"
    25  
    26  	"gvisor.dev/gvisor/pkg/atomicbitops"
    27  	"gvisor.dev/gvisor/pkg/buffer"
    28  	"gvisor.dev/gvisor/pkg/sleep"
    29  	"gvisor.dev/gvisor/pkg/sync"
    30  	"gvisor.dev/gvisor/pkg/tcpip"
    31  	"gvisor.dev/gvisor/pkg/tcpip/header"
    32  	"gvisor.dev/gvisor/pkg/tcpip/ports"
    33  	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
    34  	"gvisor.dev/gvisor/pkg/tcpip/stack"
    35  	"gvisor.dev/gvisor/pkg/waiter"
    36  )
    37  
    38  // EndpointState represents the state of a TCP endpoint.
    39  type EndpointState tcpip.EndpointState
    40  
    41  // Endpoint states. Note that are represented in a netstack-specific manner and
    42  // may not be meaningful externally. Specifically, they need to be translated to
    43  // Linux's representation for these states if presented to userspace.
    44  const (
    45  	_ EndpointState = iota
    46  	// TCP protocol states in sync with the definitions in
    47  	// https://github.com/torvalds/linux/blob/7acac4b3196/include/net/tcp_states.h#L13
    48  	StateEstablished
    49  	StateSynSent
    50  	StateSynRecv
    51  	StateFinWait1
    52  	StateFinWait2
    53  	StateTimeWait
    54  	StateClose
    55  	StateCloseWait
    56  	StateLastAck
    57  	StateListen
    58  	StateClosing
    59  
    60  	// Endpoint states internal to netstack.
    61  	StateInitial
    62  	StateBound
    63  	StateConnecting // Connect() called, but the initial SYN hasn't been sent.
    64  	StateError
    65  )
    66  
    67  const (
    68  	// rcvAdvWndScale is used to split the available socket buffer into
    69  	// application buffer and the window to be advertised to the peer. This is
    70  	// currently hard coded to split the available space equally.
    71  	rcvAdvWndScale = 1
    72  
    73  	// SegOverheadFactor is used to multiply the value provided by the
    74  	// user on a SetSockOpt for setting the socket send/receive buffer sizes.
    75  	SegOverheadFactor = 2
    76  )
    77  
    78  type connDirectionState uint32
    79  
    80  // Connection direction states used for directionState checks in endpoint struct
    81  // to detect half-closed connection and deliver POLLRDHUP
    82  const (
    83  	connDirectionStateOpen      connDirectionState = 0
    84  	connDirectionStateRcvClosed connDirectionState = 1
    85  	connDirectionStateSndClosed connDirectionState = 2
    86  	connDirectionStateAll       connDirectionState = connDirectionStateOpen | connDirectionStateRcvClosed | connDirectionStateSndClosed
    87  )
    88  
    89  // connected returns true when s is one of the states representing an
    90  // endpoint connected to a peer.
    91  func (s EndpointState) connected() bool {
    92  	switch s {
    93  	case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
    94  		return true
    95  	default:
    96  		return false
    97  	}
    98  }
    99  
   100  // connecting returns true when s is one of the states representing a
   101  // connection in progress, but not yet fully established.
   102  func (s EndpointState) connecting() bool {
   103  	switch s {
   104  	case StateConnecting, StateSynSent, StateSynRecv:
   105  		return true
   106  	default:
   107  		return false
   108  	}
   109  }
   110  
   111  // internal returns true when the state is netstack internal.
   112  func (s EndpointState) internal() bool {
   113  	switch s {
   114  	case StateInitial, StateBound, StateConnecting, StateError:
   115  		return true
   116  	default:
   117  		return false
   118  	}
   119  }
   120  
   121  // handshake returns true when s is one of the states representing an endpoint
   122  // in the middle of a TCP handshake.
   123  func (s EndpointState) handshake() bool {
   124  	switch s {
   125  	case StateSynSent, StateSynRecv:
   126  		return true
   127  	default:
   128  		return false
   129  	}
   130  }
   131  
   132  // closed returns true when s is one of the states an endpoint transitions to
   133  // when closed or when it encounters an error. This is distinct from a newly
   134  // initialized endpoint that was never connected.
   135  func (s EndpointState) closed() bool {
   136  	switch s {
   137  	case StateClose, StateError:
   138  		return true
   139  	default:
   140  		return false
   141  	}
   142  }
   143  
   144  // String implements fmt.Stringer.String.
   145  func (s EndpointState) String() string {
   146  	switch s {
   147  	case StateInitial:
   148  		return "INITIAL"
   149  	case StateBound:
   150  		return "BOUND"
   151  	case StateConnecting:
   152  		return "CONNECTING"
   153  	case StateError:
   154  		return "ERROR"
   155  	case StateEstablished:
   156  		return "ESTABLISHED"
   157  	case StateSynSent:
   158  		return "SYN-SENT"
   159  	case StateSynRecv:
   160  		return "SYN-RCVD"
   161  	case StateFinWait1:
   162  		return "FIN-WAIT1"
   163  	case StateFinWait2:
   164  		return "FIN-WAIT2"
   165  	case StateTimeWait:
   166  		return "TIME-WAIT"
   167  	case StateClose:
   168  		return "CLOSED"
   169  	case StateCloseWait:
   170  		return "CLOSE-WAIT"
   171  	case StateLastAck:
   172  		return "LAST-ACK"
   173  	case StateListen:
   174  		return "LISTEN"
   175  	case StateClosing:
   176  		return "CLOSING"
   177  	default:
   178  		panic("unreachable")
   179  	}
   180  }
   181  
   182  // SACKInfo holds TCP SACK related information for a given endpoint.
   183  //
   184  // +stateify savable
   185  type SACKInfo struct {
   186  	// Blocks is the maximum number of SACK blocks we track
   187  	// per endpoint.
   188  	Blocks [MaxSACKBlocks]header.SACKBlock
   189  
   190  	// NumBlocks is the number of valid SACK blocks stored in the
   191  	// blocks array above.
   192  	NumBlocks int
   193  }
   194  
   195  // ReceiveErrors collect segment receive errors within transport layer.
   196  //
   197  // +stateify savable
   198  type ReceiveErrors struct {
   199  	tcpip.ReceiveErrors
   200  
   201  	// SegmentQueueDropped is the number of segments dropped due to
   202  	// a full segment queue.
   203  	SegmentQueueDropped tcpip.StatCounter
   204  
   205  	// ChecksumErrors is the number of segments dropped due to bad checksums.
   206  	ChecksumErrors tcpip.StatCounter
   207  
   208  	// ListenOverflowSynDrop is the number of times the listen queue overflowed
   209  	// and a SYN was dropped.
   210  	ListenOverflowSynDrop tcpip.StatCounter
   211  
   212  	// ListenOverflowAckDrop is the number of times the final ACK
   213  	// in the handshake was dropped due to overflow.
   214  	ListenOverflowAckDrop tcpip.StatCounter
   215  
   216  	// ZeroRcvWindowState is the number of times we advertised
   217  	// a zero receive window when rcvQueue is full.
   218  	ZeroRcvWindowState tcpip.StatCounter
   219  
   220  	// WantZeroWindow is the number of times we wanted to advertise a
   221  	// zero receive window but couldn't because it would have caused
   222  	// the receive window's right edge to shrink.
   223  	WantZeroRcvWindow tcpip.StatCounter
   224  }
   225  
   226  // SendErrors collect segment send errors within the transport layer.
   227  //
   228  // +stateify savable
   229  type SendErrors struct {
   230  	tcpip.SendErrors
   231  
   232  	// SegmentSendToNetworkFailed is the number of TCP segments failed to be sent
   233  	// to the network endpoint.
   234  	SegmentSendToNetworkFailed tcpip.StatCounter
   235  
   236  	// SynSendToNetworkFailed is the number of TCP SYNs failed to be sent
   237  	// to the network endpoint.
   238  	SynSendToNetworkFailed tcpip.StatCounter
   239  
   240  	// Retransmits is the number of TCP segments retransmitted.
   241  	Retransmits tcpip.StatCounter
   242  
   243  	// FastRetransmit is the number of segments retransmitted in fast
   244  	// recovery.
   245  	FastRetransmit tcpip.StatCounter
   246  
   247  	// Timeouts is the number of times the RTO expired.
   248  	Timeouts tcpip.StatCounter
   249  }
   250  
   251  // Stats holds statistics about the endpoint.
   252  //
   253  // +stateify savable
   254  type Stats struct {
   255  	// SegmentsReceived is the number of TCP segments received that
   256  	// the transport layer successfully parsed.
   257  	SegmentsReceived tcpip.StatCounter
   258  
   259  	// SegmentsSent is the number of TCP segments sent.
   260  	SegmentsSent tcpip.StatCounter
   261  
   262  	// FailedConnectionAttempts is the number of times we saw Connect and
   263  	// Accept errors.
   264  	FailedConnectionAttempts tcpip.StatCounter
   265  
   266  	// ReceiveErrors collects segment receive errors within the
   267  	// transport layer.
   268  	ReceiveErrors ReceiveErrors
   269  
   270  	// ReadErrors collects segment read errors from an endpoint read call.
   271  	ReadErrors tcpip.ReadErrors
   272  
   273  	// SendErrors collects segment send errors within the transport layer.
   274  	SendErrors SendErrors
   275  
   276  	// WriteErrors collects segment write errors from an endpoint write call.
   277  	WriteErrors tcpip.WriteErrors
   278  }
   279  
   280  // IsEndpointStats is an empty method to implement the tcpip.EndpointStats
   281  // marker interface.
   282  func (*Stats) IsEndpointStats() {}
   283  
   284  // sndQueueInfo implements a send queue.
   285  //
   286  // +stateify savable
   287  type sndQueueInfo struct {
   288  	sndQueueMu sync.Mutex `state:"nosave"`
   289  	stack.TCPSndBufState
   290  
   291  	// sndWaker is used to signal the protocol goroutine when there may be
   292  	// segments that need to be sent.
   293  	sndWaker sleep.Waker `state:"manual"`
   294  }
   295  
   296  // CloneState clones sq into other. It is not thread safe
   297  func (sq *sndQueueInfo) CloneState(other *stack.TCPSndBufState) {
   298  	other.SndBufSize = sq.SndBufSize
   299  	other.SndBufUsed = sq.SndBufUsed
   300  	other.SndClosed = sq.SndClosed
   301  	other.PacketTooBigCount = sq.PacketTooBigCount
   302  	other.SndMTU = sq.SndMTU
   303  	other.AutoTuneSndBufDisabled = atomicbitops.FromUint32(sq.AutoTuneSndBufDisabled.RacyLoad())
   304  }
   305  
   306  // Endpoint represents a TCP endpoint. This struct serves as the interface
   307  // between users of the endpoint and the protocol implementation; it is legal to
   308  // have concurrent goroutines make calls into the endpoint, they are properly
   309  // synchronized. The protocol implementation, however, runs in a single
   310  // goroutine.
   311  //
   312  // Each endpoint has a few mutexes:
   313  //
   314  // e.mu -> Primary mutex for an endpoint must be held for all operations except
   315  // in e.Readiness where acquiring it will result in a deadlock in epoll
   316  // implementation.
   317  //
   318  // The following three mutexes can be acquired independent of e.mu but if
   319  // acquired with e.mu then e.mu must be acquired first.
   320  //
   321  // e.acceptMu -> Protects e.acceptQueue.
   322  // e.rcvQueueMu -> Protects e.rcvQueue's associated fields but not e.rcvQueue
   323  // itself.
   324  // e.sndQueueMu -> Protects the e.sndQueue and associated fields.
   325  // e.lastErrorMu -> Protects the lastError field.
   326  //
   327  // LOCKING/UNLOCKING of the endpoint.  The locking of an endpoint is different
   328  // based on the context in which the lock is acquired. In the syscall context
   329  // e.LockUser/e.UnlockUser should be used and when doing background processing
   330  // e.mu.Lock/e.mu.Unlock should be used. The distinction is described below
   331  // in brief.
   332  //
   333  // The reason for this locking behaviour is to avoid wakeups to handle packets.
   334  // In cases where the endpoint is already locked the background processor can
   335  // queue the packet up and go its merry way and the lock owner will eventually
   336  // process the backlog when releasing the lock. Similarly when acquiring the
   337  // lock from say a syscall goroutine we can implement a bit of spinning if we
   338  // know that the lock is not held by another syscall goroutine. Background
   339  // processors should never hold the lock for long and we can avoid an expensive
   340  // sleep/wakeup by spinning for a shortwhile.
   341  //
   342  // For more details please see the detailed documentation on
   343  // e.LockUser/e.UnlockUser methods.
   344  //
   345  // +stateify savable
   346  type Endpoint struct {
   347  	stack.TCPEndpointStateInner
   348  	stack.TransportEndpointInfo
   349  	tcpip.DefaultSocketOptionsHandler
   350  
   351  	// EndpointEntry is used to queue endpoints for processing to the
   352  	// a given tcp processor goroutine.
   353  	//
   354  	// Precondition: epQueue.mu must be held to read/write this field..
   355  	endpointEntry `state:"nosave"`
   356  
   357  	// pendingProcessingMu protects pendingProcessing.
   358  	pendingProcessingMu sync.Mutex `state:"nosave"`
   359  
   360  	// pendingProcessing is true if this endpoint is queued for processing
   361  	// to a TCP processor.
   362  	// +checklocks:pendingProcessingMu
   363  	pendingProcessing bool `state:"nosave"`
   364  
   365  	// The following fields are initialized at creation time and do not
   366  	// change throughout the lifetime of the endpoint.
   367  	stack       *stack.Stack  `state:"manual"`
   368  	protocol    *protocol     `state:"manual"`
   369  	waiterQueue *waiter.Queue `state:"wait"`
   370  	uniqueID    uint64
   371  
   372  	// hardError is meaningful only when state is stateError. It stores the
   373  	// error to be returned when read/write syscalls are called and the
   374  	// endpoint is in this state. hardError is protected by endpoint mu.
   375  	hardError tcpip.Error
   376  
   377  	// lastError represents the last error that the endpoint reported;
   378  	// access to it is protected by the following mutex.
   379  	lastErrorMu sync.Mutex `state:"nosave"`
   380  	lastError   tcpip.Error
   381  
   382  	rcvQueueMu sync.Mutex `state:"nosave"`
   383  
   384  	// +checklocks:rcvQueueMu
   385  	stack.TCPRcvBufState
   386  
   387  	// rcvMemUsed tracks the total amount of memory in use by received segments
   388  	// held in rcvQueue, pendingRcvdSegments and the segment queue. This is used to
   389  	// compute the window and the actual available buffer space. This is distinct
   390  	// from rcvBufUsed above which is the actual number of payload bytes held in
   391  	// the buffer not including any segment overheads.
   392  	rcvMemUsed atomicbitops.Int32
   393  
   394  	// mu protects all endpoint fields unless documented otherwise. mu must
   395  	// be acquired before interacting with the endpoint fields.
   396  	//
   397  	// During handshake, mu is locked by the protocol listen goroutine and
   398  	// released by the handshake completion goroutine.
   399  	mu          sync.CrossGoroutineMutex `state:"nosave"`
   400  	ownedByUser atomicbitops.Uint32
   401  
   402  	// rcvQueue is the queue for ready-for-delivery segments.
   403  	//
   404  	// +checklocks:mu
   405  	rcvQueue segmentList `state:"wait"`
   406  
   407  	// state must be read/set using the EndpointState()/setEndpointState()
   408  	// methods.
   409  	state atomicbitops.Uint32 `state:".(EndpointState)"`
   410  
   411  	// connectionDirectionState holds current state of send and receive,
   412  	// accessed atomically
   413  	connectionDirectionState atomicbitops.Uint32
   414  
   415  	// origEndpointState is only used during a restore phase to save the
   416  	// endpoint state at restore time as the socket is moved to it's correct
   417  	// state.
   418  	origEndpointState uint32 `state:"nosave"`
   419  
   420  	isPortReserved    bool `state:"manual"`
   421  	isRegistered      bool `state:"manual"`
   422  	boundNICID        tcpip.NICID
   423  	route             *stack.Route `state:"manual"`
   424  	ipv4TTL           uint8
   425  	ipv6HopLimit      int16
   426  	isConnectNotified bool
   427  
   428  	// h stores a reference to the current handshake state if the endpoint is in
   429  	// the SYN-SENT or SYN-RECV states, in which case endpoint == endpoint.h.ep.
   430  	// nil otherwise.
   431  	// +checklocks:mu
   432  	h *handshake
   433  
   434  	// portFlags stores the current values of port related flags.
   435  	portFlags ports.Flags
   436  
   437  	// Values used to reserve a port or register a transport endpoint
   438  	// (which ever happens first).
   439  	boundBindToDevice tcpip.NICID
   440  	boundPortFlags    ports.Flags
   441  	boundDest         tcpip.FullAddress
   442  
   443  	// effectiveNetProtos contains the network protocols actually in use. In
   444  	// most cases it will only contain "netProto", but in cases like IPv6
   445  	// endpoints with v6only set to false, this could include multiple
   446  	// protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g.,
   447  	// IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped
   448  	// address).
   449  	effectiveNetProtos []tcpip.NetworkProtocolNumber
   450  
   451  	// recentTSTime is the unix time when we last updated
   452  	// TCPEndpointStateInner.RecentTS.
   453  	recentTSTime tcpip.MonotonicTime
   454  
   455  	// shutdownFlags represent the current shutdown state of the endpoint.
   456  	shutdownFlags tcpip.ShutdownFlags
   457  
   458  	// tcpRecovery is the loss recovery algorithm used by TCP.
   459  	tcpRecovery tcpip.TCPRecovery
   460  
   461  	// sack holds TCP SACK related information for this endpoint.
   462  	sack SACKInfo
   463  
   464  	// delay enables Nagle's algorithm.
   465  	//
   466  	// delay is a boolean (0 is false) and must be accessed atomically.
   467  	delay uint32
   468  
   469  	// scoreboard holds TCP SACK Scoreboard information for this endpoint.
   470  	scoreboard *SACKScoreboard
   471  
   472  	// segmentQueue is used to hand received segments to the protocol
   473  	// goroutine. Segments are queued as long as the queue is not full,
   474  	// and dropped when it is.
   475  	segmentQueue segmentQueue `state:"wait"`
   476  
   477  	// userMSS if non-zero is the MSS value explicitly set by the user
   478  	// for this endpoint using the TCP_MAXSEG setsockopt.
   479  	userMSS uint16
   480  
   481  	// maxSynRetries is the maximum number of SYN retransmits that TCP should
   482  	// send before aborting the attempt to connect. It cannot exceed 255.
   483  	//
   484  	// NOTE: This is currently a no-op and does not change the SYN
   485  	// retransmissions.
   486  	maxSynRetries uint8
   487  
   488  	// windowClamp is used to bound the size of the advertised window to
   489  	// this value.
   490  	windowClamp uint32
   491  
   492  	// sndQueueInfo contains the implementation of the endpoint's send queue.
   493  	sndQueueInfo sndQueueInfo
   494  
   495  	// cc stores the name of the Congestion Control algorithm to use for
   496  	// this endpoint.
   497  	cc tcpip.CongestionControlOption
   498  
   499  	// keepalive manages TCP keepalive state. When the connection is idle
   500  	// (no data sent or received) for keepaliveIdle, we start sending
   501  	// keepalives every keepalive.interval. If we send keepalive.count
   502  	// without hearing a response, the connection is closed.
   503  	keepalive keepalive
   504  
   505  	// userTimeout if non-zero specifies a user specified timeout for
   506  	// a connection w/ pending data to send. A connection that has pending
   507  	// unacked data will be forcibily aborted if the timeout is reached
   508  	// without any data being acked.
   509  	userTimeout time.Duration
   510  
   511  	// deferAccept if non-zero specifies a user specified time during
   512  	// which the final ACK of a handshake will be dropped provided the
   513  	// ACK is a bare ACK and carries no data. If the timeout is crossed then
   514  	// the bare ACK is accepted and the connection is delivered to the
   515  	// listener.
   516  	deferAccept time.Duration
   517  
   518  	// acceptMu protects accepQueue
   519  	acceptMu sync.Mutex `state:"nosave"`
   520  
   521  	// acceptQueue is used by a listening endpoint to send newly accepted
   522  	// connections to the endpoint so that they can be read by Accept()
   523  	// calls.
   524  	//
   525  	// +checklocks:acceptMu
   526  	acceptQueue acceptQueue
   527  
   528  	// The following are only used from the protocol goroutine, and
   529  	// therefore don't need locks to protect them.
   530  	rcv *receiver `state:"wait"`
   531  	snd *sender   `state:"wait"`
   532  
   533  	// The goroutine drain completion notification channel.
   534  	drainDone chan struct{} `state:"nosave"`
   535  
   536  	// The goroutine undrain notification channel. This is currently used as
   537  	// a way to block the worker goroutines. Today nothing closes/writes
   538  	// this channel and this causes any goroutines waiting on this to just
   539  	// block. This is used during save/restore to prevent worker goroutines
   540  	// from mutating state as it's being saved.
   541  	undrain chan struct{} `state:"nosave"`
   542  
   543  	// probe if not nil is invoked on every received segment. It is passed
   544  	// a copy of the current state of the endpoint.
   545  	probe stack.TCPProbeFunc `state:"nosave"`
   546  
   547  	// The following are only used to assist the restore run to re-connect.
   548  	connectingAddress tcpip.Address
   549  
   550  	// amss is the advertised MSS to the peer by this endpoint.
   551  	amss uint16
   552  
   553  	// sendTOS represents IPv4 TOS or IPv6 TrafficClass,
   554  	// applied while sending packets. Defaults to 0 as on Linux.
   555  	sendTOS uint8
   556  
   557  	gso stack.GSO
   558  
   559  	stats Stats
   560  
   561  	// tcpLingerTimeout is the maximum amount of a time a socket
   562  	// a socket stays in TIME_WAIT state before being marked
   563  	// closed.
   564  	tcpLingerTimeout time.Duration
   565  
   566  	// closed indicates that the user has called closed on the
   567  	// endpoint and at this point the endpoint is only around
   568  	// to complete the TCP shutdown.
   569  	closed bool
   570  
   571  	// txHash is the transport layer hash to be set on outbound packets
   572  	// emitted by this endpoint.
   573  	txHash uint32
   574  
   575  	// owner is used to get uid and gid of the packet.
   576  	owner tcpip.PacketOwner
   577  
   578  	// ops is used to get socket level options.
   579  	ops tcpip.SocketOptions
   580  
   581  	// lastOutOfWindowAckTime is the time at which the an ACK was sent in response
   582  	// to an out of window segment being received by this endpoint.
   583  	lastOutOfWindowAckTime tcpip.MonotonicTime
   584  
   585  	// finWait2Timer is used to reap orphaned sockets in FIN-WAIT-2 where the peer
   586  	// is yet to send a FIN but on our end the socket is fully closed i.e. endpoint.Close()
   587  	// has been called on the socket. This timer is not started for sockets that
   588  	// are waiting for a peer FIN but are not closed.
   589  	finWait2Timer tcpip.Timer `state:"nosave"`
   590  
   591  	// timeWaitTimer is used to reap a socket once a socket has been in TIME-WAIT state
   592  	// for tcp.DefaultTCPTimeWaitTimeout seconds.
   593  	timeWaitTimer tcpip.Timer `state:"nosave"`
   594  
   595  	// listenCtx is used by listening endpoints to store state used while listening for
   596  	// connections. Nil otherwise.
   597  	listenCtx *listenContext `state:"nosave"`
   598  
   599  	// limRdr is reused to avoid allocations.
   600  	//
   601  	// +checklocks:mu
   602  	limRdr *io.LimitedReader `state:"nosave"`
   603  
   604  	// pmtud is the PMTUD strategy to use.
   605  	//
   606  	// +checklocks:mu
   607  	pmtud tcpip.PMTUDStrategy
   608  }
   609  
   610  // UniqueID implements stack.TransportEndpoint.UniqueID.
   611  func (e *Endpoint) UniqueID() uint64 {
   612  	return e.uniqueID
   613  }
   614  
   615  // calculateAdvertisedMSS calculates the MSS to advertise.
   616  //
   617  // If userMSS is non-zero and is not greater than the maximum possible MSS for
   618  // r, it will be used; otherwise, the maximum possible MSS will be used.
   619  func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 {
   620  	// The maximum possible MSS is dependent on the route.
   621  	// TODO(b/143359391): Respect TCP Min and Max size.
   622  	maxMSS := uint16(r.MTU() - header.TCPMinimumSize)
   623  
   624  	if userMSS != 0 && userMSS < maxMSS {
   625  		return userMSS
   626  	}
   627  
   628  	return maxMSS
   629  }
   630  
   631  // isOwnedByUser() returns true if the endpoint lock is currently
   632  // held by a user(syscall) goroutine.
   633  func (e *Endpoint) isOwnedByUser() bool {
   634  	return e.ownedByUser.Load() == 1
   635  }
   636  
   637  // LockUser tries to lock e.mu and if it fails it will check if the lock is held
   638  // by another syscall goroutine. If yes, then it will goto sleep waiting for the
   639  // lock to be released, if not then it will spin till it acquires the lock or
   640  // another syscall goroutine acquires it in which case it will goto sleep as
   641  // described above.
   642  //
   643  // The assumption behind spinning here being that background packet processing
   644  // should not be holding the lock for long and spinning reduces latency as we
   645  // avoid an expensive sleep/wakeup of the syscall goroutine).
   646  // +checklocksacquire:e.mu
   647  func (e *Endpoint) LockUser() {
   648  	const iterations = 5
   649  	for i := 0; i < iterations; i++ {
   650  		// Try first if the sock is locked then check if it's owned
   651  		// by another user goroutine if not then we spin, otherwise
   652  		// we just go to sleep on the Lock() and wait.
   653  		if !e.TryLock() {
   654  			// If socket is owned by the user then just go to sleep
   655  			// as the lock could be held for a reasonably long time.
   656  			if e.ownedByUser.Load() == 1 {
   657  				e.mu.Lock()
   658  				e.ownedByUser.Store(1)
   659  				return
   660  			}
   661  			// Spin but don't yield the processor since the lower half
   662  			// should yield the lock soon.
   663  			continue
   664  		}
   665  		e.ownedByUser.Store(1)
   666  		return
   667  	}
   668  
   669  	for i := 0; i < iterations; i++ {
   670  		// Try first if the sock is locked then check if it's owned
   671  		// by another user goroutine if not then we spin, otherwise
   672  		// we just go to sleep on the Lock() and wait.
   673  		if !e.TryLock() {
   674  			// If socket is owned by the user then just go to sleep
   675  			// as the lock could be held for a reasonably long time.
   676  			if e.ownedByUser.Load() == 1 {
   677  				e.mu.Lock()
   678  				e.ownedByUser.Store(1)
   679  				return
   680  			}
   681  			// Spin but yield the processor since the lower half
   682  			// should yield the lock soon.
   683  			runtime.Gosched()
   684  			continue
   685  		}
   686  		e.ownedByUser.Store(1)
   687  		return
   688  	}
   689  
   690  	// Finally just give up and wait for the Lock.
   691  	e.mu.Lock()
   692  	e.ownedByUser.Store(1)
   693  }
   694  
   695  // UnlockUser will check if there are any segments already queued for processing
   696  // and wake up a processor goroutine to process them before unlocking e.mu.
   697  // This is required because we when packets arrive and endpoint lock is already
   698  // held then such packets are queued up to be processed.
   699  //
   700  // Precondition: e.LockUser() must have been called before calling e.UnlockUser()
   701  // +checklocksrelease:e.mu
   702  func (e *Endpoint) UnlockUser() {
   703  	// Lock segment queue before checking so that we avoid a race where
   704  	// segments can be queued between the time we check if queue is empty
   705  	// and actually unlock the endpoint mutex.
   706  	e.segmentQueue.mu.Lock()
   707  	if e.segmentQueue.emptyLocked() {
   708  		if e.ownedByUser.Swap(0) != 1 {
   709  			panic("e.UnlockUser() called without calling e.LockUser()")
   710  		}
   711  		e.mu.Unlock()
   712  		e.segmentQueue.mu.Unlock()
   713  		return
   714  	}
   715  	e.segmentQueue.mu.Unlock()
   716  
   717  	// Since we are waking the processor goroutine here just unlock
   718  	// and let it process the queued segments.
   719  	if e.ownedByUser.Swap(0) != 1 {
   720  		panic("e.UnlockUser() called without calling e.LockUser()")
   721  	}
   722  	processor := e.protocol.dispatcher.selectProcessor(e.ID)
   723  	e.mu.Unlock()
   724  
   725  	// Wake up the processor for this endpoint to process any queued
   726  	// segments after releasing the lock to avoid the case where if the
   727  	// processor goroutine starts running before we release the lock here
   728  	// then it will fail to process as TryLock() will fail.
   729  	processor.queueEndpoint(e)
   730  	return
   731  }
   732  
   733  // StopWork halts packet processing. Only to be used in tests.
   734  // +checklocksacquire:e.mu
   735  func (e *Endpoint) StopWork() {
   736  	e.mu.Lock()
   737  }
   738  
   739  // ResumeWork resumes packet processing. Only to be used in tests.
   740  // +checklocksrelease:e.mu
   741  func (e *Endpoint) ResumeWork() {
   742  	e.mu.Unlock()
   743  }
   744  
   745  // AssertLockHeld forces the checklocks analyzer to consider e.mu held. This is
   746  // used in places where we know that e.mu is held, but checklocks does not,
   747  // which can happen when creating new locked objects. You must pass the known
   748  // locked endpoint to this function and it must be the same as the caller
   749  // endpoint.
   750  // TODO(b/226403629): Remove this function once checklocks understands local
   751  // variable locks.
   752  // +checklocks:locked.mu
   753  // +checklocksacquire:e.mu
   754  func (e *Endpoint) AssertLockHeld(locked *Endpoint) {
   755  	if e != locked {
   756  		panic("AssertLockHeld failed: locked endpoint != asserting endpoint")
   757  	}
   758  }
   759  
   760  // TryLock is a helper that calls TryLock on the endpoint's mutex and
   761  // adds the necessary checklocks annotations.
   762  // TODO(b/226403629): Remove this once checklocks understands TryLock.
   763  // +checklocksacquire:e.mu
   764  func (e *Endpoint) TryLock() bool {
   765  	if e.mu.TryLock() {
   766  		return true // +checklocksforce
   767  	}
   768  	return false // +checklocksignore
   769  }
   770  
   771  // setEndpointState updates the state of the endpoint to state atomically. This
   772  // method is unexported as the only place we should update the state is in this
   773  // package but we allow the state to be read freely without holding e.mu.
   774  //
   775  // +checklocks:e.mu
   776  func (e *Endpoint) setEndpointState(state EndpointState) {
   777  	oldstate := EndpointState(e.state.Swap(uint32(state)))
   778  	switch state {
   779  	case StateEstablished:
   780  		e.stack.Stats().TCP.CurrentEstablished.Increment()
   781  		e.stack.Stats().TCP.CurrentConnected.Increment()
   782  	case StateError:
   783  		fallthrough
   784  	case StateClose:
   785  		if oldstate == StateCloseWait || oldstate == StateEstablished {
   786  			e.stack.Stats().TCP.EstablishedResets.Increment()
   787  		}
   788  		if oldstate.connected() {
   789  			e.stack.Stats().TCP.CurrentConnected.Decrement()
   790  		}
   791  		fallthrough
   792  	default:
   793  		if oldstate == StateEstablished {
   794  			e.stack.Stats().TCP.CurrentEstablished.Decrement()
   795  		}
   796  	}
   797  }
   798  
   799  // EndpointState returns the current state of the endpoint.
   800  func (e *Endpoint) EndpointState() EndpointState {
   801  	return EndpointState(e.state.Load())
   802  }
   803  
   804  // setRecentTimestamp sets the recentTS field to the provided value.
   805  func (e *Endpoint) setRecentTimestamp(recentTS uint32) {
   806  	e.RecentTS = recentTS
   807  	e.recentTSTime = e.stack.Clock().NowMonotonic()
   808  }
   809  
   810  // recentTimestamp returns the value of the recentTS field.
   811  func (e *Endpoint) recentTimestamp() uint32 {
   812  	return e.RecentTS
   813  }
   814  
   815  // TODO(gvisor.dev/issue/6974): Remove once tcp endpoints are composed with a
   816  // network.Endpoint, which also defines this function.
   817  func calculateTTL(route *stack.Route, ipv4TTL uint8, ipv6HopLimit int16) uint8 {
   818  	switch netProto := route.NetProto(); netProto {
   819  	case header.IPv4ProtocolNumber:
   820  		if ipv4TTL == tcpip.UseDefaultIPv4TTL {
   821  			return route.DefaultTTL()
   822  		}
   823  		return ipv4TTL
   824  	case header.IPv6ProtocolNumber:
   825  		if ipv6HopLimit == tcpip.UseDefaultIPv6HopLimit {
   826  			return route.DefaultTTL()
   827  		}
   828  		return uint8(ipv6HopLimit)
   829  	default:
   830  		panic(fmt.Sprintf("invalid protocol number = %d", netProto))
   831  	}
   832  }
   833  
   834  // keepalive is a synchronization wrapper used to appease stateify. See the
   835  // comment in endpoint, where it is used.
   836  //
   837  // +stateify savable
   838  type keepalive struct {
   839  	sync.Mutex `state:"nosave"`
   840  	idle       time.Duration
   841  	interval   time.Duration
   842  	count      int
   843  	unacked    int
   844  	// should never be a zero timer if the endpoint is not closed.
   845  	timer timer       `state:"nosave"`
   846  	waker sleep.Waker `state:"nosave"`
   847  }
   848  
   849  func newEndpoint(s *stack.Stack, protocol *protocol, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *Endpoint {
   850  	e := &Endpoint{
   851  		stack:    s,
   852  		protocol: protocol,
   853  		TransportEndpointInfo: stack.TransportEndpointInfo{
   854  			NetProto:   netProto,
   855  			TransProto: header.TCPProtocolNumber,
   856  		},
   857  		sndQueueInfo: sndQueueInfo{
   858  			TCPSndBufState: stack.TCPSndBufState{
   859  				SndMTU: math.MaxInt32,
   860  			},
   861  		},
   862  		waiterQueue: waiterQueue,
   863  		state:       atomicbitops.FromUint32(uint32(StateInitial)),
   864  		keepalive: keepalive{
   865  			idle:     DefaultKeepaliveIdle,
   866  			interval: DefaultKeepaliveInterval,
   867  			count:    DefaultKeepaliveCount,
   868  		},
   869  		uniqueID:     s.UniqueID(),
   870  		ipv4TTL:      tcpip.UseDefaultIPv4TTL,
   871  		ipv6HopLimit: tcpip.UseDefaultIPv6HopLimit,
   872  		// txHash only determines which outgoing queue to use, so
   873  		// InsecureRNG is fine.
   874  		txHash:        s.InsecureRNG().Uint32(),
   875  		windowClamp:   DefaultReceiveBufferSize,
   876  		maxSynRetries: DefaultSynRetries,
   877  		limRdr:        &io.LimitedReader{},
   878  	}
   879  	e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits, GetTCPReceiveBufferLimits)
   880  	e.ops.SetMulticastLoop(true)
   881  	e.ops.SetQuickAck(true)
   882  	e.ops.SetSendBufferSize(DefaultSendBufferSize, false /* notify */)
   883  	e.ops.SetReceiveBufferSize(DefaultReceiveBufferSize, false /* notify */)
   884  
   885  	var ss tcpip.TCPSendBufferSizeRangeOption
   886  	if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
   887  		e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */)
   888  	}
   889  
   890  	var rs tcpip.TCPReceiveBufferSizeRangeOption
   891  	if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
   892  		e.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */)
   893  	}
   894  
   895  	var cs tcpip.CongestionControlOption
   896  	if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil {
   897  		e.cc = cs
   898  	}
   899  
   900  	var mrb tcpip.TCPModerateReceiveBufferOption
   901  	if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil {
   902  		e.RcvAutoParams.Disabled = !bool(mrb)
   903  	}
   904  
   905  	var de tcpip.TCPDelayEnabled
   906  	if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de {
   907  		e.ops.SetDelayOption(true)
   908  	}
   909  
   910  	var tcpLT tcpip.TCPLingerTimeoutOption
   911  	if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil {
   912  		e.tcpLingerTimeout = time.Duration(tcpLT)
   913  	}
   914  
   915  	var synRetries tcpip.TCPSynRetriesOption
   916  	if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil {
   917  		e.maxSynRetries = uint8(synRetries)
   918  	}
   919  
   920  	if p := s.GetTCPProbe(); p != nil {
   921  		e.probe = p
   922  	}
   923  
   924  	e.segmentQueue.ep = e
   925  
   926  	// TODO(https://gvisor.dev/issues/7493): Defer creating the timer until TCP connection becomes
   927  	// established.
   928  	e.keepalive.timer.init(e.stack.Clock(), timerHandler(e, e.keepaliveTimerExpired))
   929  
   930  	return e
   931  }
   932  
   933  // Readiness returns the current readiness of the endpoint. For example, if
   934  // waiter.EventIn is set, the endpoint is immediately readable.
   935  func (e *Endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
   936  	result := waiter.EventMask(0)
   937  
   938  	switch e.EndpointState() {
   939  	case StateInitial, StateBound:
   940  		// This prevents blocking of new sockets which are not
   941  		// connected when SO_LINGER is set.
   942  		result |= waiter.EventHUp
   943  
   944  	case StateConnecting, StateSynSent, StateSynRecv:
   945  		// Ready for nothing.
   946  
   947  	case StateClose, StateError, StateTimeWait:
   948  		// Ready for anything.
   949  		result = mask
   950  
   951  	case StateListen:
   952  		// Check if there's anything in the accepted queue.
   953  		if (mask & waiter.ReadableEvents) != 0 {
   954  			e.acceptMu.Lock()
   955  			if e.acceptQueue.endpoints.Len() != 0 {
   956  				result |= waiter.ReadableEvents
   957  			}
   958  			e.acceptMu.Unlock()
   959  		}
   960  	}
   961  	if e.EndpointState().connected() {
   962  		// Determine if the endpoint is writable if requested.
   963  		if (mask & waiter.WritableEvents) != 0 {
   964  			e.sndQueueInfo.sndQueueMu.Lock()
   965  			sndBufSize := e.getSendBufferSize()
   966  			if e.sndQueueInfo.SndClosed || e.sndQueueInfo.SndBufUsed < sndBufSize {
   967  				result |= waiter.WritableEvents
   968  			}
   969  			if e.sndQueueInfo.SndClosed {
   970  				e.updateConnDirectionState(connDirectionStateSndClosed)
   971  			}
   972  			e.sndQueueInfo.sndQueueMu.Unlock()
   973  		}
   974  
   975  		// Determine if the endpoint is readable if requested.
   976  		if (mask & waiter.ReadableEvents) != 0 {
   977  			e.rcvQueueMu.Lock()
   978  			if e.RcvBufUsed > 0 || e.RcvClosed {
   979  				result |= waiter.ReadableEvents
   980  			}
   981  			if e.RcvClosed {
   982  				e.updateConnDirectionState(connDirectionStateRcvClosed)
   983  			}
   984  			e.rcvQueueMu.Unlock()
   985  		}
   986  	}
   987  
   988  	// Determine whether endpoint is half-closed with rcv shutdown
   989  	if e.connDirectionState() == connDirectionStateRcvClosed {
   990  		result |= waiter.EventRdHUp
   991  	}
   992  
   993  	return result
   994  }
   995  
   996  // Purging pending rcv segments is only necessary on RST.
   997  func (e *Endpoint) purgePendingRcvQueue() {
   998  	if e.rcv != nil {
   999  		for e.rcv.pendingRcvdSegments.Len() > 0 {
  1000  			s := heap.Pop(&e.rcv.pendingRcvdSegments).(*segment)
  1001  			s.DecRef()
  1002  		}
  1003  	}
  1004  }
  1005  
  1006  // +checklocks:e.mu
  1007  func (e *Endpoint) purgeReadQueue() {
  1008  	if e.rcv != nil {
  1009  		e.rcvQueueMu.Lock()
  1010  		defer e.rcvQueueMu.Unlock()
  1011  		for {
  1012  			s := e.rcvQueue.Front()
  1013  			if s == nil {
  1014  				break
  1015  			}
  1016  			e.rcvQueue.Remove(s)
  1017  			s.DecRef()
  1018  		}
  1019  		e.RcvBufUsed = 0
  1020  	}
  1021  }
  1022  
  1023  // +checklocks:e.mu
  1024  func (e *Endpoint) purgeWriteQueue() {
  1025  	if e.snd != nil {
  1026  		e.sndQueueInfo.sndQueueMu.Lock()
  1027  		defer e.sndQueueInfo.sndQueueMu.Unlock()
  1028  		e.snd.updateWriteNext(nil)
  1029  		for {
  1030  			s := e.snd.writeList.Front()
  1031  			if s == nil {
  1032  				break
  1033  			}
  1034  			e.snd.writeList.Remove(s)
  1035  			s.DecRef()
  1036  		}
  1037  		e.sndQueueInfo.SndBufUsed = 0
  1038  		e.sndQueueInfo.SndClosed = true
  1039  	}
  1040  }
  1041  
  1042  // Abort implements stack.TransportEndpoint.Abort.
  1043  func (e *Endpoint) Abort() {
  1044  	defer e.drainClosingSegmentQueue()
  1045  	e.LockUser()
  1046  	defer e.UnlockUser()
  1047  	defer e.purgeReadQueue()
  1048  	// Reset all connected endpoints.
  1049  	switch state := e.EndpointState(); {
  1050  	case state.connected():
  1051  		e.resetConnectionLocked(&tcpip.ErrAborted{})
  1052  		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  1053  		return
  1054  	}
  1055  	e.closeLocked()
  1056  }
  1057  
  1058  // Close puts the endpoint in a closed state and frees all resources associated
  1059  // with it. It must be called only once and with no other concurrent calls to
  1060  // the endpoint.
  1061  func (e *Endpoint) Close() {
  1062  	e.LockUser()
  1063  	if e.closed {
  1064  		e.UnlockUser()
  1065  		return
  1066  	}
  1067  
  1068  	// We always want to purge the read queue, but do so after the checks in
  1069  	// shutdownLocked.
  1070  	e.closeLocked()
  1071  	e.purgeReadQueue()
  1072  	if e.EndpointState() == StateClose || e.EndpointState() == StateError {
  1073  		// It should be safe to purge the read queue now as the endpoint
  1074  		// is now closed or in an error state and further reads are not
  1075  		// permitted.
  1076  		e.UnlockUser()
  1077  		e.drainClosingSegmentQueue()
  1078  		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  1079  		return
  1080  	}
  1081  	e.UnlockUser()
  1082  }
  1083  
  1084  // +checklocks:e.mu
  1085  func (e *Endpoint) closeLocked() {
  1086  	linger := e.SocketOptions().GetLinger()
  1087  	if linger.Enabled && linger.Timeout == 0 {
  1088  		s := e.EndpointState()
  1089  		isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv
  1090  		if isResetState {
  1091  			// Close the endpoint without doing full shutdown and
  1092  			// send a RST.
  1093  			e.resetConnectionLocked(&tcpip.ErrConnectionAborted{})
  1094  			return
  1095  		}
  1096  	}
  1097  
  1098  	// Issue a shutdown so that the peer knows we won't send any more data
  1099  	// if we're connected, or stop accepting if we're listening.
  1100  	e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead)
  1101  	e.closeNoShutdownLocked()
  1102  }
  1103  
  1104  // closeNoShutdown closes the endpoint without doing a full shutdown.
  1105  // +checklocks:e.mu
  1106  func (e *Endpoint) closeNoShutdownLocked() {
  1107  	// For listening sockets, we always release ports inline so that they
  1108  	// are immediately available for reuse after Close() is called. If also
  1109  	// registered, we unregister as well otherwise the next user would fail
  1110  	// in Listen() when trying to register.
  1111  	if e.EndpointState() == StateListen && e.isPortReserved {
  1112  		if e.isRegistered {
  1113  			e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
  1114  			e.isRegistered = false
  1115  		}
  1116  
  1117  		portRes := ports.Reservation{
  1118  			Networks:     e.effectiveNetProtos,
  1119  			Transport:    ProtocolNumber,
  1120  			Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  1121  			Port:         e.TransportEndpointInfo.ID.LocalPort,
  1122  			Flags:        e.boundPortFlags,
  1123  			BindToDevice: e.boundBindToDevice,
  1124  			Dest:         e.boundDest,
  1125  		}
  1126  		e.stack.ReleasePort(portRes)
  1127  		e.isPortReserved = false
  1128  		e.boundBindToDevice = 0
  1129  		e.boundPortFlags = ports.Flags{}
  1130  		e.boundDest = tcpip.FullAddress{}
  1131  	}
  1132  
  1133  	// Mark endpoint as closed.
  1134  	e.closed = true
  1135  	tcpip.AddDanglingEndpoint(e)
  1136  
  1137  	eventMask := waiter.ReadableEvents | waiter.WritableEvents
  1138  
  1139  	switch e.EndpointState() {
  1140  	case StateInitial, StateBound, StateListen:
  1141  		e.setEndpointState(StateClose)
  1142  		fallthrough
  1143  	case StateClose, StateError:
  1144  		eventMask |= waiter.EventHUp
  1145  		e.cleanupLocked()
  1146  	case StateConnecting, StateSynSent, StateSynRecv:
  1147  		// Abort the handshake and set the error.
  1148  		// Notify that the endpoint is closed.
  1149  		eventMask |= waiter.EventHUp
  1150  		e.handshakeFailed(&tcpip.ErrAborted{})
  1151  		// Notify that the endpoint is closed.
  1152  		eventMask |= waiter.EventHUp
  1153  	case StateFinWait2:
  1154  		// The socket has been closed and we are in FIN-WAIT-2 so start
  1155  		// the FIN-WAIT-2 timer.
  1156  		if e.finWait2Timer == nil {
  1157  			e.finWait2Timer = e.stack.Clock().AfterFunc(e.tcpLingerTimeout, e.finWait2TimerExpired)
  1158  		}
  1159  	}
  1160  
  1161  	e.waiterQueue.Notify(eventMask)
  1162  }
  1163  
  1164  // closePendingAcceptableConnections closes all connections that have completed
  1165  // handshake but not yet been delivered to the application.
  1166  func (e *Endpoint) closePendingAcceptableConnectionsLocked() {
  1167  	e.acceptMu.Lock()
  1168  
  1169  	pendingEndpoints := e.acceptQueue.pendingEndpoints
  1170  	e.acceptQueue.pendingEndpoints = nil
  1171  
  1172  	completedEndpoints := make([]*Endpoint, 0, e.acceptQueue.endpoints.Len())
  1173  	for n := e.acceptQueue.endpoints.Front(); n != nil; n = n.Next() {
  1174  		completedEndpoints = append(completedEndpoints, n.Value.(*Endpoint))
  1175  	}
  1176  	e.acceptQueue.endpoints.Init()
  1177  	e.acceptQueue.capacity = 0
  1178  	e.acceptMu.Unlock()
  1179  
  1180  	// Close any endpoints in SYN-RCVD state.
  1181  	for n := range pendingEndpoints {
  1182  		n.Abort()
  1183  	}
  1184  
  1185  	// Reset all connections that are waiting to be accepted.
  1186  	for _, n := range completedEndpoints {
  1187  		n.Abort()
  1188  	}
  1189  }
  1190  
  1191  // cleanupLocked frees all resources associated with the endpoint.
  1192  // +checklocks:e.mu
  1193  func (e *Endpoint) cleanupLocked() {
  1194  	if e.snd != nil {
  1195  		e.snd.resendTimer.cleanup()
  1196  		e.snd.probeTimer.cleanup()
  1197  		e.snd.reorderTimer.cleanup()
  1198  		e.snd.corkTimer.cleanup()
  1199  	}
  1200  
  1201  	if e.finWait2Timer != nil {
  1202  		e.finWait2Timer.Stop()
  1203  	}
  1204  
  1205  	if e.timeWaitTimer != nil {
  1206  		e.timeWaitTimer.Stop()
  1207  	}
  1208  
  1209  	// Close all endpoints that might have been accepted by TCP but not by
  1210  	// the client.
  1211  	e.closePendingAcceptableConnectionsLocked()
  1212  	e.keepalive.timer.cleanup()
  1213  
  1214  	if e.isRegistered {
  1215  		e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
  1216  		e.isRegistered = false
  1217  	}
  1218  
  1219  	if e.isPortReserved {
  1220  		portRes := ports.Reservation{
  1221  			Networks:     e.effectiveNetProtos,
  1222  			Transport:    ProtocolNumber,
  1223  			Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  1224  			Port:         e.TransportEndpointInfo.ID.LocalPort,
  1225  			Flags:        e.boundPortFlags,
  1226  			BindToDevice: e.boundBindToDevice,
  1227  			Dest:         e.boundDest,
  1228  		}
  1229  		e.stack.ReleasePort(portRes)
  1230  		e.isPortReserved = false
  1231  	}
  1232  	e.boundBindToDevice = 0
  1233  	e.boundPortFlags = ports.Flags{}
  1234  	e.boundDest = tcpip.FullAddress{}
  1235  
  1236  	if e.route != nil {
  1237  		e.route.Release()
  1238  		e.route = nil
  1239  	}
  1240  
  1241  	e.purgeWriteQueue()
  1242  	// Only purge the read queue here if the socket is fully closed by the
  1243  	// user.
  1244  	if e.closed {
  1245  		e.purgeReadQueue()
  1246  	}
  1247  	e.stack.CompleteTransportEndpointCleanup(e)
  1248  	tcpip.DeleteDanglingEndpoint(e)
  1249  }
  1250  
  1251  // wndFromSpace returns the window that we can advertise based on the available
  1252  // receive buffer space.
  1253  func wndFromSpace(space int) int {
  1254  	return space >> rcvAdvWndScale
  1255  }
  1256  
  1257  // initialReceiveWindow returns the initial receive window to advertise in the
  1258  // SYN/SYN-ACK.
  1259  func (e *Endpoint) initialReceiveWindow() int {
  1260  	rcvWnd := wndFromSpace(e.receiveBufferAvailable())
  1261  	if rcvWnd > math.MaxUint16 {
  1262  		rcvWnd = math.MaxUint16
  1263  	}
  1264  
  1265  	// Use the user supplied MSS, if available.
  1266  	routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2
  1267  	if rcvWnd > routeWnd {
  1268  		rcvWnd = routeWnd
  1269  	}
  1270  	rcvWndScale := e.rcvWndScaleForHandshake()
  1271  
  1272  	// Round-down the rcvWnd to a multiple of wndScale. This ensures that the
  1273  	// window offered in SYN won't be reduced due to the loss of precision if
  1274  	// window scaling is enabled after the handshake.
  1275  	rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale)
  1276  
  1277  	// Ensure we can always accept at least 1 byte if the scale specified
  1278  	// was too high for the provided rcvWnd.
  1279  	if rcvWnd == 0 {
  1280  		rcvWnd = 1
  1281  	}
  1282  
  1283  	return rcvWnd
  1284  }
  1285  
  1286  // ModerateRecvBuf adjusts the receive buffer and the advertised window
  1287  // based on the number of bytes copied to userspace.
  1288  func (e *Endpoint) ModerateRecvBuf(copied int) {
  1289  	e.LockUser()
  1290  	defer e.UnlockUser()
  1291  
  1292  	sendNonZeroWindowUpdate := false
  1293  
  1294  	e.rcvQueueMu.Lock()
  1295  	if e.RcvAutoParams.Disabled {
  1296  		e.rcvQueueMu.Unlock()
  1297  		return
  1298  	}
  1299  	now := e.stack.Clock().NowMonotonic()
  1300  	if rtt := e.RcvAutoParams.RTT; rtt == 0 || now.Sub(e.RcvAutoParams.MeasureTime) < rtt {
  1301  		e.RcvAutoParams.CopiedBytes += copied
  1302  		e.rcvQueueMu.Unlock()
  1303  		return
  1304  	}
  1305  	prevRTTCopied := e.RcvAutoParams.CopiedBytes + copied
  1306  	prevCopied := e.RcvAutoParams.PrevCopiedBytes
  1307  	rcvWnd := 0
  1308  	if prevRTTCopied > prevCopied {
  1309  		// The minimal receive window based on what was copied by the app
  1310  		// in the immediate preceding RTT and some extra buffer for 16
  1311  		// segments to account for variations.
  1312  		// We multiply by 2 to account for packet losses.
  1313  		rcvWnd = prevRTTCopied*2 + 16*int(e.amss)
  1314  
  1315  		// Scale for slow start based on bytes copied in this RTT vs previous.
  1316  		grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied
  1317  
  1318  		// Multiply growth factor by 2 again to account for sender being
  1319  		// in slow-start where the sender grows it's congestion window
  1320  		// by 100% per RTT.
  1321  		rcvWnd += grow * 2
  1322  
  1323  		// Make sure auto tuned buffer size can always receive upto 2x
  1324  		// the initial window of 10 segments.
  1325  		if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd {
  1326  			rcvWnd = minRcvWnd
  1327  		}
  1328  
  1329  		// Cap the auto tuned buffer size by the maximum permissible
  1330  		// receive buffer size.
  1331  		if max := e.maxReceiveBufferSize(); rcvWnd > max {
  1332  			rcvWnd = max
  1333  		}
  1334  
  1335  		// We do not adjust downwards as that can cause the receiver to
  1336  		// reject valid data that might already be in flight as the
  1337  		// acceptable window will shrink.
  1338  		rcvBufSize := int(e.ops.GetReceiveBufferSize())
  1339  		if rcvWnd > rcvBufSize {
  1340  			availBefore := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize))
  1341  			e.ops.SetReceiveBufferSize(int64(rcvWnd), false /* notify */)
  1342  			availAfter := wndFromSpace(e.receiveBufferAvailableLocked(rcvWnd))
  1343  			if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, rcvBufSize); crossed && above {
  1344  				sendNonZeroWindowUpdate = true
  1345  			}
  1346  		}
  1347  
  1348  		// We only update PrevCopiedBytes when we grow the buffer because in cases
  1349  		// where PrevCopiedBytes > prevRTTCopied the existing buffer is already big
  1350  		// enough to handle the current rate and we don't need to do any
  1351  		// adjustments.
  1352  		e.RcvAutoParams.PrevCopiedBytes = prevRTTCopied
  1353  	}
  1354  	e.RcvAutoParams.MeasureTime = now
  1355  	e.RcvAutoParams.CopiedBytes = 0
  1356  	e.rcvQueueMu.Unlock()
  1357  
  1358  	// Send the update after unlocking rcvQueueMu as sending a segment acquires
  1359  	// the lock to calculate the window to be sent.
  1360  	if e.EndpointState().connected() && sendNonZeroWindowUpdate {
  1361  		e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu
  1362  	}
  1363  }
  1364  
  1365  // SetOwner implements tcpip.Endpoint.SetOwner.
  1366  func (e *Endpoint) SetOwner(owner tcpip.PacketOwner) {
  1367  	e.owner = owner
  1368  }
  1369  
  1370  // +checklocks:e.mu
  1371  func (e *Endpoint) hardErrorLocked() tcpip.Error {
  1372  	err := e.hardError
  1373  	e.hardError = nil
  1374  	return err
  1375  }
  1376  
  1377  // +checklocks:e.mu
  1378  func (e *Endpoint) lastErrorLocked() tcpip.Error {
  1379  	e.lastErrorMu.Lock()
  1380  	defer e.lastErrorMu.Unlock()
  1381  	err := e.lastError
  1382  	e.lastError = nil
  1383  	return err
  1384  }
  1385  
  1386  // LastError implements tcpip.Endpoint.LastError.
  1387  func (e *Endpoint) LastError() tcpip.Error {
  1388  	e.LockUser()
  1389  	defer e.UnlockUser()
  1390  	if err := e.hardErrorLocked(); err != nil {
  1391  		return err
  1392  	}
  1393  	return e.lastErrorLocked()
  1394  }
  1395  
  1396  // LastErrorLocked reads and clears lastError.
  1397  // Only to be used in tests.
  1398  // +checklocks:e.mu
  1399  func (e *Endpoint) LastErrorLocked() tcpip.Error {
  1400  	return e.lastErrorLocked()
  1401  }
  1402  
  1403  // UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError.
  1404  func (e *Endpoint) UpdateLastError(err tcpip.Error) {
  1405  	e.LockUser()
  1406  	e.lastErrorMu.Lock()
  1407  	e.lastError = err
  1408  	e.lastErrorMu.Unlock()
  1409  	e.UnlockUser()
  1410  }
  1411  
  1412  // Read implements tcpip.Endpoint.Read.
  1413  func (e *Endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) {
  1414  	e.LockUser()
  1415  	defer e.UnlockUser()
  1416  
  1417  	if err := e.checkReadLocked(); err != nil {
  1418  		if _, ok := err.(*tcpip.ErrClosedForReceive); ok {
  1419  			e.stats.ReadErrors.ReadClosed.Increment()
  1420  		}
  1421  		return tcpip.ReadResult{}, err
  1422  	}
  1423  
  1424  	var err error
  1425  	done := 0
  1426  	// N.B. Here we get the first segment to be processed. It is safe to not
  1427  	// hold rcvQueueMu when processing, since we hold e.mu to ensure we only
  1428  	// remove segments from the list through Read() and that new segments
  1429  	// cannot be appended.
  1430  	s := e.rcvQueue.Front()
  1431  	for s != nil {
  1432  		var n int
  1433  		n, err = s.ReadTo(dst, opts.Peek)
  1434  		// Book keeping first then error handling.
  1435  		done += n
  1436  
  1437  		if opts.Peek {
  1438  			s = s.Next()
  1439  		} else {
  1440  			sendNonZeroWindowUpdate := false
  1441  			memDelta := 0
  1442  			for {
  1443  				seg := e.rcvQueue.Front()
  1444  				if seg == nil || seg.payloadSize() != 0 {
  1445  					break
  1446  				}
  1447  				e.rcvQueue.Remove(seg)
  1448  				// Memory is only considered released when the whole segment has been
  1449  				// read.
  1450  				memDelta += seg.segMemSize()
  1451  				seg.DecRef()
  1452  			}
  1453  			e.rcvQueueMu.Lock()
  1454  			e.RcvBufUsed -= n
  1455  			s = e.rcvQueue.Front()
  1456  
  1457  			if memDelta > 0 {
  1458  				// If the window was small before this read and if the read freed up
  1459  				// enough buffer space, to either fit an aMSS or half a receive buffer
  1460  				// (whichever smaller), then notify the protocol goroutine to send a
  1461  				// window update.
  1462  				if crossed, above := e.windowCrossedACKThresholdLocked(memDelta, int(e.ops.GetReceiveBufferSize())); crossed && above {
  1463  					sendNonZeroWindowUpdate = true
  1464  				}
  1465  			}
  1466  			e.rcvQueueMu.Unlock()
  1467  
  1468  			if e.EndpointState().connected() && sendNonZeroWindowUpdate {
  1469  				e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu
  1470  			}
  1471  		}
  1472  
  1473  		if err != nil {
  1474  			break
  1475  		}
  1476  	}
  1477  
  1478  	// If something is read, we must report it. Report error when nothing is read.
  1479  	if done == 0 && err != nil {
  1480  		return tcpip.ReadResult{}, &tcpip.ErrBadBuffer{}
  1481  	}
  1482  	return tcpip.ReadResult{
  1483  		Count: done,
  1484  		Total: done,
  1485  	}, nil
  1486  }
  1487  
  1488  // checkRead checks that endpoint is in a readable state.
  1489  //
  1490  // +checklocks:e.mu
  1491  func (e *Endpoint) checkReadLocked() tcpip.Error {
  1492  	e.rcvQueueMu.Lock()
  1493  	defer e.rcvQueueMu.Unlock()
  1494  	// When in SYN-SENT state, let the caller block on the receive.
  1495  	// An application can initiate a non-blocking connect and then block
  1496  	// on a receive. It can expect to read any data after the handshake
  1497  	// is complete. RFC793, section 3.9, p58.
  1498  	if e.EndpointState() == StateSynSent {
  1499  		return &tcpip.ErrWouldBlock{}
  1500  	}
  1501  
  1502  	// The endpoint can be read if it's connected, or if it's already closed
  1503  	// but has some pending unread data. Also note that a RST being received
  1504  	// would cause the state to become StateError so we should allow the
  1505  	// reads to proceed before returning a ECONNRESET.
  1506  	bufUsed := e.RcvBufUsed
  1507  	if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 {
  1508  		if s == StateError {
  1509  			if err := e.hardErrorLocked(); err != nil {
  1510  				return err
  1511  			}
  1512  			return &tcpip.ErrClosedForReceive{}
  1513  		}
  1514  		e.stats.ReadErrors.NotConnected.Increment()
  1515  		return &tcpip.ErrNotConnected{}
  1516  	}
  1517  
  1518  	if e.RcvBufUsed == 0 {
  1519  		if e.RcvClosed || !e.EndpointState().connected() {
  1520  			return &tcpip.ErrClosedForReceive{}
  1521  		}
  1522  		return &tcpip.ErrWouldBlock{}
  1523  	}
  1524  
  1525  	return nil
  1526  }
  1527  
  1528  // isEndpointWritableLocked checks if a given endpoint is writable
  1529  // and also returns the number of bytes that can be written at this
  1530  // moment. If the endpoint is not writable then it returns an error
  1531  // indicating the reason why it's not writable.
  1532  // +checklocks:e.mu
  1533  // +checklocks:e.sndQueueInfo.sndQueueMu
  1534  func (e *Endpoint) isEndpointWritableLocked() (int, tcpip.Error) {
  1535  	// The endpoint cannot be written to if it's not connected.
  1536  	switch s := e.EndpointState(); {
  1537  	case s == StateError:
  1538  		if err := e.hardErrorLocked(); err != nil {
  1539  			return 0, err
  1540  		}
  1541  		return 0, &tcpip.ErrClosedForSend{}
  1542  	case !s.connecting() && !s.connected():
  1543  		return 0, &tcpip.ErrClosedForSend{}
  1544  	case s.connecting():
  1545  		// As per RFC793, page 56, a send request arriving when in connecting
  1546  		// state, can be queued to be completed after the state becomes
  1547  		// connected. Return an error code for the caller of endpoint Write to
  1548  		// try again, until the connection handshake is complete.
  1549  		return 0, &tcpip.ErrWouldBlock{}
  1550  	}
  1551  
  1552  	// Check if the connection has already been closed for sends.
  1553  	if e.sndQueueInfo.SndClosed {
  1554  		return 0, &tcpip.ErrClosedForSend{}
  1555  	}
  1556  
  1557  	sndBufSize := e.getSendBufferSize()
  1558  	avail := sndBufSize - e.sndQueueInfo.SndBufUsed
  1559  	if avail <= 0 {
  1560  		return 0, &tcpip.ErrWouldBlock{}
  1561  	}
  1562  	return avail, nil
  1563  }
  1564  
  1565  // readFromPayloader reads a slice from the Payloader.
  1566  // +checklocks:e.mu
  1567  // +checklocks:e.sndQueueInfo.sndQueueMu
  1568  func (e *Endpoint) readFromPayloader(p tcpip.Payloader, opts tcpip.WriteOptions, avail int) (buffer.Buffer, tcpip.Error) {
  1569  	// We can release locks while copying data.
  1570  	//
  1571  	// This is not possible if atomic is set, because we can't allow the
  1572  	// available buffer space to be consumed by some other caller while we
  1573  	// are copying data in.
  1574  	limRdr := e.limRdr
  1575  	if !opts.Atomic {
  1576  		defer func() {
  1577  			e.limRdr = limRdr
  1578  		}()
  1579  		e.limRdr = nil
  1580  
  1581  		e.sndQueueInfo.sndQueueMu.Unlock()
  1582  		defer e.sndQueueInfo.sndQueueMu.Lock()
  1583  
  1584  		e.UnlockUser()
  1585  		defer e.LockUser()
  1586  	}
  1587  
  1588  	// Fetch data.
  1589  	var payload buffer.Buffer
  1590  	if l := p.Len(); l < avail {
  1591  		avail = l
  1592  	}
  1593  	if avail == 0 {
  1594  		return payload, nil
  1595  	}
  1596  	if _, err := payload.WriteFromReaderAndLimitedReader(p, int64(avail), limRdr); err != nil {
  1597  		payload.Release()
  1598  		return buffer.Buffer{}, &tcpip.ErrBadBuffer{}
  1599  	}
  1600  	return payload, nil
  1601  }
  1602  
  1603  // queueSegment reads data from the payloader and returns a segment to be sent.
  1604  // +checklocks:e.mu
  1605  func (e *Endpoint) queueSegment(p tcpip.Payloader, opts tcpip.WriteOptions) (*segment, int, tcpip.Error) {
  1606  	e.sndQueueInfo.sndQueueMu.Lock()
  1607  	defer e.sndQueueInfo.sndQueueMu.Unlock()
  1608  
  1609  	avail, err := e.isEndpointWritableLocked()
  1610  	if err != nil {
  1611  		e.stats.WriteErrors.WriteClosed.Increment()
  1612  		return nil, 0, err
  1613  	}
  1614  
  1615  	buf, err := e.readFromPayloader(p, opts, avail)
  1616  	if err != nil {
  1617  		return nil, 0, err
  1618  	}
  1619  
  1620  	// Do not queue zero length segments.
  1621  	if buf.Size() == 0 {
  1622  		return nil, 0, nil
  1623  	}
  1624  
  1625  	if !opts.Atomic {
  1626  		// Since we released locks in between it's possible that the
  1627  		// endpoint transitioned to a CLOSED/ERROR states so make
  1628  		// sure endpoint is still writable before trying to write.
  1629  		avail, err := e.isEndpointWritableLocked()
  1630  		if err != nil {
  1631  			e.stats.WriteErrors.WriteClosed.Increment()
  1632  			buf.Release()
  1633  			return nil, 0, err
  1634  		}
  1635  
  1636  		// A simultaneous call to write on the socket can reduce avail. Discard
  1637  		// excess data copied if this is the case.
  1638  		if int64(avail) < buf.Size() {
  1639  			buf.Truncate(int64(avail))
  1640  		}
  1641  	}
  1642  
  1643  	// Add data to the send queue.
  1644  	size := int(buf.Size())
  1645  	s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), buf)
  1646  	e.sndQueueInfo.SndBufUsed += size
  1647  	e.snd.writeList.PushBack(s)
  1648  
  1649  	return s, size, nil
  1650  }
  1651  
  1652  // Write writes data to the endpoint's peer.
  1653  func (e *Endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) {
  1654  	// Linux completely ignores any address passed to sendto(2) for TCP sockets
  1655  	// (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More
  1656  	// and opts.EndOfRecord are also ignored.
  1657  
  1658  	e.LockUser()
  1659  	defer e.UnlockUser()
  1660  
  1661  	// Return if either we didn't queue anything or if an error occurred while
  1662  	// attempting to queue data.
  1663  	nextSeg, n, err := e.queueSegment(p, opts)
  1664  	if n == 0 || err != nil {
  1665  		return 0, err
  1666  	}
  1667  
  1668  	e.sendData(nextSeg)
  1669  	return int64(n), nil
  1670  }
  1671  
  1672  // selectWindowLocked returns the new window without checking for shrinking or scaling
  1673  // applied.
  1674  // +checklocks:e.mu
  1675  // +checklocks:e.rcvQueueMu
  1676  func (e *Endpoint) selectWindowLocked(rcvBufSize int) (wnd seqnum.Size) {
  1677  	wndFromAvailable := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize))
  1678  	maxWindow := wndFromSpace(rcvBufSize)
  1679  	wndFromUsedBytes := maxWindow - e.RcvBufUsed
  1680  
  1681  	// We take the lesser of the wndFromAvailable and wndFromUsedBytes because in
  1682  	// cases where we receive a lot of small segments the segment overhead is a
  1683  	// lot higher and we can run out socket buffer space before we can fill the
  1684  	// previous window we advertised. In cases where we receive MSS sized or close
  1685  	// MSS sized segments we will probably run out of window space before we
  1686  	// exhaust receive buffer.
  1687  	newWnd := wndFromAvailable
  1688  	if newWnd > wndFromUsedBytes {
  1689  		newWnd = wndFromUsedBytes
  1690  	}
  1691  	if newWnd < 0 {
  1692  		newWnd = 0
  1693  	}
  1694  	return seqnum.Size(newWnd)
  1695  }
  1696  
  1697  // selectWindow invokes selectWindowLocked after acquiring e.rcvQueueMu.
  1698  // +checklocks:e.mu
  1699  func (e *Endpoint) selectWindow() (wnd seqnum.Size) {
  1700  	e.rcvQueueMu.Lock()
  1701  	wnd = e.selectWindowLocked(int(e.ops.GetReceiveBufferSize()))
  1702  	e.rcvQueueMu.Unlock()
  1703  	return wnd
  1704  }
  1705  
  1706  // windowCrossedACKThresholdLocked checks if the receive window to be announced
  1707  // would be under aMSS or under the window derived from half receive buffer,
  1708  // whichever smaller. This is useful as a receive side silly window syndrome
  1709  // prevention mechanism. If window grows to reasonable value, we should send ACK
  1710  // to the sender to inform the rx space is now large. We also want ensure a
  1711  // series of small read()'s won't trigger a flood of spurious tiny ACK's.
  1712  //
  1713  // For large receive buffers, the threshold is aMSS - once reader reads more
  1714  // than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of
  1715  // receive buffer size. This is chosen arbitrarily.
  1716  // crossed will be true if the window size crossed the ACK threshold.
  1717  // above will be true if the new window is >= ACK threshold and false
  1718  // otherwise.
  1719  //
  1720  // +checklocks:e.mu
  1721  // +checklocks:e.rcvQueueMu
  1722  func (e *Endpoint) windowCrossedACKThresholdLocked(deltaBefore int, rcvBufSize int) (crossed bool, above bool) {
  1723  	newAvail := int(e.selectWindowLocked(rcvBufSize))
  1724  	oldAvail := newAvail - deltaBefore
  1725  	if oldAvail < 0 {
  1726  		oldAvail = 0
  1727  	}
  1728  	threshold := int(e.amss)
  1729  	// rcvBufFraction is the inverse of the fraction of receive buffer size that
  1730  	// is used to decide if the available buffer space is now above it.
  1731  	const rcvBufFraction = 2
  1732  	if wndThreshold := wndFromSpace(rcvBufSize / rcvBufFraction); threshold > wndThreshold {
  1733  		threshold = wndThreshold
  1734  	}
  1735  
  1736  	switch {
  1737  	case oldAvail < threshold && newAvail >= threshold:
  1738  		return true, true
  1739  	case oldAvail >= threshold && newAvail < threshold:
  1740  		return true, false
  1741  	}
  1742  	return false, false
  1743  }
  1744  
  1745  // OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet.
  1746  func (e *Endpoint) OnReuseAddressSet(v bool) {
  1747  	e.LockUser()
  1748  	e.portFlags.TupleOnly = v
  1749  	e.UnlockUser()
  1750  }
  1751  
  1752  // OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet.
  1753  func (e *Endpoint) OnReusePortSet(v bool) {
  1754  	e.LockUser()
  1755  	e.portFlags.LoadBalanced = v
  1756  	e.UnlockUser()
  1757  }
  1758  
  1759  // OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet.
  1760  func (e *Endpoint) OnKeepAliveSet(bool) {
  1761  	e.LockUser()
  1762  	e.resetKeepaliveTimer(true /* receivedData */)
  1763  	e.UnlockUser()
  1764  }
  1765  
  1766  // OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet.
  1767  func (e *Endpoint) OnDelayOptionSet(v bool) {
  1768  	if !v {
  1769  		e.LockUser()
  1770  		defer e.UnlockUser()
  1771  		// Handle delayed data.
  1772  		if e.EndpointState().connected() {
  1773  			e.sendData(nil /* next */)
  1774  		}
  1775  	}
  1776  }
  1777  
  1778  // OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet.
  1779  func (e *Endpoint) OnCorkOptionSet(v bool) {
  1780  	if !v {
  1781  		e.LockUser()
  1782  		defer e.UnlockUser()
  1783  		if e.snd != nil {
  1784  			e.snd.corkTimer.disable()
  1785  		}
  1786  		// Handle the corked data.
  1787  		if e.EndpointState().connected() {
  1788  			e.sendData(nil /* next */)
  1789  		}
  1790  	}
  1791  }
  1792  
  1793  func (e *Endpoint) getSendBufferSize() int {
  1794  	return int(e.ops.GetSendBufferSize())
  1795  }
  1796  
  1797  // OnSetReceiveBufferSize implements tcpip.SocketOptionsHandler.OnSetReceiveBufferSize.
  1798  func (e *Endpoint) OnSetReceiveBufferSize(rcvBufSz, oldSz int64) (newSz int64, postSet func()) {
  1799  	e.LockUser()
  1800  
  1801  	sendNonZeroWindowUpdate := false
  1802  	e.rcvQueueMu.Lock()
  1803  
  1804  	// Make sure the receive buffer size allows us to send a
  1805  	// non-zero window size.
  1806  	scale := uint8(0)
  1807  	if e.rcv != nil {
  1808  		scale = e.rcv.RcvWndScale
  1809  	}
  1810  	if rcvBufSz>>scale == 0 {
  1811  		rcvBufSz = 1 << scale
  1812  	}
  1813  
  1814  	availBefore := wndFromSpace(e.receiveBufferAvailableLocked(int(oldSz)))
  1815  	availAfter := wndFromSpace(e.receiveBufferAvailableLocked(int(rcvBufSz)))
  1816  	e.RcvAutoParams.Disabled = true
  1817  
  1818  	// Immediately send an ACK to uncork the sender silly window
  1819  	// syndrome prevetion, when our available space grows above aMSS
  1820  	// or half receive buffer, whichever smaller.
  1821  	if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, int(rcvBufSz)); crossed && above {
  1822  		sendNonZeroWindowUpdate = true
  1823  	}
  1824  
  1825  	e.rcvQueueMu.Unlock()
  1826  
  1827  	postSet = func() {
  1828  		e.LockUser()
  1829  		defer e.UnlockUser()
  1830  		if e.EndpointState().connected() && sendNonZeroWindowUpdate {
  1831  			e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu
  1832  		}
  1833  
  1834  	}
  1835  	e.UnlockUser()
  1836  	return rcvBufSz, postSet
  1837  }
  1838  
  1839  // OnSetSendBufferSize implements tcpip.SocketOptionsHandler.OnSetSendBufferSize.
  1840  func (e *Endpoint) OnSetSendBufferSize(sz int64) int64 {
  1841  	e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled.Store(1)
  1842  	return sz
  1843  }
  1844  
  1845  // WakeupWriters implements tcpip.SocketOptionsHandler.WakeupWriters.
  1846  func (e *Endpoint) WakeupWriters() {
  1847  	e.LockUser()
  1848  	defer e.UnlockUser()
  1849  
  1850  	sendBufferSize := e.getSendBufferSize()
  1851  	e.sndQueueInfo.sndQueueMu.Lock()
  1852  	notify := (sendBufferSize - e.sndQueueInfo.SndBufUsed) >= e.sndQueueInfo.SndBufUsed>>1
  1853  	e.sndQueueInfo.sndQueueMu.Unlock()
  1854  
  1855  	if notify {
  1856  		e.waiterQueue.Notify(waiter.WritableEvents)
  1857  	}
  1858  }
  1859  
  1860  // SetSockOptInt sets a socket option.
  1861  func (e *Endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error {
  1862  	// Lower 2 bits represents ECN bits. RFC 3168, section 23.1
  1863  	const inetECNMask = 3
  1864  
  1865  	switch opt {
  1866  	case tcpip.KeepaliveCountOption:
  1867  		e.LockUser()
  1868  		e.keepalive.Lock()
  1869  		e.keepalive.count = v
  1870  		e.keepalive.Unlock()
  1871  		e.resetKeepaliveTimer(true /* receivedData */)
  1872  		e.UnlockUser()
  1873  
  1874  	case tcpip.IPv4TOSOption:
  1875  		e.LockUser()
  1876  		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
  1877  		// ignore the bits for now.
  1878  		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
  1879  		e.UnlockUser()
  1880  
  1881  	case tcpip.IPv6TrafficClassOption:
  1882  		e.LockUser()
  1883  		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
  1884  		// ignore the bits for now.
  1885  		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
  1886  		e.UnlockUser()
  1887  
  1888  	case tcpip.MaxSegOption:
  1889  		userMSS := v
  1890  		if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS {
  1891  			return &tcpip.ErrInvalidOptionValue{}
  1892  		}
  1893  		e.LockUser()
  1894  		e.userMSS = uint16(userMSS)
  1895  		e.UnlockUser()
  1896  
  1897  	case tcpip.MTUDiscoverOption:
  1898  		switch v := tcpip.PMTUDStrategy(v); v {
  1899  		case tcpip.PMTUDiscoveryWant, tcpip.PMTUDiscoveryDont, tcpip.PMTUDiscoveryDo:
  1900  			e.LockUser()
  1901  			e.pmtud = v
  1902  			e.UnlockUser()
  1903  		case tcpip.PMTUDiscoveryProbe:
  1904  			// We don't support a way to ignore MTU updates; it's
  1905  			// either on or it's off.
  1906  			return &tcpip.ErrNotSupported{}
  1907  		default:
  1908  			return &tcpip.ErrNotSupported{}
  1909  		}
  1910  
  1911  	case tcpip.IPv4TTLOption:
  1912  		e.LockUser()
  1913  		e.ipv4TTL = uint8(v)
  1914  		e.UnlockUser()
  1915  
  1916  	case tcpip.IPv6HopLimitOption:
  1917  		e.LockUser()
  1918  		e.ipv6HopLimit = int16(v)
  1919  		e.UnlockUser()
  1920  
  1921  	case tcpip.TCPSynCountOption:
  1922  		if v < 1 || v > 255 {
  1923  			return &tcpip.ErrInvalidOptionValue{}
  1924  		}
  1925  		e.LockUser()
  1926  		e.maxSynRetries = uint8(v)
  1927  		e.UnlockUser()
  1928  
  1929  	case tcpip.TCPWindowClampOption:
  1930  		if v == 0 {
  1931  			e.LockUser()
  1932  			switch e.EndpointState() {
  1933  			case StateClose, StateInitial:
  1934  				e.windowClamp = 0
  1935  				e.UnlockUser()
  1936  				return nil
  1937  			default:
  1938  				e.UnlockUser()
  1939  				return &tcpip.ErrInvalidOptionValue{}
  1940  			}
  1941  		}
  1942  		var rs tcpip.TCPReceiveBufferSizeRangeOption
  1943  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
  1944  			if v < rs.Min/2 {
  1945  				v = rs.Min / 2
  1946  			}
  1947  		}
  1948  		e.LockUser()
  1949  		e.windowClamp = uint32(v)
  1950  		e.UnlockUser()
  1951  	}
  1952  	return nil
  1953  }
  1954  
  1955  // HasNIC returns true if the NICID is defined in the stack or id is 0.
  1956  func (e *Endpoint) HasNIC(id int32) bool {
  1957  	return id == 0 || e.stack.HasNIC(tcpip.NICID(id))
  1958  }
  1959  
  1960  // SetSockOpt sets a socket option.
  1961  func (e *Endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error {
  1962  	switch v := opt.(type) {
  1963  	case *tcpip.KeepaliveIdleOption:
  1964  		e.LockUser()
  1965  		e.keepalive.Lock()
  1966  		e.keepalive.idle = time.Duration(*v)
  1967  		e.keepalive.Unlock()
  1968  		e.resetKeepaliveTimer(true /* receivedData */)
  1969  		e.UnlockUser()
  1970  
  1971  	case *tcpip.KeepaliveIntervalOption:
  1972  		e.LockUser()
  1973  		e.keepalive.Lock()
  1974  		e.keepalive.interval = time.Duration(*v)
  1975  		e.keepalive.Unlock()
  1976  		e.resetKeepaliveTimer(true /* receivedData */)
  1977  		e.UnlockUser()
  1978  
  1979  	case *tcpip.TCPUserTimeoutOption:
  1980  		e.LockUser()
  1981  		e.userTimeout = time.Duration(*v)
  1982  		e.UnlockUser()
  1983  
  1984  	case *tcpip.CongestionControlOption:
  1985  		// Query the available cc algorithms in the stack and
  1986  		// validate that the specified algorithm is actually
  1987  		// supported in the stack.
  1988  		var avail tcpip.TCPAvailableCongestionControlOption
  1989  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil {
  1990  			return err
  1991  		}
  1992  		availCC := strings.Split(string(avail), " ")
  1993  		for _, cc := range availCC {
  1994  			if *v == tcpip.CongestionControlOption(cc) {
  1995  				e.LockUser()
  1996  				state := e.EndpointState()
  1997  				e.cc = *v
  1998  				switch state {
  1999  				case StateEstablished:
  2000  					if e.EndpointState() == state {
  2001  						e.snd.cc = e.snd.initCongestionControl(e.cc)
  2002  					}
  2003  				}
  2004  				e.UnlockUser()
  2005  				return nil
  2006  			}
  2007  		}
  2008  
  2009  		// Linux returns ENOENT when an invalid congestion
  2010  		// control algorithm is specified.
  2011  		return &tcpip.ErrNoSuchFile{}
  2012  
  2013  	case *tcpip.TCPLingerTimeoutOption:
  2014  		e.LockUser()
  2015  
  2016  		switch {
  2017  		case *v < 0:
  2018  			// Same as effectively disabling TCPLinger timeout.
  2019  			*v = -1
  2020  		case *v == 0:
  2021  			// Same as the stack default.
  2022  			var stackLingerTimeout tcpip.TCPLingerTimeoutOption
  2023  			if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil {
  2024  				panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err))
  2025  			}
  2026  			*v = stackLingerTimeout
  2027  		case *v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout):
  2028  			// Cap it to Stack's default TCP_LINGER2 timeout.
  2029  			*v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout)
  2030  		default:
  2031  		}
  2032  
  2033  		e.tcpLingerTimeout = time.Duration(*v)
  2034  		e.UnlockUser()
  2035  
  2036  	case *tcpip.TCPDeferAcceptOption:
  2037  		e.LockUser()
  2038  		if time.Duration(*v) > MaxRTO {
  2039  			*v = tcpip.TCPDeferAcceptOption(MaxRTO)
  2040  		}
  2041  		e.deferAccept = time.Duration(*v)
  2042  		e.UnlockUser()
  2043  
  2044  	case *tcpip.SocketDetachFilterOption:
  2045  		return nil
  2046  
  2047  	default:
  2048  		return nil
  2049  	}
  2050  	return nil
  2051  }
  2052  
  2053  // readyReceiveSize returns the number of bytes ready to be received.
  2054  func (e *Endpoint) readyReceiveSize() (int, tcpip.Error) {
  2055  	e.LockUser()
  2056  	defer e.UnlockUser()
  2057  
  2058  	// The endpoint cannot be in listen state.
  2059  	if e.EndpointState() == StateListen {
  2060  		return 0, &tcpip.ErrInvalidEndpointState{}
  2061  	}
  2062  
  2063  	e.rcvQueueMu.Lock()
  2064  	defer e.rcvQueueMu.Unlock()
  2065  
  2066  	return e.RcvBufUsed, nil
  2067  }
  2068  
  2069  // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
  2070  func (e *Endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) {
  2071  	switch opt {
  2072  	case tcpip.KeepaliveCountOption:
  2073  		e.keepalive.Lock()
  2074  		v := e.keepalive.count
  2075  		e.keepalive.Unlock()
  2076  		return v, nil
  2077  
  2078  	case tcpip.IPv4TOSOption:
  2079  		e.LockUser()
  2080  		v := int(e.sendTOS)
  2081  		e.UnlockUser()
  2082  		return v, nil
  2083  
  2084  	case tcpip.IPv6TrafficClassOption:
  2085  		e.LockUser()
  2086  		v := int(e.sendTOS)
  2087  		e.UnlockUser()
  2088  		return v, nil
  2089  
  2090  	case tcpip.MaxSegOption:
  2091  		// Linux only returns user_mss value if user_mss is set and the socket is
  2092  		// unconnected. Otherwise Linux returns the actual current MSS. Netstack
  2093  		// mimics the user_mss behavior, but otherwise just returns the defaultMSS
  2094  		// for now.
  2095  		v := header.TCPDefaultMSS
  2096  		e.LockUser()
  2097  		if state := e.EndpointState(); e.userMSS > 0 && (state.internal() || state == StateClose || state == StateListen) {
  2098  			v = int(e.userMSS)
  2099  		}
  2100  		e.UnlockUser()
  2101  		return v, nil
  2102  
  2103  	case tcpip.MTUDiscoverOption:
  2104  		e.LockUser()
  2105  		v := e.pmtud
  2106  		e.UnlockUser()
  2107  		return int(v), nil
  2108  
  2109  	case tcpip.ReceiveQueueSizeOption:
  2110  		return e.readyReceiveSize()
  2111  
  2112  	case tcpip.IPv4TTLOption:
  2113  		e.LockUser()
  2114  		v := int(e.ipv4TTL)
  2115  		e.UnlockUser()
  2116  		return v, nil
  2117  
  2118  	case tcpip.IPv6HopLimitOption:
  2119  		e.LockUser()
  2120  		v := int(e.ipv6HopLimit)
  2121  		e.UnlockUser()
  2122  		return v, nil
  2123  
  2124  	case tcpip.TCPSynCountOption:
  2125  		e.LockUser()
  2126  		v := int(e.maxSynRetries)
  2127  		e.UnlockUser()
  2128  		return v, nil
  2129  
  2130  	case tcpip.TCPWindowClampOption:
  2131  		e.LockUser()
  2132  		v := int(e.windowClamp)
  2133  		e.UnlockUser()
  2134  		return v, nil
  2135  
  2136  	case tcpip.MulticastTTLOption:
  2137  		return 1, nil
  2138  
  2139  	default:
  2140  		return -1, &tcpip.ErrUnknownProtocolOption{}
  2141  	}
  2142  }
  2143  
  2144  func (e *Endpoint) getTCPInfo() tcpip.TCPInfoOption {
  2145  	info := tcpip.TCPInfoOption{}
  2146  	e.LockUser()
  2147  	if state := e.EndpointState(); state.internal() {
  2148  		info.State = tcpip.EndpointState(StateClose)
  2149  	} else {
  2150  		info.State = tcpip.EndpointState(state)
  2151  	}
  2152  	snd := e.snd
  2153  	if snd != nil {
  2154  		// We do not calculate RTT before sending the data packets. If
  2155  		// the connection did not send and receive data, then RTT will
  2156  		// be zero.
  2157  		snd.rtt.Lock()
  2158  		info.RTT = snd.rtt.TCPRTTState.SRTT
  2159  		info.RTTVar = snd.rtt.TCPRTTState.RTTVar
  2160  		snd.rtt.Unlock()
  2161  
  2162  		info.RTO = snd.RTO
  2163  		info.CcState = snd.state
  2164  		info.SndSsthresh = uint32(snd.Ssthresh)
  2165  		info.SndCwnd = uint32(snd.SndCwnd)
  2166  		info.ReorderSeen = snd.rc.Reord
  2167  	}
  2168  	e.UnlockUser()
  2169  	return info
  2170  }
  2171  
  2172  // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
  2173  func (e *Endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error {
  2174  	switch o := opt.(type) {
  2175  	case *tcpip.TCPInfoOption:
  2176  		*o = e.getTCPInfo()
  2177  
  2178  	case *tcpip.KeepaliveIdleOption:
  2179  		e.keepalive.Lock()
  2180  		*o = tcpip.KeepaliveIdleOption(e.keepalive.idle)
  2181  		e.keepalive.Unlock()
  2182  
  2183  	case *tcpip.KeepaliveIntervalOption:
  2184  		e.keepalive.Lock()
  2185  		*o = tcpip.KeepaliveIntervalOption(e.keepalive.interval)
  2186  		e.keepalive.Unlock()
  2187  
  2188  	case *tcpip.TCPUserTimeoutOption:
  2189  		e.LockUser()
  2190  		*o = tcpip.TCPUserTimeoutOption(e.userTimeout)
  2191  		e.UnlockUser()
  2192  
  2193  	case *tcpip.CongestionControlOption:
  2194  		e.LockUser()
  2195  		*o = e.cc
  2196  		e.UnlockUser()
  2197  
  2198  	case *tcpip.TCPLingerTimeoutOption:
  2199  		e.LockUser()
  2200  		*o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout)
  2201  		e.UnlockUser()
  2202  
  2203  	case *tcpip.TCPDeferAcceptOption:
  2204  		e.LockUser()
  2205  		*o = tcpip.TCPDeferAcceptOption(e.deferAccept)
  2206  		e.UnlockUser()
  2207  
  2208  	case *tcpip.OriginalDestinationOption:
  2209  		e.LockUser()
  2210  		ipt := e.stack.IPTables()
  2211  		addr, port, err := ipt.OriginalDst(e.TransportEndpointInfo.ID, e.NetProto, ProtocolNumber)
  2212  		e.UnlockUser()
  2213  		if err != nil {
  2214  			return err
  2215  		}
  2216  		*o = tcpip.OriginalDestinationOption{
  2217  			Addr: addr,
  2218  			Port: port,
  2219  		}
  2220  
  2221  	default:
  2222  		return &tcpip.ErrUnknownProtocolOption{}
  2223  	}
  2224  	return nil
  2225  }
  2226  
  2227  // checkV4MappedLocked determines the effective network protocol and converts
  2228  // addr to its canonical form.
  2229  // +checklocks:e.mu
  2230  func (e *Endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) {
  2231  	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only())
  2232  	if err != nil {
  2233  		return tcpip.FullAddress{}, 0, err
  2234  	}
  2235  	return unwrapped, netProto, nil
  2236  }
  2237  
  2238  // Disconnect implements tcpip.Endpoint.Disconnect.
  2239  func (*Endpoint) Disconnect() tcpip.Error {
  2240  	return &tcpip.ErrNotSupported{}
  2241  }
  2242  
  2243  // Connect connects the endpoint to its peer.
  2244  func (e *Endpoint) Connect(addr tcpip.FullAddress) tcpip.Error {
  2245  	e.LockUser()
  2246  	defer e.UnlockUser()
  2247  	err := e.connect(addr, true)
  2248  	if err != nil {
  2249  		if !err.IgnoreStats() {
  2250  			// Connect failed. Let's wake up any waiters.
  2251  			e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  2252  			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
  2253  			e.stats.FailedConnectionAttempts.Increment()
  2254  		}
  2255  	}
  2256  	return err
  2257  }
  2258  
  2259  // registerEndpoint registers the endpoint with the provided address.
  2260  //
  2261  // +checklocks:e.mu
  2262  func (e *Endpoint) registerEndpoint(addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber, nicID tcpip.NICID) tcpip.Error {
  2263  	netProtos := []tcpip.NetworkProtocolNumber{netProto}
  2264  	if e.TransportEndpointInfo.ID.LocalPort != 0 {
  2265  		// The endpoint is bound to a port, attempt to register it.
  2266  		err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
  2267  		if err != nil {
  2268  			return err
  2269  		}
  2270  	} else {
  2271  		// The endpoint doesn't have a local port yet, so try to get
  2272  		// one. Make sure that it isn't one that will result in the same
  2273  		// address/port for both local and remote (otherwise this
  2274  		// endpoint would be trying to connect to itself).
  2275  		sameAddr := e.TransportEndpointInfo.ID.LocalAddress == e.TransportEndpointInfo.ID.RemoteAddress
  2276  
  2277  		var twReuse tcpip.TCPTimeWaitReuseOption
  2278  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &twReuse); err != nil {
  2279  			panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &twReuse, err))
  2280  		}
  2281  
  2282  		reuse := twReuse == tcpip.TCPTimeWaitReuseGlobal
  2283  		if twReuse == tcpip.TCPTimeWaitReuseLoopbackOnly {
  2284  			switch netProto {
  2285  			case header.IPv4ProtocolNumber:
  2286  				reuse = header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.LocalAddress) && header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.RemoteAddress)
  2287  			case header.IPv6ProtocolNumber:
  2288  				reuse = e.TransportEndpointInfo.ID.LocalAddress == header.IPv6Loopback && e.TransportEndpointInfo.ID.RemoteAddress == header.IPv6Loopback
  2289  			}
  2290  		}
  2291  
  2292  		bindToDevice := tcpip.NICID(e.ops.GetBindToDevice())
  2293  		if _, err := e.stack.PickEphemeralPort(e.stack.SecureRNG(), func(p uint16) (bool, tcpip.Error) {
  2294  			if sameAddr && p == e.TransportEndpointInfo.ID.RemotePort {
  2295  				return false, nil
  2296  			}
  2297  			portRes := ports.Reservation{
  2298  				Networks:     netProtos,
  2299  				Transport:    ProtocolNumber,
  2300  				Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  2301  				Port:         p,
  2302  				Flags:        e.portFlags,
  2303  				BindToDevice: bindToDevice,
  2304  				Dest:         addr,
  2305  			}
  2306  			if _, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, nil /* testPort */); err != nil {
  2307  				if _, ok := err.(*tcpip.ErrPortInUse); !ok || !reuse {
  2308  					return false, nil
  2309  				}
  2310  				transEPID := e.TransportEndpointInfo.ID
  2311  				transEPID.LocalPort = p
  2312  				// Check if an endpoint is registered with demuxer in TIME-WAIT and if
  2313  				// we can reuse it. If we can't find a transport endpoint then we just
  2314  				// skip using this port as it's possible that either an endpoint has
  2315  				// bound the port but not registered with demuxer yet (no listen/connect
  2316  				// done yet) or the reservation was freed between the check above and
  2317  				// the FindTransportEndpoint below. But rather than retry the same port
  2318  				// we just skip it and move on.
  2319  				transEP := e.stack.FindTransportEndpoint(netProto, ProtocolNumber, transEPID, nicID)
  2320  				if transEP == nil {
  2321  					// ReservePort failed but there is no registered endpoint with
  2322  					// demuxer. Which indicates there is at least some endpoint that has
  2323  					// bound the port.
  2324  					return false, nil
  2325  				}
  2326  
  2327  				tcpEP := transEP.(*Endpoint)
  2328  				tcpEP.LockUser()
  2329  				// If the endpoint is not in TIME-WAIT or if it is in TIME-WAIT but
  2330  				// less than 1 second has elapsed since its recentTS was updated then
  2331  				// we cannot reuse the port.
  2332  				if tcpEP.EndpointState() != StateTimeWait || e.stack.Clock().NowMonotonic().Sub(tcpEP.recentTSTime) < 1*time.Second {
  2333  					tcpEP.UnlockUser()
  2334  					return false, nil
  2335  				}
  2336  				// Since the endpoint is in TIME-WAIT it should be safe to acquire its
  2337  				// Lock while holding the lock for this endpoint as endpoints in
  2338  				// TIME-WAIT do not acquire locks on other endpoints.
  2339  				tcpEP.transitionToStateCloseLocked()
  2340  				tcpEP.drainClosingSegmentQueue()
  2341  				tcpEP.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  2342  				tcpEP.UnlockUser()
  2343  				// Now try and Reserve again if it fails then we skip.
  2344  				portRes := ports.Reservation{
  2345  					Networks:     netProtos,
  2346  					Transport:    ProtocolNumber,
  2347  					Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  2348  					Port:         p,
  2349  					Flags:        e.portFlags,
  2350  					BindToDevice: bindToDevice,
  2351  					Dest:         addr,
  2352  				}
  2353  				if _, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, nil /* testPort */); err != nil {
  2354  					return false, nil
  2355  				}
  2356  			}
  2357  
  2358  			id := e.TransportEndpointInfo.ID
  2359  			id.LocalPort = p
  2360  			if err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.portFlags, bindToDevice); err != nil {
  2361  				portRes := ports.Reservation{
  2362  					Networks:     netProtos,
  2363  					Transport:    ProtocolNumber,
  2364  					Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  2365  					Port:         p,
  2366  					Flags:        e.portFlags,
  2367  					BindToDevice: bindToDevice,
  2368  					Dest:         addr,
  2369  				}
  2370  				e.stack.ReleasePort(portRes)
  2371  				if _, ok := err.(*tcpip.ErrPortInUse); ok {
  2372  					return false, nil
  2373  				}
  2374  				return false, err
  2375  			}
  2376  
  2377  			// Port picking successful. Save the details of
  2378  			// the selected port.
  2379  			e.TransportEndpointInfo.ID = id
  2380  			e.isPortReserved = true
  2381  			e.boundBindToDevice = bindToDevice
  2382  			e.boundPortFlags = e.portFlags
  2383  			e.boundDest = addr
  2384  			return true, nil
  2385  		}); err != nil {
  2386  			e.stack.Stats().TCP.FailedPortReservations.Increment()
  2387  			return err
  2388  		}
  2389  	}
  2390  	return nil
  2391  }
  2392  
  2393  // connect connects the endpoint to its peer.
  2394  // +checklocks:e.mu
  2395  func (e *Endpoint) connect(addr tcpip.FullAddress, handshake bool) tcpip.Error {
  2396  	connectingAddr := addr.Addr
  2397  
  2398  	addr, netProto, err := e.checkV4MappedLocked(addr)
  2399  	if err != nil {
  2400  		return err
  2401  	}
  2402  
  2403  	if e.EndpointState().connected() {
  2404  		// The endpoint is already connected. If caller hasn't been
  2405  		// notified yet, return success.
  2406  		if !e.isConnectNotified {
  2407  			e.isConnectNotified = true
  2408  			return nil
  2409  		}
  2410  		// Otherwise return that it's already connected.
  2411  		return &tcpip.ErrAlreadyConnected{}
  2412  	}
  2413  
  2414  	nicID := addr.NIC
  2415  	switch e.EndpointState() {
  2416  	case StateBound:
  2417  		// If we're already bound to a NIC but the caller is requesting
  2418  		// that we use a different one now, we cannot proceed.
  2419  		if e.boundNICID == 0 {
  2420  			break
  2421  		}
  2422  
  2423  		if nicID != 0 && nicID != e.boundNICID {
  2424  			return &tcpip.ErrHostUnreachable{}
  2425  		}
  2426  
  2427  		nicID = e.boundNICID
  2428  
  2429  	case StateInitial:
  2430  		// Nothing to do. We'll eventually fill-in the gaps in the ID (if any)
  2431  		// when we find a route.
  2432  
  2433  	case StateConnecting, StateSynSent, StateSynRecv:
  2434  		// A connection request has already been issued but hasn't completed
  2435  		// yet.
  2436  		return &tcpip.ErrAlreadyConnecting{}
  2437  
  2438  	case StateError:
  2439  		if err := e.hardErrorLocked(); err != nil {
  2440  			return err
  2441  		}
  2442  		return &tcpip.ErrConnectionAborted{}
  2443  
  2444  	default:
  2445  		return &tcpip.ErrInvalidEndpointState{}
  2446  	}
  2447  
  2448  	// Find a route to the desired destination.
  2449  	r, err := e.stack.FindRoute(nicID, e.TransportEndpointInfo.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */)
  2450  	if err != nil {
  2451  		return err
  2452  	}
  2453  	defer r.Release()
  2454  
  2455  	e.TransportEndpointInfo.ID.LocalAddress = r.LocalAddress()
  2456  	e.TransportEndpointInfo.ID.RemoteAddress = r.RemoteAddress()
  2457  	e.TransportEndpointInfo.ID.RemotePort = addr.Port
  2458  
  2459  	oldState := e.EndpointState()
  2460  	e.setEndpointState(StateConnecting)
  2461  	if err := e.registerEndpoint(addr, netProto, r.NICID()); err != nil {
  2462  		e.setEndpointState(oldState)
  2463  		if _, ok := err.(*tcpip.ErrPortInUse); ok {
  2464  			return &tcpip.ErrBadLocalAddress{}
  2465  		}
  2466  		return err
  2467  	}
  2468  
  2469  	e.isRegistered = true
  2470  	r.Acquire()
  2471  	e.route = r
  2472  	e.boundNICID = nicID
  2473  	e.effectiveNetProtos = []tcpip.NetworkProtocolNumber{netProto}
  2474  	e.connectingAddress = connectingAddr
  2475  
  2476  	e.initGSO()
  2477  
  2478  	// Connect in the restore phase does not perform handshake. Restore its
  2479  	// connection setting here.
  2480  	if !handshake {
  2481  		e.segmentQueue.mu.Lock()
  2482  		for _, l := range []segmentList{e.segmentQueue.list, e.snd.writeList} {
  2483  			for s := l.Front(); s != nil; s = s.Next() {
  2484  				s.id = e.TransportEndpointInfo.ID
  2485  				e.sndQueueInfo.sndWaker.Assert()
  2486  			}
  2487  		}
  2488  		e.segmentQueue.mu.Unlock()
  2489  		e.snd.ep.AssertLockHeld(e)
  2490  		e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
  2491  		e.setEndpointState(StateEstablished)
  2492  		// Set the new auto tuned send buffer size after entering
  2493  		// established state.
  2494  		e.ops.SetSendBufferSize(e.computeTCPSendBufferSize(), false /* notify */)
  2495  		return &tcpip.ErrConnectStarted{}
  2496  	}
  2497  
  2498  	// Start a new handshake.
  2499  	h := e.newHandshake()
  2500  	e.setEndpointState(StateSynSent)
  2501  	h.start()
  2502  	e.stack.Stats().TCP.ActiveConnectionOpenings.Increment()
  2503  
  2504  	return &tcpip.ErrConnectStarted{}
  2505  }
  2506  
  2507  // ConnectEndpoint is not supported.
  2508  func (*Endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error {
  2509  	return &tcpip.ErrInvalidEndpointState{}
  2510  }
  2511  
  2512  // Shutdown closes the read and/or write end of the endpoint connection to its
  2513  // peer.
  2514  func (e *Endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error {
  2515  	e.LockUser()
  2516  	defer e.UnlockUser()
  2517  
  2518  	if e.EndpointState().connecting() {
  2519  		// When calling shutdown(2) on a connecting socket, the endpoint must
  2520  		// enter the error state. But this logic cannot belong to the shutdownLocked
  2521  		// method because that method is called during a close(2) (and closing a
  2522  		// connecting socket is not an error).
  2523  		e.handshakeFailed(&tcpip.ErrConnectionReset{})
  2524  		e.waiterQueue.Notify(waiter.WritableEvents | waiter.EventHUp | waiter.EventErr)
  2525  		return nil
  2526  	}
  2527  
  2528  	return e.shutdownLocked(flags)
  2529  }
  2530  
  2531  // +checklocks:e.mu
  2532  func (e *Endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error {
  2533  	e.shutdownFlags |= flags
  2534  	switch {
  2535  	case e.EndpointState().connected():
  2536  		// Close for read.
  2537  		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
  2538  			// Mark read side as closed.
  2539  			e.rcvQueueMu.Lock()
  2540  			e.RcvClosed = true
  2541  			rcvBufUsed := e.RcvBufUsed
  2542  			e.rcvQueueMu.Unlock()
  2543  			// If we're fully closed and we have unread data we need to abort
  2544  			// the connection with a RST.
  2545  			if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 {
  2546  				e.resetConnectionLocked(&tcpip.ErrConnectionAborted{})
  2547  				return nil
  2548  			}
  2549  			// Wake up any readers that maybe waiting for the stream to become
  2550  			// readable.
  2551  			events := waiter.ReadableEvents
  2552  			if e.shutdownFlags&tcpip.ShutdownWrite == 0 {
  2553  				// If ShutdownWrite is not set, write end won't close and
  2554  				// we end up with a half-closed connection
  2555  				events |= waiter.EventRdHUp
  2556  			}
  2557  			e.waiterQueue.Notify(events)
  2558  		}
  2559  
  2560  		// Close for write.
  2561  		if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
  2562  			e.sndQueueInfo.sndQueueMu.Lock()
  2563  			if e.sndQueueInfo.SndClosed {
  2564  				// Already closed.
  2565  				e.sndQueueInfo.sndQueueMu.Unlock()
  2566  				if e.EndpointState() == StateTimeWait {
  2567  					return &tcpip.ErrNotConnected{}
  2568  				}
  2569  				return nil
  2570  			}
  2571  
  2572  			// Queue fin segment.
  2573  			s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), buffer.Buffer{})
  2574  			e.snd.writeList.PushBack(s)
  2575  			// Mark endpoint as closed.
  2576  			e.sndQueueInfo.SndClosed = true
  2577  			e.sndQueueInfo.sndQueueMu.Unlock()
  2578  
  2579  			// Drain the send queue.
  2580  			e.sendData(s)
  2581  
  2582  			// Mark send side as closed.
  2583  			e.snd.Closed = true
  2584  
  2585  			// Wake up any writers that maybe waiting for the stream to become
  2586  			// writable.
  2587  			e.waiterQueue.Notify(waiter.WritableEvents)
  2588  		}
  2589  
  2590  		return nil
  2591  	case e.EndpointState() == StateListen:
  2592  		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
  2593  			// Reset all connections from the accept queue and keep the
  2594  			// worker running so that it can continue handling incoming
  2595  			// segments by replying with RST.
  2596  			//
  2597  			// By not removing this endpoint from the demuxer mapping, we
  2598  			// ensure that any other bind to the same port fails, as on Linux.
  2599  			e.rcvQueueMu.Lock()
  2600  			e.RcvClosed = true
  2601  			e.rcvQueueMu.Unlock()
  2602  			e.closePendingAcceptableConnectionsLocked()
  2603  			// Notify waiters that the endpoint is shutdown.
  2604  			e.waiterQueue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr)
  2605  		}
  2606  		return nil
  2607  	default:
  2608  		return &tcpip.ErrNotConnected{}
  2609  	}
  2610  }
  2611  
  2612  // Listen puts the endpoint in "listen" mode, which allows it to accept
  2613  // new connections.
  2614  func (e *Endpoint) Listen(backlog int) tcpip.Error {
  2615  	if err := e.listen(backlog); err != nil {
  2616  		if !err.IgnoreStats() {
  2617  			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
  2618  			e.stats.FailedConnectionAttempts.Increment()
  2619  		}
  2620  		return err
  2621  	}
  2622  	return nil
  2623  }
  2624  
  2625  func (e *Endpoint) listen(backlog int) tcpip.Error {
  2626  	e.LockUser()
  2627  	defer e.UnlockUser()
  2628  
  2629  	if e.EndpointState() == StateListen && !e.closed {
  2630  		e.acceptMu.Lock()
  2631  		defer e.acceptMu.Unlock()
  2632  
  2633  		// Adjust the size of the backlog iff we can fit
  2634  		// existing pending connections into the new one.
  2635  		if e.acceptQueue.endpoints.Len() > backlog {
  2636  			return &tcpip.ErrInvalidEndpointState{}
  2637  		}
  2638  		e.acceptQueue.capacity = backlog
  2639  
  2640  		if e.acceptQueue.pendingEndpoints == nil {
  2641  			e.acceptQueue.pendingEndpoints = make(map[*Endpoint]struct{})
  2642  		}
  2643  
  2644  		e.shutdownFlags = 0
  2645  		e.updateConnDirectionState(connDirectionStateOpen)
  2646  		e.rcvQueueMu.Lock()
  2647  		e.RcvClosed = false
  2648  		e.rcvQueueMu.Unlock()
  2649  
  2650  		return nil
  2651  	}
  2652  
  2653  	if e.EndpointState() == StateInitial {
  2654  		// The listen is called on an unbound socket, the socket is
  2655  		// automatically bound to a random free port with the local
  2656  		// address set to INADDR_ANY.
  2657  		if err := e.bindLocked(tcpip.FullAddress{}); err != nil {
  2658  			return err
  2659  		}
  2660  	}
  2661  
  2662  	// Endpoint must be bound before it can transition to listen mode.
  2663  	if e.EndpointState() != StateBound {
  2664  		e.stats.ReadErrors.InvalidEndpointState.Increment()
  2665  		return &tcpip.ErrInvalidEndpointState{}
  2666  	}
  2667  
  2668  	// Setting this state after RegisterTransportEndpoint will result in a
  2669  	// race where the endpoint is in Bound but reachable via the demuxer. Instead
  2670  	// we set it to listen so that incoming packets will just be queued to the
  2671  	// inbound segment queue by the TCP processor.
  2672  	e.setEndpointState(StateListen)
  2673  	// Register the endpoint.
  2674  	if err := e.stack.RegisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil {
  2675  		e.transitionToStateCloseLocked()
  2676  		return err
  2677  	}
  2678  
  2679  	e.isRegistered = true
  2680  
  2681  	// The queue may be non-zero when we're restoring the endpoint, and it
  2682  	// may be pre-populated with some previously accepted (but not Accepted)
  2683  	// endpoints.
  2684  	e.acceptMu.Lock()
  2685  	if e.acceptQueue.pendingEndpoints == nil {
  2686  		e.acceptQueue.pendingEndpoints = make(map[*Endpoint]struct{})
  2687  	}
  2688  	if e.acceptQueue.capacity == 0 {
  2689  		e.acceptQueue.capacity = backlog
  2690  	}
  2691  	e.acceptMu.Unlock()
  2692  
  2693  	// Initialize the listening context.
  2694  	rcvWnd := seqnum.Size(e.receiveBufferAvailable())
  2695  	e.listenCtx = newListenContext(e.stack, e.protocol, e, rcvWnd, e.ops.GetV6Only(), e.NetProto)
  2696  
  2697  	return nil
  2698  }
  2699  
  2700  // Accept returns a new endpoint if a peer has established a connection
  2701  // to an endpoint previously set to listen mode.
  2702  //
  2703  // addr if not-nil will contain the peer address of the returned endpoint.
  2704  func (e *Endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) {
  2705  	e.LockUser()
  2706  	defer e.UnlockUser()
  2707  
  2708  	e.rcvQueueMu.Lock()
  2709  	rcvClosed := e.RcvClosed
  2710  	e.rcvQueueMu.Unlock()
  2711  	// Endpoint must be in listen state before it can accept connections.
  2712  	if rcvClosed || e.EndpointState() != StateListen {
  2713  		return nil, nil, &tcpip.ErrInvalidEndpointState{}
  2714  	}
  2715  
  2716  	// Get the new accepted endpoint.
  2717  	var n *Endpoint
  2718  	e.acceptMu.Lock()
  2719  	if element := e.acceptQueue.endpoints.Front(); element != nil {
  2720  		n = e.acceptQueue.endpoints.Remove(element).(*Endpoint)
  2721  	}
  2722  	e.acceptMu.Unlock()
  2723  	if n == nil {
  2724  		return nil, nil, &tcpip.ErrWouldBlock{}
  2725  	}
  2726  	if peerAddr != nil {
  2727  		*peerAddr = n.getRemoteAddress()
  2728  	}
  2729  	return n, n.waiterQueue, nil
  2730  }
  2731  
  2732  // Bind binds the endpoint to a specific local port and optionally address.
  2733  func (e *Endpoint) Bind(addr tcpip.FullAddress) (err tcpip.Error) {
  2734  	e.LockUser()
  2735  	defer e.UnlockUser()
  2736  
  2737  	return e.bindLocked(addr)
  2738  }
  2739  
  2740  // +checklocks:e.mu
  2741  func (e *Endpoint) bindLocked(addr tcpip.FullAddress) (err tcpip.Error) {
  2742  	// Don't allow binding once endpoint is not in the initial state
  2743  	// anymore. This is because once the endpoint goes into a connected or
  2744  	// listen state, it is already bound.
  2745  	if e.EndpointState() != StateInitial {
  2746  		return &tcpip.ErrAlreadyBound{}
  2747  	}
  2748  
  2749  	e.BindAddr = addr.Addr
  2750  	addr, netProto, err := e.checkV4MappedLocked(addr)
  2751  	if err != nil {
  2752  		return err
  2753  	}
  2754  
  2755  	netProtos := []tcpip.NetworkProtocolNumber{netProto}
  2756  
  2757  	// Expand netProtos to include v4 and v6 under dual-stack if the caller is
  2758  	// binding to a wildcard (empty) address, and this is an IPv6 endpoint with
  2759  	// v6only set to false.
  2760  	if netProto == header.IPv6ProtocolNumber {
  2761  		stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber)
  2762  		alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == tcpip.Address{} && stackHasV4
  2763  		if alsoBindToV4 {
  2764  			netProtos = append(netProtos, header.IPv4ProtocolNumber)
  2765  		}
  2766  	}
  2767  
  2768  	var nic tcpip.NICID
  2769  	// If an address is specified, we must ensure that it's one of our
  2770  	// local addresses.
  2771  	if addr.Addr.Len() != 0 {
  2772  		nic = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
  2773  		if nic == 0 {
  2774  			return &tcpip.ErrBadLocalAddress{}
  2775  		}
  2776  		e.TransportEndpointInfo.ID.LocalAddress = addr.Addr
  2777  	}
  2778  
  2779  	bindToDevice := tcpip.NICID(e.ops.GetBindToDevice())
  2780  	portRes := ports.Reservation{
  2781  		Networks:     netProtos,
  2782  		Transport:    ProtocolNumber,
  2783  		Addr:         addr.Addr,
  2784  		Port:         addr.Port,
  2785  		Flags:        e.portFlags,
  2786  		BindToDevice: bindToDevice,
  2787  		Dest:         tcpip.FullAddress{},
  2788  	}
  2789  	port, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, func(p uint16) (bool, tcpip.Error) {
  2790  		id := e.TransportEndpointInfo.ID
  2791  		id.LocalPort = p
  2792  		// CheckRegisterTransportEndpoint should only return an error if there is a
  2793  		// listening endpoint bound with the same id and portFlags and bindToDevice
  2794  		// options.
  2795  		//
  2796  		// NOTE: Only listening and connected endpoint register with
  2797  		// demuxer. Further connected endpoints always have a remote
  2798  		// address/port. Hence this will only return an error if there is a matching
  2799  		// listening endpoint.
  2800  		if err := e.stack.CheckRegisterTransportEndpoint(netProtos, ProtocolNumber, id, e.portFlags, bindToDevice); err != nil {
  2801  			return false, nil
  2802  		}
  2803  		return true, nil
  2804  	})
  2805  	if err != nil {
  2806  		e.stack.Stats().TCP.FailedPortReservations.Increment()
  2807  		return err
  2808  	}
  2809  
  2810  	e.boundBindToDevice = bindToDevice
  2811  	e.boundPortFlags = e.portFlags
  2812  	// TODO(gvisor.dev/issue/3691): Add test to verify boundNICID is correct.
  2813  	e.boundNICID = nic
  2814  	e.isPortReserved = true
  2815  	e.effectiveNetProtos = netProtos
  2816  	e.TransportEndpointInfo.ID.LocalPort = port
  2817  
  2818  	// Mark endpoint as bound.
  2819  	e.setEndpointState(StateBound)
  2820  
  2821  	return nil
  2822  }
  2823  
  2824  // GetLocalAddress returns the address to which the endpoint is bound.
  2825  func (e *Endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) {
  2826  	e.LockUser()
  2827  	defer e.UnlockUser()
  2828  
  2829  	return tcpip.FullAddress{
  2830  		Addr: e.TransportEndpointInfo.ID.LocalAddress,
  2831  		Port: e.TransportEndpointInfo.ID.LocalPort,
  2832  		NIC:  e.boundNICID,
  2833  	}, nil
  2834  }
  2835  
  2836  // GetRemoteAddress returns the address to which the endpoint is connected.
  2837  func (e *Endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) {
  2838  	e.LockUser()
  2839  	defer e.UnlockUser()
  2840  
  2841  	if !e.EndpointState().connected() {
  2842  		return tcpip.FullAddress{}, &tcpip.ErrNotConnected{}
  2843  	}
  2844  
  2845  	return e.getRemoteAddress(), nil
  2846  }
  2847  
  2848  func (e *Endpoint) getRemoteAddress() tcpip.FullAddress {
  2849  	return tcpip.FullAddress{
  2850  		Addr: e.TransportEndpointInfo.ID.RemoteAddress,
  2851  		Port: e.TransportEndpointInfo.ID.RemotePort,
  2852  		NIC:  e.boundNICID,
  2853  	}
  2854  }
  2855  
  2856  // HandlePacket implements stack.TransportEndpoint.HandlePacket.
  2857  func (*Endpoint) HandlePacket(stack.TransportEndpointID, *stack.PacketBuffer) {
  2858  	// TCP HandlePacket is not required anymore as inbound packets first
  2859  	// land at the Dispatcher which then can either deliver using the
  2860  	// worker go routine or directly do the invoke the tcp processing inline
  2861  	// based on the state of the endpoint.
  2862  }
  2863  
  2864  func (e *Endpoint) enqueueSegment(s *segment) bool {
  2865  	// Send packet to worker goroutine.
  2866  	if !e.segmentQueue.enqueue(s) {
  2867  		// The queue is full, so we drop the segment.
  2868  		e.stack.Stats().DroppedPackets.Increment()
  2869  		e.stats.ReceiveErrors.SegmentQueueDropped.Increment()
  2870  		return false
  2871  	}
  2872  	return true
  2873  }
  2874  
  2875  func (e *Endpoint) onICMPError(err tcpip.Error, transErr stack.TransportError, pkt *stack.PacketBuffer) {
  2876  	// Update last error first.
  2877  	e.lastErrorMu.Lock()
  2878  	e.lastError = err
  2879  	e.lastErrorMu.Unlock()
  2880  
  2881  	var recvErr bool
  2882  	switch pkt.NetworkProtocolNumber {
  2883  	case header.IPv4ProtocolNumber:
  2884  		recvErr = e.SocketOptions().GetIPv4RecvError()
  2885  	case header.IPv6ProtocolNumber:
  2886  		recvErr = e.SocketOptions().GetIPv6RecvError()
  2887  	default:
  2888  		panic(fmt.Sprintf("unhandled network protocol number = %d", pkt.NetworkProtocolNumber))
  2889  	}
  2890  
  2891  	if recvErr {
  2892  		e.SocketOptions().QueueErr(&tcpip.SockError{
  2893  			Err:   err,
  2894  			Cause: transErr,
  2895  			// Linux passes the payload with the TCP header. We don't know if the TCP
  2896  			// header even exists, it may not for fragmented packets.
  2897  			Payload: pkt.Data().AsRange().ToView(),
  2898  			Dst: tcpip.FullAddress{
  2899  				NIC:  pkt.NICID,
  2900  				Addr: e.TransportEndpointInfo.ID.RemoteAddress,
  2901  				Port: e.TransportEndpointInfo.ID.RemotePort,
  2902  			},
  2903  			Offender: tcpip.FullAddress{
  2904  				NIC:  pkt.NICID,
  2905  				Addr: e.TransportEndpointInfo.ID.LocalAddress,
  2906  				Port: e.TransportEndpointInfo.ID.LocalPort,
  2907  			},
  2908  			NetProto: pkt.NetworkProtocolNumber,
  2909  		})
  2910  	}
  2911  
  2912  	if e.EndpointState().connecting() {
  2913  		e.mu.Lock()
  2914  		if lEP := e.h.listenEP; lEP != nil {
  2915  			// Remove from listening endpoints pending list.
  2916  			lEP.acceptMu.Lock()
  2917  			delete(lEP.acceptQueue.pendingEndpoints, e)
  2918  			lEP.acceptMu.Unlock()
  2919  			lEP.stats.FailedConnectionAttempts.Increment()
  2920  		}
  2921  		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
  2922  		e.cleanupLocked()
  2923  		e.hardError = err
  2924  		e.setEndpointState(StateError)
  2925  		e.mu.Unlock()
  2926  		e.drainClosingSegmentQueue()
  2927  		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  2928  	}
  2929  }
  2930  
  2931  // HandleError implements stack.TransportEndpoint.
  2932  func (e *Endpoint) HandleError(transErr stack.TransportError, pkt *stack.PacketBuffer) {
  2933  	handlePacketTooBig := func(mtu uint32) {
  2934  		e.sndQueueInfo.sndQueueMu.Lock()
  2935  		update := false
  2936  		if v := int(mtu); v < e.sndQueueInfo.SndMTU {
  2937  			e.sndQueueInfo.SndMTU = v
  2938  			update = true
  2939  		}
  2940  		newMTU := e.sndQueueInfo.SndMTU
  2941  		e.sndQueueInfo.sndQueueMu.Unlock()
  2942  		if update {
  2943  			e.mu.Lock()
  2944  			defer e.mu.Unlock()
  2945  			if e.snd != nil {
  2946  				e.snd.updateMaxPayloadSize(newMTU, 1 /* count */) // +checklocksforce:e.snd.ep.mu
  2947  			}
  2948  		}
  2949  	}
  2950  
  2951  	// TODO(gvisor.dev/issues/5270): Handle all transport errors.
  2952  	switch transErr.Kind() {
  2953  	case stack.PacketTooBigTransportError:
  2954  		handlePacketTooBig(transErr.Info())
  2955  	case stack.DestinationHostUnreachableTransportError:
  2956  		e.onICMPError(&tcpip.ErrHostUnreachable{}, transErr, pkt)
  2957  	case stack.DestinationNetworkUnreachableTransportError:
  2958  		e.onICMPError(&tcpip.ErrNetworkUnreachable{}, transErr, pkt)
  2959  	case stack.DestinationPortUnreachableTransportError:
  2960  		e.onICMPError(&tcpip.ErrConnectionRefused{}, transErr, pkt)
  2961  	case stack.DestinationProtoUnreachableTransportError:
  2962  		e.onICMPError(&tcpip.ErrUnknownProtocolOption{}, transErr, pkt)
  2963  	case stack.SourceRouteFailedTransportError:
  2964  		e.onICMPError(&tcpip.ErrNotSupported{}, transErr, pkt)
  2965  	case stack.SourceHostIsolatedTransportError:
  2966  		e.onICMPError(&tcpip.ErrNoNet{}, transErr, pkt)
  2967  	case stack.DestinationHostDownTransportError:
  2968  		e.onICMPError(&tcpip.ErrHostDown{}, transErr, pkt)
  2969  	}
  2970  }
  2971  
  2972  // updateSndBufferUsage is called by the protocol goroutine when room opens up
  2973  // in the send buffer. The number of newly available bytes is v.
  2974  func (e *Endpoint) updateSndBufferUsage(v int) {
  2975  	sendBufferSize := e.getSendBufferSize()
  2976  	e.sndQueueInfo.sndQueueMu.Lock()
  2977  	notify := e.sndQueueInfo.SndBufUsed >= sendBufferSize>>1
  2978  	e.sndQueueInfo.SndBufUsed -= v
  2979  
  2980  	// Get the new send buffer size with auto tuning, but do not set it
  2981  	// unless we decide to notify the writers.
  2982  	newSndBufSz := e.computeTCPSendBufferSize()
  2983  
  2984  	// We only notify when there is half the sendBufferSize available after
  2985  	// a full buffer event occurs. This ensures that we don't wake up
  2986  	// writers to queue just 1-2 segments and go back to sleep.
  2987  	notify = notify && e.sndQueueInfo.SndBufUsed < int(newSndBufSz)>>1
  2988  	e.sndQueueInfo.sndQueueMu.Unlock()
  2989  
  2990  	if notify {
  2991  		// Set the new send buffer size calculated from auto tuning.
  2992  		e.ops.SetSendBufferSize(newSndBufSz, false /* notify */)
  2993  		e.waiterQueue.Notify(waiter.WritableEvents)
  2994  	}
  2995  }
  2996  
  2997  // readyToRead is called by the protocol goroutine when a new segment is ready
  2998  // to be read, or when the connection is closed for receiving (in which case
  2999  // s will be nil).
  3000  //
  3001  // +checklocks:e.mu
  3002  func (e *Endpoint) readyToRead(s *segment) {
  3003  	e.rcvQueueMu.Lock()
  3004  	if s != nil {
  3005  		e.RcvBufUsed += s.payloadSize()
  3006  		s.IncRef()
  3007  		e.rcvQueue.PushBack(s)
  3008  	} else {
  3009  		e.RcvClosed = true
  3010  	}
  3011  	e.rcvQueueMu.Unlock()
  3012  	e.waiterQueue.Notify(waiter.ReadableEvents)
  3013  }
  3014  
  3015  // receiveBufferAvailableLocked calculates how many bytes are still available
  3016  // in the receive buffer.
  3017  // +checklocks:e.rcvQueueMu
  3018  func (e *Endpoint) receiveBufferAvailableLocked(rcvBufSize int) int {
  3019  	// We may use more bytes than the buffer size when the receive buffer
  3020  	// shrinks.
  3021  	memUsed := e.receiveMemUsed()
  3022  	if memUsed >= rcvBufSize {
  3023  		return 0
  3024  	}
  3025  
  3026  	return rcvBufSize - memUsed
  3027  }
  3028  
  3029  // receiveBufferAvailable calculates how many bytes are still available in the
  3030  // receive buffer based on the actual memory used by all segments held in
  3031  // receive buffer/pending and segment queue.
  3032  func (e *Endpoint) receiveBufferAvailable() int {
  3033  	e.rcvQueueMu.Lock()
  3034  	available := e.receiveBufferAvailableLocked(int(e.ops.GetReceiveBufferSize()))
  3035  	e.rcvQueueMu.Unlock()
  3036  	return available
  3037  }
  3038  
  3039  // receiveBufferUsed returns the amount of in-use receive buffer.
  3040  func (e *Endpoint) receiveBufferUsed() int {
  3041  	e.rcvQueueMu.Lock()
  3042  	used := e.RcvBufUsed
  3043  	e.rcvQueueMu.Unlock()
  3044  	return used
  3045  }
  3046  
  3047  // receiveMemUsed returns the total memory in use by segments held by this
  3048  // endpoint.
  3049  func (e *Endpoint) receiveMemUsed() int {
  3050  	return int(e.rcvMemUsed.Load())
  3051  }
  3052  
  3053  // updateReceiveMemUsed adds the provided delta to e.rcvMemUsed.
  3054  func (e *Endpoint) updateReceiveMemUsed(delta int) {
  3055  	e.rcvMemUsed.Add(int32(delta))
  3056  }
  3057  
  3058  // maxReceiveBufferSize returns the stack wide maximum receive buffer size for
  3059  // an endpoint.
  3060  func (e *Endpoint) maxReceiveBufferSize() int {
  3061  	var rs tcpip.TCPReceiveBufferSizeRangeOption
  3062  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil {
  3063  		// As a fallback return the hardcoded max buffer size.
  3064  		return MaxBufferSize
  3065  	}
  3066  	return rs.Max
  3067  }
  3068  
  3069  // directionState returns the close state of send and receive part of the endpoint
  3070  func (e *Endpoint) connDirectionState() connDirectionState {
  3071  	return connDirectionState(e.connectionDirectionState.Load())
  3072  }
  3073  
  3074  // updateDirectionState updates the close state of send and receive part of the endpoint
  3075  func (e *Endpoint) updateConnDirectionState(state connDirectionState) connDirectionState {
  3076  	return connDirectionState(e.connectionDirectionState.Swap(uint32(e.connDirectionState() | state)))
  3077  }
  3078  
  3079  // rcvWndScaleForHandshake computes the receive window scale to offer to the
  3080  // peer when window scaling is enabled (true by default). If auto-tuning is
  3081  // disabled then the window scaling factor is based on the size of the
  3082  // receiveBuffer otherwise we use the max permissible receive buffer size to
  3083  // compute the scale.
  3084  func (e *Endpoint) rcvWndScaleForHandshake() int {
  3085  	bufSizeForScale := e.ops.GetReceiveBufferSize()
  3086  
  3087  	e.rcvQueueMu.Lock()
  3088  	autoTuningDisabled := e.RcvAutoParams.Disabled
  3089  	e.rcvQueueMu.Unlock()
  3090  	if autoTuningDisabled {
  3091  		return FindWndScale(seqnum.Size(bufSizeForScale))
  3092  	}
  3093  
  3094  	return FindWndScale(seqnum.Size(e.maxReceiveBufferSize()))
  3095  }
  3096  
  3097  // updateRecentTimestamp updates the recent timestamp using the algorithm
  3098  // described in https://tools.ietf.org/html/rfc7323#section-4.3
  3099  func (e *Endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) {
  3100  	if e.SendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) {
  3101  		e.setRecentTimestamp(tsVal)
  3102  	}
  3103  }
  3104  
  3105  // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if
  3106  // the SYN options indicate that timestamp option was negotiated. It also
  3107  // initializes the recentTS with the value provided in synOpts.TSval.
  3108  func (e *Endpoint) maybeEnableTimestamp(synOpts header.TCPSynOptions) {
  3109  	if synOpts.TS {
  3110  		e.SendTSOk = true
  3111  		e.setRecentTimestamp(synOpts.TSVal)
  3112  	}
  3113  }
  3114  
  3115  func (e *Endpoint) tsVal(now tcpip.MonotonicTime) uint32 {
  3116  	return e.TSOffset.TSVal(now)
  3117  }
  3118  
  3119  func (e *Endpoint) tsValNow() uint32 {
  3120  	return e.tsVal(e.stack.Clock().NowMonotonic())
  3121  }
  3122  
  3123  func (e *Endpoint) elapsed(now tcpip.MonotonicTime, tsEcr uint32) time.Duration {
  3124  	return e.TSOffset.Elapsed(now, tsEcr)
  3125  }
  3126  
  3127  // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint
  3128  // if the SYN options indicate that the SACK option was negotiated and the TCP
  3129  // stack is configured to enable TCP SACK option.
  3130  func (e *Endpoint) maybeEnableSACKPermitted(synOpts header.TCPSynOptions) {
  3131  	var v tcpip.TCPSACKEnabled
  3132  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil {
  3133  		// Stack doesn't support SACK. So just return.
  3134  		return
  3135  	}
  3136  	if bool(v) && synOpts.SACKPermitted {
  3137  		e.SACKPermitted = true
  3138  		e.stack.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery)
  3139  	}
  3140  }
  3141  
  3142  // maxOptionSize return the maximum size of TCP options.
  3143  func (e *Endpoint) maxOptionSize() (size int) {
  3144  	var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock
  3145  	options := e.makeOptions(maxSackBlocks[:])
  3146  	size = len(options)
  3147  	putOptions(options)
  3148  
  3149  	return size
  3150  }
  3151  
  3152  // completeStateLocked makes a full copy of the endpoint and returns it. This is
  3153  // used before invoking the probe.
  3154  //
  3155  // +checklocks:e.mu
  3156  func (e *Endpoint) completeStateLocked(s *stack.TCPEndpointState) {
  3157  	s.TCPEndpointStateInner = e.TCPEndpointStateInner
  3158  	s.ID = stack.TCPEndpointID(e.TransportEndpointInfo.ID)
  3159  	s.SegTime = e.stack.Clock().NowMonotonic()
  3160  	s.Receiver = e.rcv.TCPReceiverState
  3161  	s.Sender = e.snd.TCPSenderState
  3162  
  3163  	sndBufSize := e.getSendBufferSize()
  3164  	// Copy the send buffer atomically.
  3165  	e.sndQueueInfo.sndQueueMu.Lock()
  3166  	e.sndQueueInfo.CloneState(&s.SndBufState)
  3167  	s.SndBufState.SndBufSize = sndBufSize
  3168  	e.sndQueueInfo.sndQueueMu.Unlock()
  3169  
  3170  	// Copy the receive buffer atomically.
  3171  	e.rcvQueueMu.Lock()
  3172  	s.RcvBufState = e.TCPRcvBufState
  3173  	e.rcvQueueMu.Unlock()
  3174  
  3175  	// Copy the endpoint TCP Option state.
  3176  	s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks)
  3177  	copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks])
  3178  	s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy()
  3179  
  3180  	e.snd.rtt.Lock()
  3181  	s.Sender.RTTState = e.snd.rtt.TCPRTTState
  3182  	e.snd.rtt.Unlock()
  3183  
  3184  	if cubic, ok := e.snd.cc.(*cubicState); ok {
  3185  		s.Sender.Cubic = cubic.TCPCubicState
  3186  		s.Sender.Cubic.TimeSinceLastCongestion = e.stack.Clock().NowMonotonic().Sub(s.Sender.Cubic.T)
  3187  	}
  3188  
  3189  	s.Sender.RACKState = e.snd.rc.TCPRACKState
  3190  	s.Sender.RetransmitTS = e.snd.retransmitTS
  3191  	s.Sender.SpuriousRecovery = e.snd.spuriousRecovery
  3192  }
  3193  
  3194  func (e *Endpoint) initHostGSO() {
  3195  	switch e.route.NetProto() {
  3196  	case header.IPv4ProtocolNumber:
  3197  		e.gso.Type = stack.GSOTCPv4
  3198  		e.gso.L3HdrLen = header.IPv4MinimumSize
  3199  	case header.IPv6ProtocolNumber:
  3200  		e.gso.Type = stack.GSOTCPv6
  3201  		e.gso.L3HdrLen = header.IPv6MinimumSize
  3202  	default:
  3203  		panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto))
  3204  	}
  3205  	e.gso.NeedsCsum = true
  3206  	e.gso.CsumOffset = header.TCPChecksumOffset
  3207  	e.gso.MaxSize = e.route.GSOMaxSize()
  3208  }
  3209  
  3210  func (e *Endpoint) initGSO() {
  3211  	if e.route.HasHostGSOCapability() {
  3212  		e.initHostGSO()
  3213  	} else if e.route.HasGVisorGSOCapability() {
  3214  		e.gso = stack.GSO{
  3215  			MaxSize:   e.route.GSOMaxSize(),
  3216  			Type:      stack.GSOGvisor,
  3217  			NeedsCsum: false,
  3218  		}
  3219  	}
  3220  }
  3221  
  3222  // State implements tcpip.Endpoint.State. It exports the endpoint's protocol
  3223  // state for diagnostics.
  3224  func (e *Endpoint) State() uint32 {
  3225  	return uint32(e.EndpointState())
  3226  }
  3227  
  3228  // Info returns a copy of the endpoint info.
  3229  func (e *Endpoint) Info() tcpip.EndpointInfo {
  3230  	e.LockUser()
  3231  	// Make a copy of the endpoint info.
  3232  	ret := e.TransportEndpointInfo
  3233  	e.UnlockUser()
  3234  	return &ret
  3235  }
  3236  
  3237  // Stats returns a pointer to the endpoint stats.
  3238  func (e *Endpoint) Stats() tcpip.EndpointStats {
  3239  	return &e.stats
  3240  }
  3241  
  3242  // Wait implements stack.TransportEndpoint.Wait.
  3243  func (e *Endpoint) Wait() {
  3244  	waitEntry, notifyCh := waiter.NewChannelEntry(waiter.EventHUp)
  3245  	e.waiterQueue.EventRegister(&waitEntry)
  3246  	defer e.waiterQueue.EventUnregister(&waitEntry)
  3247  	switch e.EndpointState() {
  3248  	case StateClose, StateError:
  3249  		return
  3250  	}
  3251  	<-notifyCh
  3252  }
  3253  
  3254  // SocketOptions implements tcpip.Endpoint.SocketOptions.
  3255  func (e *Endpoint) SocketOptions() *tcpip.SocketOptions {
  3256  	return &e.ops
  3257  }
  3258  
  3259  // GetTCPSendBufferLimits is used to get send buffer size limits for TCP.
  3260  func GetTCPSendBufferLimits(sh tcpip.StackHandler) tcpip.SendBufferSizeOption {
  3261  	// This type assertion is safe because only the TCP stack calls this
  3262  	// function.
  3263  	ss := sh.(*stack.Stack).TCPSendBufferLimits()
  3264  	return tcpip.SendBufferSizeOption{
  3265  		Min:     ss.Min,
  3266  		Default: ss.Default,
  3267  		Max:     ss.Max,
  3268  	}
  3269  }
  3270  
  3271  // allowOutOfWindowAck returns true if an out-of-window ACK can be sent now.
  3272  func (e *Endpoint) allowOutOfWindowAck() bool {
  3273  	now := e.stack.Clock().NowMonotonic()
  3274  
  3275  	if e.lastOutOfWindowAckTime != (tcpip.MonotonicTime{}) {
  3276  		var limit stack.TCPInvalidRateLimitOption
  3277  		if err := e.stack.Option(&limit); err != nil {
  3278  			panic(fmt.Sprintf("e.stack.Option(%+v) failed with error: %s", limit, err))
  3279  		}
  3280  		if now.Sub(e.lastOutOfWindowAckTime) < time.Duration(limit) {
  3281  			return false
  3282  		}
  3283  	}
  3284  
  3285  	e.lastOutOfWindowAckTime = now
  3286  	return true
  3287  }
  3288  
  3289  // GetTCPReceiveBufferLimits is used to get send buffer size limits for TCP.
  3290  func GetTCPReceiveBufferLimits(s tcpip.StackHandler) tcpip.ReceiveBufferSizeOption {
  3291  	var ss tcpip.TCPReceiveBufferSizeRangeOption
  3292  	if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil {
  3293  		panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err))
  3294  	}
  3295  
  3296  	return tcpip.ReceiveBufferSizeOption{
  3297  		Min:     ss.Min,
  3298  		Default: ss.Default,
  3299  		Max:     ss.Max,
  3300  	}
  3301  }
  3302  
  3303  // computeTCPSendBufferSize implements auto tuning of send buffer size and
  3304  // returns the new send buffer size.
  3305  func (e *Endpoint) computeTCPSendBufferSize() int64 {
  3306  	curSndBufSz := int64(e.getSendBufferSize())
  3307  
  3308  	// Auto tuning is disabled when the user explicitly sets the send
  3309  	// buffer size with SO_SNDBUF option.
  3310  	if disabled := e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled.Load(); disabled == 1 {
  3311  		return curSndBufSz
  3312  	}
  3313  
  3314  	const packetOverheadFactor = 2
  3315  	curMSS := e.snd.MaxPayloadSize
  3316  	numSeg := InitialCwnd
  3317  	if numSeg < e.snd.SndCwnd {
  3318  		numSeg = e.snd.SndCwnd
  3319  	}
  3320  
  3321  	// SndCwnd indicates the number of segments that can be sent. This means
  3322  	// that the sender can send upto #SndCwnd segments and the send buffer
  3323  	// size should be set to SndCwnd*MSS to accommodate sending of all the
  3324  	// segments.
  3325  	newSndBufSz := int64(numSeg * curMSS * packetOverheadFactor)
  3326  	if newSndBufSz < curSndBufSz {
  3327  		return curSndBufSz
  3328  	}
  3329  	if ss := GetTCPSendBufferLimits(e.stack); int64(ss.Max) < newSndBufSz {
  3330  		newSndBufSz = int64(ss.Max)
  3331  	}
  3332  
  3333  	return newSndBufSz
  3334  }
  3335  
  3336  // GetAcceptConn implements tcpip.SocketOptionsHandler.
  3337  func (e *Endpoint) GetAcceptConn() bool {
  3338  	return EndpointState(e.State()) == StateListen
  3339  }