github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/tcpip/transport/tcp/endpoint.go

github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/tcpip/transport/tcp/endpoint.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"container/heap"
    19  	"fmt"
    20  	"io"
    21  	"math"
    22  	"runtime"
    23  	"strings"
    24  	"time"
    25  
    26  	"github.com/metacubex/gvisor/pkg/atomicbitops"
    27  	"github.com/metacubex/gvisor/pkg/buffer"
    28  	"github.com/metacubex/gvisor/pkg/sleep"
    29  	"github.com/metacubex/gvisor/pkg/sync"
    30  	"github.com/metacubex/gvisor/pkg/tcpip"
    31  	"github.com/metacubex/gvisor/pkg/tcpip/header"
    32  	"github.com/metacubex/gvisor/pkg/tcpip/ports"
    33  	"github.com/metacubex/gvisor/pkg/tcpip/seqnum"
    34  	"github.com/metacubex/gvisor/pkg/tcpip/stack"
    35  	"github.com/metacubex/gvisor/pkg/waiter"
    36  )
    37  
    38  // EndpointState represents the state of a TCP endpoint.
    39  type EndpointState tcpip.EndpointState
    40  
    41  // Endpoint states. Note that are represented in a netstack-specific manner and
    42  // may not be meaningful externally. Specifically, they need to be translated to
    43  // Linux's representation for these states if presented to userspace.
    44  const (
    45  	_ EndpointState = iota
    46  	// TCP protocol states in sync with the definitions in
    47  	// https://github.com/torvalds/linux/blob/7acac4b3196/include/net/tcp_states.h#L13
    48  	StateEstablished
    49  	StateSynSent
    50  	StateSynRecv
    51  	StateFinWait1
    52  	StateFinWait2
    53  	StateTimeWait
    54  	StateClose
    55  	StateCloseWait
    56  	StateLastAck
    57  	StateListen
    58  	StateClosing
    59  
    60  	// Endpoint states internal to netstack.
    61  	StateInitial
    62  	StateBound
    63  	StateConnecting // Connect() called, but the initial SYN hasn't been sent.
    64  	StateError
    65  )
    66  
    67  const (
    68  	// rcvAdvWndScale is used to split the available socket buffer into
    69  	// application buffer and the window to be advertised to the peer. This is
    70  	// currently hard coded to split the available space equally.
    71  	rcvAdvWndScale = 1
    72  
    73  	// SegOverheadFactor is used to multiply the value provided by the
    74  	// user on a SetSockOpt for setting the socket send/receive buffer sizes.
    75  	SegOverheadFactor = 2
    76  )
    77  
    78  type connDirectionState uint32
    79  
    80  // Connection direction states used for directionState checks in endpoint struct
    81  // to detect half-closed connection and deliver POLLRDHUP
    82  const (
    83  	connDirectionStateOpen      connDirectionState = 0
    84  	connDirectionStateRcvClosed connDirectionState = 1
    85  	connDirectionStateSndClosed connDirectionState = 2
    86  	connDirectionStateAll       connDirectionState = connDirectionStateOpen | connDirectionStateRcvClosed | connDirectionStateSndClosed
    87  )
    88  
    89  // connected returns true when s is one of the states representing an
    90  // endpoint connected to a peer.
    91  func (s EndpointState) connected() bool {
    92  	switch s {
    93  	case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
    94  		return true
    95  	default:
    96  		return false
    97  	}
    98  }
    99  
   100  // connecting returns true when s is one of the states representing a
   101  // connection in progress, but not yet fully established.
   102  func (s EndpointState) connecting() bool {
   103  	switch s {
   104  	case StateConnecting, StateSynSent, StateSynRecv:
   105  		return true
   106  	default:
   107  		return false
   108  	}
   109  }
   110  
   111  // internal returns true when the state is netstack internal.
   112  func (s EndpointState) internal() bool {
   113  	switch s {
   114  	case StateInitial, StateBound, StateConnecting, StateError:
   115  		return true
   116  	default:
   117  		return false
   118  	}
   119  }
   120  
   121  // handshake returns true when s is one of the states representing an endpoint
   122  // in the middle of a TCP handshake.
   123  func (s EndpointState) handshake() bool {
   124  	switch s {
   125  	case StateSynSent, StateSynRecv:
   126  		return true
   127  	default:
   128  		return false
   129  	}
   130  }
   131  
   132  // closed returns true when s is one of the states an endpoint transitions to
   133  // when closed or when it encounters an error. This is distinct from a newly
   134  // initialized endpoint that was never connected.
   135  func (s EndpointState) closed() bool {
   136  	switch s {
   137  	case StateClose, StateError:
   138  		return true
   139  	default:
   140  		return false
   141  	}
   142  }
   143  
   144  // String implements fmt.Stringer.String.
   145  func (s EndpointState) String() string {
   146  	switch s {
   147  	case StateInitial:
   148  		return "INITIAL"
   149  	case StateBound:
   150  		return "BOUND"
   151  	case StateConnecting:
   152  		return "CONNECTING"
   153  	case StateError:
   154  		return "ERROR"
   155  	case StateEstablished:
   156  		return "ESTABLISHED"
   157  	case StateSynSent:
   158  		return "SYN-SENT"
   159  	case StateSynRecv:
   160  		return "SYN-RCVD"
   161  	case StateFinWait1:
   162  		return "FIN-WAIT1"
   163  	case StateFinWait2:
   164  		return "FIN-WAIT2"
   165  	case StateTimeWait:
   166  		return "TIME-WAIT"
   167  	case StateClose:
   168  		return "CLOSED"
   169  	case StateCloseWait:
   170  		return "CLOSE-WAIT"
   171  	case StateLastAck:
   172  		return "LAST-ACK"
   173  	case StateListen:
   174  		return "LISTEN"
   175  	case StateClosing:
   176  		return "CLOSING"
   177  	default:
   178  		panic("unreachable")
   179  	}
   180  }
   181  
   182  // SACKInfo holds TCP SACK related information for a given endpoint.
   183  //
   184  // +stateify savable
   185  type SACKInfo struct {
   186  	// Blocks is the maximum number of SACK blocks we track
   187  	// per endpoint.
   188  	Blocks [MaxSACKBlocks]header.SACKBlock
   189  
   190  	// NumBlocks is the number of valid SACK blocks stored in the
   191  	// blocks array above.
   192  	NumBlocks int
   193  }
   194  
   195  // ReceiveErrors collect segment receive errors within transport layer.
   196  //
   197  // +stateify savable
   198  type ReceiveErrors struct {
   199  	tcpip.ReceiveErrors
   200  
   201  	// SegmentQueueDropped is the number of segments dropped due to
   202  	// a full segment queue.
   203  	SegmentQueueDropped tcpip.StatCounter
   204  
   205  	// ChecksumErrors is the number of segments dropped due to bad checksums.
   206  	ChecksumErrors tcpip.StatCounter
   207  
   208  	// ListenOverflowSynDrop is the number of times the listen queue overflowed
   209  	// and a SYN was dropped.
   210  	ListenOverflowSynDrop tcpip.StatCounter
   211  
   212  	// ListenOverflowAckDrop is the number of times the final ACK
   213  	// in the handshake was dropped due to overflow.
   214  	ListenOverflowAckDrop tcpip.StatCounter
   215  
   216  	// ZeroRcvWindowState is the number of times we advertised
   217  	// a zero receive window when rcvQueue is full.
   218  	ZeroRcvWindowState tcpip.StatCounter
   219  
   220  	// WantZeroWindow is the number of times we wanted to advertise a
   221  	// zero receive window but couldn't because it would have caused
   222  	// the receive window's right edge to shrink.
   223  	WantZeroRcvWindow tcpip.StatCounter
   224  }
   225  
   226  // SendErrors collect segment send errors within the transport layer.
   227  //
   228  // +stateify savable
   229  type SendErrors struct {
   230  	tcpip.SendErrors
   231  
   232  	// SegmentSendToNetworkFailed is the number of TCP segments failed to be sent
   233  	// to the network endpoint.
   234  	SegmentSendToNetworkFailed tcpip.StatCounter
   235  
   236  	// SynSendToNetworkFailed is the number of TCP SYNs failed to be sent
   237  	// to the network endpoint.
   238  	SynSendToNetworkFailed tcpip.StatCounter
   239  
   240  	// Retransmits is the number of TCP segments retransmitted.
   241  	Retransmits tcpip.StatCounter
   242  
   243  	// FastRetransmit is the number of segments retransmitted in fast
   244  	// recovery.
   245  	FastRetransmit tcpip.StatCounter
   246  
   247  	// Timeouts is the number of times the RTO expired.
   248  	Timeouts tcpip.StatCounter
   249  }
   250  
   251  // Stats holds statistics about the endpoint.
   252  //
   253  // +stateify savable
   254  type Stats struct {
   255  	// SegmentsReceived is the number of TCP segments received that
   256  	// the transport layer successfully parsed.
   257  	SegmentsReceived tcpip.StatCounter
   258  
   259  	// SegmentsSent is the number of TCP segments sent.
   260  	SegmentsSent tcpip.StatCounter
   261  
   262  	// FailedConnectionAttempts is the number of times we saw Connect and
   263  	// Accept errors.
   264  	FailedConnectionAttempts tcpip.StatCounter
   265  
   266  	// ReceiveErrors collects segment receive errors within the
   267  	// transport layer.
   268  	ReceiveErrors ReceiveErrors
   269  
   270  	// ReadErrors collects segment read errors from an endpoint read call.
   271  	ReadErrors tcpip.ReadErrors
   272  
   273  	// SendErrors collects segment send errors within the transport layer.
   274  	SendErrors SendErrors
   275  
   276  	// WriteErrors collects segment write errors from an endpoint write call.
   277  	WriteErrors tcpip.WriteErrors
   278  }
   279  
   280  // IsEndpointStats is an empty method to implement the tcpip.EndpointStats
   281  // marker interface.
   282  func (*Stats) IsEndpointStats() {}
   283  
   284  // sndQueueInfo implements a send queue.
   285  //
   286  // +stateify savable
   287  type sndQueueInfo struct {
   288  	sndQueueMu sync.Mutex `state:"nosave"`
   289  	stack.TCPSndBufState
   290  
   291  	// sndWaker is used to signal the protocol goroutine when there may be
   292  	// segments that need to be sent.
   293  	sndWaker sleep.Waker `state:"manual"`
   294  }
   295  
   296  // CloneState clones sq into other. It is not thread safe
   297  func (sq *sndQueueInfo) CloneState(other *stack.TCPSndBufState) {
   298  	other.SndBufSize = sq.SndBufSize
   299  	other.SndBufUsed = sq.SndBufUsed
   300  	other.SndClosed = sq.SndClosed
   301  	other.PacketTooBigCount = sq.PacketTooBigCount
   302  	other.SndMTU = sq.SndMTU
   303  	other.AutoTuneSndBufDisabled = atomicbitops.FromUint32(sq.AutoTuneSndBufDisabled.RacyLoad())
   304  }
   305  
   306  // Endpoint represents a TCP endpoint. This struct serves as the interface
   307  // between users of the endpoint and the protocol implementation; it is legal to
   308  // have concurrent goroutines make calls into the endpoint, they are properly
   309  // synchronized. The protocol implementation, however, runs in a single
   310  // goroutine.
   311  //
   312  // Each endpoint has a few mutexes:
   313  //
   314  // e.mu -> Primary mutex for an endpoint must be held for all operations except
   315  // in e.Readiness where acquiring it will result in a deadlock in epoll
   316  // implementation.
   317  //
   318  // The following three mutexes can be acquired independent of e.mu but if
   319  // acquired with e.mu then e.mu must be acquired first.
   320  //
   321  // e.acceptMu -> Protects e.acceptQueue.
   322  // e.rcvQueueMu -> Protects e.rcvQueue's associated fields but not e.rcvQueue
   323  // itself.
   324  // e.sndQueueMu -> Protects the e.sndQueue and associated fields.
   325  // e.lastErrorMu -> Protects the lastError field.
   326  //
   327  // LOCKING/UNLOCKING of the endpoint.  The locking of an endpoint is different
   328  // based on the context in which the lock is acquired. In the syscall context
   329  // e.LockUser/e.UnlockUser should be used and when doing background processing
   330  // e.mu.Lock/e.mu.Unlock should be used. The distinction is described below
   331  // in brief.
   332  //
   333  // The reason for this locking behaviour is to avoid wakeups to handle packets.
   334  // In cases where the endpoint is already locked the background processor can
   335  // queue the packet up and go its merry way and the lock owner will eventually
   336  // process the backlog when releasing the lock. Similarly when acquiring the
   337  // lock from say a syscall goroutine we can implement a bit of spinning if we
   338  // know that the lock is not held by another syscall goroutine. Background
   339  // processors should never hold the lock for long and we can avoid an expensive
   340  // sleep/wakeup by spinning for a shortwhile.
   341  //
   342  // For more details please see the detailed documentation on
   343  // e.LockUser/e.UnlockUser methods.
   344  //
   345  // +stateify savable
   346  type Endpoint struct {
   347  	stack.TCPEndpointStateInner
   348  	stack.TransportEndpointInfo
   349  	tcpip.DefaultSocketOptionsHandler
   350  
   351  	// endpointEntry is used to queue endpoints for processing to the
   352  	// a given tcp processor goroutine.
   353  	//
   354  	// Precondition: epQueue.mu must be held to read/write this field..
   355  	endpointEntry `state:"nosave"`
   356  
   357  	// pendingProcessingMu protects pendingProcessing.
   358  	pendingProcessingMu sync.Mutex `state:"nosave"`
   359  
   360  	// pendingProcessing is true if this endpoint is queued for processing
   361  	// to a TCP processor.
   362  	// +checklocks:pendingProcessingMu
   363  	pendingProcessing bool `state:"nosave"`
   364  
   365  	// The following fields are initialized at creation time and do not
   366  	// change throughout the lifetime of the endpoint.
   367  	stack       *stack.Stack  `state:"manual"`
   368  	protocol    *protocol     `state:"manual"`
   369  	waiterQueue *waiter.Queue `state:"wait"`
   370  	uniqueID    uint64
   371  
   372  	// hardError is meaningful only when state is stateError. It stores the
   373  	// error to be returned when read/write syscalls are called and the
   374  	// endpoint is in this state. hardError is protected by endpoint mu.
   375  	hardError tcpip.Error
   376  
   377  	// lastError represents the last error that the endpoint reported;
   378  	// access to it is protected by the following mutex.
   379  	lastErrorMu sync.Mutex `state:"nosave"`
   380  	lastError   tcpip.Error
   381  
   382  	rcvQueueMu sync.Mutex `state:"nosave"`
   383  
   384  	// +checklocks:rcvQueueMu
   385  	stack.TCPRcvBufState
   386  
   387  	// rcvMemUsed tracks the total amount of memory in use by received segments
   388  	// held in rcvQueue, pendingRcvdSegments and the segment queue. This is used to
   389  	// compute the window and the actual available buffer space. This is distinct
   390  	// from rcvBufUsed above which is the actual number of payload bytes held in
   391  	// the buffer not including any segment overheads.
   392  	rcvMemUsed atomicbitops.Int32
   393  
   394  	// mu protects all endpoint fields unless documented otherwise. mu must
   395  	// be acquired before interacting with the endpoint fields.
   396  	//
   397  	// During handshake, mu is locked by the protocol listen goroutine and
   398  	// released by the handshake completion goroutine.
   399  	mu          sync.CrossGoroutineMutex `state:"nosave"`
   400  	ownedByUser atomicbitops.Uint32
   401  
   402  	// rcvQueue is the queue for ready-for-delivery segments.
   403  	//
   404  	// +checklocks:mu
   405  	rcvQueue segmentList `state:"wait"`
   406  
   407  	// state must be read/set using the EndpointState()/setEndpointState()
   408  	// methods.
   409  	state atomicbitops.Uint32 `state:".(EndpointState)"`
   410  
   411  	// connectionDirectionState holds current state of send and receive,
   412  	// accessed atomically
   413  	connectionDirectionState atomicbitops.Uint32
   414  
   415  	// origEndpointState is only used during a restore phase to save the
   416  	// endpoint state at restore time as the socket is moved to it's correct
   417  	// state.
   418  	origEndpointState uint32 `state:"nosave"`
   419  
   420  	isPortReserved    bool `state:"manual"`
   421  	isRegistered      bool `state:"manual"`
   422  	boundNICID        tcpip.NICID
   423  	route             *stack.Route `state:"manual"`
   424  	ipv4TTL           uint8
   425  	ipv6HopLimit      int16
   426  	isConnectNotified bool
   427  
   428  	// h stores a reference to the current handshake state if the endpoint is in
   429  	// the SYN-SENT or SYN-RECV states, in which case endpoint == endpoint.h.ep.
   430  	// nil otherwise.
   431  	// +checklocks:mu
   432  	h *handshake
   433  
   434  	// portFlags stores the current values of port related flags.
   435  	portFlags ports.Flags
   436  
   437  	// Values used to reserve a port or register a transport endpoint
   438  	// (which ever happens first).
   439  	boundBindToDevice tcpip.NICID
   440  	boundPortFlags    ports.Flags
   441  	boundDest         tcpip.FullAddress
   442  
   443  	// effectiveNetProtos contains the network protocols actually in use. In
   444  	// most cases it will only contain "netProto", but in cases like IPv6
   445  	// endpoints with v6only set to false, this could include multiple
   446  	// protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g.,
   447  	// IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped
   448  	// address).
   449  	effectiveNetProtos []tcpip.NetworkProtocolNumber
   450  
   451  	// recentTSTime is the unix time when we last updated
   452  	// TCPEndpointStateInner.RecentTS.
   453  	recentTSTime tcpip.MonotonicTime
   454  
   455  	// shutdownFlags represent the current shutdown state of the endpoint.
   456  	shutdownFlags tcpip.ShutdownFlags
   457  
   458  	// tcpRecovery is the loss recovery algorithm used by TCP.
   459  	tcpRecovery tcpip.TCPRecovery
   460  
   461  	// sack holds TCP SACK related information for this endpoint.
   462  	sack SACKInfo
   463  
   464  	// delay enables Nagle's algorithm.
   465  	//
   466  	// delay is a boolean (0 is false) and must be accessed atomically.
   467  	delay uint32
   468  
   469  	// scoreboard holds TCP SACK Scoreboard information for this endpoint.
   470  	scoreboard *SACKScoreboard
   471  
   472  	// segmentQueue is used to hand received segments to the protocol
   473  	// goroutine. Segments are queued as long as the queue is not full,
   474  	// and dropped when it is.
   475  	segmentQueue segmentQueue `state:"wait"`
   476  
   477  	// userMSS if non-zero is the MSS value explicitly set by the user
   478  	// for this endpoint using the TCP_MAXSEG setsockopt.
   479  	userMSS uint16
   480  
   481  	// maxSynRetries is the maximum number of SYN retransmits that TCP should
   482  	// send before aborting the attempt to connect. It cannot exceed 255.
   483  	//
   484  	// NOTE: This is currently a no-op and does not change the SYN
   485  	// retransmissions.
   486  	maxSynRetries uint8
   487  
   488  	// windowClamp is used to bound the size of the advertised window to
   489  	// this value.
   490  	windowClamp uint32
   491  
   492  	// sndQueueInfo contains the implementation of the endpoint's send queue.
   493  	sndQueueInfo sndQueueInfo
   494  
   495  	// cc stores the name of the Congestion Control algorithm to use for
   496  	// this endpoint.
   497  	cc tcpip.CongestionControlOption
   498  
   499  	// keepalive manages TCP keepalive state. When the connection is idle
   500  	// (no data sent or received) for keepaliveIdle, we start sending
   501  	// keepalives every keepalive.interval. If we send keepalive.count
   502  	// without hearing a response, the connection is closed.
   503  	keepalive keepalive
   504  
   505  	// userTimeout if non-zero specifies a user specified timeout for
   506  	// a connection w/ pending data to send. A connection that has pending
   507  	// unacked data will be forcibily aborted if the timeout is reached
   508  	// without any data being acked.
   509  	userTimeout time.Duration
   510  
   511  	// deferAccept if non-zero specifies a user specified time during
   512  	// which the final ACK of a handshake will be dropped provided the
   513  	// ACK is a bare ACK and carries no data. If the timeout is crossed then
   514  	// the bare ACK is accepted and the connection is delivered to the
   515  	// listener.
   516  	deferAccept time.Duration
   517  
   518  	// acceptMu protects accepQueue
   519  	acceptMu sync.Mutex `state:"nosave"`
   520  
   521  	// acceptQueue is used by a listening endpoint to send newly accepted
   522  	// connections to the endpoint so that they can be read by Accept()
   523  	// calls.
   524  	//
   525  	// +checklocks:acceptMu
   526  	acceptQueue acceptQueue
   527  
   528  	// The following are only used from the protocol goroutine, and
   529  	// therefore don't need locks to protect them.
   530  	rcv *receiver `state:"wait"`
   531  	snd *sender   `state:"wait"`
   532  
   533  	// The goroutine drain completion notification channel.
   534  	drainDone chan struct{} `state:"nosave"`
   535  
   536  	// The goroutine undrain notification channel. This is currently used as
   537  	// a way to block the worker goroutines. Today nothing closes/writes
   538  	// this channel and this causes any goroutines waiting on this to just
   539  	// block. This is used during save/restore to prevent worker goroutines
   540  	// from mutating state as it's being saved.
   541  	undrain chan struct{} `state:"nosave"`
   542  
   543  	// probe if not nil is invoked on every received segment. It is passed
   544  	// a copy of the current state of the endpoint.
   545  	probe stack.TCPProbeFunc `state:"nosave"`
   546  
   547  	// The following are only used to assist the restore run to re-connect.
   548  	connectingAddress tcpip.Address
   549  
   550  	// amss is the advertised MSS to the peer by this endpoint.
   551  	amss uint16
   552  
   553  	// sendTOS represents IPv4 TOS or IPv6 TrafficClass,
   554  	// applied while sending packets. Defaults to 0 as on Linux.
   555  	sendTOS uint8
   556  
   557  	gso stack.GSO
   558  
   559  	stats Stats
   560  
   561  	// tcpLingerTimeout is the maximum amount of a time a socket
   562  	// a socket stays in TIME_WAIT state before being marked
   563  	// closed.
   564  	tcpLingerTimeout time.Duration
   565  
   566  	// closed indicates that the user has called closed on the
   567  	// endpoint and at this point the endpoint is only around
   568  	// to complete the TCP shutdown.
   569  	closed bool
   570  
   571  	// txHash is the transport layer hash to be set on outbound packets
   572  	// emitted by this endpoint.
   573  	txHash uint32
   574  
   575  	// owner is used to get uid and gid of the packet.
   576  	owner tcpip.PacketOwner
   577  
   578  	// ops is used to get socket level options.
   579  	ops tcpip.SocketOptions
   580  
   581  	// lastOutOfWindowAckTime is the time at which the an ACK was sent in response
   582  	// to an out of window segment being received by this endpoint.
   583  	lastOutOfWindowAckTime tcpip.MonotonicTime
   584  
   585  	// finWait2Timer is used to reap orphaned sockets in FIN-WAIT-2 where the peer
   586  	// is yet to send a FIN but on our end the socket is fully closed i.e. endpoint.Close()
   587  	// has been called on the socket. This timer is not started for sockets that
   588  	// are waiting for a peer FIN but are not closed.
   589  	finWait2Timer tcpip.Timer `state:"nosave"`
   590  
   591  	// timeWaitTimer is used to reap a socket once a socket has been in TIME-WAIT state
   592  	// for tcp.DefaultTCPTimeWaitTimeout seconds.
   593  	timeWaitTimer tcpip.Timer `state:"nosave"`
   594  
   595  	// listenCtx is used by listening endpoints to store state used while listening for
   596  	// connections. Nil otherwise.
   597  	listenCtx *listenContext `state:"nosave"`
   598  }
   599  
   600  // UniqueID implements stack.TransportEndpoint.UniqueID.
   601  func (e *Endpoint) UniqueID() uint64 {
   602  	return e.uniqueID
   603  }
   604  
   605  // calculateAdvertisedMSS calculates the MSS to advertise.
   606  //
   607  // If userMSS is non-zero and is not greater than the maximum possible MSS for
   608  // r, it will be used; otherwise, the maximum possible MSS will be used.
   609  func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 {
   610  	// The maximum possible MSS is dependent on the route.
   611  	// TODO(b/143359391): Respect TCP Min and Max size.
   612  	maxMSS := uint16(r.MTU() - header.TCPMinimumSize)
   613  
   614  	if userMSS != 0 && userMSS < maxMSS {
   615  		return userMSS
   616  	}
   617  
   618  	return maxMSS
   619  }
   620  
   621  // isOwnedByUser() returns true if the endpoint lock is currently
   622  // held by a user(syscall) goroutine.
   623  func (e *Endpoint) isOwnedByUser() bool {
   624  	return e.ownedByUser.Load() == 1
   625  }
   626  
   627  // LockUser tries to lock e.mu and if it fails it will check if the lock is held
   628  // by another syscall goroutine. If yes, then it will goto sleep waiting for the
   629  // lock to be released, if not then it will spin till it acquires the lock or
   630  // another syscall goroutine acquires it in which case it will goto sleep as
   631  // described above.
   632  //
   633  // The assumption behind spinning here being that background packet processing
   634  // should not be holding the lock for long and spinning reduces latency as we
   635  // avoid an expensive sleep/wakeup of the syscall goroutine).
   636  // +checklocksacquire:e.mu
   637  func (e *Endpoint) LockUser() {
   638  	const iterations = 5
   639  	for i := 0; i < iterations; i++ {
   640  		// Try first if the sock is locked then check if it's owned
   641  		// by another user goroutine if not then we spin, otherwise
   642  		// we just go to sleep on the Lock() and wait.
   643  		if !e.TryLock() {
   644  			// If socket is owned by the user then just go to sleep
   645  			// as the lock could be held for a reasonably long time.
   646  			if e.ownedByUser.Load() == 1 {
   647  				e.mu.Lock()
   648  				e.ownedByUser.Store(1)
   649  				return
   650  			}
   651  			// Spin but don't yield the processor since the lower half
   652  			// should yield the lock soon.
   653  			continue
   654  		}
   655  		e.ownedByUser.Store(1)
   656  		return
   657  	}
   658  
   659  	for i := 0; i < iterations; i++ {
   660  		// Try first if the sock is locked then check if it's owned
   661  		// by another user goroutine if not then we spin, otherwise
   662  		// we just go to sleep on the Lock() and wait.
   663  		if !e.TryLock() {
   664  			// If socket is owned by the user then just go to sleep
   665  			// as the lock could be held for a reasonably long time.
   666  			if e.ownedByUser.Load() == 1 {
   667  				e.mu.Lock()
   668  				e.ownedByUser.Store(1)
   669  				return
   670  			}
   671  			// Spin but yield the processor since the lower half
   672  			// should yield the lock soon.
   673  			runtime.Gosched()
   674  			continue
   675  		}
   676  		e.ownedByUser.Store(1)
   677  		return
   678  	}
   679  
   680  	// Finally just give up and wait for the Lock.
   681  	e.mu.Lock()
   682  	e.ownedByUser.Store(1)
   683  }
   684  
   685  // UnlockUser will check if there are any segments already queued for processing
   686  // and wake up a processor goroutine to process them before unlocking e.mu.
   687  // This is required because we when packets arrive and endpoint lock is already
   688  // held then such packets are queued up to be processed.
   689  //
   690  // Precondition: e.LockUser() must have been called before calling e.UnlockUser()
   691  // +checklocksrelease:e.mu
   692  func (e *Endpoint) UnlockUser() {
   693  	// Lock segment queue before checking so that we avoid a race where
   694  	// segments can be queued between the time we check if queue is empty
   695  	// and actually unlock the endpoint mutex.
   696  	e.segmentQueue.mu.Lock()
   697  	if e.segmentQueue.emptyLocked() {
   698  		if e.ownedByUser.Swap(0) != 1 {
   699  			panic("e.UnlockUser() called without calling e.LockUser()")
   700  		}
   701  		e.mu.Unlock()
   702  		e.segmentQueue.mu.Unlock()
   703  		return
   704  	}
   705  	e.segmentQueue.mu.Unlock()
   706  
   707  	// Since we are waking the processor goroutine here just unlock
   708  	// and let it process the queued segments.
   709  	if e.ownedByUser.Swap(0) != 1 {
   710  		panic("e.UnlockUser() called without calling e.LockUser()")
   711  	}
   712  	processor := e.protocol.dispatcher.selectProcessor(e.ID)
   713  	e.mu.Unlock()
   714  
   715  	// Wake up the processor for this endpoint to process any queued
   716  	// segments after releasing the lock to avoid the case where if the
   717  	// processor goroutine starts running before we release the lock here
   718  	// then it will fail to process as TryLock() will fail.
   719  	processor.queueEndpoint(e)
   720  	return
   721  }
   722  
   723  // StopWork halts packet processing. Only to be used in tests.
   724  // +checklocksacquire:e.mu
   725  func (e *Endpoint) StopWork() {
   726  	e.mu.Lock()
   727  }
   728  
   729  // ResumeWork resumes packet processing. Only to be used in tests.
   730  // +checklocksrelease:e.mu
   731  func (e *Endpoint) ResumeWork() {
   732  	e.mu.Unlock()
   733  }
   734  
   735  // AssertLockHeld forces the checklocks analyzer to consider e.mu held. This is
   736  // used in places where we know that e.mu is held, but checklocks does not,
   737  // which can happen when creating new locked objects. You must pass the known
   738  // locked endpoint to this function and it must be the same as the caller
   739  // endpoint.
   740  // TODO(b/226403629): Remove this function once checklocks understands local
   741  // variable locks.
   742  // +checklocks:locked.mu
   743  // +checklocksacquire:e.mu
   744  func (e *Endpoint) AssertLockHeld(locked *Endpoint) {
   745  	if e != locked {
   746  		panic("AssertLockHeld failed: locked endpoint != asserting endpoint")
   747  	}
   748  }
   749  
   750  // TryLock is a helper that calls TryLock on the endpoint's mutex and
   751  // adds the necessary checklocks annotations.
   752  // TODO(b/226403629): Remove this once checklocks understands TryLock.
   753  // +checklocksacquire:e.mu
   754  func (e *Endpoint) TryLock() bool {
   755  	if e.mu.TryLock() {
   756  		return true // +checklocksforce
   757  	}
   758  	return false // +checklocksignore
   759  }
   760  
   761  // setEndpointState updates the state of the endpoint to state atomically. This
   762  // method is unexported as the only place we should update the state is in this
   763  // package but we allow the state to be read freely without holding e.mu.
   764  //
   765  // +checklocks:e.mu
   766  func (e *Endpoint) setEndpointState(state EndpointState) {
   767  	oldstate := EndpointState(e.state.Swap(uint32(state)))
   768  	switch state {
   769  	case StateEstablished:
   770  		e.stack.Stats().TCP.CurrentEstablished.Increment()
   771  		e.stack.Stats().TCP.CurrentConnected.Increment()
   772  	case StateError:
   773  		fallthrough
   774  	case StateClose:
   775  		if oldstate == StateCloseWait || oldstate == StateEstablished {
   776  			e.stack.Stats().TCP.EstablishedResets.Increment()
   777  		}
   778  		if oldstate.connected() {
   779  			e.stack.Stats().TCP.CurrentConnected.Decrement()
   780  		}
   781  		fallthrough
   782  	default:
   783  		if oldstate == StateEstablished {
   784  			e.stack.Stats().TCP.CurrentEstablished.Decrement()
   785  		}
   786  	}
   787  }
   788  
   789  // EndpointState returns the current state of the endpoint.
   790  func (e *Endpoint) EndpointState() EndpointState {
   791  	return EndpointState(e.state.Load())
   792  }
   793  
   794  // setRecentTimestamp sets the recentTS field to the provided value.
   795  func (e *Endpoint) setRecentTimestamp(recentTS uint32) {
   796  	e.RecentTS = recentTS
   797  	e.recentTSTime = e.stack.Clock().NowMonotonic()
   798  }
   799  
   800  // recentTimestamp returns the value of the recentTS field.
   801  func (e *Endpoint) recentTimestamp() uint32 {
   802  	return e.RecentTS
   803  }
   804  
   805  // TODO(gvisor.dev/issue/6974): Remove once tcp endpoints are composed with a
   806  // network.Endpoint, which also defines this function.
   807  func calculateTTL(route *stack.Route, ipv4TTL uint8, ipv6HopLimit int16) uint8 {
   808  	switch netProto := route.NetProto(); netProto {
   809  	case header.IPv4ProtocolNumber:
   810  		if ipv4TTL == tcpip.UseDefaultIPv4TTL {
   811  			return route.DefaultTTL()
   812  		}
   813  		return ipv4TTL
   814  	case header.IPv6ProtocolNumber:
   815  		if ipv6HopLimit == tcpip.UseDefaultIPv6HopLimit {
   816  			return route.DefaultTTL()
   817  		}
   818  		return uint8(ipv6HopLimit)
   819  	default:
   820  		panic(fmt.Sprintf("invalid protocol number = %d", netProto))
   821  	}
   822  }
   823  
   824  // keepalive is a synchronization wrapper used to appease stateify. See the
   825  // comment in endpoint, where it is used.
   826  //
   827  // +stateify savable
   828  type keepalive struct {
   829  	sync.Mutex `state:"nosave"`
   830  	idle       time.Duration
   831  	interval   time.Duration
   832  	count      int
   833  	unacked    int
   834  	// should never be a zero timer if the endpoint is not closed.
   835  	timer timer       `state:"nosave"`
   836  	waker sleep.Waker `state:"nosave"`
   837  }
   838  
   839  func newEndpoint(s *stack.Stack, protocol *protocol, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *Endpoint {
   840  	e := &Endpoint{
   841  		stack:    s,
   842  		protocol: protocol,
   843  		TransportEndpointInfo: stack.TransportEndpointInfo{
   844  			NetProto:   netProto,
   845  			TransProto: header.TCPProtocolNumber,
   846  		},
   847  		sndQueueInfo: sndQueueInfo{
   848  			TCPSndBufState: stack.TCPSndBufState{
   849  				SndMTU: math.MaxInt32,
   850  			},
   851  		},
   852  		waiterQueue: waiterQueue,
   853  		state:       atomicbitops.FromUint32(uint32(StateInitial)),
   854  		keepalive: keepalive{
   855  			idle:     DefaultKeepaliveIdle,
   856  			interval: DefaultKeepaliveInterval,
   857  			count:    DefaultKeepaliveCount,
   858  		},
   859  		uniqueID:     s.UniqueID(),
   860  		ipv4TTL:      tcpip.UseDefaultIPv4TTL,
   861  		ipv6HopLimit: tcpip.UseDefaultIPv6HopLimit,
   862  		// txHash only determines which outgoing queue to use, so
   863  		// InsecureRNG is fine.
   864  		txHash:        s.InsecureRNG().Uint32(),
   865  		windowClamp:   DefaultReceiveBufferSize,
   866  		maxSynRetries: DefaultSynRetries,
   867  	}
   868  	e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits, GetTCPReceiveBufferLimits)
   869  	e.ops.SetMulticastLoop(true)
   870  	e.ops.SetQuickAck(true)
   871  	e.ops.SetSendBufferSize(DefaultSendBufferSize, false /* notify */)
   872  	e.ops.SetReceiveBufferSize(DefaultReceiveBufferSize, false /* notify */)
   873  
   874  	var ss tcpip.TCPSendBufferSizeRangeOption
   875  	if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
   876  		e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */)
   877  	}
   878  
   879  	var rs tcpip.TCPReceiveBufferSizeRangeOption
   880  	if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
   881  		e.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */)
   882  	}
   883  
   884  	var cs tcpip.CongestionControlOption
   885  	if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil {
   886  		e.cc = cs
   887  	}
   888  
   889  	var mrb tcpip.TCPModerateReceiveBufferOption
   890  	if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil {
   891  		e.RcvAutoParams.Disabled = !bool(mrb)
   892  	}
   893  
   894  	var de tcpip.TCPDelayEnabled
   895  	if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de {
   896  		e.ops.SetDelayOption(true)
   897  	}
   898  
   899  	var tcpLT tcpip.TCPLingerTimeoutOption
   900  	if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil {
   901  		e.tcpLingerTimeout = time.Duration(tcpLT)
   902  	}
   903  
   904  	var synRetries tcpip.TCPSynRetriesOption
   905  	if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil {
   906  		e.maxSynRetries = uint8(synRetries)
   907  	}
   908  
   909  	if p := s.GetTCPProbe(); p != nil {
   910  		e.probe = p
   911  	}
   912  
   913  	e.segmentQueue.ep = e
   914  
   915  	// TODO(https://gvisor.dev/issues/7493): Defer creating the timer until TCP connection becomes
   916  	// established.
   917  	e.keepalive.timer.init(e.stack.Clock(), timerHandler(e, e.keepaliveTimerExpired))
   918  
   919  	return e
   920  }
   921  
   922  // Readiness returns the current readiness of the endpoint. For example, if
   923  // waiter.EventIn is set, the endpoint is immediately readable.
   924  func (e *Endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
   925  	result := waiter.EventMask(0)
   926  
   927  	switch e.EndpointState() {
   928  	case StateInitial, StateBound:
   929  		// This prevents blocking of new sockets which are not
   930  		// connected when SO_LINGER is set.
   931  		result |= waiter.EventHUp
   932  
   933  	case StateConnecting, StateSynSent, StateSynRecv:
   934  		// Ready for nothing.
   935  
   936  	case StateClose, StateError, StateTimeWait:
   937  		// Ready for anything.
   938  		result = mask
   939  
   940  	case StateListen:
   941  		// Check if there's anything in the accepted queue.
   942  		if (mask & waiter.ReadableEvents) != 0 {
   943  			e.acceptMu.Lock()
   944  			if e.acceptQueue.endpoints.Len() != 0 {
   945  				result |= waiter.ReadableEvents
   946  			}
   947  			e.acceptMu.Unlock()
   948  		}
   949  	}
   950  	if e.EndpointState().connected() {
   951  		// Determine if the endpoint is writable if requested.
   952  		if (mask & waiter.WritableEvents) != 0 {
   953  			e.sndQueueInfo.sndQueueMu.Lock()
   954  			sndBufSize := e.getSendBufferSize()
   955  			if e.sndQueueInfo.SndClosed || e.sndQueueInfo.SndBufUsed < sndBufSize {
   956  				result |= waiter.WritableEvents
   957  			}
   958  			if e.sndQueueInfo.SndClosed {
   959  				e.updateConnDirectionState(connDirectionStateSndClosed)
   960  			}
   961  			e.sndQueueInfo.sndQueueMu.Unlock()
   962  		}
   963  
   964  		// Determine if the endpoint is readable if requested.
   965  		if (mask & waiter.ReadableEvents) != 0 {
   966  			e.rcvQueueMu.Lock()
   967  			if e.RcvBufUsed > 0 || e.RcvClosed {
   968  				result |= waiter.ReadableEvents
   969  			}
   970  			if e.RcvClosed {
   971  				e.updateConnDirectionState(connDirectionStateRcvClosed)
   972  			}
   973  			e.rcvQueueMu.Unlock()
   974  		}
   975  	}
   976  
   977  	// Determine whether endpoint is half-closed with rcv shutdown
   978  	if e.connDirectionState() == connDirectionStateRcvClosed {
   979  		result |= waiter.EventRdHUp
   980  	}
   981  
   982  	return result
   983  }
   984  
   985  // Purging pending rcv segments is only necessary on RST.
   986  func (e *Endpoint) purgePendingRcvQueue() {
   987  	if e.rcv != nil {
   988  		for e.rcv.pendingRcvdSegments.Len() > 0 {
   989  			s := heap.Pop(&e.rcv.pendingRcvdSegments).(*segment)
   990  			s.DecRef()
   991  		}
   992  	}
   993  }
   994  
   995  // +checklocks:e.mu
   996  func (e *Endpoint) purgeReadQueue() {
   997  	if e.rcv != nil {
   998  		e.rcvQueueMu.Lock()
   999  		defer e.rcvQueueMu.Unlock()
  1000  		for {
  1001  			s := e.rcvQueue.Front()
  1002  			if s == nil {
  1003  				break
  1004  			}
  1005  			e.rcvQueue.Remove(s)
  1006  			s.DecRef()
  1007  		}
  1008  		e.RcvBufUsed = 0
  1009  	}
  1010  }
  1011  
  1012  // +checklocks:e.mu
  1013  func (e *Endpoint) purgeWriteQueue() {
  1014  	if e.snd != nil {
  1015  		e.sndQueueInfo.sndQueueMu.Lock()
  1016  		defer e.sndQueueInfo.sndQueueMu.Unlock()
  1017  		e.snd.updateWriteNext(nil)
  1018  		for {
  1019  			s := e.snd.writeList.Front()
  1020  			if s == nil {
  1021  				break
  1022  			}
  1023  			e.snd.writeList.Remove(s)
  1024  			s.DecRef()
  1025  		}
  1026  		e.sndQueueInfo.SndBufUsed = 0
  1027  		e.sndQueueInfo.SndClosed = true
  1028  	}
  1029  }
  1030  
  1031  // Abort implements stack.TransportEndpoint.Abort.
  1032  func (e *Endpoint) Abort() {
  1033  	defer e.drainClosingSegmentQueue()
  1034  	e.LockUser()
  1035  	defer e.UnlockUser()
  1036  	defer e.purgeReadQueue()
  1037  	// Reset all connected endpoints.
  1038  	switch state := e.EndpointState(); {
  1039  	case state.connected():
  1040  		e.resetConnectionLocked(&tcpip.ErrAborted{})
  1041  		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  1042  		return
  1043  	}
  1044  	e.closeLocked()
  1045  }
  1046  
  1047  // Close puts the endpoint in a closed state and frees all resources associated
  1048  // with it. It must be called only once and with no other concurrent calls to
  1049  // the endpoint.
  1050  func (e *Endpoint) Close() {
  1051  	e.LockUser()
  1052  	if e.closed {
  1053  		e.UnlockUser()
  1054  		return
  1055  	}
  1056  
  1057  	// We always want to purge the read queue, but do so after the checks in
  1058  	// shutdownLocked.
  1059  	e.closeLocked()
  1060  	e.purgeReadQueue()
  1061  	if e.EndpointState() == StateClose || e.EndpointState() == StateError {
  1062  		// It should be safe to purge the read queue now as the endpoint
  1063  		// is now closed or in an error state and further reads are not
  1064  		// permitted.
  1065  		e.UnlockUser()
  1066  		e.drainClosingSegmentQueue()
  1067  		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  1068  		return
  1069  	}
  1070  	e.UnlockUser()
  1071  }
  1072  
  1073  // +checklocks:e.mu
  1074  func (e *Endpoint) closeLocked() {
  1075  	linger := e.SocketOptions().GetLinger()
  1076  	if linger.Enabled && linger.Timeout == 0 {
  1077  		s := e.EndpointState()
  1078  		isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv
  1079  		if isResetState {
  1080  			// Close the endpoint without doing full shutdown and
  1081  			// send a RST.
  1082  			e.resetConnectionLocked(&tcpip.ErrConnectionAborted{})
  1083  			return
  1084  		}
  1085  	}
  1086  
  1087  	// Issue a shutdown so that the peer knows we won't send any more data
  1088  	// if we're connected, or stop accepting if we're listening.
  1089  	e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead)
  1090  	e.closeNoShutdownLocked()
  1091  }
  1092  
  1093  // closeNoShutdown closes the endpoint without doing a full shutdown.
  1094  // +checklocks:e.mu
  1095  func (e *Endpoint) closeNoShutdownLocked() {
  1096  	// For listening sockets, we always release ports inline so that they
  1097  	// are immediately available for reuse after Close() is called. If also
  1098  	// registered, we unregister as well otherwise the next user would fail
  1099  	// in Listen() when trying to register.
  1100  	if e.EndpointState() == StateListen && e.isPortReserved {
  1101  		if e.isRegistered {
  1102  			e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
  1103  			e.isRegistered = false
  1104  		}
  1105  
  1106  		portRes := ports.Reservation{
  1107  			Networks:     e.effectiveNetProtos,
  1108  			Transport:    ProtocolNumber,
  1109  			Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  1110  			Port:         e.TransportEndpointInfo.ID.LocalPort,
  1111  			Flags:        e.boundPortFlags,
  1112  			BindToDevice: e.boundBindToDevice,
  1113  			Dest:         e.boundDest,
  1114  		}
  1115  		e.stack.ReleasePort(portRes)
  1116  		e.isPortReserved = false
  1117  		e.boundBindToDevice = 0
  1118  		e.boundPortFlags = ports.Flags{}
  1119  		e.boundDest = tcpip.FullAddress{}
  1120  	}
  1121  
  1122  	// Mark endpoint as closed.
  1123  	e.closed = true
  1124  	tcpip.AddDanglingEndpoint(e)
  1125  
  1126  	eventMask := waiter.ReadableEvents | waiter.WritableEvents
  1127  
  1128  	switch e.EndpointState() {
  1129  	case StateInitial, StateBound, StateListen:
  1130  		e.setEndpointState(StateClose)
  1131  		fallthrough
  1132  	case StateClose, StateError:
  1133  		eventMask |= waiter.EventHUp
  1134  		e.cleanupLocked()
  1135  	case StateConnecting, StateSynSent, StateSynRecv:
  1136  		// Abort the handshake and set the error.
  1137  		// Notify that the endpoint is closed.
  1138  		eventMask |= waiter.EventHUp
  1139  		e.handshakeFailed(&tcpip.ErrAborted{})
  1140  		// Notify that the endpoint is closed.
  1141  		eventMask |= waiter.EventHUp
  1142  	case StateFinWait2:
  1143  		// The socket has been closed and we are in FIN-WAIT-2 so start
  1144  		// the FIN-WAIT-2 timer.
  1145  		if e.finWait2Timer == nil {
  1146  			e.finWait2Timer = e.stack.Clock().AfterFunc(e.tcpLingerTimeout, e.finWait2TimerExpired)
  1147  		}
  1148  	}
  1149  
  1150  	e.waiterQueue.Notify(eventMask)
  1151  }
  1152  
  1153  // closePendingAcceptableConnections closes all connections that have completed
  1154  // handshake but not yet been delivered to the application.
  1155  func (e *Endpoint) closePendingAcceptableConnectionsLocked() {
  1156  	e.acceptMu.Lock()
  1157  
  1158  	pendingEndpoints := e.acceptQueue.pendingEndpoints
  1159  	e.acceptQueue.pendingEndpoints = nil
  1160  
  1161  	completedEndpoints := make([]*Endpoint, 0, e.acceptQueue.endpoints.Len())
  1162  	for n := e.acceptQueue.endpoints.Front(); n != nil; n = n.Next() {
  1163  		completedEndpoints = append(completedEndpoints, n.Value.(*Endpoint))
  1164  	}
  1165  	e.acceptQueue.endpoints.Init()
  1166  	e.acceptQueue.capacity = 0
  1167  	e.acceptMu.Unlock()
  1168  
  1169  	// Close any endpoints in SYN-RCVD state.
  1170  	for n := range pendingEndpoints {
  1171  		n.Abort()
  1172  	}
  1173  
  1174  	// Reset all connections that are waiting to be accepted.
  1175  	for _, n := range completedEndpoints {
  1176  		n.Abort()
  1177  	}
  1178  }
  1179  
  1180  // cleanupLocked frees all resources associated with the endpoint.
  1181  // +checklocks:e.mu
  1182  func (e *Endpoint) cleanupLocked() {
  1183  	if e.snd != nil {
  1184  		e.snd.resendTimer.cleanup()
  1185  		e.snd.probeTimer.cleanup()
  1186  		e.snd.reorderTimer.cleanup()
  1187  		e.snd.corkTimer.cleanup()
  1188  	}
  1189  
  1190  	if e.finWait2Timer != nil {
  1191  		e.finWait2Timer.Stop()
  1192  	}
  1193  
  1194  	if e.timeWaitTimer != nil {
  1195  		e.timeWaitTimer.Stop()
  1196  	}
  1197  
  1198  	// Close all endpoints that might have been accepted by TCP but not by
  1199  	// the client.
  1200  	e.closePendingAcceptableConnectionsLocked()
  1201  	e.keepalive.timer.cleanup()
  1202  
  1203  	if e.isRegistered {
  1204  		e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
  1205  		e.isRegistered = false
  1206  	}
  1207  
  1208  	if e.isPortReserved {
  1209  		portRes := ports.Reservation{
  1210  			Networks:     e.effectiveNetProtos,
  1211  			Transport:    ProtocolNumber,
  1212  			Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  1213  			Port:         e.TransportEndpointInfo.ID.LocalPort,
  1214  			Flags:        e.boundPortFlags,
  1215  			BindToDevice: e.boundBindToDevice,
  1216  			Dest:         e.boundDest,
  1217  		}
  1218  		e.stack.ReleasePort(portRes)
  1219  		e.isPortReserved = false
  1220  	}
  1221  	e.boundBindToDevice = 0
  1222  	e.boundPortFlags = ports.Flags{}
  1223  	e.boundDest = tcpip.FullAddress{}
  1224  
  1225  	if e.route != nil {
  1226  		e.route.Release()
  1227  		e.route = nil
  1228  	}
  1229  
  1230  	e.purgeWriteQueue()
  1231  	// Only purge the read queue here if the socket is fully closed by the
  1232  	// user.
  1233  	if e.closed {
  1234  		e.purgeReadQueue()
  1235  	}
  1236  	e.stack.CompleteTransportEndpointCleanup(e)
  1237  	tcpip.DeleteDanglingEndpoint(e)
  1238  }
  1239  
  1240  // wndFromSpace returns the window that we can advertise based on the available
  1241  // receive buffer space.
  1242  func wndFromSpace(space int) int {
  1243  	return space >> rcvAdvWndScale
  1244  }
  1245  
  1246  // initialReceiveWindow returns the initial receive window to advertise in the
  1247  // SYN/SYN-ACK.
  1248  func (e *Endpoint) initialReceiveWindow() int {
  1249  	rcvWnd := wndFromSpace(e.receiveBufferAvailable())
  1250  	if rcvWnd > math.MaxUint16 {
  1251  		rcvWnd = math.MaxUint16
  1252  	}
  1253  
  1254  	// Use the user supplied MSS, if available.
  1255  	routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2
  1256  	if rcvWnd > routeWnd {
  1257  		rcvWnd = routeWnd
  1258  	}
  1259  	rcvWndScale := e.rcvWndScaleForHandshake()
  1260  
  1261  	// Round-down the rcvWnd to a multiple of wndScale. This ensures that the
  1262  	// window offered in SYN won't be reduced due to the loss of precision if
  1263  	// window scaling is enabled after the handshake.
  1264  	rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale)
  1265  
  1266  	// Ensure we can always accept at least 1 byte if the scale specified
  1267  	// was too high for the provided rcvWnd.
  1268  	if rcvWnd == 0 {
  1269  		rcvWnd = 1
  1270  	}
  1271  
  1272  	return rcvWnd
  1273  }
  1274  
  1275  // ModerateRecvBuf adjusts the receive buffer and the advertised window
  1276  // based on the number of bytes copied to userspace.
  1277  func (e *Endpoint) ModerateRecvBuf(copied int) {
  1278  	e.LockUser()
  1279  	defer e.UnlockUser()
  1280  
  1281  	sendNonZeroWindowUpdate := false
  1282  
  1283  	e.rcvQueueMu.Lock()
  1284  	if e.RcvAutoParams.Disabled {
  1285  		e.rcvQueueMu.Unlock()
  1286  		return
  1287  	}
  1288  	now := e.stack.Clock().NowMonotonic()
  1289  	if rtt := e.RcvAutoParams.RTT; rtt == 0 || now.Sub(e.RcvAutoParams.MeasureTime) < rtt {
  1290  		e.RcvAutoParams.CopiedBytes += copied
  1291  		e.rcvQueueMu.Unlock()
  1292  		return
  1293  	}
  1294  	prevRTTCopied := e.RcvAutoParams.CopiedBytes + copied
  1295  	prevCopied := e.RcvAutoParams.PrevCopiedBytes
  1296  	rcvWnd := 0
  1297  	if prevRTTCopied > prevCopied {
  1298  		// The minimal receive window based on what was copied by the app
  1299  		// in the immediate preceding RTT and some extra buffer for 16
  1300  		// segments to account for variations.
  1301  		// We multiply by 2 to account for packet losses.
  1302  		rcvWnd = prevRTTCopied*2 + 16*int(e.amss)
  1303  
  1304  		// Scale for slow start based on bytes copied in this RTT vs previous.
  1305  		grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied
  1306  
  1307  		// Multiply growth factor by 2 again to account for sender being
  1308  		// in slow-start where the sender grows it's congestion window
  1309  		// by 100% per RTT.
  1310  		rcvWnd += grow * 2
  1311  
  1312  		// Make sure auto tuned buffer size can always receive upto 2x
  1313  		// the initial window of 10 segments.
  1314  		if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd {
  1315  			rcvWnd = minRcvWnd
  1316  		}
  1317  
  1318  		// Cap the auto tuned buffer size by the maximum permissible
  1319  		// receive buffer size.
  1320  		if max := e.maxReceiveBufferSize(); rcvWnd > max {
  1321  			rcvWnd = max
  1322  		}
  1323  
  1324  		// We do not adjust downwards as that can cause the receiver to
  1325  		// reject valid data that might already be in flight as the
  1326  		// acceptable window will shrink.
  1327  		rcvBufSize := int(e.ops.GetReceiveBufferSize())
  1328  		if rcvWnd > rcvBufSize {
  1329  			availBefore := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize))
  1330  			e.ops.SetReceiveBufferSize(int64(rcvWnd), false /* notify */)
  1331  			availAfter := wndFromSpace(e.receiveBufferAvailableLocked(rcvWnd))
  1332  			if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, rcvBufSize); crossed && above {
  1333  				sendNonZeroWindowUpdate = true
  1334  			}
  1335  		}
  1336  
  1337  		// We only update PrevCopiedBytes when we grow the buffer because in cases
  1338  		// where PrevCopiedBytes > prevRTTCopied the existing buffer is already big
  1339  		// enough to handle the current rate and we don't need to do any
  1340  		// adjustments.
  1341  		e.RcvAutoParams.PrevCopiedBytes = prevRTTCopied
  1342  	}
  1343  	e.RcvAutoParams.MeasureTime = now
  1344  	e.RcvAutoParams.CopiedBytes = 0
  1345  	e.rcvQueueMu.Unlock()
  1346  
  1347  	// Send the update after unlocking rcvQueueMu as sending a segment acquires
  1348  	// the lock to calculate the window to be sent.
  1349  	if e.EndpointState().connected() && sendNonZeroWindowUpdate {
  1350  		e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu
  1351  	}
  1352  }
  1353  
  1354  // SetOwner implements tcpip.Endpoint.SetOwner.
  1355  func (e *Endpoint) SetOwner(owner tcpip.PacketOwner) {
  1356  	e.owner = owner
  1357  }
  1358  
  1359  // +checklocks:e.mu
  1360  func (e *Endpoint) hardErrorLocked() tcpip.Error {
  1361  	err := e.hardError
  1362  	e.hardError = nil
  1363  	return err
  1364  }
  1365  
  1366  // +checklocks:e.mu
  1367  func (e *Endpoint) lastErrorLocked() tcpip.Error {
  1368  	e.lastErrorMu.Lock()
  1369  	defer e.lastErrorMu.Unlock()
  1370  	err := e.lastError
  1371  	e.lastError = nil
  1372  	return err
  1373  }
  1374  
  1375  // LastError implements tcpip.Endpoint.LastError.
  1376  func (e *Endpoint) LastError() tcpip.Error {
  1377  	e.LockUser()
  1378  	defer e.UnlockUser()
  1379  	if err := e.hardErrorLocked(); err != nil {
  1380  		return err
  1381  	}
  1382  	return e.lastErrorLocked()
  1383  }
  1384  
  1385  // LastErrorLocked reads and clears lastError.
  1386  // Only to be used in tests.
  1387  // +checklocks:e.mu
  1388  func (e *Endpoint) LastErrorLocked() tcpip.Error {
  1389  	return e.lastErrorLocked()
  1390  }
  1391  
  1392  // UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError.
  1393  func (e *Endpoint) UpdateLastError(err tcpip.Error) {
  1394  	e.LockUser()
  1395  	e.lastErrorMu.Lock()
  1396  	e.lastError = err
  1397  	e.lastErrorMu.Unlock()
  1398  	e.UnlockUser()
  1399  }
  1400  
  1401  // Read implements tcpip.Endpoint.Read.
  1402  func (e *Endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) {
  1403  	e.LockUser()
  1404  	defer e.UnlockUser()
  1405  
  1406  	if err := e.checkReadLocked(); err != nil {
  1407  		if _, ok := err.(*tcpip.ErrClosedForReceive); ok {
  1408  			e.stats.ReadErrors.ReadClosed.Increment()
  1409  		}
  1410  		return tcpip.ReadResult{}, err
  1411  	}
  1412  
  1413  	var err error
  1414  	done := 0
  1415  	// N.B. Here we get the first segment to be processed. It is safe to not
  1416  	// hold rcvQueueMu when processing, since we hold e.mu to ensure we only
  1417  	// remove segments from the list through Read() and that new segments
  1418  	// cannot be appended.
  1419  	s := e.rcvQueue.Front()
  1420  	for s != nil {
  1421  		var n int
  1422  		n, err = s.ReadTo(dst, opts.Peek)
  1423  		// Book keeping first then error handling.
  1424  		done += n
  1425  
  1426  		if opts.Peek {
  1427  			s = s.Next()
  1428  		} else {
  1429  			sendNonZeroWindowUpdate := false
  1430  			memDelta := 0
  1431  			for {
  1432  				seg := e.rcvQueue.Front()
  1433  				if seg == nil || seg.payloadSize() != 0 {
  1434  					break
  1435  				}
  1436  				e.rcvQueue.Remove(seg)
  1437  				// Memory is only considered released when the whole segment has been
  1438  				// read.
  1439  				memDelta += seg.segMemSize()
  1440  				seg.DecRef()
  1441  			}
  1442  			e.rcvQueueMu.Lock()
  1443  			e.RcvBufUsed -= n
  1444  			s = e.rcvQueue.Front()
  1445  
  1446  			if memDelta > 0 {
  1447  				// If the window was small before this read and if the read freed up
  1448  				// enough buffer space, to either fit an aMSS or half a receive buffer
  1449  				// (whichever smaller), then notify the protocol goroutine to send a
  1450  				// window update.
  1451  				if crossed, above := e.windowCrossedACKThresholdLocked(memDelta, int(e.ops.GetReceiveBufferSize())); crossed && above {
  1452  					sendNonZeroWindowUpdate = true
  1453  				}
  1454  			}
  1455  			e.rcvQueueMu.Unlock()
  1456  
  1457  			if e.EndpointState().connected() && sendNonZeroWindowUpdate {
  1458  				e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu
  1459  			}
  1460  		}
  1461  
  1462  		if err != nil {
  1463  			break
  1464  		}
  1465  	}
  1466  
  1467  	// If something is read, we must report it. Report error when nothing is read.
  1468  	if done == 0 && err != nil {
  1469  		return tcpip.ReadResult{}, &tcpip.ErrBadBuffer{}
  1470  	}
  1471  	return tcpip.ReadResult{
  1472  		Count: done,
  1473  		Total: done,
  1474  	}, nil
  1475  }
  1476  
  1477  // checkRead checks that endpoint is in a readable state.
  1478  //
  1479  // +checklocks:e.mu
  1480  func (e *Endpoint) checkReadLocked() tcpip.Error {
  1481  	e.rcvQueueMu.Lock()
  1482  	defer e.rcvQueueMu.Unlock()
  1483  	// When in SYN-SENT state, let the caller block on the receive.
  1484  	// An application can initiate a non-blocking connect and then block
  1485  	// on a receive. It can expect to read any data after the handshake
  1486  	// is complete. RFC793, section 3.9, p58.
  1487  	if e.EndpointState() == StateSynSent {
  1488  		return &tcpip.ErrWouldBlock{}
  1489  	}
  1490  
  1491  	// The endpoint can be read if it's connected, or if it's already closed
  1492  	// but has some pending unread data. Also note that a RST being received
  1493  	// would cause the state to become StateError so we should allow the
  1494  	// reads to proceed before returning a ECONNRESET.
  1495  	bufUsed := e.RcvBufUsed
  1496  	if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 {
  1497  		if s == StateError {
  1498  			if err := e.hardErrorLocked(); err != nil {
  1499  				return err
  1500  			}
  1501  			return &tcpip.ErrClosedForReceive{}
  1502  		}
  1503  		e.stats.ReadErrors.NotConnected.Increment()
  1504  		return &tcpip.ErrNotConnected{}
  1505  	}
  1506  
  1507  	if e.RcvBufUsed == 0 {
  1508  		if e.RcvClosed || !e.EndpointState().connected() {
  1509  			return &tcpip.ErrClosedForReceive{}
  1510  		}
  1511  		return &tcpip.ErrWouldBlock{}
  1512  	}
  1513  
  1514  	return nil
  1515  }
  1516  
  1517  // isEndpointWritableLocked checks if a given endpoint is writable
  1518  // and also returns the number of bytes that can be written at this
  1519  // moment. If the endpoint is not writable then it returns an error
  1520  // indicating the reason why it's not writable.
  1521  // +checklocks:e.mu
  1522  // +checklocks:e.sndQueueInfo.sndQueueMu
  1523  func (e *Endpoint) isEndpointWritableLocked() (int, tcpip.Error) {
  1524  	// The endpoint cannot be written to if it's not connected.
  1525  	switch s := e.EndpointState(); {
  1526  	case s == StateError:
  1527  		if err := e.hardErrorLocked(); err != nil {
  1528  			return 0, err
  1529  		}
  1530  		return 0, &tcpip.ErrClosedForSend{}
  1531  	case !s.connecting() && !s.connected():
  1532  		return 0, &tcpip.ErrClosedForSend{}
  1533  	case s.connecting():
  1534  		// As per RFC793, page 56, a send request arriving when in connecting
  1535  		// state, can be queued to be completed after the state becomes
  1536  		// connected. Return an error code for the caller of endpoint Write to
  1537  		// try again, until the connection handshake is complete.
  1538  		return 0, &tcpip.ErrWouldBlock{}
  1539  	}
  1540  
  1541  	// Check if the connection has already been closed for sends.
  1542  	if e.sndQueueInfo.SndClosed {
  1543  		return 0, &tcpip.ErrClosedForSend{}
  1544  	}
  1545  
  1546  	sndBufSize := e.getSendBufferSize()
  1547  	avail := sndBufSize - e.sndQueueInfo.SndBufUsed
  1548  	if avail <= 0 {
  1549  		return 0, &tcpip.ErrWouldBlock{}
  1550  	}
  1551  	return avail, nil
  1552  }
  1553  
  1554  // readFromPayloader reads a slice from the Payloader.
  1555  // +checklocks:e.mu
  1556  // +checklocks:e.sndQueueInfo.sndQueueMu
  1557  func (e *Endpoint) readFromPayloader(p tcpip.Payloader, opts tcpip.WriteOptions, avail int) (buffer.Buffer, tcpip.Error) {
  1558  	// We can release locks while copying data.
  1559  	//
  1560  	// This is not possible if atomic is set, because we can't allow the
  1561  	// available buffer space to be consumed by some other caller while we
  1562  	// are copying data in.
  1563  	if !opts.Atomic {
  1564  		e.sndQueueInfo.sndQueueMu.Unlock()
  1565  		defer e.sndQueueInfo.sndQueueMu.Lock()
  1566  
  1567  		e.UnlockUser()
  1568  		defer e.LockUser()
  1569  	}
  1570  
  1571  	// Fetch data.
  1572  	var payload buffer.Buffer
  1573  	if l := p.Len(); l < avail {
  1574  		avail = l
  1575  	}
  1576  	if avail == 0 {
  1577  		return payload, nil
  1578  	}
  1579  	if _, err := payload.WriteFromReader(p, int64(avail)); err != nil {
  1580  		payload.Release()
  1581  		return buffer.Buffer{}, &tcpip.ErrBadBuffer{}
  1582  	}
  1583  	return payload, nil
  1584  }
  1585  
  1586  // queueSegment reads data from the payloader and returns a segment to be sent.
  1587  // +checklocks:e.mu
  1588  func (e *Endpoint) queueSegment(p tcpip.Payloader, opts tcpip.WriteOptions) (*segment, int, tcpip.Error) {
  1589  	e.sndQueueInfo.sndQueueMu.Lock()
  1590  	defer e.sndQueueInfo.sndQueueMu.Unlock()
  1591  
  1592  	avail, err := e.isEndpointWritableLocked()
  1593  	if err != nil {
  1594  		e.stats.WriteErrors.WriteClosed.Increment()
  1595  		return nil, 0, err
  1596  	}
  1597  
  1598  	buf, err := e.readFromPayloader(p, opts, avail)
  1599  	if err != nil {
  1600  		return nil, 0, err
  1601  	}
  1602  
  1603  	// Do not queue zero length segments.
  1604  	if buf.Size() == 0 {
  1605  		return nil, 0, nil
  1606  	}
  1607  
  1608  	if !opts.Atomic {
  1609  		// Since we released locks in between it's possible that the
  1610  		// endpoint transitioned to a CLOSED/ERROR states so make
  1611  		// sure endpoint is still writable before trying to write.
  1612  		avail, err := e.isEndpointWritableLocked()
  1613  		if err != nil {
  1614  			e.stats.WriteErrors.WriteClosed.Increment()
  1615  			buf.Release()
  1616  			return nil, 0, err
  1617  		}
  1618  
  1619  		// A simultaneous call to write on the socket can reduce avail. Discard
  1620  		// excess data copied if this is the case.
  1621  		if int64(avail) < buf.Size() {
  1622  			buf.Truncate(int64(avail))
  1623  		}
  1624  	}
  1625  
  1626  	// Add data to the send queue.
  1627  	size := int(buf.Size())
  1628  	s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), buf)
  1629  	e.sndQueueInfo.SndBufUsed += size
  1630  	e.snd.writeList.PushBack(s)
  1631  
  1632  	return s, size, nil
  1633  }
  1634  
  1635  // Write writes data to the endpoint's peer.
  1636  func (e *Endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) {
  1637  	// Linux completely ignores any address passed to sendto(2) for TCP sockets
  1638  	// (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More
  1639  	// and opts.EndOfRecord are also ignored.
  1640  
  1641  	e.LockUser()
  1642  	defer e.UnlockUser()
  1643  
  1644  	// Return if either we didn't queue anything or if an error occurred while
  1645  	// attempting to queue data.
  1646  	nextSeg, n, err := e.queueSegment(p, opts)
  1647  	if n == 0 || err != nil {
  1648  		return 0, err
  1649  	}
  1650  
  1651  	e.sendData(nextSeg)
  1652  	return int64(n), nil
  1653  }
  1654  
  1655  // selectWindowLocked returns the new window without checking for shrinking or scaling
  1656  // applied.
  1657  // +checklocks:e.mu
  1658  // +checklocks:e.rcvQueueMu
  1659  func (e *Endpoint) selectWindowLocked(rcvBufSize int) (wnd seqnum.Size) {
  1660  	wndFromAvailable := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize))
  1661  	maxWindow := wndFromSpace(rcvBufSize)
  1662  	wndFromUsedBytes := maxWindow - e.RcvBufUsed
  1663  
  1664  	// We take the lesser of the wndFromAvailable and wndFromUsedBytes because in
  1665  	// cases where we receive a lot of small segments the segment overhead is a
  1666  	// lot higher and we can run out socket buffer space before we can fill the
  1667  	// previous window we advertised. In cases where we receive MSS sized or close
  1668  	// MSS sized segments we will probably run out of window space before we
  1669  	// exhaust receive buffer.
  1670  	newWnd := wndFromAvailable
  1671  	if newWnd > wndFromUsedBytes {
  1672  		newWnd = wndFromUsedBytes
  1673  	}
  1674  	if newWnd < 0 {
  1675  		newWnd = 0
  1676  	}
  1677  	return seqnum.Size(newWnd)
  1678  }
  1679  
  1680  // selectWindow invokes selectWindowLocked after acquiring e.rcvQueueMu.
  1681  // +checklocks:e.mu
  1682  func (e *Endpoint) selectWindow() (wnd seqnum.Size) {
  1683  	e.rcvQueueMu.Lock()
  1684  	wnd = e.selectWindowLocked(int(e.ops.GetReceiveBufferSize()))
  1685  	e.rcvQueueMu.Unlock()
  1686  	return wnd
  1687  }
  1688  
  1689  // windowCrossedACKThresholdLocked checks if the receive window to be announced
  1690  // would be under aMSS or under the window derived from half receive buffer,
  1691  // whichever smaller. This is useful as a receive side silly window syndrome
  1692  // prevention mechanism. If window grows to reasonable value, we should send ACK
  1693  // to the sender to inform the rx space is now large. We also want ensure a
  1694  // series of small read()'s won't trigger a flood of spurious tiny ACK's.
  1695  //
  1696  // For large receive buffers, the threshold is aMSS - once reader reads more
  1697  // than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of
  1698  // receive buffer size. This is chosen arbitrarily.
  1699  // crossed will be true if the window size crossed the ACK threshold.
  1700  // above will be true if the new window is >= ACK threshold and false
  1701  // otherwise.
  1702  //
  1703  // +checklocks:e.mu
  1704  // +checklocks:e.rcvQueueMu
  1705  func (e *Endpoint) windowCrossedACKThresholdLocked(deltaBefore int, rcvBufSize int) (crossed bool, above bool) {
  1706  	newAvail := int(e.selectWindowLocked(rcvBufSize))
  1707  	oldAvail := newAvail - deltaBefore
  1708  	if oldAvail < 0 {
  1709  		oldAvail = 0
  1710  	}
  1711  	threshold := int(e.amss)
  1712  	// rcvBufFraction is the inverse of the fraction of receive buffer size that
  1713  	// is used to decide if the available buffer space is now above it.
  1714  	const rcvBufFraction = 2
  1715  	if wndThreshold := wndFromSpace(rcvBufSize / rcvBufFraction); threshold > wndThreshold {
  1716  		threshold = wndThreshold
  1717  	}
  1718  
  1719  	switch {
  1720  	case oldAvail < threshold && newAvail >= threshold:
  1721  		return true, true
  1722  	case oldAvail >= threshold && newAvail < threshold:
  1723  		return true, false
  1724  	}
  1725  	return false, false
  1726  }
  1727  
  1728  // OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet.
  1729  func (e *Endpoint) OnReuseAddressSet(v bool) {
  1730  	e.LockUser()
  1731  	e.portFlags.TupleOnly = v
  1732  	e.UnlockUser()
  1733  }
  1734  
  1735  // OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet.
  1736  func (e *Endpoint) OnReusePortSet(v bool) {
  1737  	e.LockUser()
  1738  	e.portFlags.LoadBalanced = v
  1739  	e.UnlockUser()
  1740  }
  1741  
  1742  // OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet.
  1743  func (e *Endpoint) OnKeepAliveSet(bool) {
  1744  	e.LockUser()
  1745  	e.resetKeepaliveTimer(true /* receivedData */)
  1746  	e.UnlockUser()
  1747  }
  1748  
  1749  // OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet.
  1750  func (e *Endpoint) OnDelayOptionSet(v bool) {
  1751  	if !v {
  1752  		e.LockUser()
  1753  		defer e.UnlockUser()
  1754  		// Handle delayed data.
  1755  		if e.EndpointState().connected() {
  1756  			e.sendData(nil /* next */)
  1757  		}
  1758  	}
  1759  }
  1760  
  1761  // OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet.
  1762  func (e *Endpoint) OnCorkOptionSet(v bool) {
  1763  	if !v {
  1764  		e.LockUser()
  1765  		defer e.UnlockUser()
  1766  		if e.snd != nil {
  1767  			e.snd.corkTimer.disable()
  1768  		}
  1769  		// Handle the corked data.
  1770  		if e.EndpointState().connected() {
  1771  			e.sendData(nil /* next */)
  1772  		}
  1773  	}
  1774  }
  1775  
  1776  func (e *Endpoint) getSendBufferSize() int {
  1777  	return int(e.ops.GetSendBufferSize())
  1778  }
  1779  
  1780  // OnSetReceiveBufferSize implements tcpip.SocketOptionsHandler.OnSetReceiveBufferSize.
  1781  func (e *Endpoint) OnSetReceiveBufferSize(rcvBufSz, oldSz int64) (newSz int64, postSet func()) {
  1782  	e.LockUser()
  1783  
  1784  	sendNonZeroWindowUpdate := false
  1785  	e.rcvQueueMu.Lock()
  1786  
  1787  	// Make sure the receive buffer size allows us to send a
  1788  	// non-zero window size.
  1789  	scale := uint8(0)
  1790  	if e.rcv != nil {
  1791  		scale = e.rcv.RcvWndScale
  1792  	}
  1793  	if rcvBufSz>>scale == 0 {
  1794  		rcvBufSz = 1 << scale
  1795  	}
  1796  
  1797  	availBefore := wndFromSpace(e.receiveBufferAvailableLocked(int(oldSz)))
  1798  	availAfter := wndFromSpace(e.receiveBufferAvailableLocked(int(rcvBufSz)))
  1799  	e.RcvAutoParams.Disabled = true
  1800  
  1801  	// Immediately send an ACK to uncork the sender silly window
  1802  	// syndrome prevetion, when our available space grows above aMSS
  1803  	// or half receive buffer, whichever smaller.
  1804  	if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, int(rcvBufSz)); crossed && above {
  1805  		sendNonZeroWindowUpdate = true
  1806  	}
  1807  
  1808  	e.rcvQueueMu.Unlock()
  1809  
  1810  	postSet = func() {
  1811  		e.LockUser()
  1812  		defer e.UnlockUser()
  1813  		if e.EndpointState().connected() && sendNonZeroWindowUpdate {
  1814  			e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu
  1815  		}
  1816  
  1817  	}
  1818  	e.UnlockUser()
  1819  	return rcvBufSz, postSet
  1820  }
  1821  
  1822  // OnSetSendBufferSize implements tcpip.SocketOptionsHandler.OnSetSendBufferSize.
  1823  func (e *Endpoint) OnSetSendBufferSize(sz int64) int64 {
  1824  	e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled.Store(1)
  1825  	return sz
  1826  }
  1827  
  1828  // WakeupWriters implements tcpip.SocketOptionsHandler.WakeupWriters.
  1829  func (e *Endpoint) WakeupWriters() {
  1830  	e.LockUser()
  1831  	defer e.UnlockUser()
  1832  
  1833  	sendBufferSize := e.getSendBufferSize()
  1834  	e.sndQueueInfo.sndQueueMu.Lock()
  1835  	notify := (sendBufferSize - e.sndQueueInfo.SndBufUsed) >= e.sndQueueInfo.SndBufUsed>>1
  1836  	e.sndQueueInfo.sndQueueMu.Unlock()
  1837  
  1838  	if notify {
  1839  		e.waiterQueue.Notify(waiter.WritableEvents)
  1840  	}
  1841  }
  1842  
  1843  // SetSockOptInt sets a socket option.
  1844  func (e *Endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error {
  1845  	// Lower 2 bits represents ECN bits. RFC 3168, section 23.1
  1846  	const inetECNMask = 3
  1847  
  1848  	switch opt {
  1849  	case tcpip.KeepaliveCountOption:
  1850  		e.LockUser()
  1851  		e.keepalive.Lock()
  1852  		e.keepalive.count = v
  1853  		e.keepalive.Unlock()
  1854  		e.resetKeepaliveTimer(true /* receivedData */)
  1855  		e.UnlockUser()
  1856  
  1857  	case tcpip.IPv4TOSOption:
  1858  		e.LockUser()
  1859  		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
  1860  		// ignore the bits for now.
  1861  		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
  1862  		e.UnlockUser()
  1863  
  1864  	case tcpip.IPv6TrafficClassOption:
  1865  		e.LockUser()
  1866  		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
  1867  		// ignore the bits for now.
  1868  		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
  1869  		e.UnlockUser()
  1870  
  1871  	case tcpip.MaxSegOption:
  1872  		userMSS := v
  1873  		if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS {
  1874  			return &tcpip.ErrInvalidOptionValue{}
  1875  		}
  1876  		e.LockUser()
  1877  		e.userMSS = uint16(userMSS)
  1878  		e.UnlockUser()
  1879  
  1880  	case tcpip.MTUDiscoverOption:
  1881  		// Return not supported if attempting to set this option to
  1882  		// anything other than path MTU discovery disabled.
  1883  		if v != tcpip.PMTUDiscoveryDont {
  1884  			return &tcpip.ErrNotSupported{}
  1885  		}
  1886  
  1887  	case tcpip.IPv4TTLOption:
  1888  		e.LockUser()
  1889  		e.ipv4TTL = uint8(v)
  1890  		e.UnlockUser()
  1891  
  1892  	case tcpip.IPv6HopLimitOption:
  1893  		e.LockUser()
  1894  		e.ipv6HopLimit = int16(v)
  1895  		e.UnlockUser()
  1896  
  1897  	case tcpip.TCPSynCountOption:
  1898  		if v < 1 || v > 255 {
  1899  			return &tcpip.ErrInvalidOptionValue{}
  1900  		}
  1901  		e.LockUser()
  1902  		e.maxSynRetries = uint8(v)
  1903  		e.UnlockUser()
  1904  
  1905  	case tcpip.TCPWindowClampOption:
  1906  		if v == 0 {
  1907  			e.LockUser()
  1908  			switch e.EndpointState() {
  1909  			case StateClose, StateInitial:
  1910  				e.windowClamp = 0
  1911  				e.UnlockUser()
  1912  				return nil
  1913  			default:
  1914  				e.UnlockUser()
  1915  				return &tcpip.ErrInvalidOptionValue{}
  1916  			}
  1917  		}
  1918  		var rs tcpip.TCPReceiveBufferSizeRangeOption
  1919  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
  1920  			if v < rs.Min/2 {
  1921  				v = rs.Min / 2
  1922  			}
  1923  		}
  1924  		e.LockUser()
  1925  		e.windowClamp = uint32(v)
  1926  		e.UnlockUser()
  1927  	}
  1928  	return nil
  1929  }
  1930  
  1931  // HasNIC returns true if the NICID is defined in the stack or id is 0.
  1932  func (e *Endpoint) HasNIC(id int32) bool {
  1933  	return id == 0 || e.stack.HasNIC(tcpip.NICID(id))
  1934  }
  1935  
  1936  // SetSockOpt sets a socket option.
  1937  func (e *Endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error {
  1938  	switch v := opt.(type) {
  1939  	case *tcpip.KeepaliveIdleOption:
  1940  		e.LockUser()
  1941  		e.keepalive.Lock()
  1942  		e.keepalive.idle = time.Duration(*v)
  1943  		e.keepalive.Unlock()
  1944  		e.resetKeepaliveTimer(true /* receivedData */)
  1945  		e.UnlockUser()
  1946  
  1947  	case *tcpip.KeepaliveIntervalOption:
  1948  		e.LockUser()
  1949  		e.keepalive.Lock()
  1950  		e.keepalive.interval = time.Duration(*v)
  1951  		e.keepalive.Unlock()
  1952  		e.resetKeepaliveTimer(true /* receivedData */)
  1953  		e.UnlockUser()
  1954  
  1955  	case *tcpip.TCPUserTimeoutOption:
  1956  		e.LockUser()
  1957  		e.userTimeout = time.Duration(*v)
  1958  		e.UnlockUser()
  1959  
  1960  	case *tcpip.CongestionControlOption:
  1961  		// Query the available cc algorithms in the stack and
  1962  		// validate that the specified algorithm is actually
  1963  		// supported in the stack.
  1964  		var avail tcpip.TCPAvailableCongestionControlOption
  1965  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil {
  1966  			return err
  1967  		}
  1968  		availCC := strings.Split(string(avail), " ")
  1969  		for _, cc := range availCC {
  1970  			if *v == tcpip.CongestionControlOption(cc) {
  1971  				e.LockUser()
  1972  				state := e.EndpointState()
  1973  				e.cc = *v
  1974  				switch state {
  1975  				case StateEstablished:
  1976  					if e.EndpointState() == state {
  1977  						e.snd.cc = e.snd.initCongestionControl(e.cc)
  1978  					}
  1979  				}
  1980  				e.UnlockUser()
  1981  				return nil
  1982  			}
  1983  		}
  1984  
  1985  		// Linux returns ENOENT when an invalid congestion
  1986  		// control algorithm is specified.
  1987  		return &tcpip.ErrNoSuchFile{}
  1988  
  1989  	case *tcpip.TCPLingerTimeoutOption:
  1990  		e.LockUser()
  1991  
  1992  		switch {
  1993  		case *v < 0:
  1994  			// Same as effectively disabling TCPLinger timeout.
  1995  			*v = -1
  1996  		case *v == 0:
  1997  			// Same as the stack default.
  1998  			var stackLingerTimeout tcpip.TCPLingerTimeoutOption
  1999  			if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil {
  2000  				panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err))
  2001  			}
  2002  			*v = stackLingerTimeout
  2003  		case *v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout):
  2004  			// Cap it to Stack's default TCP_LINGER2 timeout.
  2005  			*v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout)
  2006  		default:
  2007  		}
  2008  
  2009  		e.tcpLingerTimeout = time.Duration(*v)
  2010  		e.UnlockUser()
  2011  
  2012  	case *tcpip.TCPDeferAcceptOption:
  2013  		e.LockUser()
  2014  		if time.Duration(*v) > MaxRTO {
  2015  			*v = tcpip.TCPDeferAcceptOption(MaxRTO)
  2016  		}
  2017  		e.deferAccept = time.Duration(*v)
  2018  		e.UnlockUser()
  2019  
  2020  	case *tcpip.SocketDetachFilterOption:
  2021  		return nil
  2022  
  2023  	default:
  2024  		return nil
  2025  	}
  2026  	return nil
  2027  }
  2028  
  2029  // readyReceiveSize returns the number of bytes ready to be received.
  2030  func (e *Endpoint) readyReceiveSize() (int, tcpip.Error) {
  2031  	e.LockUser()
  2032  	defer e.UnlockUser()
  2033  
  2034  	// The endpoint cannot be in listen state.
  2035  	if e.EndpointState() == StateListen {
  2036  		return 0, &tcpip.ErrInvalidEndpointState{}
  2037  	}
  2038  
  2039  	e.rcvQueueMu.Lock()
  2040  	defer e.rcvQueueMu.Unlock()
  2041  
  2042  	return e.RcvBufUsed, nil
  2043  }
  2044  
  2045  // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
  2046  func (e *Endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) {
  2047  	switch opt {
  2048  	case tcpip.KeepaliveCountOption:
  2049  		e.keepalive.Lock()
  2050  		v := e.keepalive.count
  2051  		e.keepalive.Unlock()
  2052  		return v, nil
  2053  
  2054  	case tcpip.IPv4TOSOption:
  2055  		e.LockUser()
  2056  		v := int(e.sendTOS)
  2057  		e.UnlockUser()
  2058  		return v, nil
  2059  
  2060  	case tcpip.IPv6TrafficClassOption:
  2061  		e.LockUser()
  2062  		v := int(e.sendTOS)
  2063  		e.UnlockUser()
  2064  		return v, nil
  2065  
  2066  	case tcpip.MaxSegOption:
  2067  		// Linux only returns user_mss value if user_mss is set and the socket is
  2068  		// unconnected. Otherwise Linux returns the actual current MSS. Netstack
  2069  		// mimics the user_mss behavior, but otherwise just returns the defaultMSS
  2070  		// for now.
  2071  		v := header.TCPDefaultMSS
  2072  		e.LockUser()
  2073  		if state := e.EndpointState(); e.userMSS > 0 && (state.internal() || state == StateClose || state == StateListen) {
  2074  			v = int(e.userMSS)
  2075  		}
  2076  		e.UnlockUser()
  2077  		return v, nil
  2078  
  2079  	case tcpip.MTUDiscoverOption:
  2080  		// Always return the path MTU discovery disabled setting since
  2081  		// it's the only one supported.
  2082  		return tcpip.PMTUDiscoveryDont, nil
  2083  
  2084  	case tcpip.ReceiveQueueSizeOption:
  2085  		return e.readyReceiveSize()
  2086  
  2087  	case tcpip.IPv4TTLOption:
  2088  		e.LockUser()
  2089  		v := int(e.ipv4TTL)
  2090  		e.UnlockUser()
  2091  		return v, nil
  2092  
  2093  	case tcpip.IPv6HopLimitOption:
  2094  		e.LockUser()
  2095  		v := int(e.ipv6HopLimit)
  2096  		e.UnlockUser()
  2097  		return v, nil
  2098  
  2099  	case tcpip.TCPSynCountOption:
  2100  		e.LockUser()
  2101  		v := int(e.maxSynRetries)
  2102  		e.UnlockUser()
  2103  		return v, nil
  2104  
  2105  	case tcpip.TCPWindowClampOption:
  2106  		e.LockUser()
  2107  		v := int(e.windowClamp)
  2108  		e.UnlockUser()
  2109  		return v, nil
  2110  
  2111  	case tcpip.MulticastTTLOption:
  2112  		return 1, nil
  2113  
  2114  	default:
  2115  		return -1, &tcpip.ErrUnknownProtocolOption{}
  2116  	}
  2117  }
  2118  
  2119  func (e *Endpoint) getTCPInfo() tcpip.TCPInfoOption {
  2120  	info := tcpip.TCPInfoOption{}
  2121  	e.LockUser()
  2122  	if state := e.EndpointState(); state.internal() {
  2123  		info.State = tcpip.EndpointState(StateClose)
  2124  	} else {
  2125  		info.State = tcpip.EndpointState(state)
  2126  	}
  2127  	snd := e.snd
  2128  	if snd != nil {
  2129  		// We do not calculate RTT before sending the data packets. If
  2130  		// the connection did not send and receive data, then RTT will
  2131  		// be zero.
  2132  		snd.rtt.Lock()
  2133  		info.RTT = snd.rtt.TCPRTTState.SRTT
  2134  		info.RTTVar = snd.rtt.TCPRTTState.RTTVar
  2135  		snd.rtt.Unlock()
  2136  
  2137  		info.RTO = snd.RTO
  2138  		info.CcState = snd.state
  2139  		info.SndSsthresh = uint32(snd.Ssthresh)
  2140  		info.SndCwnd = uint32(snd.SndCwnd)
  2141  		info.ReorderSeen = snd.rc.Reord
  2142  	}
  2143  	e.UnlockUser()
  2144  	return info
  2145  }
  2146  
  2147  // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
  2148  func (e *Endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error {
  2149  	switch o := opt.(type) {
  2150  	case *tcpip.TCPInfoOption:
  2151  		*o = e.getTCPInfo()
  2152  
  2153  	case *tcpip.KeepaliveIdleOption:
  2154  		e.keepalive.Lock()
  2155  		*o = tcpip.KeepaliveIdleOption(e.keepalive.idle)
  2156  		e.keepalive.Unlock()
  2157  
  2158  	case *tcpip.KeepaliveIntervalOption:
  2159  		e.keepalive.Lock()
  2160  		*o = tcpip.KeepaliveIntervalOption(e.keepalive.interval)
  2161  		e.keepalive.Unlock()
  2162  
  2163  	case *tcpip.TCPUserTimeoutOption:
  2164  		e.LockUser()
  2165  		*o = tcpip.TCPUserTimeoutOption(e.userTimeout)
  2166  		e.UnlockUser()
  2167  
  2168  	case *tcpip.CongestionControlOption:
  2169  		e.LockUser()
  2170  		*o = e.cc
  2171  		e.UnlockUser()
  2172  
  2173  	case *tcpip.TCPLingerTimeoutOption:
  2174  		e.LockUser()
  2175  		*o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout)
  2176  		e.UnlockUser()
  2177  
  2178  	case *tcpip.TCPDeferAcceptOption:
  2179  		e.LockUser()
  2180  		*o = tcpip.TCPDeferAcceptOption(e.deferAccept)
  2181  		e.UnlockUser()
  2182  
  2183  	case *tcpip.OriginalDestinationOption:
  2184  		e.LockUser()
  2185  		ipt := e.stack.IPTables()
  2186  		addr, port, err := ipt.OriginalDst(e.TransportEndpointInfo.ID, e.NetProto, ProtocolNumber)
  2187  		e.UnlockUser()
  2188  		if err != nil {
  2189  			return err
  2190  		}
  2191  		*o = tcpip.OriginalDestinationOption{
  2192  			Addr: addr,
  2193  			Port: port,
  2194  		}
  2195  
  2196  	default:
  2197  		return &tcpip.ErrUnknownProtocolOption{}
  2198  	}
  2199  	return nil
  2200  }
  2201  
  2202  // checkV4MappedLocked determines the effective network protocol and converts
  2203  // addr to its canonical form.
  2204  // +checklocks:e.mu
  2205  func (e *Endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) {
  2206  	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only())
  2207  	if err != nil {
  2208  		return tcpip.FullAddress{}, 0, err
  2209  	}
  2210  	return unwrapped, netProto, nil
  2211  }
  2212  
  2213  // Disconnect implements tcpip.Endpoint.Disconnect.
  2214  func (*Endpoint) Disconnect() tcpip.Error {
  2215  	return &tcpip.ErrNotSupported{}
  2216  }
  2217  
  2218  // Connect connects the endpoint to its peer.
  2219  func (e *Endpoint) Connect(addr tcpip.FullAddress) tcpip.Error {
  2220  	e.LockUser()
  2221  	defer e.UnlockUser()
  2222  	err := e.connect(addr, true)
  2223  	if err != nil {
  2224  		if !err.IgnoreStats() {
  2225  			// Connect failed. Let's wake up any waiters.
  2226  			e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  2227  			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
  2228  			e.stats.FailedConnectionAttempts.Increment()
  2229  		}
  2230  	}
  2231  	return err
  2232  }
  2233  
  2234  // registerEndpoint registers the endpoint with the provided address.
  2235  //
  2236  // +checklocks:e.mu
  2237  func (e *Endpoint) registerEndpoint(addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber, nicID tcpip.NICID) tcpip.Error {
  2238  	netProtos := []tcpip.NetworkProtocolNumber{netProto}
  2239  	if e.TransportEndpointInfo.ID.LocalPort != 0 {
  2240  		// The endpoint is bound to a port, attempt to register it.
  2241  		err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
  2242  		if err != nil {
  2243  			return err
  2244  		}
  2245  	} else {
  2246  		// The endpoint doesn't have a local port yet, so try to get
  2247  		// one. Make sure that it isn't one that will result in the same
  2248  		// address/port for both local and remote (otherwise this
  2249  		// endpoint would be trying to connect to itself).
  2250  		sameAddr := e.TransportEndpointInfo.ID.LocalAddress == e.TransportEndpointInfo.ID.RemoteAddress
  2251  
  2252  		var twReuse tcpip.TCPTimeWaitReuseOption
  2253  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &twReuse); err != nil {
  2254  			panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &twReuse, err))
  2255  		}
  2256  
  2257  		reuse := twReuse == tcpip.TCPTimeWaitReuseGlobal
  2258  		if twReuse == tcpip.TCPTimeWaitReuseLoopbackOnly {
  2259  			switch netProto {
  2260  			case header.IPv4ProtocolNumber:
  2261  				reuse = header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.LocalAddress) && header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.RemoteAddress)
  2262  			case header.IPv6ProtocolNumber:
  2263  				reuse = e.TransportEndpointInfo.ID.LocalAddress == header.IPv6Loopback && e.TransportEndpointInfo.ID.RemoteAddress == header.IPv6Loopback
  2264  			}
  2265  		}
  2266  
  2267  		bindToDevice := tcpip.NICID(e.ops.GetBindToDevice())
  2268  		if _, err := e.stack.PickEphemeralPort(e.stack.SecureRNG(), func(p uint16) (bool, tcpip.Error) {
  2269  			if sameAddr && p == e.TransportEndpointInfo.ID.RemotePort {
  2270  				return false, nil
  2271  			}
  2272  			portRes := ports.Reservation{
  2273  				Networks:     netProtos,
  2274  				Transport:    ProtocolNumber,
  2275  				Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  2276  				Port:         p,
  2277  				Flags:        e.portFlags,
  2278  				BindToDevice: bindToDevice,
  2279  				Dest:         addr,
  2280  			}
  2281  			if _, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, nil /* testPort */); err != nil {
  2282  				if _, ok := err.(*tcpip.ErrPortInUse); !ok || !reuse {
  2283  					return false, nil
  2284  				}
  2285  				transEPID := e.TransportEndpointInfo.ID
  2286  				transEPID.LocalPort = p
  2287  				// Check if an endpoint is registered with demuxer in TIME-WAIT and if
  2288  				// we can reuse it. If we can't find a transport endpoint then we just
  2289  				// skip using this port as it's possible that either an endpoint has
  2290  				// bound the port but not registered with demuxer yet (no listen/connect
  2291  				// done yet) or the reservation was freed between the check above and
  2292  				// the FindTransportEndpoint below. But rather than retry the same port
  2293  				// we just skip it and move on.
  2294  				transEP := e.stack.FindTransportEndpoint(netProto, ProtocolNumber, transEPID, nicID)
  2295  				if transEP == nil {
  2296  					// ReservePort failed but there is no registered endpoint with
  2297  					// demuxer. Which indicates there is at least some endpoint that has
  2298  					// bound the port.
  2299  					return false, nil
  2300  				}
  2301  
  2302  				tcpEP := transEP.(*Endpoint)
  2303  				tcpEP.LockUser()
  2304  				// If the endpoint is not in TIME-WAIT or if it is in TIME-WAIT but
  2305  				// less than 1 second has elapsed since its recentTS was updated then
  2306  				// we cannot reuse the port.
  2307  				if tcpEP.EndpointState() != StateTimeWait || e.stack.Clock().NowMonotonic().Sub(tcpEP.recentTSTime) < 1*time.Second {
  2308  					tcpEP.UnlockUser()
  2309  					return false, nil
  2310  				}
  2311  				// Since the endpoint is in TIME-WAIT it should be safe to acquire its
  2312  				// Lock while holding the lock for this endpoint as endpoints in
  2313  				// TIME-WAIT do not acquire locks on other endpoints.
  2314  				tcpEP.transitionToStateCloseLocked()
  2315  				tcpEP.drainClosingSegmentQueue()
  2316  				tcpEP.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  2317  				tcpEP.UnlockUser()
  2318  				// Now try and Reserve again if it fails then we skip.
  2319  				portRes := ports.Reservation{
  2320  					Networks:     netProtos,
  2321  					Transport:    ProtocolNumber,
  2322  					Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  2323  					Port:         p,
  2324  					Flags:        e.portFlags,
  2325  					BindToDevice: bindToDevice,
  2326  					Dest:         addr,
  2327  				}
  2328  				if _, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, nil /* testPort */); err != nil {
  2329  					return false, nil
  2330  				}
  2331  			}
  2332  
  2333  			id := e.TransportEndpointInfo.ID
  2334  			id.LocalPort = p
  2335  			if err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.portFlags, bindToDevice); err != nil {
  2336  				portRes := ports.Reservation{
  2337  					Networks:     netProtos,
  2338  					Transport:    ProtocolNumber,
  2339  					Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  2340  					Port:         p,
  2341  					Flags:        e.portFlags,
  2342  					BindToDevice: bindToDevice,
  2343  					Dest:         addr,
  2344  				}
  2345  				e.stack.ReleasePort(portRes)
  2346  				if _, ok := err.(*tcpip.ErrPortInUse); ok {
  2347  					return false, nil
  2348  				}
  2349  				return false, err
  2350  			}
  2351  
  2352  			// Port picking successful. Save the details of
  2353  			// the selected port.
  2354  			e.TransportEndpointInfo.ID = id
  2355  			e.isPortReserved = true
  2356  			e.boundBindToDevice = bindToDevice
  2357  			e.boundPortFlags = e.portFlags
  2358  			e.boundDest = addr
  2359  			return true, nil
  2360  		}); err != nil {
  2361  			e.stack.Stats().TCP.FailedPortReservations.Increment()
  2362  			return err
  2363  		}
  2364  	}
  2365  	return nil
  2366  }
  2367  
  2368  // connect connects the endpoint to its peer.
  2369  // +checklocks:e.mu
  2370  func (e *Endpoint) connect(addr tcpip.FullAddress, handshake bool) tcpip.Error {
  2371  	connectingAddr := addr.Addr
  2372  
  2373  	addr, netProto, err := e.checkV4MappedLocked(addr)
  2374  	if err != nil {
  2375  		return err
  2376  	}
  2377  
  2378  	if e.EndpointState().connected() {
  2379  		// The endpoint is already connected. If caller hasn't been
  2380  		// notified yet, return success.
  2381  		if !e.isConnectNotified {
  2382  			e.isConnectNotified = true
  2383  			return nil
  2384  		}
  2385  		// Otherwise return that it's already connected.
  2386  		return &tcpip.ErrAlreadyConnected{}
  2387  	}
  2388  
  2389  	nicID := addr.NIC
  2390  	switch e.EndpointState() {
  2391  	case StateBound:
  2392  		// If we're already bound to a NIC but the caller is requesting
  2393  		// that we use a different one now, we cannot proceed.
  2394  		if e.boundNICID == 0 {
  2395  			break
  2396  		}
  2397  
  2398  		if nicID != 0 && nicID != e.boundNICID {
  2399  			return &tcpip.ErrHostUnreachable{}
  2400  		}
  2401  
  2402  		nicID = e.boundNICID
  2403  
  2404  	case StateInitial:
  2405  		// Nothing to do. We'll eventually fill-in the gaps in the ID (if any)
  2406  		// when we find a route.
  2407  
  2408  	case StateConnecting, StateSynSent, StateSynRecv:
  2409  		// A connection request has already been issued but hasn't completed
  2410  		// yet.
  2411  		return &tcpip.ErrAlreadyConnecting{}
  2412  
  2413  	case StateError:
  2414  		if err := e.hardErrorLocked(); err != nil {
  2415  			return err
  2416  		}
  2417  		return &tcpip.ErrConnectionAborted{}
  2418  
  2419  	default:
  2420  		return &tcpip.ErrInvalidEndpointState{}
  2421  	}
  2422  
  2423  	// Find a route to the desired destination.
  2424  	r, err := e.stack.FindRoute(nicID, e.TransportEndpointInfo.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */)
  2425  	if err != nil {
  2426  		return err
  2427  	}
  2428  	defer r.Release()
  2429  
  2430  	e.TransportEndpointInfo.ID.LocalAddress = r.LocalAddress()
  2431  	e.TransportEndpointInfo.ID.RemoteAddress = r.RemoteAddress()
  2432  	e.TransportEndpointInfo.ID.RemotePort = addr.Port
  2433  
  2434  	oldState := e.EndpointState()
  2435  	e.setEndpointState(StateConnecting)
  2436  	if err := e.registerEndpoint(addr, netProto, r.NICID()); err != nil {
  2437  		e.setEndpointState(oldState)
  2438  		if _, ok := err.(*tcpip.ErrPortInUse); ok {
  2439  			return &tcpip.ErrBadLocalAddress{}
  2440  		}
  2441  		return err
  2442  	}
  2443  
  2444  	e.isRegistered = true
  2445  	r.Acquire()
  2446  	e.route = r
  2447  	e.boundNICID = nicID
  2448  	e.effectiveNetProtos = []tcpip.NetworkProtocolNumber{netProto}
  2449  	e.connectingAddress = connectingAddr
  2450  
  2451  	e.initGSO()
  2452  
  2453  	// Connect in the restore phase does not perform handshake. Restore its
  2454  	// connection setting here.
  2455  	if !handshake {
  2456  		e.segmentQueue.mu.Lock()
  2457  		for _, l := range []segmentList{e.segmentQueue.list, e.snd.writeList} {
  2458  			for s := l.Front(); s != nil; s = s.Next() {
  2459  				s.id = e.TransportEndpointInfo.ID
  2460  				e.sndQueueInfo.sndWaker.Assert()
  2461  			}
  2462  		}
  2463  		e.segmentQueue.mu.Unlock()
  2464  		e.snd.ep.AssertLockHeld(e)
  2465  		e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
  2466  		e.setEndpointState(StateEstablished)
  2467  		// Set the new auto tuned send buffer size after entering
  2468  		// established state.
  2469  		e.ops.SetSendBufferSize(e.computeTCPSendBufferSize(), false /* notify */)
  2470  		return &tcpip.ErrConnectStarted{}
  2471  	}
  2472  
  2473  	// Start a new handshake.
  2474  	h := e.newHandshake()
  2475  	e.setEndpointState(StateSynSent)
  2476  	h.start()
  2477  	e.stack.Stats().TCP.ActiveConnectionOpenings.Increment()
  2478  
  2479  	return &tcpip.ErrConnectStarted{}
  2480  }
  2481  
  2482  // ConnectEndpoint is not supported.
  2483  func (*Endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error {
  2484  	return &tcpip.ErrInvalidEndpointState{}
  2485  }
  2486  
  2487  // Shutdown closes the read and/or write end of the endpoint connection to its
  2488  // peer.
  2489  func (e *Endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error {
  2490  	e.LockUser()
  2491  	defer e.UnlockUser()
  2492  
  2493  	if e.EndpointState().connecting() {
  2494  		// When calling shutdown(2) on a connecting socket, the endpoint must
  2495  		// enter the error state. But this logic cannot belong to the shutdownLocked
  2496  		// method because that method is called during a close(2) (and closing a
  2497  		// connecting socket is not an error).
  2498  		e.handshakeFailed(&tcpip.ErrConnectionReset{})
  2499  		e.waiterQueue.Notify(waiter.WritableEvents | waiter.EventHUp | waiter.EventErr)
  2500  		return nil
  2501  	}
  2502  
  2503  	return e.shutdownLocked(flags)
  2504  }
  2505  
  2506  // +checklocks:e.mu
  2507  func (e *Endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error {
  2508  	e.shutdownFlags |= flags
  2509  	switch {
  2510  	case e.EndpointState().connected():
  2511  		// Close for read.
  2512  		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
  2513  			// Mark read side as closed.
  2514  			e.rcvQueueMu.Lock()
  2515  			e.RcvClosed = true
  2516  			rcvBufUsed := e.RcvBufUsed
  2517  			e.rcvQueueMu.Unlock()
  2518  			// If we're fully closed and we have unread data we need to abort
  2519  			// the connection with a RST.
  2520  			if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 {
  2521  				e.resetConnectionLocked(&tcpip.ErrConnectionAborted{})
  2522  				return nil
  2523  			}
  2524  			// Wake up any readers that maybe waiting for the stream to become
  2525  			// readable.
  2526  			events := waiter.ReadableEvents
  2527  			if e.shutdownFlags&tcpip.ShutdownWrite == 0 {
  2528  				// If ShutdownWrite is not set, write end won't close and
  2529  				// we end up with a half-closed connection
  2530  				events |= waiter.EventRdHUp
  2531  			}
  2532  			e.waiterQueue.Notify(events)
  2533  		}
  2534  
  2535  		// Close for write.
  2536  		if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
  2537  			e.sndQueueInfo.sndQueueMu.Lock()
  2538  			if e.sndQueueInfo.SndClosed {
  2539  				// Already closed.
  2540  				e.sndQueueInfo.sndQueueMu.Unlock()
  2541  				if e.EndpointState() == StateTimeWait {
  2542  					return &tcpip.ErrNotConnected{}
  2543  				}
  2544  				return nil
  2545  			}
  2546  
  2547  			// Queue fin segment.
  2548  			s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), buffer.Buffer{})
  2549  			e.snd.writeList.PushBack(s)
  2550  			// Mark endpoint as closed.
  2551  			e.sndQueueInfo.SndClosed = true
  2552  			e.sndQueueInfo.sndQueueMu.Unlock()
  2553  
  2554  			// Drain the send queue.
  2555  			e.sendData(s)
  2556  
  2557  			// Mark send side as closed.
  2558  			e.snd.Closed = true
  2559  
  2560  			// Wake up any writers that maybe waiting for the stream to become
  2561  			// writable.
  2562  			e.waiterQueue.Notify(waiter.WritableEvents)
  2563  		}
  2564  
  2565  		return nil
  2566  	case e.EndpointState() == StateListen:
  2567  		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
  2568  			// Reset all connections from the accept queue and keep the
  2569  			// worker running so that it can continue handling incoming
  2570  			// segments by replying with RST.
  2571  			//
  2572  			// By not removing this endpoint from the demuxer mapping, we
  2573  			// ensure that any other bind to the same port fails, as on Linux.
  2574  			e.rcvQueueMu.Lock()
  2575  			e.RcvClosed = true
  2576  			e.rcvQueueMu.Unlock()
  2577  			e.closePendingAcceptableConnectionsLocked()
  2578  			// Notify waiters that the endpoint is shutdown.
  2579  			e.waiterQueue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr)
  2580  		}
  2581  		return nil
  2582  	default:
  2583  		return &tcpip.ErrNotConnected{}
  2584  	}
  2585  }
  2586  
  2587  // Listen puts the endpoint in "listen" mode, which allows it to accept
  2588  // new connections.
  2589  func (e *Endpoint) Listen(backlog int) tcpip.Error {
  2590  	if err := e.listen(backlog); err != nil {
  2591  		if !err.IgnoreStats() {
  2592  			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
  2593  			e.stats.FailedConnectionAttempts.Increment()
  2594  		}
  2595  		return err
  2596  	}
  2597  	return nil
  2598  }
  2599  
  2600  func (e *Endpoint) listen(backlog int) tcpip.Error {
  2601  	e.LockUser()
  2602  	defer e.UnlockUser()
  2603  
  2604  	if e.EndpointState() == StateListen && !e.closed {
  2605  		e.acceptMu.Lock()
  2606  		defer e.acceptMu.Unlock()
  2607  
  2608  		// Adjust the size of the backlog iff we can fit
  2609  		// existing pending connections into the new one.
  2610  		if e.acceptQueue.endpoints.Len() > backlog {
  2611  			return &tcpip.ErrInvalidEndpointState{}
  2612  		}
  2613  		e.acceptQueue.capacity = backlog
  2614  
  2615  		if e.acceptQueue.pendingEndpoints == nil {
  2616  			e.acceptQueue.pendingEndpoints = make(map[*Endpoint]struct{})
  2617  		}
  2618  
  2619  		e.shutdownFlags = 0
  2620  		e.updateConnDirectionState(connDirectionStateOpen)
  2621  		e.rcvQueueMu.Lock()
  2622  		e.RcvClosed = false
  2623  		e.rcvQueueMu.Unlock()
  2624  
  2625  		return nil
  2626  	}
  2627  
  2628  	if e.EndpointState() == StateInitial {
  2629  		// The listen is called on an unbound socket, the socket is
  2630  		// automatically bound to a random free port with the local
  2631  		// address set to INADDR_ANY.
  2632  		if err := e.bindLocked(tcpip.FullAddress{}); err != nil {
  2633  			return err
  2634  		}
  2635  	}
  2636  
  2637  	// Endpoint must be bound before it can transition to listen mode.
  2638  	if e.EndpointState() != StateBound {
  2639  		e.stats.ReadErrors.InvalidEndpointState.Increment()
  2640  		return &tcpip.ErrInvalidEndpointState{}
  2641  	}
  2642  
  2643  	// Setting this state after RegisterTransportEndpoint will result in a
  2644  	// race where the endpoint is in Bound but reachable via the demuxer. Instead
  2645  	// we set it to listen so that incoming packets will just be queued to the
  2646  	// inbound segment queue by the TCP processor.
  2647  	e.setEndpointState(StateListen)
  2648  	// Register the endpoint.
  2649  	if err := e.stack.RegisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil {
  2650  		e.transitionToStateCloseLocked()
  2651  		return err
  2652  	}
  2653  
  2654  	e.isRegistered = true
  2655  
  2656  	// The queue may be non-zero when we're restoring the endpoint, and it
  2657  	// may be pre-populated with some previously accepted (but not Accepted)
  2658  	// endpoints.
  2659  	e.acceptMu.Lock()
  2660  	if e.acceptQueue.pendingEndpoints == nil {
  2661  		e.acceptQueue.pendingEndpoints = make(map[*Endpoint]struct{})
  2662  	}
  2663  	if e.acceptQueue.capacity == 0 {
  2664  		e.acceptQueue.capacity = backlog
  2665  	}
  2666  	e.acceptMu.Unlock()
  2667  
  2668  	// Initialize the listening context.
  2669  	rcvWnd := seqnum.Size(e.receiveBufferAvailable())
  2670  	e.listenCtx = newListenContext(e.stack, e.protocol, e, rcvWnd, e.ops.GetV6Only(), e.NetProto)
  2671  
  2672  	return nil
  2673  }
  2674  
  2675  // Accept returns a new endpoint if a peer has established a connection
  2676  // to an endpoint previously set to listen mode.
  2677  //
  2678  // addr if not-nil will contain the peer address of the returned endpoint.
  2679  func (e *Endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) {
  2680  	e.LockUser()
  2681  	defer e.UnlockUser()
  2682  
  2683  	e.rcvQueueMu.Lock()
  2684  	rcvClosed := e.RcvClosed
  2685  	e.rcvQueueMu.Unlock()
  2686  	// Endpoint must be in listen state before it can accept connections.
  2687  	if rcvClosed || e.EndpointState() != StateListen {
  2688  		return nil, nil, &tcpip.ErrInvalidEndpointState{}
  2689  	}
  2690  
  2691  	// Get the new accepted endpoint.
  2692  	var n *Endpoint
  2693  	e.acceptMu.Lock()
  2694  	if element := e.acceptQueue.endpoints.Front(); element != nil {
  2695  		n = e.acceptQueue.endpoints.Remove(element).(*Endpoint)
  2696  	}
  2697  	e.acceptMu.Unlock()
  2698  	if n == nil {
  2699  		return nil, nil, &tcpip.ErrWouldBlock{}
  2700  	}
  2701  	if peerAddr != nil {
  2702  		*peerAddr = n.getRemoteAddress()
  2703  	}
  2704  	return n, n.waiterQueue, nil
  2705  }
  2706  
  2707  // Bind binds the endpoint to a specific local port and optionally address.
  2708  func (e *Endpoint) Bind(addr tcpip.FullAddress) (err tcpip.Error) {
  2709  	e.LockUser()
  2710  	defer e.UnlockUser()
  2711  
  2712  	return e.bindLocked(addr)
  2713  }
  2714  
  2715  // +checklocks:e.mu
  2716  func (e *Endpoint) bindLocked(addr tcpip.FullAddress) (err tcpip.Error) {
  2717  	// Don't allow binding once endpoint is not in the initial state
  2718  	// anymore. This is because once the endpoint goes into a connected or
  2719  	// listen state, it is already bound.
  2720  	if e.EndpointState() != StateInitial {
  2721  		return &tcpip.ErrAlreadyBound{}
  2722  	}
  2723  
  2724  	e.BindAddr = addr.Addr
  2725  	addr, netProto, err := e.checkV4MappedLocked(addr)
  2726  	if err != nil {
  2727  		return err
  2728  	}
  2729  
  2730  	netProtos := []tcpip.NetworkProtocolNumber{netProto}
  2731  
  2732  	// Expand netProtos to include v4 and v6 under dual-stack if the caller is
  2733  	// binding to a wildcard (empty) address, and this is an IPv6 endpoint with
  2734  	// v6only set to false.
  2735  	if netProto == header.IPv6ProtocolNumber {
  2736  		stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber)
  2737  		alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == tcpip.Address{} && stackHasV4
  2738  		if alsoBindToV4 {
  2739  			netProtos = append(netProtos, header.IPv4ProtocolNumber)
  2740  		}
  2741  	}
  2742  
  2743  	var nic tcpip.NICID
  2744  	// If an address is specified, we must ensure that it's one of our
  2745  	// local addresses.
  2746  	if addr.Addr.Len() != 0 {
  2747  		nic = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
  2748  		if nic == 0 {
  2749  			return &tcpip.ErrBadLocalAddress{}
  2750  		}
  2751  		e.TransportEndpointInfo.ID.LocalAddress = addr.Addr
  2752  	}
  2753  
  2754  	bindToDevice := tcpip.NICID(e.ops.GetBindToDevice())
  2755  	portRes := ports.Reservation{
  2756  		Networks:     netProtos,
  2757  		Transport:    ProtocolNumber,
  2758  		Addr:         addr.Addr,
  2759  		Port:         addr.Port,
  2760  		Flags:        e.portFlags,
  2761  		BindToDevice: bindToDevice,
  2762  		Dest:         tcpip.FullAddress{},
  2763  	}
  2764  	port, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, func(p uint16) (bool, tcpip.Error) {
  2765  		id := e.TransportEndpointInfo.ID
  2766  		id.LocalPort = p
  2767  		// CheckRegisterTransportEndpoint should only return an error if there is a
  2768  		// listening endpoint bound with the same id and portFlags and bindToDevice
  2769  		// options.
  2770  		//
  2771  		// NOTE: Only listening and connected endpoint register with
  2772  		// demuxer. Further connected endpoints always have a remote
  2773  		// address/port. Hence this will only return an error if there is a matching
  2774  		// listening endpoint.
  2775  		if err := e.stack.CheckRegisterTransportEndpoint(netProtos, ProtocolNumber, id, e.portFlags, bindToDevice); err != nil {
  2776  			return false, nil
  2777  		}
  2778  		return true, nil
  2779  	})
  2780  	if err != nil {
  2781  		e.stack.Stats().TCP.FailedPortReservations.Increment()
  2782  		return err
  2783  	}
  2784  
  2785  	e.boundBindToDevice = bindToDevice
  2786  	e.boundPortFlags = e.portFlags
  2787  	// TODO(gvisor.dev/issue/3691): Add test to verify boundNICID is correct.
  2788  	e.boundNICID = nic
  2789  	e.isPortReserved = true
  2790  	e.effectiveNetProtos = netProtos
  2791  	e.TransportEndpointInfo.ID.LocalPort = port
  2792  
  2793  	// Mark endpoint as bound.
  2794  	e.setEndpointState(StateBound)
  2795  
  2796  	return nil
  2797  }
  2798  
  2799  // GetLocalAddress returns the address to which the endpoint is bound.
  2800  func (e *Endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) {
  2801  	e.LockUser()
  2802  	defer e.UnlockUser()
  2803  
  2804  	return tcpip.FullAddress{
  2805  		Addr: e.TransportEndpointInfo.ID.LocalAddress,
  2806  		Port: e.TransportEndpointInfo.ID.LocalPort,
  2807  		NIC:  e.boundNICID,
  2808  	}, nil
  2809  }
  2810  
  2811  // GetRemoteAddress returns the address to which the endpoint is connected.
  2812  func (e *Endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) {
  2813  	e.LockUser()
  2814  	defer e.UnlockUser()
  2815  
  2816  	if !e.EndpointState().connected() {
  2817  		return tcpip.FullAddress{}, &tcpip.ErrNotConnected{}
  2818  	}
  2819  
  2820  	return e.getRemoteAddress(), nil
  2821  }
  2822  
  2823  func (e *Endpoint) getRemoteAddress() tcpip.FullAddress {
  2824  	return tcpip.FullAddress{
  2825  		Addr: e.TransportEndpointInfo.ID.RemoteAddress,
  2826  		Port: e.TransportEndpointInfo.ID.RemotePort,
  2827  		NIC:  e.boundNICID,
  2828  	}
  2829  }
  2830  
  2831  // HandlePacket implements stack.TransportEndpoint.HandlePacket.
  2832  func (*Endpoint) HandlePacket(stack.TransportEndpointID, *stack.PacketBuffer) {
  2833  	// TCP HandlePacket is not required anymore as inbound packets first
  2834  	// land at the Dispatcher which then can either deliver using the
  2835  	// worker go routine or directly do the invoke the tcp processing inline
  2836  	// based on the state of the endpoint.
  2837  }
  2838  
  2839  func (e *Endpoint) enqueueSegment(s *segment) bool {
  2840  	// Send packet to worker goroutine.
  2841  	if !e.segmentQueue.enqueue(s) {
  2842  		// The queue is full, so we drop the segment.
  2843  		e.stack.Stats().DroppedPackets.Increment()
  2844  		e.stats.ReceiveErrors.SegmentQueueDropped.Increment()
  2845  		return false
  2846  	}
  2847  	return true
  2848  }
  2849  
  2850  func (e *Endpoint) onICMPError(err tcpip.Error, transErr stack.TransportError, pkt *stack.PacketBuffer) {
  2851  	// Update last error first.
  2852  	e.lastErrorMu.Lock()
  2853  	e.lastError = err
  2854  	e.lastErrorMu.Unlock()
  2855  
  2856  	var recvErr bool
  2857  	switch pkt.NetworkProtocolNumber {
  2858  	case header.IPv4ProtocolNumber:
  2859  		recvErr = e.SocketOptions().GetIPv4RecvError()
  2860  	case header.IPv6ProtocolNumber:
  2861  		recvErr = e.SocketOptions().GetIPv6RecvError()
  2862  	default:
  2863  		panic(fmt.Sprintf("unhandled network protocol number = %d", pkt.NetworkProtocolNumber))
  2864  	}
  2865  
  2866  	if recvErr {
  2867  		e.SocketOptions().QueueErr(&tcpip.SockError{
  2868  			Err:   err,
  2869  			Cause: transErr,
  2870  			// Linux passes the payload with the TCP header. We don't know if the TCP
  2871  			// header even exists, it may not for fragmented packets.
  2872  			Payload: pkt.Data().AsRange().ToView(),
  2873  			Dst: tcpip.FullAddress{
  2874  				NIC:  pkt.NICID,
  2875  				Addr: e.TransportEndpointInfo.ID.RemoteAddress,
  2876  				Port: e.TransportEndpointInfo.ID.RemotePort,
  2877  			},
  2878  			Offender: tcpip.FullAddress{
  2879  				NIC:  pkt.NICID,
  2880  				Addr: e.TransportEndpointInfo.ID.LocalAddress,
  2881  				Port: e.TransportEndpointInfo.ID.LocalPort,
  2882  			},
  2883  			NetProto: pkt.NetworkProtocolNumber,
  2884  		})
  2885  	}
  2886  
  2887  	if e.EndpointState().connecting() {
  2888  		e.mu.Lock()
  2889  		if lEP := e.h.listenEP; lEP != nil {
  2890  			// Remove from listening endpoints pending list.
  2891  			lEP.acceptMu.Lock()
  2892  			delete(lEP.acceptQueue.pendingEndpoints, e)
  2893  			lEP.acceptMu.Unlock()
  2894  			lEP.stats.FailedConnectionAttempts.Increment()
  2895  		}
  2896  		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
  2897  		e.cleanupLocked()
  2898  		e.hardError = err
  2899  		e.setEndpointState(StateError)
  2900  		e.mu.Unlock()
  2901  		e.drainClosingSegmentQueue()
  2902  		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  2903  	}
  2904  }
  2905  
  2906  // HandleError implements stack.TransportEndpoint.
  2907  func (e *Endpoint) HandleError(transErr stack.TransportError, pkt *stack.PacketBuffer) {
  2908  	handlePacketTooBig := func(mtu uint32) {
  2909  		e.sndQueueInfo.sndQueueMu.Lock()
  2910  		update := false
  2911  		if v := int(mtu); v < e.sndQueueInfo.SndMTU {
  2912  			e.sndQueueInfo.SndMTU = v
  2913  			update = true
  2914  		}
  2915  		newMTU := e.sndQueueInfo.SndMTU
  2916  		e.sndQueueInfo.sndQueueMu.Unlock()
  2917  		if update {
  2918  			e.mu.Lock()
  2919  			defer e.mu.Unlock()
  2920  			if e.snd != nil {
  2921  				e.snd.updateMaxPayloadSize(newMTU, 1 /* count */) // +checklocksforce:e.snd.ep.mu
  2922  			}
  2923  		}
  2924  	}
  2925  
  2926  	// TODO(gvisor.dev/issues/5270): Handle all transport errors.
  2927  	switch transErr.Kind() {
  2928  	case stack.PacketTooBigTransportError:
  2929  		handlePacketTooBig(transErr.Info())
  2930  	case stack.DestinationHostUnreachableTransportError:
  2931  		e.onICMPError(&tcpip.ErrHostUnreachable{}, transErr, pkt)
  2932  	case stack.DestinationNetworkUnreachableTransportError:
  2933  		e.onICMPError(&tcpip.ErrNetworkUnreachable{}, transErr, pkt)
  2934  	case stack.DestinationPortUnreachableTransportError:
  2935  		e.onICMPError(&tcpip.ErrConnectionRefused{}, transErr, pkt)
  2936  	case stack.DestinationProtoUnreachableTransportError:
  2937  		e.onICMPError(&tcpip.ErrUnknownProtocolOption{}, transErr, pkt)
  2938  	case stack.SourceRouteFailedTransportError:
  2939  		e.onICMPError(&tcpip.ErrNotSupported{}, transErr, pkt)
  2940  	case stack.SourceHostIsolatedTransportError:
  2941  		e.onICMPError(&tcpip.ErrNoNet{}, transErr, pkt)
  2942  	case stack.DestinationHostDownTransportError:
  2943  		e.onICMPError(&tcpip.ErrHostDown{}, transErr, pkt)
  2944  	}
  2945  }
  2946  
  2947  // updateSndBufferUsage is called by the protocol goroutine when room opens up
  2948  // in the send buffer. The number of newly available bytes is v.
  2949  func (e *Endpoint) updateSndBufferUsage(v int) {
  2950  	sendBufferSize := e.getSendBufferSize()
  2951  	e.sndQueueInfo.sndQueueMu.Lock()
  2952  	notify := e.sndQueueInfo.SndBufUsed >= sendBufferSize>>1
  2953  	e.sndQueueInfo.SndBufUsed -= v
  2954  
  2955  	// Get the new send buffer size with auto tuning, but do not set it
  2956  	// unless we decide to notify the writers.
  2957  	newSndBufSz := e.computeTCPSendBufferSize()
  2958  
  2959  	// We only notify when there is half the sendBufferSize available after
  2960  	// a full buffer event occurs. This ensures that we don't wake up
  2961  	// writers to queue just 1-2 segments and go back to sleep.
  2962  	notify = notify && e.sndQueueInfo.SndBufUsed < int(newSndBufSz)>>1
  2963  	e.sndQueueInfo.sndQueueMu.Unlock()
  2964  
  2965  	if notify {
  2966  		// Set the new send buffer size calculated from auto tuning.
  2967  		e.ops.SetSendBufferSize(newSndBufSz, false /* notify */)
  2968  		e.waiterQueue.Notify(waiter.WritableEvents)
  2969  	}
  2970  }
  2971  
  2972  // readyToRead is called by the protocol goroutine when a new segment is ready
  2973  // to be read, or when the connection is closed for receiving (in which case
  2974  // s will be nil).
  2975  //
  2976  // +checklocks:e.mu
  2977  func (e *Endpoint) readyToRead(s *segment) {
  2978  	e.rcvQueueMu.Lock()
  2979  	if s != nil {
  2980  		e.RcvBufUsed += s.payloadSize()
  2981  		s.IncRef()
  2982  		e.rcvQueue.PushBack(s)
  2983  	} else {
  2984  		e.RcvClosed = true
  2985  	}
  2986  	e.rcvQueueMu.Unlock()
  2987  	e.waiterQueue.Notify(waiter.ReadableEvents)
  2988  }
  2989  
  2990  // receiveBufferAvailableLocked calculates how many bytes are still available
  2991  // in the receive buffer.
  2992  // +checklocks:e.rcvQueueMu
  2993  func (e *Endpoint) receiveBufferAvailableLocked(rcvBufSize int) int {
  2994  	// We may use more bytes than the buffer size when the receive buffer
  2995  	// shrinks.
  2996  	memUsed := e.receiveMemUsed()
  2997  	if memUsed >= rcvBufSize {
  2998  		return 0
  2999  	}
  3000  
  3001  	return rcvBufSize - memUsed
  3002  }
  3003  
  3004  // receiveBufferAvailable calculates how many bytes are still available in the
  3005  // receive buffer based on the actual memory used by all segments held in
  3006  // receive buffer/pending and segment queue.
  3007  func (e *Endpoint) receiveBufferAvailable() int {
  3008  	e.rcvQueueMu.Lock()
  3009  	available := e.receiveBufferAvailableLocked(int(e.ops.GetReceiveBufferSize()))
  3010  	e.rcvQueueMu.Unlock()
  3011  	return available
  3012  }
  3013  
  3014  // receiveBufferUsed returns the amount of in-use receive buffer.
  3015  func (e *Endpoint) receiveBufferUsed() int {
  3016  	e.rcvQueueMu.Lock()
  3017  	used := e.RcvBufUsed
  3018  	e.rcvQueueMu.Unlock()
  3019  	return used
  3020  }
  3021  
  3022  // receiveMemUsed returns the total memory in use by segments held by this
  3023  // endpoint.
  3024  func (e *Endpoint) receiveMemUsed() int {
  3025  	return int(e.rcvMemUsed.Load())
  3026  }
  3027  
  3028  // updateReceiveMemUsed adds the provided delta to e.rcvMemUsed.
  3029  func (e *Endpoint) updateReceiveMemUsed(delta int) {
  3030  	e.rcvMemUsed.Add(int32(delta))
  3031  }
  3032  
  3033  // maxReceiveBufferSize returns the stack wide maximum receive buffer size for
  3034  // an endpoint.
  3035  func (e *Endpoint) maxReceiveBufferSize() int {
  3036  	var rs tcpip.TCPReceiveBufferSizeRangeOption
  3037  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil {
  3038  		// As a fallback return the hardcoded max buffer size.
  3039  		return MaxBufferSize
  3040  	}
  3041  	return rs.Max
  3042  }
  3043  
  3044  // directionState returns the close state of send and receive part of the endpoint
  3045  func (e *Endpoint) connDirectionState() connDirectionState {
  3046  	return connDirectionState(e.connectionDirectionState.Load())
  3047  }
  3048  
  3049  // updateDirectionState updates the close state of send and receive part of the endpoint
  3050  func (e *Endpoint) updateConnDirectionState(state connDirectionState) connDirectionState {
  3051  	return connDirectionState(e.connectionDirectionState.Swap(uint32(e.connDirectionState() | state)))
  3052  }
  3053  
  3054  // rcvWndScaleForHandshake computes the receive window scale to offer to the
  3055  // peer when window scaling is enabled (true by default). If auto-tuning is
  3056  // disabled then the window scaling factor is based on the size of the
  3057  // receiveBuffer otherwise we use the max permissible receive buffer size to
  3058  // compute the scale.
  3059  func (e *Endpoint) rcvWndScaleForHandshake() int {
  3060  	bufSizeForScale := e.ops.GetReceiveBufferSize()
  3061  
  3062  	e.rcvQueueMu.Lock()
  3063  	autoTuningDisabled := e.RcvAutoParams.Disabled
  3064  	e.rcvQueueMu.Unlock()
  3065  	if autoTuningDisabled {
  3066  		return FindWndScale(seqnum.Size(bufSizeForScale))
  3067  	}
  3068  
  3069  	return FindWndScale(seqnum.Size(e.maxReceiveBufferSize()))
  3070  }
  3071  
  3072  // updateRecentTimestamp updates the recent timestamp using the algorithm
  3073  // described in https://tools.ietf.org/html/rfc7323#section-4.3
  3074  func (e *Endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) {
  3075  	if e.SendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) {
  3076  		e.setRecentTimestamp(tsVal)
  3077  	}
  3078  }
  3079  
  3080  // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if
  3081  // the SYN options indicate that timestamp option was negotiated. It also
  3082  // initializes the recentTS with the value provided in synOpts.TSval.
  3083  func (e *Endpoint) maybeEnableTimestamp(synOpts header.TCPSynOptions) {
  3084  	if synOpts.TS {
  3085  		e.SendTSOk = true
  3086  		e.setRecentTimestamp(synOpts.TSVal)
  3087  	}
  3088  }
  3089  
  3090  func (e *Endpoint) tsVal(now tcpip.MonotonicTime) uint32 {
  3091  	return e.TSOffset.TSVal(now)
  3092  }
  3093  
  3094  func (e *Endpoint) tsValNow() uint32 {
  3095  	return e.tsVal(e.stack.Clock().NowMonotonic())
  3096  }
  3097  
  3098  func (e *Endpoint) elapsed(now tcpip.MonotonicTime, tsEcr uint32) time.Duration {
  3099  	return e.TSOffset.Elapsed(now, tsEcr)
  3100  }
  3101  
  3102  // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint
  3103  // if the SYN options indicate that the SACK option was negotiated and the TCP
  3104  // stack is configured to enable TCP SACK option.
  3105  func (e *Endpoint) maybeEnableSACKPermitted(synOpts header.TCPSynOptions) {
  3106  	var v tcpip.TCPSACKEnabled
  3107  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil {
  3108  		// Stack doesn't support SACK. So just return.
  3109  		return
  3110  	}
  3111  	if bool(v) && synOpts.SACKPermitted {
  3112  		e.SACKPermitted = true
  3113  		e.stack.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery)
  3114  	}
  3115  }
  3116  
  3117  // maxOptionSize return the maximum size of TCP options.
  3118  func (e *Endpoint) maxOptionSize() (size int) {
  3119  	var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock
  3120  	options := e.makeOptions(maxSackBlocks[:])
  3121  	size = len(options)
  3122  	putOptions(options)
  3123  
  3124  	return size
  3125  }
  3126  
  3127  // completeStateLocked makes a full copy of the endpoint and returns it. This is
  3128  // used before invoking the probe.
  3129  //
  3130  // +checklocks:e.mu
  3131  func (e *Endpoint) completeStateLocked(s *stack.TCPEndpointState) {
  3132  	s.TCPEndpointStateInner = e.TCPEndpointStateInner
  3133  	s.ID = stack.TCPEndpointID(e.TransportEndpointInfo.ID)
  3134  	s.SegTime = e.stack.Clock().NowMonotonic()
  3135  	s.Receiver = e.rcv.TCPReceiverState
  3136  	s.Sender = e.snd.TCPSenderState
  3137  
  3138  	sndBufSize := e.getSendBufferSize()
  3139  	// Copy the send buffer atomically.
  3140  	e.sndQueueInfo.sndQueueMu.Lock()
  3141  	e.sndQueueInfo.CloneState(&s.SndBufState)
  3142  	s.SndBufState.SndBufSize = sndBufSize
  3143  	e.sndQueueInfo.sndQueueMu.Unlock()
  3144  
  3145  	// Copy the receive buffer atomically.
  3146  	e.rcvQueueMu.Lock()
  3147  	s.RcvBufState = e.TCPRcvBufState
  3148  	e.rcvQueueMu.Unlock()
  3149  
  3150  	// Copy the endpoint TCP Option state.
  3151  	s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks)
  3152  	copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks])
  3153  	s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy()
  3154  
  3155  	e.snd.rtt.Lock()
  3156  	s.Sender.RTTState = e.snd.rtt.TCPRTTState
  3157  	e.snd.rtt.Unlock()
  3158  
  3159  	if cubic, ok := e.snd.cc.(*cubicState); ok {
  3160  		s.Sender.Cubic = cubic.TCPCubicState
  3161  		s.Sender.Cubic.TimeSinceLastCongestion = e.stack.Clock().NowMonotonic().Sub(s.Sender.Cubic.T)
  3162  	}
  3163  
  3164  	s.Sender.RACKState = e.snd.rc.TCPRACKState
  3165  	s.Sender.RetransmitTS = e.snd.retransmitTS
  3166  	s.Sender.SpuriousRecovery = e.snd.spuriousRecovery
  3167  }
  3168  
  3169  func (e *Endpoint) initHostGSO() {
  3170  	switch e.route.NetProto() {
  3171  	case header.IPv4ProtocolNumber:
  3172  		e.gso.Type = stack.GSOTCPv4
  3173  		e.gso.L3HdrLen = header.IPv4MinimumSize
  3174  	case header.IPv6ProtocolNumber:
  3175  		e.gso.Type = stack.GSOTCPv6
  3176  		e.gso.L3HdrLen = header.IPv6MinimumSize
  3177  	default:
  3178  		panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto))
  3179  	}
  3180  	e.gso.NeedsCsum = true
  3181  	e.gso.CsumOffset = header.TCPChecksumOffset
  3182  	e.gso.MaxSize = e.route.GSOMaxSize()
  3183  }
  3184  
  3185  func (e *Endpoint) initGSO() {
  3186  	if e.route.HasHostGSOCapability() {
  3187  		e.initHostGSO()
  3188  	} else if e.route.HasGvisorGSOCapability() {
  3189  		e.gso = stack.GSO{
  3190  			MaxSize:   e.route.GSOMaxSize(),
  3191  			Type:      stack.GSOGvisor,
  3192  			NeedsCsum: false,
  3193  		}
  3194  	}
  3195  }
  3196  
  3197  // State implements tcpip.Endpoint.State. It exports the endpoint's protocol
  3198  // state for diagnostics.
  3199  func (e *Endpoint) State() uint32 {
  3200  	return uint32(e.EndpointState())
  3201  }
  3202  
  3203  // Info returns a copy of the endpoint info.
  3204  func (e *Endpoint) Info() tcpip.EndpointInfo {
  3205  	e.LockUser()
  3206  	// Make a copy of the endpoint info.
  3207  	ret := e.TransportEndpointInfo
  3208  	e.UnlockUser()
  3209  	return &ret
  3210  }
  3211  
  3212  // Stats returns a pointer to the endpoint stats.
  3213  func (e *Endpoint) Stats() tcpip.EndpointStats {
  3214  	return &e.stats
  3215  }
  3216  
  3217  // Wait implements stack.TransportEndpoint.Wait.
  3218  func (e *Endpoint) Wait() {
  3219  	waitEntry, notifyCh := waiter.NewChannelEntry(waiter.EventHUp)
  3220  	e.waiterQueue.EventRegister(&waitEntry)
  3221  	defer e.waiterQueue.EventUnregister(&waitEntry)
  3222  	switch e.EndpointState() {
  3223  	case StateClose, StateError:
  3224  		return
  3225  	}
  3226  	<-notifyCh
  3227  }
  3228  
  3229  // SocketOptions implements tcpip.Endpoint.SocketOptions.
  3230  func (e *Endpoint) SocketOptions() *tcpip.SocketOptions {
  3231  	return &e.ops
  3232  }
  3233  
  3234  // GetTCPSendBufferLimits is used to get send buffer size limits for TCP.
  3235  func GetTCPSendBufferLimits(sh tcpip.StackHandler) tcpip.SendBufferSizeOption {
  3236  	// This type assertion is safe because only the TCP stack calls this
  3237  	// function.
  3238  	ss := sh.(*stack.Stack).TCPSendBufferLimits()
  3239  	return tcpip.SendBufferSizeOption{
  3240  		Min:     ss.Min,
  3241  		Default: ss.Default,
  3242  		Max:     ss.Max,
  3243  	}
  3244  }
  3245  
  3246  // allowOutOfWindowAck returns true if an out-of-window ACK can be sent now.
  3247  func (e *Endpoint) allowOutOfWindowAck() bool {
  3248  	now := e.stack.Clock().NowMonotonic()
  3249  
  3250  	if e.lastOutOfWindowAckTime != (tcpip.MonotonicTime{}) {
  3251  		var limit stack.TCPInvalidRateLimitOption
  3252  		if err := e.stack.Option(&limit); err != nil {
  3253  			panic(fmt.Sprintf("e.stack.Option(%+v) failed with error: %s", limit, err))
  3254  		}
  3255  		if now.Sub(e.lastOutOfWindowAckTime) < time.Duration(limit) {
  3256  			return false
  3257  		}
  3258  	}
  3259  
  3260  	e.lastOutOfWindowAckTime = now
  3261  	return true
  3262  }
  3263  
  3264  // GetTCPReceiveBufferLimits is used to get send buffer size limits for TCP.
  3265  func GetTCPReceiveBufferLimits(s tcpip.StackHandler) tcpip.ReceiveBufferSizeOption {
  3266  	var ss tcpip.TCPReceiveBufferSizeRangeOption
  3267  	if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil {
  3268  		panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err))
  3269  	}
  3270  
  3271  	return tcpip.ReceiveBufferSizeOption{
  3272  		Min:     ss.Min,
  3273  		Default: ss.Default,
  3274  		Max:     ss.Max,
  3275  	}
  3276  }
  3277  
  3278  // computeTCPSendBufferSize implements auto tuning of send buffer size and
  3279  // returns the new send buffer size.
  3280  func (e *Endpoint) computeTCPSendBufferSize() int64 {
  3281  	curSndBufSz := int64(e.getSendBufferSize())
  3282  
  3283  	// Auto tuning is disabled when the user explicitly sets the send
  3284  	// buffer size with SO_SNDBUF option.
  3285  	if disabled := e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled.Load(); disabled == 1 {
  3286  		return curSndBufSz
  3287  	}
  3288  
  3289  	const packetOverheadFactor = 2
  3290  	curMSS := e.snd.MaxPayloadSize
  3291  	numSeg := InitialCwnd
  3292  	if numSeg < e.snd.SndCwnd {
  3293  		numSeg = e.snd.SndCwnd
  3294  	}
  3295  
  3296  	// SndCwnd indicates the number of segments that can be sent. This means
  3297  	// that the sender can send upto #SndCwnd segments and the send buffer
  3298  	// size should be set to SndCwnd*MSS to accommodate sending of all the
  3299  	// segments.
  3300  	newSndBufSz := int64(numSeg * curMSS * packetOverheadFactor)
  3301  	if newSndBufSz < curSndBufSz {
  3302  		return curSndBufSz
  3303  	}
  3304  	if ss := GetTCPSendBufferLimits(e.stack); int64(ss.Max) < newSndBufSz {
  3305  		newSndBufSz = int64(ss.Max)
  3306  	}
  3307  
  3308  	return newSndBufSz
  3309  }
  3310  
  3311  // GetAcceptConn implements tcpip.SocketOptionsHandler.
  3312  func (e *Endpoint) GetAcceptConn() bool {
  3313  	return EndpointState(e.State()) == StateListen
  3314  }