github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/tcpip/transport/tcp/endpoint.go

github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/tcpip/transport/tcp/endpoint.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"container/heap"
    19  	"encoding/binary"
    20  	"fmt"
    21  	"io"
    22  	"math"
    23  	"runtime"
    24  	"strings"
    25  	"time"
    26  
    27  	"github.com/MerlinKodo/gvisor/pkg/atomicbitops"
    28  	"github.com/MerlinKodo/gvisor/pkg/buffer"
    29  	"github.com/MerlinKodo/gvisor/pkg/sleep"
    30  	"github.com/MerlinKodo/gvisor/pkg/sync"
    31  	"github.com/MerlinKodo/gvisor/pkg/tcpip"
    32  	"github.com/MerlinKodo/gvisor/pkg/tcpip/hash/jenkins"
    33  	"github.com/MerlinKodo/gvisor/pkg/tcpip/header"
    34  	"github.com/MerlinKodo/gvisor/pkg/tcpip/ports"
    35  	"github.com/MerlinKodo/gvisor/pkg/tcpip/seqnum"
    36  	"github.com/MerlinKodo/gvisor/pkg/tcpip/stack"
    37  	"github.com/MerlinKodo/gvisor/pkg/waiter"
    38  )
    39  
    40  // EndpointState represents the state of a TCP endpoint.
    41  type EndpointState tcpip.EndpointState
    42  
    43  // Endpoint states. Note that are represented in a netstack-specific manner and
    44  // may not be meaningful externally. Specifically, they need to be translated to
    45  // Linux's representation for these states if presented to userspace.
    46  const (
    47  	_ EndpointState = iota
    48  	// TCP protocol states in sync with the definitions in
    49  	// https://github.com/torvalds/linux/blob/7acac4b3196/include/net/tcp_states.h#L13
    50  	StateEstablished
    51  	StateSynSent
    52  	StateSynRecv
    53  	StateFinWait1
    54  	StateFinWait2
    55  	StateTimeWait
    56  	StateClose
    57  	StateCloseWait
    58  	StateLastAck
    59  	StateListen
    60  	StateClosing
    61  
    62  	// Endpoint states internal to netstack.
    63  	StateInitial
    64  	StateBound
    65  	StateConnecting // Connect() called, but the initial SYN hasn't been sent.
    66  	StateError
    67  )
    68  
    69  const (
    70  	// rcvAdvWndScale is used to split the available socket buffer into
    71  	// application buffer and the window to be advertised to the peer. This is
    72  	// currently hard coded to split the available space equally.
    73  	rcvAdvWndScale = 1
    74  
    75  	// SegOverheadFactor is used to multiply the value provided by the
    76  	// user on a SetSockOpt for setting the socket send/receive buffer sizes.
    77  	SegOverheadFactor = 2
    78  )
    79  
    80  type connDirectionState uint32
    81  
    82  // Connection direction states used for directionState checks in endpoint struct
    83  // to detect half-closed connection and deliver POLLRDHUP
    84  const (
    85  	connDirectionStateOpen      connDirectionState = 0
    86  	connDirectionStateRcvClosed connDirectionState = 1
    87  	connDirectionStateSndClosed connDirectionState = 2
    88  	connDirectionStateAll       connDirectionState = connDirectionStateOpen | connDirectionStateRcvClosed | connDirectionStateSndClosed
    89  )
    90  
    91  // connected returns true when s is one of the states representing an
    92  // endpoint connected to a peer.
    93  func (s EndpointState) connected() bool {
    94  	switch s {
    95  	case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
    96  		return true
    97  	default:
    98  		return false
    99  	}
   100  }
   101  
   102  // connecting returns true when s is one of the states representing a
   103  // connection in progress, but not yet fully established.
   104  func (s EndpointState) connecting() bool {
   105  	switch s {
   106  	case StateConnecting, StateSynSent, StateSynRecv:
   107  		return true
   108  	default:
   109  		return false
   110  	}
   111  }
   112  
   113  // internal returns true when the state is netstack internal.
   114  func (s EndpointState) internal() bool {
   115  	switch s {
   116  	case StateInitial, StateBound, StateConnecting, StateError:
   117  		return true
   118  	default:
   119  		return false
   120  	}
   121  }
   122  
   123  // handshake returns true when s is one of the states representing an endpoint
   124  // in the middle of a TCP handshake.
   125  func (s EndpointState) handshake() bool {
   126  	switch s {
   127  	case StateSynSent, StateSynRecv:
   128  		return true
   129  	default:
   130  		return false
   131  	}
   132  }
   133  
   134  // closed returns true when s is one of the states an endpoint transitions to
   135  // when closed or when it encounters an error. This is distinct from a newly
   136  // initialized endpoint that was never connected.
   137  func (s EndpointState) closed() bool {
   138  	switch s {
   139  	case StateClose, StateError:
   140  		return true
   141  	default:
   142  		return false
   143  	}
   144  }
   145  
   146  // String implements fmt.Stringer.String.
   147  func (s EndpointState) String() string {
   148  	switch s {
   149  	case StateInitial:
   150  		return "INITIAL"
   151  	case StateBound:
   152  		return "BOUND"
   153  	case StateConnecting:
   154  		return "CONNECTING"
   155  	case StateError:
   156  		return "ERROR"
   157  	case StateEstablished:
   158  		return "ESTABLISHED"
   159  	case StateSynSent:
   160  		return "SYN-SENT"
   161  	case StateSynRecv:
   162  		return "SYN-RCVD"
   163  	case StateFinWait1:
   164  		return "FIN-WAIT1"
   165  	case StateFinWait2:
   166  		return "FIN-WAIT2"
   167  	case StateTimeWait:
   168  		return "TIME-WAIT"
   169  	case StateClose:
   170  		return "CLOSED"
   171  	case StateCloseWait:
   172  		return "CLOSE-WAIT"
   173  	case StateLastAck:
   174  		return "LAST-ACK"
   175  	case StateListen:
   176  		return "LISTEN"
   177  	case StateClosing:
   178  		return "CLOSING"
   179  	default:
   180  		panic("unreachable")
   181  	}
   182  }
   183  
   184  // SACKInfo holds TCP SACK related information for a given endpoint.
   185  //
   186  // +stateify savable
   187  type SACKInfo struct {
   188  	// Blocks is the maximum number of SACK blocks we track
   189  	// per endpoint.
   190  	Blocks [MaxSACKBlocks]header.SACKBlock
   191  
   192  	// NumBlocks is the number of valid SACK blocks stored in the
   193  	// blocks array above.
   194  	NumBlocks int
   195  }
   196  
   197  // ReceiveErrors collect segment receive errors within transport layer.
   198  //
   199  // +stateify savable
   200  type ReceiveErrors struct {
   201  	tcpip.ReceiveErrors
   202  
   203  	// SegmentQueueDropped is the number of segments dropped due to
   204  	// a full segment queue.
   205  	SegmentQueueDropped tcpip.StatCounter
   206  
   207  	// ChecksumErrors is the number of segments dropped due to bad checksums.
   208  	ChecksumErrors tcpip.StatCounter
   209  
   210  	// ListenOverflowSynDrop is the number of times the listen queue overflowed
   211  	// and a SYN was dropped.
   212  	ListenOverflowSynDrop tcpip.StatCounter
   213  
   214  	// ListenOverflowAckDrop is the number of times the final ACK
   215  	// in the handshake was dropped due to overflow.
   216  	ListenOverflowAckDrop tcpip.StatCounter
   217  
   218  	// ZeroRcvWindowState is the number of times we advertised
   219  	// a zero receive window when rcvQueue is full.
   220  	ZeroRcvWindowState tcpip.StatCounter
   221  
   222  	// WantZeroWindow is the number of times we wanted to advertise a
   223  	// zero receive window but couldn't because it would have caused
   224  	// the receive window's right edge to shrink.
   225  	WantZeroRcvWindow tcpip.StatCounter
   226  }
   227  
   228  // SendErrors collect segment send errors within the transport layer.
   229  //
   230  // +stateify savable
   231  type SendErrors struct {
   232  	tcpip.SendErrors
   233  
   234  	// SegmentSendToNetworkFailed is the number of TCP segments failed to be sent
   235  	// to the network endpoint.
   236  	SegmentSendToNetworkFailed tcpip.StatCounter
   237  
   238  	// SynSendToNetworkFailed is the number of TCP SYNs failed to be sent
   239  	// to the network endpoint.
   240  	SynSendToNetworkFailed tcpip.StatCounter
   241  
   242  	// Retransmits is the number of TCP segments retransmitted.
   243  	Retransmits tcpip.StatCounter
   244  
   245  	// FastRetransmit is the number of segments retransmitted in fast
   246  	// recovery.
   247  	FastRetransmit tcpip.StatCounter
   248  
   249  	// Timeouts is the number of times the RTO expired.
   250  	Timeouts tcpip.StatCounter
   251  }
   252  
   253  // Stats holds statistics about the endpoint.
   254  //
   255  // +stateify savable
   256  type Stats struct {
   257  	// SegmentsReceived is the number of TCP segments received that
   258  	// the transport layer successfully parsed.
   259  	SegmentsReceived tcpip.StatCounter
   260  
   261  	// SegmentsSent is the number of TCP segments sent.
   262  	SegmentsSent tcpip.StatCounter
   263  
   264  	// FailedConnectionAttempts is the number of times we saw Connect and
   265  	// Accept errors.
   266  	FailedConnectionAttempts tcpip.StatCounter
   267  
   268  	// ReceiveErrors collects segment receive errors within the
   269  	// transport layer.
   270  	ReceiveErrors ReceiveErrors
   271  
   272  	// ReadErrors collects segment read errors from an endpoint read call.
   273  	ReadErrors tcpip.ReadErrors
   274  
   275  	// SendErrors collects segment send errors within the transport layer.
   276  	SendErrors SendErrors
   277  
   278  	// WriteErrors collects segment write errors from an endpoint write call.
   279  	WriteErrors tcpip.WriteErrors
   280  }
   281  
   282  // IsEndpointStats is an empty method to implement the tcpip.EndpointStats
   283  // marker interface.
   284  func (*Stats) IsEndpointStats() {}
   285  
   286  // sndQueueInfo implements a send queue.
   287  //
   288  // +stateify savable
   289  type sndQueueInfo struct {
   290  	sndQueueMu sync.Mutex `state:"nosave"`
   291  	stack.TCPSndBufState
   292  
   293  	// sndWaker is used to signal the protocol goroutine when there may be
   294  	// segments that need to be sent.
   295  	sndWaker sleep.Waker `state:"manual"`
   296  }
   297  
   298  // CloneState clones sq into other. It is not thread safe
   299  func (sq *sndQueueInfo) CloneState(other *stack.TCPSndBufState) {
   300  	other.SndBufSize = sq.SndBufSize
   301  	other.SndBufUsed = sq.SndBufUsed
   302  	other.SndClosed = sq.SndClosed
   303  	other.PacketTooBigCount = sq.PacketTooBigCount
   304  	other.SndMTU = sq.SndMTU
   305  	other.AutoTuneSndBufDisabled = atomicbitops.FromUint32(sq.AutoTuneSndBufDisabled.RacyLoad())
   306  }
   307  
   308  // endpoint represents a TCP endpoint. This struct serves as the interface
   309  // between users of the endpoint and the protocol implementation; it is legal to
   310  // have concurrent goroutines make calls into the endpoint, they are properly
   311  // synchronized. The protocol implementation, however, runs in a single
   312  // goroutine.
   313  //
   314  // Each endpoint has a few mutexes:
   315  //
   316  // e.mu -> Primary mutex for an endpoint must be held for all operations except
   317  // in e.Readiness where acquiring it will result in a deadlock in epoll
   318  // implementation.
   319  //
   320  // The following three mutexes can be acquired independent of e.mu but if
   321  // acquired with e.mu then e.mu must be acquired first.
   322  //
   323  // e.acceptMu -> Protects e.acceptQueue.
   324  // e.rcvQueueMu -> Protects e.rcvQueue's associated fields but not e.rcvQueue
   325  // itself.
   326  // e.sndQueueMu -> Protects the e.sndQueue and associated fields.
   327  // e.lastErrorMu -> Protects the lastError field.
   328  //
   329  // LOCKING/UNLOCKING of the endpoint.  The locking of an endpoint is different
   330  // based on the context in which the lock is acquired. In the syscall context
   331  // e.LockUser/e.UnlockUser should be used and when doing background processing
   332  // e.mu.Lock/e.mu.Unlock should be used. The distinction is described below
   333  // in brief.
   334  //
   335  // The reason for this locking behaviour is to avoid wakeups to handle packets.
   336  // In cases where the endpoint is already locked the background processor can
   337  // queue the packet up and go its merry way and the lock owner will eventually
   338  // process the backlog when releasing the lock. Similarly when acquiring the
   339  // lock from say a syscall goroutine we can implement a bit of spinning if we
   340  // know that the lock is not held by another syscall goroutine. Background
   341  // processors should never hold the lock for long and we can avoid an expensive
   342  // sleep/wakeup by spinning for a shortwhile.
   343  //
   344  // For more details please see the detailed documentation on
   345  // e.LockUser/e.UnlockUser methods.
   346  //
   347  // +stateify savable
   348  type endpoint struct {
   349  	stack.TCPEndpointStateInner
   350  	stack.TransportEndpointInfo
   351  	tcpip.DefaultSocketOptionsHandler
   352  
   353  	// endpointEntry is used to queue endpoints for processing to the
   354  	// a given tcp processor goroutine.
   355  	//
   356  	// Precondition: epQueue.mu must be held to read/write this field..
   357  	endpointEntry `state:"nosave"`
   358  
   359  	// pendingProcessingMu protects pendingProcessing.
   360  	pendingProcessingMu sync.Mutex `state:"nosave"`
   361  
   362  	// pendingProcessing is true if this endpoint is queued for processing
   363  	// to a TCP processor.
   364  	// +checklocks:pendingProcessingMu
   365  	pendingProcessing bool `state:"nosave"`
   366  
   367  	// The following fields are initialized at creation time and do not
   368  	// change throughout the lifetime of the endpoint.
   369  	stack       *stack.Stack  `state:"manual"`
   370  	protocol    *protocol     `state:"manual"`
   371  	waiterQueue *waiter.Queue `state:"wait"`
   372  	uniqueID    uint64
   373  
   374  	// hardError is meaningful only when state is stateError. It stores the
   375  	// error to be returned when read/write syscalls are called and the
   376  	// endpoint is in this state. hardError is protected by endpoint mu.
   377  	hardError tcpip.Error
   378  
   379  	// lastError represents the last error that the endpoint reported;
   380  	// access to it is protected by the following mutex.
   381  	lastErrorMu sync.Mutex `state:"nosave"`
   382  	lastError   tcpip.Error
   383  
   384  	rcvQueueMu sync.Mutex `state:"nosave"`
   385  
   386  	// +checklocks:rcvQueueMu
   387  	stack.TCPRcvBufState
   388  
   389  	// rcvMemUsed tracks the total amount of memory in use by received segments
   390  	// held in rcvQueue, pendingRcvdSegments and the segment queue. This is used to
   391  	// compute the window and the actual available buffer space. This is distinct
   392  	// from rcvBufUsed above which is the actual number of payload bytes held in
   393  	// the buffer not including any segment overheads.
   394  	rcvMemUsed atomicbitops.Int32
   395  
   396  	// mu protects all endpoint fields unless documented otherwise. mu must
   397  	// be acquired before interacting with the endpoint fields.
   398  	//
   399  	// During handshake, mu is locked by the protocol listen goroutine and
   400  	// released by the handshake completion goroutine.
   401  	mu          sync.CrossGoroutineMutex `state:"nosave"`
   402  	ownedByUser atomicbitops.Uint32
   403  
   404  	// rcvQueue is the queue for ready-for-delivery segments.
   405  	//
   406  	// +checklocks:mu
   407  	rcvQueue segmentList `state:"wait"`
   408  
   409  	// state must be read/set using the EndpointState()/setEndpointState()
   410  	// methods.
   411  	state atomicbitops.Uint32 `state:".(EndpointState)"`
   412  
   413  	// connectionDirectionState holds current state of send and receive,
   414  	// accessed atomically
   415  	connectionDirectionState atomicbitops.Uint32
   416  
   417  	// origEndpointState is only used during a restore phase to save the
   418  	// endpoint state at restore time as the socket is moved to it's correct
   419  	// state.
   420  	origEndpointState uint32 `state:"nosave"`
   421  
   422  	isPortReserved    bool `state:"manual"`
   423  	isRegistered      bool `state:"manual"`
   424  	boundNICID        tcpip.NICID
   425  	route             *stack.Route `state:"manual"`
   426  	ipv4TTL           uint8
   427  	ipv6HopLimit      int16
   428  	isConnectNotified bool
   429  
   430  	// h stores a reference to the current handshake state if the endpoint is in
   431  	// the SYN-SENT or SYN-RECV states, in which case endpoint == endpoint.h.ep.
   432  	// nil otherwise.
   433  	// +checklocks:mu
   434  	h *handshake
   435  
   436  	// portFlags stores the current values of port related flags.
   437  	portFlags ports.Flags
   438  
   439  	// Values used to reserve a port or register a transport endpoint
   440  	// (which ever happens first).
   441  	boundBindToDevice tcpip.NICID
   442  	boundPortFlags    ports.Flags
   443  	boundDest         tcpip.FullAddress
   444  
   445  	// effectiveNetProtos contains the network protocols actually in use. In
   446  	// most cases it will only contain "netProto", but in cases like IPv6
   447  	// endpoints with v6only set to false, this could include multiple
   448  	// protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g.,
   449  	// IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped
   450  	// address).
   451  	effectiveNetProtos []tcpip.NetworkProtocolNumber
   452  
   453  	// recentTSTime is the unix time when we last updated
   454  	// TCPEndpointStateInner.RecentTS.
   455  	recentTSTime tcpip.MonotonicTime
   456  
   457  	// shutdownFlags represent the current shutdown state of the endpoint.
   458  	shutdownFlags tcpip.ShutdownFlags
   459  
   460  	// tcpRecovery is the loss recovery algorithm used by TCP.
   461  	tcpRecovery tcpip.TCPRecovery
   462  
   463  	// sack holds TCP SACK related information for this endpoint.
   464  	sack SACKInfo
   465  
   466  	// delay enables Nagle's algorithm.
   467  	//
   468  	// delay is a boolean (0 is false) and must be accessed atomically.
   469  	delay uint32
   470  
   471  	// scoreboard holds TCP SACK Scoreboard information for this endpoint.
   472  	scoreboard *SACKScoreboard
   473  
   474  	// segmentQueue is used to hand received segments to the protocol
   475  	// goroutine. Segments are queued as long as the queue is not full,
   476  	// and dropped when it is.
   477  	segmentQueue segmentQueue `state:"wait"`
   478  
   479  	// userMSS if non-zero is the MSS value explicitly set by the user
   480  	// for this endpoint using the TCP_MAXSEG setsockopt.
   481  	userMSS uint16
   482  
   483  	// maxSynRetries is the maximum number of SYN retransmits that TCP should
   484  	// send before aborting the attempt to connect. It cannot exceed 255.
   485  	//
   486  	// NOTE: This is currently a no-op and does not change the SYN
   487  	// retransmissions.
   488  	maxSynRetries uint8
   489  
   490  	// windowClamp is used to bound the size of the advertised window to
   491  	// this value.
   492  	windowClamp uint32
   493  
   494  	// sndQueueInfo contains the implementation of the endpoint's send queue.
   495  	sndQueueInfo sndQueueInfo
   496  
   497  	// cc stores the name of the Congestion Control algorithm to use for
   498  	// this endpoint.
   499  	cc tcpip.CongestionControlOption
   500  
   501  	// keepalive manages TCP keepalive state. When the connection is idle
   502  	// (no data sent or received) for keepaliveIdle, we start sending
   503  	// keepalives every keepalive.interval. If we send keepalive.count
   504  	// without hearing a response, the connection is closed.
   505  	keepalive keepalive
   506  
   507  	// userTimeout if non-zero specifies a user specified timeout for
   508  	// a connection w/ pending data to send. A connection that has pending
   509  	// unacked data will be forcibily aborted if the timeout is reached
   510  	// without any data being acked.
   511  	userTimeout time.Duration
   512  
   513  	// deferAccept if non-zero specifies a user specified time during
   514  	// which the final ACK of a handshake will be dropped provided the
   515  	// ACK is a bare ACK and carries no data. If the timeout is crossed then
   516  	// the bare ACK is accepted and the connection is delivered to the
   517  	// listener.
   518  	deferAccept time.Duration
   519  
   520  	// acceptMu protects accepQueue
   521  	acceptMu sync.Mutex `state:"nosave"`
   522  
   523  	// acceptQueue is used by a listening endpoint to send newly accepted
   524  	// connections to the endpoint so that they can be read by Accept()
   525  	// calls.
   526  	//
   527  	// +checklocks:acceptMu
   528  	acceptQueue acceptQueue
   529  
   530  	// The following are only used from the protocol goroutine, and
   531  	// therefore don't need locks to protect them.
   532  	rcv *receiver `state:"wait"`
   533  	snd *sender   `state:"wait"`
   534  
   535  	// The goroutine drain completion notification channel.
   536  	drainDone chan struct{} `state:"nosave"`
   537  
   538  	// The goroutine undrain notification channel. This is currently used as
   539  	// a way to block the worker goroutines. Today nothing closes/writes
   540  	// this channel and this causes any goroutines waiting on this to just
   541  	// block. This is used during save/restore to prevent worker goroutines
   542  	// from mutating state as it's being saved.
   543  	undrain chan struct{} `state:"nosave"`
   544  
   545  	// probe if not nil is invoked on every received segment. It is passed
   546  	// a copy of the current state of the endpoint.
   547  	probe stack.TCPProbeFunc `state:"nosave"`
   548  
   549  	// The following are only used to assist the restore run to re-connect.
   550  	connectingAddress tcpip.Address
   551  
   552  	// amss is the advertised MSS to the peer by this endpoint.
   553  	amss uint16
   554  
   555  	// sendTOS represents IPv4 TOS or IPv6 TrafficClass,
   556  	// applied while sending packets. Defaults to 0 as on Linux.
   557  	sendTOS uint8
   558  
   559  	gso stack.GSO
   560  
   561  	stats Stats
   562  
   563  	// tcpLingerTimeout is the maximum amount of a time a socket
   564  	// a socket stays in TIME_WAIT state before being marked
   565  	// closed.
   566  	tcpLingerTimeout time.Duration
   567  
   568  	// closed indicates that the user has called closed on the
   569  	// endpoint and at this point the endpoint is only around
   570  	// to complete the TCP shutdown.
   571  	closed bool
   572  
   573  	// txHash is the transport layer hash to be set on outbound packets
   574  	// emitted by this endpoint.
   575  	txHash uint32
   576  
   577  	// owner is used to get uid and gid of the packet.
   578  	owner tcpip.PacketOwner
   579  
   580  	// ops is used to get socket level options.
   581  	ops tcpip.SocketOptions
   582  
   583  	// lastOutOfWindowAckTime is the time at which the an ACK was sent in response
   584  	// to an out of window segment being received by this endpoint.
   585  	lastOutOfWindowAckTime tcpip.MonotonicTime
   586  
   587  	// finWait2Timer is used to reap orphaned sockets in FIN-WAIT-2 where the peer
   588  	// is yet to send a FIN but on our end the socket is fully closed i.e. endpoint.Close()
   589  	// has been called on the socket. This timer is not started for sockets that
   590  	// are waiting for a peer FIN but are not closed.
   591  	finWait2Timer tcpip.Timer `state:"nosave"`
   592  
   593  	// timeWaitTimer is used to reap a socket once a socket has been in TIME-WAIT state
   594  	// for tcp.DefaultTCPTimeWaitTimeout seconds.
   595  	timeWaitTimer tcpip.Timer `state:"nosave"`
   596  
   597  	// listenCtx is used by listening endpoints to store state used while listening for
   598  	// connections. Nil otherwise.
   599  	listenCtx *listenContext `state:"nosave"`
   600  }
   601  
   602  // UniqueID implements stack.TransportEndpoint.UniqueID.
   603  func (e *endpoint) UniqueID() uint64 {
   604  	return e.uniqueID
   605  }
   606  
   607  // calculateAdvertisedMSS calculates the MSS to advertise.
   608  //
   609  // If userMSS is non-zero and is not greater than the maximum possible MSS for
   610  // r, it will be used; otherwise, the maximum possible MSS will be used.
   611  func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 {
   612  	// The maximum possible MSS is dependent on the route.
   613  	// TODO(b/143359391): Respect TCP Min and Max size.
   614  	maxMSS := uint16(r.MTU() - header.TCPMinimumSize)
   615  
   616  	if userMSS != 0 && userMSS < maxMSS {
   617  		return userMSS
   618  	}
   619  
   620  	return maxMSS
   621  }
   622  
   623  // isOwnedByUser() returns true if the endpoint lock is currently
   624  // held by a user(syscall) goroutine.
   625  func (e *endpoint) isOwnedByUser() bool {
   626  	return e.ownedByUser.Load() == 1
   627  }
   628  
   629  // LockUser tries to lock e.mu and if it fails it will check if the lock is held
   630  // by another syscall goroutine. If yes, then it will goto sleep waiting for the
   631  // lock to be released, if not then it will spin till it acquires the lock or
   632  // another syscall goroutine acquires it in which case it will goto sleep as
   633  // described above.
   634  //
   635  // The assumption behind spinning here being that background packet processing
   636  // should not be holding the lock for long and spinning reduces latency as we
   637  // avoid an expensive sleep/wakeup of the syscall goroutine).
   638  // +checklocksacquire:e.mu
   639  func (e *endpoint) LockUser() {
   640  	const iterations = 5
   641  	for i := 0; i < iterations; i++ {
   642  		// Try first if the sock is locked then check if it's owned
   643  		// by another user goroutine if not then we spin, otherwise
   644  		// we just go to sleep on the Lock() and wait.
   645  		if !e.TryLock() {
   646  			// If socket is owned by the user then just go to sleep
   647  			// as the lock could be held for a reasonably long time.
   648  			if e.ownedByUser.Load() == 1 {
   649  				e.mu.Lock()
   650  				e.ownedByUser.Store(1)
   651  				return
   652  			}
   653  			// Spin but don't yield the processor since the lower half
   654  			// should yield the lock soon.
   655  			continue
   656  		}
   657  		e.ownedByUser.Store(1)
   658  		return
   659  	}
   660  
   661  	for i := 0; i < iterations; i++ {
   662  		// Try first if the sock is locked then check if it's owned
   663  		// by another user goroutine if not then we spin, otherwise
   664  		// we just go to sleep on the Lock() and wait.
   665  		if !e.TryLock() {
   666  			// If socket is owned by the user then just go to sleep
   667  			// as the lock could be held for a reasonably long time.
   668  			if e.ownedByUser.Load() == 1 {
   669  				e.mu.Lock()
   670  				e.ownedByUser.Store(1)
   671  				return
   672  			}
   673  			// Spin but yield the processor since the lower half
   674  			// should yield the lock soon.
   675  			runtime.Gosched()
   676  			continue
   677  		}
   678  		e.ownedByUser.Store(1)
   679  		return
   680  	}
   681  
   682  	// Finally just give up and wait for the Lock.
   683  	e.mu.Lock()
   684  	e.ownedByUser.Store(1)
   685  }
   686  
   687  // UnlockUser will check if there are any segments already queued for processing
   688  // and wake up a processor goroutine to process them before unlocking e.mu.
   689  // This is required because we when packets arrive and endpoint lock is already
   690  // held then such packets are queued up to be processed.
   691  //
   692  // Precondition: e.LockUser() must have been called before calling e.UnlockUser()
   693  // +checklocksrelease:e.mu
   694  func (e *endpoint) UnlockUser() {
   695  	// Lock segment queue before checking so that we avoid a race where
   696  	// segments can be queued between the time we check if queue is empty
   697  	// and actually unlock the endpoint mutex.
   698  	e.segmentQueue.mu.Lock()
   699  	if e.segmentQueue.emptyLocked() {
   700  		if e.ownedByUser.Swap(0) != 1 {
   701  			panic("e.UnlockUser() called without calling e.LockUser()")
   702  		}
   703  		e.mu.Unlock()
   704  		e.segmentQueue.mu.Unlock()
   705  		return
   706  	}
   707  	e.segmentQueue.mu.Unlock()
   708  
   709  	// Since we are waking the processor goroutine here just unlock
   710  	// and let it process the queued segments.
   711  	if e.ownedByUser.Swap(0) != 1 {
   712  		panic("e.UnlockUser() called without calling e.LockUser()")
   713  	}
   714  	processor := e.protocol.dispatcher.selectProcessor(e.ID)
   715  	e.mu.Unlock()
   716  
   717  	// Wake up the processor for this endpoint to process any queued
   718  	// segments after releasing the lock to avoid the case where if the
   719  	// processor goroutine starts running before we release the lock here
   720  	// then it will fail to process as TryLock() will fail.
   721  	processor.queueEndpoint(e)
   722  	return
   723  }
   724  
   725  // StopWork halts packet processing. Only to be used in tests.
   726  // +checklocksacquire:e.mu
   727  func (e *endpoint) StopWork() {
   728  	e.mu.Lock()
   729  }
   730  
   731  // ResumeWork resumes packet processing. Only to be used in tests.
   732  // +checklocksrelease:e.mu
   733  func (e *endpoint) ResumeWork() {
   734  	e.mu.Unlock()
   735  }
   736  
   737  // AssertLockHeld forces the checklocks analyzer to consider e.mu held. This is
   738  // used in places where we know that e.mu is held, but checklocks does not,
   739  // which can happen when creating new locked objects. You must pass the known
   740  // locked endpoint to this function and it must be the same as the caller
   741  // endpoint.
   742  // TODO(b/226403629): Remove this function once checklocks understands local
   743  // variable locks.
   744  // +checklocks:locked.mu
   745  // +checklocksacquire:e.mu
   746  func (e *endpoint) AssertLockHeld(locked *endpoint) {
   747  	if e != locked {
   748  		panic("AssertLockHeld failed: locked endpoint != asserting endpoint")
   749  	}
   750  }
   751  
   752  // TryLock is a helper that calls TryLock on the endpoint's mutex and
   753  // adds the necessary checklocks annotations.
   754  // TODO(b/226403629): Remove this once checklocks understands TryLock.
   755  // +checklocksacquire:e.mu
   756  func (e *endpoint) TryLock() bool {
   757  	if e.mu.TryLock() {
   758  		return true // +checklocksforce
   759  	}
   760  	return false // +checklocksignore
   761  }
   762  
   763  // setEndpointState updates the state of the endpoint to state atomically. This
   764  // method is unexported as the only place we should update the state is in this
   765  // package but we allow the state to be read freely without holding e.mu.
   766  //
   767  // +checklocks:e.mu
   768  func (e *endpoint) setEndpointState(state EndpointState) {
   769  	oldstate := EndpointState(e.state.Swap(uint32(state)))
   770  	switch state {
   771  	case StateEstablished:
   772  		e.stack.Stats().TCP.CurrentEstablished.Increment()
   773  		e.stack.Stats().TCP.CurrentConnected.Increment()
   774  	case StateError:
   775  		fallthrough
   776  	case StateClose:
   777  		if oldstate == StateCloseWait || oldstate == StateEstablished {
   778  			e.stack.Stats().TCP.EstablishedResets.Increment()
   779  		}
   780  		if oldstate.connected() {
   781  			e.stack.Stats().TCP.CurrentConnected.Decrement()
   782  		}
   783  		fallthrough
   784  	default:
   785  		if oldstate == StateEstablished {
   786  			e.stack.Stats().TCP.CurrentEstablished.Decrement()
   787  		}
   788  	}
   789  }
   790  
   791  // EndpointState returns the current state of the endpoint.
   792  func (e *endpoint) EndpointState() EndpointState {
   793  	return EndpointState(e.state.Load())
   794  }
   795  
   796  // setRecentTimestamp sets the recentTS field to the provided value.
   797  func (e *endpoint) setRecentTimestamp(recentTS uint32) {
   798  	e.RecentTS = recentTS
   799  	e.recentTSTime = e.stack.Clock().NowMonotonic()
   800  }
   801  
   802  // recentTimestamp returns the value of the recentTS field.
   803  func (e *endpoint) recentTimestamp() uint32 {
   804  	return e.RecentTS
   805  }
   806  
   807  // TODO(gvisor.dev/issue/6974): Remove once tcp endpoints are composed with a
   808  // network.Endpoint, which also defines this function.
   809  func calculateTTL(route *stack.Route, ipv4TTL uint8, ipv6HopLimit int16) uint8 {
   810  	switch netProto := route.NetProto(); netProto {
   811  	case header.IPv4ProtocolNumber:
   812  		if ipv4TTL == tcpip.UseDefaultIPv4TTL {
   813  			return route.DefaultTTL()
   814  		}
   815  		return ipv4TTL
   816  	case header.IPv6ProtocolNumber:
   817  		if ipv6HopLimit == tcpip.UseDefaultIPv6HopLimit {
   818  			return route.DefaultTTL()
   819  		}
   820  		return uint8(ipv6HopLimit)
   821  	default:
   822  		panic(fmt.Sprintf("invalid protocol number = %d", netProto))
   823  	}
   824  }
   825  
   826  // keepalive is a synchronization wrapper used to appease stateify. See the
   827  // comment in endpoint, where it is used.
   828  //
   829  // +stateify savable
   830  type keepalive struct {
   831  	sync.Mutex `state:"nosave"`
   832  	idle       time.Duration
   833  	interval   time.Duration
   834  	count      int
   835  	unacked    int
   836  	// should never be a zero timer if the endpoint is not closed.
   837  	timer timer       `state:"nosave"`
   838  	waker sleep.Waker `state:"nosave"`
   839  }
   840  
   841  func newEndpoint(s *stack.Stack, protocol *protocol, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint {
   842  	e := &endpoint{
   843  		stack:    s,
   844  		protocol: protocol,
   845  		TransportEndpointInfo: stack.TransportEndpointInfo{
   846  			NetProto:   netProto,
   847  			TransProto: header.TCPProtocolNumber,
   848  		},
   849  		sndQueueInfo: sndQueueInfo{
   850  			TCPSndBufState: stack.TCPSndBufState{
   851  				SndMTU: math.MaxInt32,
   852  			},
   853  		},
   854  		waiterQueue: waiterQueue,
   855  		state:       atomicbitops.FromUint32(uint32(StateInitial)),
   856  		keepalive: keepalive{
   857  			idle:     DefaultKeepaliveIdle,
   858  			interval: DefaultKeepaliveInterval,
   859  			count:    DefaultKeepaliveCount,
   860  		},
   861  		uniqueID:      s.UniqueID(),
   862  		ipv4TTL:       tcpip.UseDefaultIPv4TTL,
   863  		ipv6HopLimit:  tcpip.UseDefaultIPv6HopLimit,
   864  		txHash:        s.Rand().Uint32(),
   865  		windowClamp:   DefaultReceiveBufferSize,
   866  		maxSynRetries: DefaultSynRetries,
   867  	}
   868  	e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits, GetTCPReceiveBufferLimits)
   869  	e.ops.SetMulticastLoop(true)
   870  	e.ops.SetQuickAck(true)
   871  	e.ops.SetSendBufferSize(DefaultSendBufferSize, false /* notify */)
   872  	e.ops.SetReceiveBufferSize(DefaultReceiveBufferSize, false /* notify */)
   873  
   874  	var ss tcpip.TCPSendBufferSizeRangeOption
   875  	if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
   876  		e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */)
   877  	}
   878  
   879  	var rs tcpip.TCPReceiveBufferSizeRangeOption
   880  	if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
   881  		e.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */)
   882  	}
   883  
   884  	var cs tcpip.CongestionControlOption
   885  	if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil {
   886  		e.cc = cs
   887  	}
   888  
   889  	var mrb tcpip.TCPModerateReceiveBufferOption
   890  	if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil {
   891  		e.RcvAutoParams.Disabled = !bool(mrb)
   892  	}
   893  
   894  	var de tcpip.TCPDelayEnabled
   895  	if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de {
   896  		e.ops.SetDelayOption(true)
   897  	}
   898  
   899  	var tcpLT tcpip.TCPLingerTimeoutOption
   900  	if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil {
   901  		e.tcpLingerTimeout = time.Duration(tcpLT)
   902  	}
   903  
   904  	var synRetries tcpip.TCPSynRetriesOption
   905  	if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil {
   906  		e.maxSynRetries = uint8(synRetries)
   907  	}
   908  
   909  	if p := s.GetTCPProbe(); p != nil {
   910  		e.probe = p
   911  	}
   912  
   913  	e.segmentQueue.ep = e
   914  
   915  	// TODO(https://gvisor.dev/issues/7493): Defer creating the timer until TCP connection becomes
   916  	// established.
   917  	e.keepalive.timer.init(e.stack.Clock(), maybeFailTimerHandler(e, e.keepaliveTimerExpired))
   918  
   919  	return e
   920  }
   921  
   922  // Readiness returns the current readiness of the endpoint. For example, if
   923  // waiter.EventIn is set, the endpoint is immediately readable.
   924  func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
   925  	result := waiter.EventMask(0)
   926  
   927  	switch e.EndpointState() {
   928  	case StateInitial, StateBound:
   929  		// This prevents blocking of new sockets which are not
   930  		// connected when SO_LINGER is set.
   931  		result |= waiter.EventHUp
   932  
   933  	case StateConnecting, StateSynSent, StateSynRecv:
   934  		// Ready for nothing.
   935  
   936  	case StateClose, StateError, StateTimeWait:
   937  		// Ready for anything.
   938  		result = mask
   939  
   940  	case StateListen:
   941  		// Check if there's anything in the accepted queue.
   942  		if (mask & waiter.ReadableEvents) != 0 {
   943  			e.acceptMu.Lock()
   944  			if e.acceptQueue.endpoints.Len() != 0 {
   945  				result |= waiter.ReadableEvents
   946  			}
   947  			e.acceptMu.Unlock()
   948  		}
   949  	}
   950  	if e.EndpointState().connected() {
   951  		// Determine if the endpoint is writable if requested.
   952  		if (mask & waiter.WritableEvents) != 0 {
   953  			e.sndQueueInfo.sndQueueMu.Lock()
   954  			sndBufSize := e.getSendBufferSize()
   955  			if e.sndQueueInfo.SndClosed || e.sndQueueInfo.SndBufUsed < sndBufSize {
   956  				result |= waiter.WritableEvents
   957  			}
   958  			if e.sndQueueInfo.SndClosed {
   959  				e.updateConnDirectionState(connDirectionStateSndClosed)
   960  			}
   961  			e.sndQueueInfo.sndQueueMu.Unlock()
   962  		}
   963  
   964  		// Determine if the endpoint is readable if requested.
   965  		if (mask & waiter.ReadableEvents) != 0 {
   966  			e.rcvQueueMu.Lock()
   967  			if e.RcvBufUsed > 0 || e.RcvClosed {
   968  				result |= waiter.ReadableEvents
   969  			}
   970  			if e.RcvClosed {
   971  				e.updateConnDirectionState(connDirectionStateRcvClosed)
   972  			}
   973  			e.rcvQueueMu.Unlock()
   974  		}
   975  	}
   976  
   977  	// Determine whether endpoint is half-closed with rcv shutdown
   978  	if e.connDirectionState() == connDirectionStateRcvClosed {
   979  		result |= waiter.EventRdHUp
   980  	}
   981  
   982  	return result
   983  }
   984  
   985  // Purging pending rcv segments is only necessary on RST.
   986  func (e *endpoint) purgePendingRcvQueue() {
   987  	if e.rcv != nil {
   988  		for e.rcv.pendingRcvdSegments.Len() > 0 {
   989  			s := heap.Pop(&e.rcv.pendingRcvdSegments).(*segment)
   990  			s.DecRef()
   991  		}
   992  	}
   993  }
   994  
   995  // +checklocks:e.mu
   996  func (e *endpoint) purgeReadQueue() {
   997  	if e.rcv != nil {
   998  		e.rcvQueueMu.Lock()
   999  		defer e.rcvQueueMu.Unlock()
  1000  		for {
  1001  			s := e.rcvQueue.Front()
  1002  			if s == nil {
  1003  				break
  1004  			}
  1005  			e.rcvQueue.Remove(s)
  1006  			s.DecRef()
  1007  		}
  1008  		e.RcvBufUsed = 0
  1009  	}
  1010  }
  1011  
  1012  // +checklocks:e.mu
  1013  func (e *endpoint) purgeWriteQueue() {
  1014  	if e.snd != nil {
  1015  		e.sndQueueInfo.sndQueueMu.Lock()
  1016  		defer e.sndQueueInfo.sndQueueMu.Unlock()
  1017  		e.snd.updateWriteNext(nil)
  1018  		for {
  1019  			s := e.snd.writeList.Front()
  1020  			if s == nil {
  1021  				break
  1022  			}
  1023  			e.snd.writeList.Remove(s)
  1024  			s.DecRef()
  1025  		}
  1026  		e.sndQueueInfo.SndBufUsed = 0
  1027  		e.sndQueueInfo.SndClosed = true
  1028  	}
  1029  }
  1030  
  1031  // Abort implements stack.TransportEndpoint.Abort.
  1032  func (e *endpoint) Abort() {
  1033  	defer e.drainClosingSegmentQueue()
  1034  	e.LockUser()
  1035  	defer e.UnlockUser()
  1036  	defer e.purgeReadQueue()
  1037  	// Reset all connected endpoints.
  1038  	switch state := e.EndpointState(); {
  1039  	case state.connected():
  1040  		e.resetConnectionLocked(&tcpip.ErrAborted{})
  1041  		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  1042  		return
  1043  	}
  1044  	e.closeLocked()
  1045  }
  1046  
  1047  // Close puts the endpoint in a closed state and frees all resources associated
  1048  // with it. It must be called only once and with no other concurrent calls to
  1049  // the endpoint.
  1050  func (e *endpoint) Close() {
  1051  	e.LockUser()
  1052  	if e.closed {
  1053  		e.UnlockUser()
  1054  		return
  1055  	}
  1056  
  1057  	// We always want to purge the read queue, but do so after the checks in
  1058  	// shutdownLocked.
  1059  	e.closeLocked()
  1060  	e.purgeReadQueue()
  1061  	if e.EndpointState() == StateClose || e.EndpointState() == StateError {
  1062  		// It should be safe to purge the read queue now as the endpoint
  1063  		// is now closed or in an error state and further reads are not
  1064  		// permitted.
  1065  		e.UnlockUser()
  1066  		e.drainClosingSegmentQueue()
  1067  		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  1068  		return
  1069  	}
  1070  	e.UnlockUser()
  1071  }
  1072  
  1073  // +checklocks:e.mu
  1074  func (e *endpoint) closeLocked() {
  1075  	linger := e.SocketOptions().GetLinger()
  1076  	if linger.Enabled && linger.Timeout == 0 {
  1077  		s := e.EndpointState()
  1078  		isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv
  1079  		if isResetState {
  1080  			// Close the endpoint without doing full shutdown and
  1081  			// send a RST.
  1082  			e.resetConnectionLocked(&tcpip.ErrConnectionAborted{})
  1083  			return
  1084  		}
  1085  	}
  1086  
  1087  	// Issue a shutdown so that the peer knows we won't send any more data
  1088  	// if we're connected, or stop accepting if we're listening.
  1089  	e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead)
  1090  	e.closeNoShutdownLocked()
  1091  }
  1092  
  1093  // closeNoShutdown closes the endpoint without doing a full shutdown.
  1094  // +checklocks:e.mu
  1095  func (e *endpoint) closeNoShutdownLocked() {
  1096  	// For listening sockets, we always release ports inline so that they
  1097  	// are immediately available for reuse after Close() is called. If also
  1098  	// registered, we unregister as well otherwise the next user would fail
  1099  	// in Listen() when trying to register.
  1100  	if e.EndpointState() == StateListen && e.isPortReserved {
  1101  		if e.isRegistered {
  1102  			e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
  1103  			e.isRegistered = false
  1104  		}
  1105  
  1106  		portRes := ports.Reservation{
  1107  			Networks:     e.effectiveNetProtos,
  1108  			Transport:    ProtocolNumber,
  1109  			Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  1110  			Port:         e.TransportEndpointInfo.ID.LocalPort,
  1111  			Flags:        e.boundPortFlags,
  1112  			BindToDevice: e.boundBindToDevice,
  1113  			Dest:         e.boundDest,
  1114  		}
  1115  		e.stack.ReleasePort(portRes)
  1116  		e.isPortReserved = false
  1117  		e.boundBindToDevice = 0
  1118  		e.boundPortFlags = ports.Flags{}
  1119  		e.boundDest = tcpip.FullAddress{}
  1120  	}
  1121  
  1122  	// Mark endpoint as closed.
  1123  	e.closed = true
  1124  	tcpip.AddDanglingEndpoint(e)
  1125  
  1126  	eventMask := waiter.ReadableEvents | waiter.WritableEvents
  1127  
  1128  	switch e.EndpointState() {
  1129  	case StateInitial, StateBound, StateListen:
  1130  		e.setEndpointState(StateClose)
  1131  		fallthrough
  1132  	case StateClose, StateError:
  1133  		eventMask |= waiter.EventHUp
  1134  		e.cleanupLocked()
  1135  	case StateConnecting, StateSynSent, StateSynRecv:
  1136  		// Abort the handshake and set the error.
  1137  		// Notify that the endpoint is closed.
  1138  		eventMask |= waiter.EventHUp
  1139  		e.handshakeFailed(&tcpip.ErrAborted{})
  1140  		// Notify that the endpoint is closed.
  1141  		eventMask |= waiter.EventHUp
  1142  	case StateFinWait2:
  1143  		// The socket has been closed and we are in FIN-WAIT-2 so start
  1144  		// the FIN-WAIT-2 timer.
  1145  		if e.finWait2Timer == nil {
  1146  			e.finWait2Timer = e.stack.Clock().AfterFunc(e.tcpLingerTimeout, e.finWait2TimerExpired)
  1147  		}
  1148  	}
  1149  
  1150  	e.waiterQueue.Notify(eventMask)
  1151  }
  1152  
  1153  // closePendingAcceptableConnections closes all connections that have completed
  1154  // handshake but not yet been delivered to the application.
  1155  func (e *endpoint) closePendingAcceptableConnectionsLocked() {
  1156  	e.acceptMu.Lock()
  1157  
  1158  	pendingEndpoints := e.acceptQueue.pendingEndpoints
  1159  	e.acceptQueue.pendingEndpoints = nil
  1160  
  1161  	completedEndpoints := make([]*endpoint, 0, e.acceptQueue.endpoints.Len())
  1162  	for n := e.acceptQueue.endpoints.Front(); n != nil; n = n.Next() {
  1163  		completedEndpoints = append(completedEndpoints, n.Value.(*endpoint))
  1164  	}
  1165  	e.acceptQueue.endpoints.Init()
  1166  	e.acceptQueue.capacity = 0
  1167  	e.acceptMu.Unlock()
  1168  
  1169  	// Close any endpoints in SYN-RCVD state.
  1170  	for n := range pendingEndpoints {
  1171  		n.Abort()
  1172  	}
  1173  
  1174  	// Reset all connections that are waiting to be accepted.
  1175  	for _, n := range completedEndpoints {
  1176  		n.Abort()
  1177  	}
  1178  }
  1179  
  1180  // cleanupLocked frees all resources associated with the endpoint.
  1181  // +checklocks:e.mu
  1182  func (e *endpoint) cleanupLocked() {
  1183  	if e.snd != nil {
  1184  		e.snd.resendTimer.cleanup()
  1185  		e.snd.probeTimer.cleanup()
  1186  		e.snd.reorderTimer.cleanup()
  1187  	}
  1188  
  1189  	if e.finWait2Timer != nil {
  1190  		e.finWait2Timer.Stop()
  1191  	}
  1192  
  1193  	if e.timeWaitTimer != nil {
  1194  		e.timeWaitTimer.Stop()
  1195  	}
  1196  
  1197  	// Close all endpoints that might have been accepted by TCP but not by
  1198  	// the client.
  1199  	e.closePendingAcceptableConnectionsLocked()
  1200  	e.keepalive.timer.cleanup()
  1201  
  1202  	if e.isRegistered {
  1203  		e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
  1204  		e.isRegistered = false
  1205  	}
  1206  
  1207  	if e.isPortReserved {
  1208  		portRes := ports.Reservation{
  1209  			Networks:     e.effectiveNetProtos,
  1210  			Transport:    ProtocolNumber,
  1211  			Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  1212  			Port:         e.TransportEndpointInfo.ID.LocalPort,
  1213  			Flags:        e.boundPortFlags,
  1214  			BindToDevice: e.boundBindToDevice,
  1215  			Dest:         e.boundDest,
  1216  		}
  1217  		e.stack.ReleasePort(portRes)
  1218  		e.isPortReserved = false
  1219  	}
  1220  	e.boundBindToDevice = 0
  1221  	e.boundPortFlags = ports.Flags{}
  1222  	e.boundDest = tcpip.FullAddress{}
  1223  
  1224  	if e.route != nil {
  1225  		e.route.Release()
  1226  		e.route = nil
  1227  	}
  1228  
  1229  	e.purgeWriteQueue()
  1230  	// Only purge the read queue here if the socket is fully closed by the
  1231  	// user.
  1232  	if e.closed {
  1233  		e.purgeReadQueue()
  1234  	}
  1235  	e.stack.CompleteTransportEndpointCleanup(e)
  1236  	tcpip.DeleteDanglingEndpoint(e)
  1237  }
  1238  
  1239  // wndFromSpace returns the window that we can advertise based on the available
  1240  // receive buffer space.
  1241  func wndFromSpace(space int) int {
  1242  	return space >> rcvAdvWndScale
  1243  }
  1244  
  1245  // initialReceiveWindow returns the initial receive window to advertise in the
  1246  // SYN/SYN-ACK.
  1247  func (e *endpoint) initialReceiveWindow() int {
  1248  	rcvWnd := wndFromSpace(e.receiveBufferAvailable())
  1249  	if rcvWnd > math.MaxUint16 {
  1250  		rcvWnd = math.MaxUint16
  1251  	}
  1252  
  1253  	// Use the user supplied MSS, if available.
  1254  	routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2
  1255  	if rcvWnd > routeWnd {
  1256  		rcvWnd = routeWnd
  1257  	}
  1258  	rcvWndScale := e.rcvWndScaleForHandshake()
  1259  
  1260  	// Round-down the rcvWnd to a multiple of wndScale. This ensures that the
  1261  	// window offered in SYN won't be reduced due to the loss of precision if
  1262  	// window scaling is enabled after the handshake.
  1263  	rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale)
  1264  
  1265  	// Ensure we can always accept at least 1 byte if the scale specified
  1266  	// was too high for the provided rcvWnd.
  1267  	if rcvWnd == 0 {
  1268  		rcvWnd = 1
  1269  	}
  1270  
  1271  	return rcvWnd
  1272  }
  1273  
  1274  // ModerateRecvBuf adjusts the receive buffer and the advertised window
  1275  // based on the number of bytes copied to userspace.
  1276  func (e *endpoint) ModerateRecvBuf(copied int) {
  1277  	e.LockUser()
  1278  	defer e.UnlockUser()
  1279  
  1280  	sendNonZeroWindowUpdate := false
  1281  
  1282  	e.rcvQueueMu.Lock()
  1283  	if e.RcvAutoParams.Disabled {
  1284  		e.rcvQueueMu.Unlock()
  1285  		return
  1286  	}
  1287  	now := e.stack.Clock().NowMonotonic()
  1288  	if rtt := e.RcvAutoParams.RTT; rtt == 0 || now.Sub(e.RcvAutoParams.MeasureTime) < rtt {
  1289  		e.RcvAutoParams.CopiedBytes += copied
  1290  		e.rcvQueueMu.Unlock()
  1291  		return
  1292  	}
  1293  	prevRTTCopied := e.RcvAutoParams.CopiedBytes + copied
  1294  	prevCopied := e.RcvAutoParams.PrevCopiedBytes
  1295  	rcvWnd := 0
  1296  	if prevRTTCopied > prevCopied {
  1297  		// The minimal receive window based on what was copied by the app
  1298  		// in the immediate preceding RTT and some extra buffer for 16
  1299  		// segments to account for variations.
  1300  		// We multiply by 2 to account for packet losses.
  1301  		rcvWnd = prevRTTCopied*2 + 16*int(e.amss)
  1302  
  1303  		// Scale for slow start based on bytes copied in this RTT vs previous.
  1304  		grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied
  1305  
  1306  		// Multiply growth factor by 2 again to account for sender being
  1307  		// in slow-start where the sender grows it's congestion window
  1308  		// by 100% per RTT.
  1309  		rcvWnd += grow * 2
  1310  
  1311  		// Make sure auto tuned buffer size can always receive upto 2x
  1312  		// the initial window of 10 segments.
  1313  		if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd {
  1314  			rcvWnd = minRcvWnd
  1315  		}
  1316  
  1317  		// Cap the auto tuned buffer size by the maximum permissible
  1318  		// receive buffer size.
  1319  		if max := e.maxReceiveBufferSize(); rcvWnd > max {
  1320  			rcvWnd = max
  1321  		}
  1322  
  1323  		// We do not adjust downwards as that can cause the receiver to
  1324  		// reject valid data that might already be in flight as the
  1325  		// acceptable window will shrink.
  1326  		rcvBufSize := int(e.ops.GetReceiveBufferSize())
  1327  		if rcvWnd > rcvBufSize {
  1328  			availBefore := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize))
  1329  			e.ops.SetReceiveBufferSize(int64(rcvWnd), false /* notify */)
  1330  			availAfter := wndFromSpace(e.receiveBufferAvailableLocked(rcvWnd))
  1331  			if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, rcvBufSize); crossed && above {
  1332  				sendNonZeroWindowUpdate = true
  1333  			}
  1334  		}
  1335  
  1336  		// We only update PrevCopiedBytes when we grow the buffer because in cases
  1337  		// where PrevCopiedBytes > prevRTTCopied the existing buffer is already big
  1338  		// enough to handle the current rate and we don't need to do any
  1339  		// adjustments.
  1340  		e.RcvAutoParams.PrevCopiedBytes = prevRTTCopied
  1341  	}
  1342  	e.RcvAutoParams.MeasureTime = now
  1343  	e.RcvAutoParams.CopiedBytes = 0
  1344  	e.rcvQueueMu.Unlock()
  1345  
  1346  	// Send the update after unlocking rcvQueueMu as sending a segment acquires
  1347  	// the lock to calculate the window to be sent.
  1348  	if e.EndpointState().connected() && sendNonZeroWindowUpdate {
  1349  		e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu
  1350  	}
  1351  }
  1352  
  1353  // SetOwner implements tcpip.Endpoint.SetOwner.
  1354  func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
  1355  	e.owner = owner
  1356  }
  1357  
  1358  // +checklocks:e.mu
  1359  func (e *endpoint) hardErrorLocked() tcpip.Error {
  1360  	err := e.hardError
  1361  	e.hardError = nil
  1362  	return err
  1363  }
  1364  
  1365  // +checklocks:e.mu
  1366  func (e *endpoint) lastErrorLocked() tcpip.Error {
  1367  	e.lastErrorMu.Lock()
  1368  	defer e.lastErrorMu.Unlock()
  1369  	err := e.lastError
  1370  	e.lastError = nil
  1371  	return err
  1372  }
  1373  
  1374  // LastError implements tcpip.Endpoint.LastError.
  1375  func (e *endpoint) LastError() tcpip.Error {
  1376  	e.LockUser()
  1377  	defer e.UnlockUser()
  1378  	if err := e.hardErrorLocked(); err != nil {
  1379  		return err
  1380  	}
  1381  	return e.lastErrorLocked()
  1382  }
  1383  
  1384  // LastErrorLocked reads and clears lastError.
  1385  // Only to be used in tests.
  1386  // +checklocks:e.mu
  1387  func (e *endpoint) LastErrorLocked() tcpip.Error {
  1388  	return e.lastErrorLocked()
  1389  }
  1390  
  1391  // UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError.
  1392  func (e *endpoint) UpdateLastError(err tcpip.Error) {
  1393  	e.LockUser()
  1394  	e.lastErrorMu.Lock()
  1395  	e.lastError = err
  1396  	e.lastErrorMu.Unlock()
  1397  	e.UnlockUser()
  1398  }
  1399  
  1400  // Read implements tcpip.Endpoint.Read.
  1401  func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) {
  1402  	e.LockUser()
  1403  	defer e.UnlockUser()
  1404  
  1405  	if err := e.checkReadLocked(); err != nil {
  1406  		if _, ok := err.(*tcpip.ErrClosedForReceive); ok {
  1407  			e.stats.ReadErrors.ReadClosed.Increment()
  1408  		}
  1409  		return tcpip.ReadResult{}, err
  1410  	}
  1411  
  1412  	var err error
  1413  	done := 0
  1414  	// N.B. Here we get the first segment to be processed. It is safe to not
  1415  	// hold rcvQueueMu when processing, since we hold e.mu to ensure we only
  1416  	// remove segments from the list through Read() and that new segments
  1417  	// cannot be appended.
  1418  	s := e.rcvQueue.Front()
  1419  	for s != nil {
  1420  		var n int
  1421  		n, err = s.ReadTo(dst, opts.Peek)
  1422  		// Book keeping first then error handling.
  1423  		done += n
  1424  
  1425  		if opts.Peek {
  1426  			s = s.Next()
  1427  		} else {
  1428  			sendNonZeroWindowUpdate := false
  1429  			memDelta := 0
  1430  			for {
  1431  				seg := e.rcvQueue.Front()
  1432  				if seg == nil || seg.payloadSize() != 0 {
  1433  					break
  1434  				}
  1435  				e.rcvQueue.Remove(seg)
  1436  				// Memory is only considered released when the whole segment has been
  1437  				// read.
  1438  				memDelta += seg.segMemSize()
  1439  				seg.DecRef()
  1440  			}
  1441  			e.rcvQueueMu.Lock()
  1442  			e.RcvBufUsed -= n
  1443  			s = e.rcvQueue.Front()
  1444  
  1445  			if memDelta > 0 {
  1446  				// If the window was small before this read and if the read freed up
  1447  				// enough buffer space, to either fit an aMSS or half a receive buffer
  1448  				// (whichever smaller), then notify the protocol goroutine to send a
  1449  				// window update.
  1450  				if crossed, above := e.windowCrossedACKThresholdLocked(memDelta, int(e.ops.GetReceiveBufferSize())); crossed && above {
  1451  					sendNonZeroWindowUpdate = true
  1452  				}
  1453  			}
  1454  			e.rcvQueueMu.Unlock()
  1455  
  1456  			if e.EndpointState().connected() && sendNonZeroWindowUpdate {
  1457  				e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu
  1458  			}
  1459  		}
  1460  
  1461  		if err != nil {
  1462  			break
  1463  		}
  1464  	}
  1465  
  1466  	// If something is read, we must report it. Report error when nothing is read.
  1467  	if done == 0 && err != nil {
  1468  		return tcpip.ReadResult{}, &tcpip.ErrBadBuffer{}
  1469  	}
  1470  	return tcpip.ReadResult{
  1471  		Count: done,
  1472  		Total: done,
  1473  	}, nil
  1474  }
  1475  
  1476  // checkRead checks that endpoint is in a readable state.
  1477  //
  1478  // +checklocks:e.mu
  1479  func (e *endpoint) checkReadLocked() tcpip.Error {
  1480  	e.rcvQueueMu.Lock()
  1481  	defer e.rcvQueueMu.Unlock()
  1482  	// When in SYN-SENT state, let the caller block on the receive.
  1483  	// An application can initiate a non-blocking connect and then block
  1484  	// on a receive. It can expect to read any data after the handshake
  1485  	// is complete. RFC793, section 3.9, p58.
  1486  	if e.EndpointState() == StateSynSent {
  1487  		return &tcpip.ErrWouldBlock{}
  1488  	}
  1489  
  1490  	// The endpoint can be read if it's connected, or if it's already closed
  1491  	// but has some pending unread data. Also note that a RST being received
  1492  	// would cause the state to become StateError so we should allow the
  1493  	// reads to proceed before returning a ECONNRESET.
  1494  	bufUsed := e.RcvBufUsed
  1495  	if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 {
  1496  		if s == StateError {
  1497  			if err := e.hardErrorLocked(); err != nil {
  1498  				return err
  1499  			}
  1500  			return &tcpip.ErrClosedForReceive{}
  1501  		}
  1502  		e.stats.ReadErrors.NotConnected.Increment()
  1503  		return &tcpip.ErrNotConnected{}
  1504  	}
  1505  
  1506  	if e.RcvBufUsed == 0 {
  1507  		if e.RcvClosed || !e.EndpointState().connected() {
  1508  			return &tcpip.ErrClosedForReceive{}
  1509  		}
  1510  		return &tcpip.ErrWouldBlock{}
  1511  	}
  1512  
  1513  	return nil
  1514  }
  1515  
  1516  // isEndpointWritableLocked checks if a given endpoint is writable
  1517  // and also returns the number of bytes that can be written at this
  1518  // moment. If the endpoint is not writable then it returns an error
  1519  // indicating the reason why it's not writable.
  1520  // +checklocks:e.mu
  1521  // +checklocks:e.sndQueueInfo.sndQueueMu
  1522  func (e *endpoint) isEndpointWritableLocked() (int, tcpip.Error) {
  1523  	// The endpoint cannot be written to if it's not connected.
  1524  	switch s := e.EndpointState(); {
  1525  	case s == StateError:
  1526  		if err := e.hardErrorLocked(); err != nil {
  1527  			return 0, err
  1528  		}
  1529  		return 0, &tcpip.ErrClosedForSend{}
  1530  	case !s.connecting() && !s.connected():
  1531  		return 0, &tcpip.ErrClosedForSend{}
  1532  	case s.connecting():
  1533  		// As per RFC793, page 56, a send request arriving when in connecting
  1534  		// state, can be queued to be completed after the state becomes
  1535  		// connected. Return an error code for the caller of endpoint Write to
  1536  		// try again, until the connection handshake is complete.
  1537  		return 0, &tcpip.ErrWouldBlock{}
  1538  	}
  1539  
  1540  	// Check if the connection has already been closed for sends.
  1541  	if e.sndQueueInfo.SndClosed {
  1542  		return 0, &tcpip.ErrClosedForSend{}
  1543  	}
  1544  
  1545  	sndBufSize := e.getSendBufferSize()
  1546  	avail := sndBufSize - e.sndQueueInfo.SndBufUsed
  1547  	if avail <= 0 {
  1548  		return 0, &tcpip.ErrWouldBlock{}
  1549  	}
  1550  	return avail, nil
  1551  }
  1552  
  1553  // readFromPayloader reads a slice from the Payloader.
  1554  // +checklocks:e.mu
  1555  // +checklocks:e.sndQueueInfo.sndQueueMu
  1556  func (e *endpoint) readFromPayloader(p tcpip.Payloader, opts tcpip.WriteOptions, avail int) (buffer.Buffer, tcpip.Error) {
  1557  	// We can release locks while copying data.
  1558  	//
  1559  	// This is not possible if atomic is set, because we can't allow the
  1560  	// available buffer space to be consumed by some other caller while we
  1561  	// are copying data in.
  1562  	if !opts.Atomic {
  1563  		e.sndQueueInfo.sndQueueMu.Unlock()
  1564  		defer e.sndQueueInfo.sndQueueMu.Lock()
  1565  
  1566  		e.UnlockUser()
  1567  		defer e.LockUser()
  1568  	}
  1569  
  1570  	// Fetch data.
  1571  	var payload buffer.Buffer
  1572  	if l := p.Len(); l < avail {
  1573  		avail = l
  1574  	}
  1575  	if avail == 0 {
  1576  		return payload, nil
  1577  	}
  1578  	if _, err := payload.WriteFromReader(p, int64(avail)); err != nil {
  1579  		payload.Release()
  1580  		return buffer.Buffer{}, &tcpip.ErrBadBuffer{}
  1581  	}
  1582  	return payload, nil
  1583  }
  1584  
  1585  // queueSegment reads data from the payloader and returns a segment to be sent.
  1586  // +checklocks:e.mu
  1587  func (e *endpoint) queueSegment(p tcpip.Payloader, opts tcpip.WriteOptions) (*segment, int, tcpip.Error) {
  1588  	e.sndQueueInfo.sndQueueMu.Lock()
  1589  	defer e.sndQueueInfo.sndQueueMu.Unlock()
  1590  
  1591  	avail, err := e.isEndpointWritableLocked()
  1592  	if err != nil {
  1593  		e.stats.WriteErrors.WriteClosed.Increment()
  1594  		return nil, 0, err
  1595  	}
  1596  
  1597  	buf, err := e.readFromPayloader(p, opts, avail)
  1598  	if err != nil {
  1599  		return nil, 0, err
  1600  	}
  1601  
  1602  	// Do not queue zero length segments.
  1603  	if buf.Size() == 0 {
  1604  		return nil, 0, nil
  1605  	}
  1606  
  1607  	if !opts.Atomic {
  1608  		// Since we released locks in between it's possible that the
  1609  		// endpoint transitioned to a CLOSED/ERROR states so make
  1610  		// sure endpoint is still writable before trying to write.
  1611  		avail, err := e.isEndpointWritableLocked()
  1612  		if err != nil {
  1613  			e.stats.WriteErrors.WriteClosed.Increment()
  1614  			buf.Release()
  1615  			return nil, 0, err
  1616  		}
  1617  
  1618  		// A simultaneous call to write on the socket can reduce avail. Discard
  1619  		// excess data copied if this is the case.
  1620  		if int64(avail) < buf.Size() {
  1621  			buf.Truncate(int64(avail))
  1622  		}
  1623  	}
  1624  
  1625  	// Add data to the send queue.
  1626  	size := int(buf.Size())
  1627  	s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), buf)
  1628  	e.sndQueueInfo.SndBufUsed += size
  1629  	e.snd.writeList.PushBack(s)
  1630  
  1631  	return s, size, nil
  1632  }
  1633  
  1634  // Write writes data to the endpoint's peer.
  1635  func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) {
  1636  	// Linux completely ignores any address passed to sendto(2) for TCP sockets
  1637  	// (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More
  1638  	// and opts.EndOfRecord are also ignored.
  1639  
  1640  	e.LockUser()
  1641  	defer e.UnlockUser()
  1642  
  1643  	// Return if either we didn't queue anything or if an error occurred while
  1644  	// attempting to queue data.
  1645  	nextSeg, n, err := e.queueSegment(p, opts)
  1646  	if n == 0 || err != nil {
  1647  		return 0, err
  1648  	}
  1649  
  1650  	e.sendData(nextSeg)
  1651  	return int64(n), nil
  1652  }
  1653  
  1654  // selectWindowLocked returns the new window without checking for shrinking or scaling
  1655  // applied.
  1656  // +checklocks:e.mu
  1657  // +checklocks:e.rcvQueueMu
  1658  func (e *endpoint) selectWindowLocked(rcvBufSize int) (wnd seqnum.Size) {
  1659  	wndFromAvailable := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize))
  1660  	maxWindow := wndFromSpace(rcvBufSize)
  1661  	wndFromUsedBytes := maxWindow - e.RcvBufUsed
  1662  
  1663  	// We take the lesser of the wndFromAvailable and wndFromUsedBytes because in
  1664  	// cases where we receive a lot of small segments the segment overhead is a
  1665  	// lot higher and we can run out socket buffer space before we can fill the
  1666  	// previous window we advertised. In cases where we receive MSS sized or close
  1667  	// MSS sized segments we will probably run out of window space before we
  1668  	// exhaust receive buffer.
  1669  	newWnd := wndFromAvailable
  1670  	if newWnd > wndFromUsedBytes {
  1671  		newWnd = wndFromUsedBytes
  1672  	}
  1673  	if newWnd < 0 {
  1674  		newWnd = 0
  1675  	}
  1676  	return seqnum.Size(newWnd)
  1677  }
  1678  
  1679  // selectWindow invokes selectWindowLocked after acquiring e.rcvQueueMu.
  1680  // +checklocks:e.mu
  1681  func (e *endpoint) selectWindow() (wnd seqnum.Size) {
  1682  	e.rcvQueueMu.Lock()
  1683  	wnd = e.selectWindowLocked(int(e.ops.GetReceiveBufferSize()))
  1684  	e.rcvQueueMu.Unlock()
  1685  	return wnd
  1686  }
  1687  
  1688  // windowCrossedACKThresholdLocked checks if the receive window to be announced
  1689  // would be under aMSS or under the window derived from half receive buffer,
  1690  // whichever smaller. This is useful as a receive side silly window syndrome
  1691  // prevention mechanism. If window grows to reasonable value, we should send ACK
  1692  // to the sender to inform the rx space is now large. We also want ensure a
  1693  // series of small read()'s won't trigger a flood of spurious tiny ACK's.
  1694  //
  1695  // For large receive buffers, the threshold is aMSS - once reader reads more
  1696  // than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of
  1697  // receive buffer size. This is chosen arbitrarily.
  1698  // crossed will be true if the window size crossed the ACK threshold.
  1699  // above will be true if the new window is >= ACK threshold and false
  1700  // otherwise.
  1701  //
  1702  // +checklocks:e.mu
  1703  // +checklocks:e.rcvQueueMu
  1704  func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int, rcvBufSize int) (crossed bool, above bool) {
  1705  	newAvail := int(e.selectWindowLocked(rcvBufSize))
  1706  	oldAvail := newAvail - deltaBefore
  1707  	if oldAvail < 0 {
  1708  		oldAvail = 0
  1709  	}
  1710  	threshold := int(e.amss)
  1711  	// rcvBufFraction is the inverse of the fraction of receive buffer size that
  1712  	// is used to decide if the available buffer space is now above it.
  1713  	const rcvBufFraction = 2
  1714  	if wndThreshold := wndFromSpace(rcvBufSize / rcvBufFraction); threshold > wndThreshold {
  1715  		threshold = wndThreshold
  1716  	}
  1717  
  1718  	switch {
  1719  	case oldAvail < threshold && newAvail >= threshold:
  1720  		return true, true
  1721  	case oldAvail >= threshold && newAvail < threshold:
  1722  		return true, false
  1723  	}
  1724  	return false, false
  1725  }
  1726  
  1727  // OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet.
  1728  func (e *endpoint) OnReuseAddressSet(v bool) {
  1729  	e.LockUser()
  1730  	e.portFlags.TupleOnly = v
  1731  	e.UnlockUser()
  1732  }
  1733  
  1734  // OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet.
  1735  func (e *endpoint) OnReusePortSet(v bool) {
  1736  	e.LockUser()
  1737  	e.portFlags.LoadBalanced = v
  1738  	e.UnlockUser()
  1739  }
  1740  
  1741  // OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet.
  1742  func (e *endpoint) OnKeepAliveSet(bool) {
  1743  	e.LockUser()
  1744  	e.resetKeepaliveTimer(true /* receivedData */)
  1745  	e.UnlockUser()
  1746  }
  1747  
  1748  // OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet.
  1749  func (e *endpoint) OnDelayOptionSet(v bool) {
  1750  	if !v {
  1751  		e.LockUser()
  1752  		defer e.UnlockUser()
  1753  		// Handle delayed data.
  1754  		if e.EndpointState().connected() {
  1755  			e.sendData(nil /* next */)
  1756  		}
  1757  	}
  1758  }
  1759  
  1760  // OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet.
  1761  func (e *endpoint) OnCorkOptionSet(v bool) {
  1762  	if !v {
  1763  		e.LockUser()
  1764  		defer e.UnlockUser()
  1765  		// Handle the corked data.
  1766  		if e.EndpointState().connected() {
  1767  			e.sendData(nil /* next */)
  1768  		}
  1769  	}
  1770  }
  1771  
  1772  func (e *endpoint) getSendBufferSize() int {
  1773  	return int(e.ops.GetSendBufferSize())
  1774  }
  1775  
  1776  // OnSetReceiveBufferSize implements tcpip.SocketOptionsHandler.OnSetReceiveBufferSize.
  1777  func (e *endpoint) OnSetReceiveBufferSize(rcvBufSz, oldSz int64) (newSz int64, postSet func()) {
  1778  	e.LockUser()
  1779  
  1780  	sendNonZeroWindowUpdate := false
  1781  	e.rcvQueueMu.Lock()
  1782  
  1783  	// Make sure the receive buffer size allows us to send a
  1784  	// non-zero window size.
  1785  	scale := uint8(0)
  1786  	if e.rcv != nil {
  1787  		scale = e.rcv.RcvWndScale
  1788  	}
  1789  	if rcvBufSz>>scale == 0 {
  1790  		rcvBufSz = 1 << scale
  1791  	}
  1792  
  1793  	availBefore := wndFromSpace(e.receiveBufferAvailableLocked(int(oldSz)))
  1794  	availAfter := wndFromSpace(e.receiveBufferAvailableLocked(int(rcvBufSz)))
  1795  	e.RcvAutoParams.Disabled = true
  1796  
  1797  	// Immediately send an ACK to uncork the sender silly window
  1798  	// syndrome prevetion, when our available space grows above aMSS
  1799  	// or half receive buffer, whichever smaller.
  1800  	if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, int(rcvBufSz)); crossed && above {
  1801  		sendNonZeroWindowUpdate = true
  1802  	}
  1803  
  1804  	e.rcvQueueMu.Unlock()
  1805  
  1806  	postSet = func() {
  1807  		e.LockUser()
  1808  		defer e.UnlockUser()
  1809  		if e.EndpointState().connected() && sendNonZeroWindowUpdate {
  1810  			e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu
  1811  		}
  1812  
  1813  	}
  1814  	e.UnlockUser()
  1815  	return rcvBufSz, postSet
  1816  }
  1817  
  1818  // OnSetSendBufferSize implements tcpip.SocketOptionsHandler.OnSetSendBufferSize.
  1819  func (e *endpoint) OnSetSendBufferSize(sz int64) int64 {
  1820  	e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled.Store(1)
  1821  	return sz
  1822  }
  1823  
  1824  // WakeupWriters implements tcpip.SocketOptionsHandler.WakeupWriters.
  1825  func (e *endpoint) WakeupWriters() {
  1826  	e.LockUser()
  1827  	defer e.UnlockUser()
  1828  
  1829  	sendBufferSize := e.getSendBufferSize()
  1830  	e.sndQueueInfo.sndQueueMu.Lock()
  1831  	notify := (sendBufferSize - e.sndQueueInfo.SndBufUsed) >= e.sndQueueInfo.SndBufUsed>>1
  1832  	e.sndQueueInfo.sndQueueMu.Unlock()
  1833  
  1834  	if notify {
  1835  		e.waiterQueue.Notify(waiter.WritableEvents)
  1836  	}
  1837  }
  1838  
  1839  // SetSockOptInt sets a socket option.
  1840  func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error {
  1841  	// Lower 2 bits represents ECN bits. RFC 3168, section 23.1
  1842  	const inetECNMask = 3
  1843  
  1844  	switch opt {
  1845  	case tcpip.KeepaliveCountOption:
  1846  		e.LockUser()
  1847  		e.keepalive.Lock()
  1848  		e.keepalive.count = v
  1849  		e.keepalive.Unlock()
  1850  		e.resetKeepaliveTimer(true /* receivedData */)
  1851  		e.UnlockUser()
  1852  
  1853  	case tcpip.IPv4TOSOption:
  1854  		e.LockUser()
  1855  		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
  1856  		// ignore the bits for now.
  1857  		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
  1858  		e.UnlockUser()
  1859  
  1860  	case tcpip.IPv6TrafficClassOption:
  1861  		e.LockUser()
  1862  		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
  1863  		// ignore the bits for now.
  1864  		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
  1865  		e.UnlockUser()
  1866  
  1867  	case tcpip.MaxSegOption:
  1868  		userMSS := v
  1869  		if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS {
  1870  			return &tcpip.ErrInvalidOptionValue{}
  1871  		}
  1872  		e.LockUser()
  1873  		e.userMSS = uint16(userMSS)
  1874  		e.UnlockUser()
  1875  
  1876  	case tcpip.MTUDiscoverOption:
  1877  		// Return not supported if attempting to set this option to
  1878  		// anything other than path MTU discovery disabled.
  1879  		if v != tcpip.PMTUDiscoveryDont {
  1880  			return &tcpip.ErrNotSupported{}
  1881  		}
  1882  
  1883  	case tcpip.IPv4TTLOption:
  1884  		e.LockUser()
  1885  		e.ipv4TTL = uint8(v)
  1886  		e.UnlockUser()
  1887  
  1888  	case tcpip.IPv6HopLimitOption:
  1889  		e.LockUser()
  1890  		e.ipv6HopLimit = int16(v)
  1891  		e.UnlockUser()
  1892  
  1893  	case tcpip.TCPSynCountOption:
  1894  		if v < 1 || v > 255 {
  1895  			return &tcpip.ErrInvalidOptionValue{}
  1896  		}
  1897  		e.LockUser()
  1898  		e.maxSynRetries = uint8(v)
  1899  		e.UnlockUser()
  1900  
  1901  	case tcpip.TCPWindowClampOption:
  1902  		if v == 0 {
  1903  			e.LockUser()
  1904  			switch e.EndpointState() {
  1905  			case StateClose, StateInitial:
  1906  				e.windowClamp = 0
  1907  				e.UnlockUser()
  1908  				return nil
  1909  			default:
  1910  				e.UnlockUser()
  1911  				return &tcpip.ErrInvalidOptionValue{}
  1912  			}
  1913  		}
  1914  		var rs tcpip.TCPReceiveBufferSizeRangeOption
  1915  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
  1916  			if v < rs.Min/2 {
  1917  				v = rs.Min / 2
  1918  			}
  1919  		}
  1920  		e.LockUser()
  1921  		e.windowClamp = uint32(v)
  1922  		e.UnlockUser()
  1923  	}
  1924  	return nil
  1925  }
  1926  
  1927  func (e *endpoint) HasNIC(id int32) bool {
  1928  	return id == 0 || e.stack.HasNIC(tcpip.NICID(id))
  1929  }
  1930  
  1931  // SetSockOpt sets a socket option.
  1932  func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error {
  1933  	switch v := opt.(type) {
  1934  	case *tcpip.KeepaliveIdleOption:
  1935  		e.LockUser()
  1936  		e.keepalive.Lock()
  1937  		e.keepalive.idle = time.Duration(*v)
  1938  		e.keepalive.Unlock()
  1939  		e.resetKeepaliveTimer(true /* receivedData */)
  1940  		e.UnlockUser()
  1941  
  1942  	case *tcpip.KeepaliveIntervalOption:
  1943  		e.LockUser()
  1944  		e.keepalive.Lock()
  1945  		e.keepalive.interval = time.Duration(*v)
  1946  		e.keepalive.Unlock()
  1947  		e.resetKeepaliveTimer(true /* receivedData */)
  1948  		e.UnlockUser()
  1949  
  1950  	case *tcpip.TCPUserTimeoutOption:
  1951  		e.LockUser()
  1952  		e.userTimeout = time.Duration(*v)
  1953  		e.UnlockUser()
  1954  
  1955  	case *tcpip.CongestionControlOption:
  1956  		// Query the available cc algorithms in the stack and
  1957  		// validate that the specified algorithm is actually
  1958  		// supported in the stack.
  1959  		var avail tcpip.TCPAvailableCongestionControlOption
  1960  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil {
  1961  			return err
  1962  		}
  1963  		availCC := strings.Split(string(avail), " ")
  1964  		for _, cc := range availCC {
  1965  			if *v == tcpip.CongestionControlOption(cc) {
  1966  				e.LockUser()
  1967  				state := e.EndpointState()
  1968  				e.cc = *v
  1969  				switch state {
  1970  				case StateEstablished:
  1971  					if e.EndpointState() == state {
  1972  						e.snd.cc = e.snd.initCongestionControl(e.cc)
  1973  					}
  1974  				}
  1975  				e.UnlockUser()
  1976  				return nil
  1977  			}
  1978  		}
  1979  
  1980  		// Linux returns ENOENT when an invalid congestion
  1981  		// control algorithm is specified.
  1982  		return &tcpip.ErrNoSuchFile{}
  1983  
  1984  	case *tcpip.TCPLingerTimeoutOption:
  1985  		e.LockUser()
  1986  
  1987  		switch {
  1988  		case *v < 0:
  1989  			// Same as effectively disabling TCPLinger timeout.
  1990  			*v = -1
  1991  		case *v == 0:
  1992  			// Same as the stack default.
  1993  			var stackLingerTimeout tcpip.TCPLingerTimeoutOption
  1994  			if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil {
  1995  				panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err))
  1996  			}
  1997  			*v = stackLingerTimeout
  1998  		case *v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout):
  1999  			// Cap it to Stack's default TCP_LINGER2 timeout.
  2000  			*v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout)
  2001  		default:
  2002  		}
  2003  
  2004  		e.tcpLingerTimeout = time.Duration(*v)
  2005  		e.UnlockUser()
  2006  
  2007  	case *tcpip.TCPDeferAcceptOption:
  2008  		e.LockUser()
  2009  		if time.Duration(*v) > MaxRTO {
  2010  			*v = tcpip.TCPDeferAcceptOption(MaxRTO)
  2011  		}
  2012  		e.deferAccept = time.Duration(*v)
  2013  		e.UnlockUser()
  2014  
  2015  	case *tcpip.SocketDetachFilterOption:
  2016  		return nil
  2017  
  2018  	default:
  2019  		return nil
  2020  	}
  2021  	return nil
  2022  }
  2023  
  2024  // readyReceiveSize returns the number of bytes ready to be received.
  2025  func (e *endpoint) readyReceiveSize() (int, tcpip.Error) {
  2026  	e.LockUser()
  2027  	defer e.UnlockUser()
  2028  
  2029  	// The endpoint cannot be in listen state.
  2030  	if e.EndpointState() == StateListen {
  2031  		return 0, &tcpip.ErrInvalidEndpointState{}
  2032  	}
  2033  
  2034  	e.rcvQueueMu.Lock()
  2035  	defer e.rcvQueueMu.Unlock()
  2036  
  2037  	return e.RcvBufUsed, nil
  2038  }
  2039  
  2040  // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
  2041  func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) {
  2042  	switch opt {
  2043  	case tcpip.KeepaliveCountOption:
  2044  		e.keepalive.Lock()
  2045  		v := e.keepalive.count
  2046  		e.keepalive.Unlock()
  2047  		return v, nil
  2048  
  2049  	case tcpip.IPv4TOSOption:
  2050  		e.LockUser()
  2051  		v := int(e.sendTOS)
  2052  		e.UnlockUser()
  2053  		return v, nil
  2054  
  2055  	case tcpip.IPv6TrafficClassOption:
  2056  		e.LockUser()
  2057  		v := int(e.sendTOS)
  2058  		e.UnlockUser()
  2059  		return v, nil
  2060  
  2061  	case tcpip.MaxSegOption:
  2062  		// Linux only returns user_mss value if user_mss is set and the socket is
  2063  		// unconnected. Otherwise Linux returns the actual current MSS. Netstack
  2064  		// mimics the user_mss behavior, but otherwise just returns the defaultMSS
  2065  		// for now.
  2066  		v := header.TCPDefaultMSS
  2067  		e.LockUser()
  2068  		if state := e.EndpointState(); e.userMSS > 0 && (state.internal() || state == StateClose || state == StateListen) {
  2069  			v = int(e.userMSS)
  2070  		}
  2071  		e.UnlockUser()
  2072  		return v, nil
  2073  
  2074  	case tcpip.MTUDiscoverOption:
  2075  		// Always return the path MTU discovery disabled setting since
  2076  		// it's the only one supported.
  2077  		return tcpip.PMTUDiscoveryDont, nil
  2078  
  2079  	case tcpip.ReceiveQueueSizeOption:
  2080  		return e.readyReceiveSize()
  2081  
  2082  	case tcpip.IPv4TTLOption:
  2083  		e.LockUser()
  2084  		v := int(e.ipv4TTL)
  2085  		e.UnlockUser()
  2086  		return v, nil
  2087  
  2088  	case tcpip.IPv6HopLimitOption:
  2089  		e.LockUser()
  2090  		v := int(e.ipv6HopLimit)
  2091  		e.UnlockUser()
  2092  		return v, nil
  2093  
  2094  	case tcpip.TCPSynCountOption:
  2095  		e.LockUser()
  2096  		v := int(e.maxSynRetries)
  2097  		e.UnlockUser()
  2098  		return v, nil
  2099  
  2100  	case tcpip.TCPWindowClampOption:
  2101  		e.LockUser()
  2102  		v := int(e.windowClamp)
  2103  		e.UnlockUser()
  2104  		return v, nil
  2105  
  2106  	case tcpip.MulticastTTLOption:
  2107  		return 1, nil
  2108  
  2109  	default:
  2110  		return -1, &tcpip.ErrUnknownProtocolOption{}
  2111  	}
  2112  }
  2113  
  2114  func (e *endpoint) getTCPInfo() tcpip.TCPInfoOption {
  2115  	info := tcpip.TCPInfoOption{}
  2116  	e.LockUser()
  2117  	if state := e.EndpointState(); state.internal() {
  2118  		info.State = tcpip.EndpointState(StateClose)
  2119  	} else {
  2120  		info.State = tcpip.EndpointState(state)
  2121  	}
  2122  	snd := e.snd
  2123  	if snd != nil {
  2124  		// We do not calculate RTT before sending the data packets. If
  2125  		// the connection did not send and receive data, then RTT will
  2126  		// be zero.
  2127  		snd.rtt.Lock()
  2128  		info.RTT = snd.rtt.TCPRTTState.SRTT
  2129  		info.RTTVar = snd.rtt.TCPRTTState.RTTVar
  2130  		snd.rtt.Unlock()
  2131  
  2132  		info.RTO = snd.RTO
  2133  		info.CcState = snd.state
  2134  		info.SndSsthresh = uint32(snd.Ssthresh)
  2135  		info.SndCwnd = uint32(snd.SndCwnd)
  2136  		info.ReorderSeen = snd.rc.Reord
  2137  	}
  2138  	e.UnlockUser()
  2139  	return info
  2140  }
  2141  
  2142  // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
  2143  func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error {
  2144  	switch o := opt.(type) {
  2145  	case *tcpip.TCPInfoOption:
  2146  		*o = e.getTCPInfo()
  2147  
  2148  	case *tcpip.KeepaliveIdleOption:
  2149  		e.keepalive.Lock()
  2150  		*o = tcpip.KeepaliveIdleOption(e.keepalive.idle)
  2151  		e.keepalive.Unlock()
  2152  
  2153  	case *tcpip.KeepaliveIntervalOption:
  2154  		e.keepalive.Lock()
  2155  		*o = tcpip.KeepaliveIntervalOption(e.keepalive.interval)
  2156  		e.keepalive.Unlock()
  2157  
  2158  	case *tcpip.TCPUserTimeoutOption:
  2159  		e.LockUser()
  2160  		*o = tcpip.TCPUserTimeoutOption(e.userTimeout)
  2161  		e.UnlockUser()
  2162  
  2163  	case *tcpip.CongestionControlOption:
  2164  		e.LockUser()
  2165  		*o = e.cc
  2166  		e.UnlockUser()
  2167  
  2168  	case *tcpip.TCPLingerTimeoutOption:
  2169  		e.LockUser()
  2170  		*o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout)
  2171  		e.UnlockUser()
  2172  
  2173  	case *tcpip.TCPDeferAcceptOption:
  2174  		e.LockUser()
  2175  		*o = tcpip.TCPDeferAcceptOption(e.deferAccept)
  2176  		e.UnlockUser()
  2177  
  2178  	case *tcpip.OriginalDestinationOption:
  2179  		e.LockUser()
  2180  		ipt := e.stack.IPTables()
  2181  		addr, port, err := ipt.OriginalDst(e.TransportEndpointInfo.ID, e.NetProto, ProtocolNumber)
  2182  		e.UnlockUser()
  2183  		if err != nil {
  2184  			return err
  2185  		}
  2186  		*o = tcpip.OriginalDestinationOption{
  2187  			Addr: addr,
  2188  			Port: port,
  2189  		}
  2190  
  2191  	default:
  2192  		return &tcpip.ErrUnknownProtocolOption{}
  2193  	}
  2194  	return nil
  2195  }
  2196  
  2197  // checkV4MappedLocked determines the effective network protocol and converts
  2198  // addr to its canonical form.
  2199  // +checklocks:e.mu
  2200  func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) {
  2201  	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only())
  2202  	if err != nil {
  2203  		return tcpip.FullAddress{}, 0, err
  2204  	}
  2205  	return unwrapped, netProto, nil
  2206  }
  2207  
  2208  // Disconnect implements tcpip.Endpoint.Disconnect.
  2209  func (*endpoint) Disconnect() tcpip.Error {
  2210  	return &tcpip.ErrNotSupported{}
  2211  }
  2212  
  2213  // Connect connects the endpoint to its peer.
  2214  func (e *endpoint) Connect(addr tcpip.FullAddress) tcpip.Error {
  2215  	e.LockUser()
  2216  	defer e.UnlockUser()
  2217  	err := e.connect(addr, true)
  2218  	if err != nil {
  2219  		if !err.IgnoreStats() {
  2220  			// Connect failed. Let's wake up any waiters.
  2221  			e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  2222  			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
  2223  			e.stats.FailedConnectionAttempts.Increment()
  2224  		}
  2225  	}
  2226  	return err
  2227  }
  2228  
  2229  // registerEndpoint registers the endpoint with the provided address.
  2230  //
  2231  // +checklocks:e.mu
  2232  func (e *endpoint) registerEndpoint(addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber, nicID tcpip.NICID) tcpip.Error {
  2233  	netProtos := []tcpip.NetworkProtocolNumber{netProto}
  2234  	if e.TransportEndpointInfo.ID.LocalPort != 0 {
  2235  		// The endpoint is bound to a port, attempt to register it.
  2236  		err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
  2237  		if err != nil {
  2238  			return err
  2239  		}
  2240  	} else {
  2241  		// The endpoint doesn't have a local port yet, so try to get
  2242  		// one. Make sure that it isn't one that will result in the same
  2243  		// address/port for both local and remote (otherwise this
  2244  		// endpoint would be trying to connect to itself).
  2245  		sameAddr := e.TransportEndpointInfo.ID.LocalAddress == e.TransportEndpointInfo.ID.RemoteAddress
  2246  
  2247  		// Calculate a port offset based on the destination IP/port and
  2248  		// src IP to ensure that for a given tuple (srcIP, destIP,
  2249  		// destPort) the offset used as a starting point is the same to
  2250  		// ensure that we can cycle through the port space effectively.
  2251  		portBuf := make([]byte, 2)
  2252  		binary.LittleEndian.PutUint16(portBuf, e.ID.RemotePort)
  2253  
  2254  		h := jenkins.Sum32(e.protocol.portOffsetSecret)
  2255  		for _, s := range [][]byte{
  2256  			e.ID.LocalAddress.AsSlice(),
  2257  			e.ID.RemoteAddress.AsSlice(),
  2258  			portBuf,
  2259  		} {
  2260  			// Per io.Writer.Write:
  2261  			//
  2262  			// Write must return a non-nil error if it returns n < len(p).
  2263  			if _, err := h.Write(s); err != nil {
  2264  				panic(err)
  2265  			}
  2266  		}
  2267  		portOffset := h.Sum32()
  2268  
  2269  		var twReuse tcpip.TCPTimeWaitReuseOption
  2270  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &twReuse); err != nil {
  2271  			panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &twReuse, err))
  2272  		}
  2273  
  2274  		reuse := twReuse == tcpip.TCPTimeWaitReuseGlobal
  2275  		if twReuse == tcpip.TCPTimeWaitReuseLoopbackOnly {
  2276  			switch netProto {
  2277  			case header.IPv4ProtocolNumber:
  2278  				reuse = header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.LocalAddress) && header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.RemoteAddress)
  2279  			case header.IPv6ProtocolNumber:
  2280  				reuse = e.TransportEndpointInfo.ID.LocalAddress == header.IPv6Loopback && e.TransportEndpointInfo.ID.RemoteAddress == header.IPv6Loopback
  2281  			}
  2282  		}
  2283  
  2284  		bindToDevice := tcpip.NICID(e.ops.GetBindToDevice())
  2285  		if _, err := e.stack.PickEphemeralPortStable(portOffset, func(p uint16) (bool, tcpip.Error) {
  2286  			if sameAddr && p == e.TransportEndpointInfo.ID.RemotePort {
  2287  				return false, nil
  2288  			}
  2289  			portRes := ports.Reservation{
  2290  				Networks:     netProtos,
  2291  				Transport:    ProtocolNumber,
  2292  				Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  2293  				Port:         p,
  2294  				Flags:        e.portFlags,
  2295  				BindToDevice: bindToDevice,
  2296  				Dest:         addr,
  2297  			}
  2298  			if _, err := e.stack.ReservePort(e.stack.Rand(), portRes, nil /* testPort */); err != nil {
  2299  				if _, ok := err.(*tcpip.ErrPortInUse); !ok || !reuse {
  2300  					return false, nil
  2301  				}
  2302  				transEPID := e.TransportEndpointInfo.ID
  2303  				transEPID.LocalPort = p
  2304  				// Check if an endpoint is registered with demuxer in TIME-WAIT and if
  2305  				// we can reuse it. If we can't find a transport endpoint then we just
  2306  				// skip using this port as it's possible that either an endpoint has
  2307  				// bound the port but not registered with demuxer yet (no listen/connect
  2308  				// done yet) or the reservation was freed between the check above and
  2309  				// the FindTransportEndpoint below. But rather than retry the same port
  2310  				// we just skip it and move on.
  2311  				transEP := e.stack.FindTransportEndpoint(netProto, ProtocolNumber, transEPID, nicID)
  2312  				if transEP == nil {
  2313  					// ReservePort failed but there is no registered endpoint with
  2314  					// demuxer. Which indicates there is at least some endpoint that has
  2315  					// bound the port.
  2316  					return false, nil
  2317  				}
  2318  
  2319  				tcpEP := transEP.(*endpoint)
  2320  				tcpEP.LockUser()
  2321  				// If the endpoint is not in TIME-WAIT or if it is in TIME-WAIT but
  2322  				// less than 1 second has elapsed since its recentTS was updated then
  2323  				// we cannot reuse the port.
  2324  				if tcpEP.EndpointState() != StateTimeWait || e.stack.Clock().NowMonotonic().Sub(tcpEP.recentTSTime) < 1*time.Second {
  2325  					tcpEP.UnlockUser()
  2326  					return false, nil
  2327  				}
  2328  				// Since the endpoint is in TIME-WAIT it should be safe to acquire its
  2329  				// Lock while holding the lock for this endpoint as endpoints in
  2330  				// TIME-WAIT do not acquire locks on other endpoints.
  2331  				tcpEP.transitionToStateCloseLocked()
  2332  				tcpEP.drainClosingSegmentQueue()
  2333  				tcpEP.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  2334  				tcpEP.UnlockUser()
  2335  				// Now try and Reserve again if it fails then we skip.
  2336  				portRes := ports.Reservation{
  2337  					Networks:     netProtos,
  2338  					Transport:    ProtocolNumber,
  2339  					Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  2340  					Port:         p,
  2341  					Flags:        e.portFlags,
  2342  					BindToDevice: bindToDevice,
  2343  					Dest:         addr,
  2344  				}
  2345  				if _, err := e.stack.ReservePort(e.stack.Rand(), portRes, nil /* testPort */); err != nil {
  2346  					return false, nil
  2347  				}
  2348  			}
  2349  
  2350  			id := e.TransportEndpointInfo.ID
  2351  			id.LocalPort = p
  2352  			if err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.portFlags, bindToDevice); err != nil {
  2353  				portRes := ports.Reservation{
  2354  					Networks:     netProtos,
  2355  					Transport:    ProtocolNumber,
  2356  					Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  2357  					Port:         p,
  2358  					Flags:        e.portFlags,
  2359  					BindToDevice: bindToDevice,
  2360  					Dest:         addr,
  2361  				}
  2362  				e.stack.ReleasePort(portRes)
  2363  				if _, ok := err.(*tcpip.ErrPortInUse); ok {
  2364  					return false, nil
  2365  				}
  2366  				return false, err
  2367  			}
  2368  
  2369  			// Port picking successful. Save the details of
  2370  			// the selected port.
  2371  			e.TransportEndpointInfo.ID = id
  2372  			e.isPortReserved = true
  2373  			e.boundBindToDevice = bindToDevice
  2374  			e.boundPortFlags = e.portFlags
  2375  			e.boundDest = addr
  2376  			return true, nil
  2377  		}); err != nil {
  2378  			e.stack.Stats().TCP.FailedPortReservations.Increment()
  2379  			return err
  2380  		}
  2381  	}
  2382  	return nil
  2383  }
  2384  
  2385  // connect connects the endpoint to its peer.
  2386  // +checklocks:e.mu
  2387  func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool) tcpip.Error {
  2388  	connectingAddr := addr.Addr
  2389  
  2390  	addr, netProto, err := e.checkV4MappedLocked(addr)
  2391  	if err != nil {
  2392  		return err
  2393  	}
  2394  
  2395  	if e.EndpointState().connected() {
  2396  		// The endpoint is already connected. If caller hasn't been
  2397  		// notified yet, return success.
  2398  		if !e.isConnectNotified {
  2399  			e.isConnectNotified = true
  2400  			return nil
  2401  		}
  2402  		// Otherwise return that it's already connected.
  2403  		return &tcpip.ErrAlreadyConnected{}
  2404  	}
  2405  
  2406  	nicID := addr.NIC
  2407  	switch e.EndpointState() {
  2408  	case StateBound:
  2409  		// If we're already bound to a NIC but the caller is requesting
  2410  		// that we use a different one now, we cannot proceed.
  2411  		if e.boundNICID == 0 {
  2412  			break
  2413  		}
  2414  
  2415  		if nicID != 0 && nicID != e.boundNICID {
  2416  			return &tcpip.ErrHostUnreachable{}
  2417  		}
  2418  
  2419  		nicID = e.boundNICID
  2420  
  2421  	case StateInitial:
  2422  		// Nothing to do. We'll eventually fill-in the gaps in the ID (if any)
  2423  		// when we find a route.
  2424  
  2425  	case StateConnecting, StateSynSent, StateSynRecv:
  2426  		// A connection request has already been issued but hasn't completed
  2427  		// yet.
  2428  		return &tcpip.ErrAlreadyConnecting{}
  2429  
  2430  	case StateError:
  2431  		if err := e.hardErrorLocked(); err != nil {
  2432  			return err
  2433  		}
  2434  		return &tcpip.ErrConnectionAborted{}
  2435  
  2436  	default:
  2437  		return &tcpip.ErrInvalidEndpointState{}
  2438  	}
  2439  
  2440  	// Find a route to the desired destination.
  2441  	r, err := e.stack.FindRoute(nicID, e.TransportEndpointInfo.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */)
  2442  	if err != nil {
  2443  		return err
  2444  	}
  2445  	defer r.Release()
  2446  
  2447  	e.TransportEndpointInfo.ID.LocalAddress = r.LocalAddress()
  2448  	e.TransportEndpointInfo.ID.RemoteAddress = r.RemoteAddress()
  2449  	e.TransportEndpointInfo.ID.RemotePort = addr.Port
  2450  
  2451  	oldState := e.EndpointState()
  2452  	e.setEndpointState(StateConnecting)
  2453  	if err := e.registerEndpoint(addr, netProto, r.NICID()); err != nil {
  2454  		e.setEndpointState(oldState)
  2455  		if _, ok := err.(*tcpip.ErrPortInUse); ok {
  2456  			return &tcpip.ErrBadLocalAddress{}
  2457  		}
  2458  		return err
  2459  	}
  2460  
  2461  	e.isRegistered = true
  2462  	r.Acquire()
  2463  	e.route = r
  2464  	e.boundNICID = nicID
  2465  	e.effectiveNetProtos = []tcpip.NetworkProtocolNumber{netProto}
  2466  	e.connectingAddress = connectingAddr
  2467  
  2468  	e.initGSO()
  2469  
  2470  	// Connect in the restore phase does not perform handshake. Restore its
  2471  	// connection setting here.
  2472  	if !handshake {
  2473  		e.segmentQueue.mu.Lock()
  2474  		for _, l := range []segmentList{e.segmentQueue.list, e.snd.writeList} {
  2475  			for s := l.Front(); s != nil; s = s.Next() {
  2476  				s.id = e.TransportEndpointInfo.ID
  2477  				e.sndQueueInfo.sndWaker.Assert()
  2478  			}
  2479  		}
  2480  		e.segmentQueue.mu.Unlock()
  2481  		e.snd.ep.AssertLockHeld(e)
  2482  		e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
  2483  		e.setEndpointState(StateEstablished)
  2484  		// Set the new auto tuned send buffer size after entering
  2485  		// established state.
  2486  		e.ops.SetSendBufferSize(e.computeTCPSendBufferSize(), false /* notify */)
  2487  		return &tcpip.ErrConnectStarted{}
  2488  	}
  2489  
  2490  	// Start a new handshake.
  2491  	h := e.newHandshake()
  2492  	e.setEndpointState(StateSynSent)
  2493  	h.start()
  2494  	e.stack.Stats().TCP.ActiveConnectionOpenings.Increment()
  2495  
  2496  	return &tcpip.ErrConnectStarted{}
  2497  }
  2498  
  2499  // ConnectEndpoint is not supported.
  2500  func (*endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error {
  2501  	return &tcpip.ErrInvalidEndpointState{}
  2502  }
  2503  
  2504  // Shutdown closes the read and/or write end of the endpoint connection to its
  2505  // peer.
  2506  func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error {
  2507  	e.LockUser()
  2508  	defer e.UnlockUser()
  2509  
  2510  	if e.EndpointState().connecting() {
  2511  		// When calling shutdown(2) on a connecting socket, the endpoint must
  2512  		// enter the error state. But this logic cannot belong to the shutdownLocked
  2513  		// method because that method is called during a close(2) (and closing a
  2514  		// connecting socket is not an error).
  2515  		e.handshakeFailed(&tcpip.ErrConnectionReset{})
  2516  		e.waiterQueue.Notify(waiter.WritableEvents | waiter.EventHUp | waiter.EventErr)
  2517  		return nil
  2518  	}
  2519  
  2520  	return e.shutdownLocked(flags)
  2521  }
  2522  
  2523  // +checklocks:e.mu
  2524  func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error {
  2525  	e.shutdownFlags |= flags
  2526  	switch {
  2527  	case e.EndpointState().connected():
  2528  		// Close for read.
  2529  		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
  2530  			// Mark read side as closed.
  2531  			e.rcvQueueMu.Lock()
  2532  			e.RcvClosed = true
  2533  			rcvBufUsed := e.RcvBufUsed
  2534  			e.rcvQueueMu.Unlock()
  2535  			// If we're fully closed and we have unread data we need to abort
  2536  			// the connection with a RST.
  2537  			if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 {
  2538  				e.resetConnectionLocked(&tcpip.ErrConnectionAborted{})
  2539  				return nil
  2540  			}
  2541  			// Wake up any readers that maybe waiting for the stream to become
  2542  			// readable.
  2543  			events := waiter.ReadableEvents
  2544  			if e.shutdownFlags&tcpip.ShutdownWrite == 0 {
  2545  				// If ShutdownWrite is not set, write end won't close and
  2546  				// we end up with a half-closed connection
  2547  				events |= waiter.EventRdHUp
  2548  			}
  2549  			e.waiterQueue.Notify(events)
  2550  		}
  2551  
  2552  		// Close for write.
  2553  		if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
  2554  			e.sndQueueInfo.sndQueueMu.Lock()
  2555  			if e.sndQueueInfo.SndClosed {
  2556  				// Already closed.
  2557  				e.sndQueueInfo.sndQueueMu.Unlock()
  2558  				if e.EndpointState() == StateTimeWait {
  2559  					return &tcpip.ErrNotConnected{}
  2560  				}
  2561  				return nil
  2562  			}
  2563  
  2564  			// Queue fin segment.
  2565  			s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), buffer.Buffer{})
  2566  			e.snd.writeList.PushBack(s)
  2567  			// Mark endpoint as closed.
  2568  			e.sndQueueInfo.SndClosed = true
  2569  			e.sndQueueInfo.sndQueueMu.Unlock()
  2570  
  2571  			// Drain the send queue.
  2572  			e.sendData(s)
  2573  
  2574  			// Mark send side as closed.
  2575  			e.snd.Closed = true
  2576  
  2577  			// Wake up any writers that maybe waiting for the stream to become
  2578  			// writable.
  2579  			e.waiterQueue.Notify(waiter.WritableEvents)
  2580  		}
  2581  
  2582  		return nil
  2583  	case e.EndpointState() == StateListen:
  2584  		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
  2585  			// Reset all connections from the accept queue and keep the
  2586  			// worker running so that it can continue handling incoming
  2587  			// segments by replying with RST.
  2588  			//
  2589  			// By not removing this endpoint from the demuxer mapping, we
  2590  			// ensure that any other bind to the same port fails, as on Linux.
  2591  			e.rcvQueueMu.Lock()
  2592  			e.RcvClosed = true
  2593  			e.rcvQueueMu.Unlock()
  2594  			e.closePendingAcceptableConnectionsLocked()
  2595  			// Notify waiters that the endpoint is shutdown.
  2596  			e.waiterQueue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr)
  2597  		}
  2598  		return nil
  2599  	default:
  2600  		return &tcpip.ErrNotConnected{}
  2601  	}
  2602  }
  2603  
  2604  // Listen puts the endpoint in "listen" mode, which allows it to accept
  2605  // new connections.
  2606  func (e *endpoint) Listen(backlog int) tcpip.Error {
  2607  	if err := e.listen(backlog); err != nil {
  2608  		if !err.IgnoreStats() {
  2609  			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
  2610  			e.stats.FailedConnectionAttempts.Increment()
  2611  		}
  2612  		return err
  2613  	}
  2614  	return nil
  2615  }
  2616  
  2617  func (e *endpoint) listen(backlog int) tcpip.Error {
  2618  	e.LockUser()
  2619  	defer e.UnlockUser()
  2620  
  2621  	if e.EndpointState() == StateListen && !e.closed {
  2622  		e.acceptMu.Lock()
  2623  		defer e.acceptMu.Unlock()
  2624  
  2625  		// Adjust the size of the backlog iff we can fit
  2626  		// existing pending connections into the new one.
  2627  		if e.acceptQueue.endpoints.Len() > backlog {
  2628  			return &tcpip.ErrInvalidEndpointState{}
  2629  		}
  2630  		e.acceptQueue.capacity = backlog
  2631  
  2632  		if e.acceptQueue.pendingEndpoints == nil {
  2633  			e.acceptQueue.pendingEndpoints = make(map[*endpoint]struct{})
  2634  		}
  2635  
  2636  		e.shutdownFlags = 0
  2637  		e.updateConnDirectionState(connDirectionStateOpen)
  2638  		e.rcvQueueMu.Lock()
  2639  		e.RcvClosed = false
  2640  		e.rcvQueueMu.Unlock()
  2641  
  2642  		return nil
  2643  	}
  2644  
  2645  	if e.EndpointState() == StateInitial {
  2646  		// The listen is called on an unbound socket, the socket is
  2647  		// automatically bound to a random free port with the local
  2648  		// address set to INADDR_ANY.
  2649  		if err := e.bindLocked(tcpip.FullAddress{}); err != nil {
  2650  			return err
  2651  		}
  2652  	}
  2653  
  2654  	// Endpoint must be bound before it can transition to listen mode.
  2655  	if e.EndpointState() != StateBound {
  2656  		e.stats.ReadErrors.InvalidEndpointState.Increment()
  2657  		return &tcpip.ErrInvalidEndpointState{}
  2658  	}
  2659  
  2660  	// Setting this state after RegisterTransportEndpoint will result in a
  2661  	// race where the endpoint is in Bound but reachable via the demuxer. Instead
  2662  	// we set it to listen so that incoming packets will just be queued to the
  2663  	// inbound segment queue by the TCP processor.
  2664  	e.setEndpointState(StateListen)
  2665  	// Register the endpoint.
  2666  	if err := e.stack.RegisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil {
  2667  		e.transitionToStateCloseLocked()
  2668  		return err
  2669  	}
  2670  
  2671  	e.isRegistered = true
  2672  
  2673  	// The queue may be non-zero when we're restoring the endpoint, and it
  2674  	// may be pre-populated with some previously accepted (but not Accepted)
  2675  	// endpoints.
  2676  	e.acceptMu.Lock()
  2677  	if e.acceptQueue.pendingEndpoints == nil {
  2678  		e.acceptQueue.pendingEndpoints = make(map[*endpoint]struct{})
  2679  	}
  2680  	if e.acceptQueue.capacity == 0 {
  2681  		e.acceptQueue.capacity = backlog
  2682  	}
  2683  	e.acceptMu.Unlock()
  2684  
  2685  	// Initialize the listening context.
  2686  	rcvWnd := seqnum.Size(e.receiveBufferAvailable())
  2687  	e.listenCtx = newListenContext(e.stack, e.protocol, e, rcvWnd, e.ops.GetV6Only(), e.NetProto)
  2688  
  2689  	return nil
  2690  }
  2691  
  2692  // Accept returns a new endpoint if a peer has established a connection
  2693  // to an endpoint previously set to listen mode.
  2694  //
  2695  // addr if not-nil will contain the peer address of the returned endpoint.
  2696  func (e *endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) {
  2697  	e.LockUser()
  2698  	defer e.UnlockUser()
  2699  
  2700  	e.rcvQueueMu.Lock()
  2701  	rcvClosed := e.RcvClosed
  2702  	e.rcvQueueMu.Unlock()
  2703  	// Endpoint must be in listen state before it can accept connections.
  2704  	if rcvClosed || e.EndpointState() != StateListen {
  2705  		return nil, nil, &tcpip.ErrInvalidEndpointState{}
  2706  	}
  2707  
  2708  	// Get the new accepted endpoint.
  2709  	var n *endpoint
  2710  	e.acceptMu.Lock()
  2711  	if element := e.acceptQueue.endpoints.Front(); element != nil {
  2712  		n = e.acceptQueue.endpoints.Remove(element).(*endpoint)
  2713  	}
  2714  	e.acceptMu.Unlock()
  2715  	if n == nil {
  2716  		return nil, nil, &tcpip.ErrWouldBlock{}
  2717  	}
  2718  	if peerAddr != nil {
  2719  		*peerAddr = n.getRemoteAddress()
  2720  	}
  2721  	return n, n.waiterQueue, nil
  2722  }
  2723  
  2724  // Bind binds the endpoint to a specific local port and optionally address.
  2725  func (e *endpoint) Bind(addr tcpip.FullAddress) (err tcpip.Error) {
  2726  	e.LockUser()
  2727  	defer e.UnlockUser()
  2728  
  2729  	return e.bindLocked(addr)
  2730  }
  2731  
  2732  // +checklocks:e.mu
  2733  func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err tcpip.Error) {
  2734  	// Don't allow binding once endpoint is not in the initial state
  2735  	// anymore. This is because once the endpoint goes into a connected or
  2736  	// listen state, it is already bound.
  2737  	if e.EndpointState() != StateInitial {
  2738  		return &tcpip.ErrAlreadyBound{}
  2739  	}
  2740  
  2741  	e.BindAddr = addr.Addr
  2742  	addr, netProto, err := e.checkV4MappedLocked(addr)
  2743  	if err != nil {
  2744  		return err
  2745  	}
  2746  
  2747  	netProtos := []tcpip.NetworkProtocolNumber{netProto}
  2748  
  2749  	// Expand netProtos to include v4 and v6 under dual-stack if the caller is
  2750  	// binding to a wildcard (empty) address, and this is an IPv6 endpoint with
  2751  	// v6only set to false.
  2752  	if netProto == header.IPv6ProtocolNumber {
  2753  		stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber)
  2754  		alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == tcpip.Address{} && stackHasV4
  2755  		if alsoBindToV4 {
  2756  			netProtos = append(netProtos, header.IPv4ProtocolNumber)
  2757  		}
  2758  	}
  2759  
  2760  	var nic tcpip.NICID
  2761  	// If an address is specified, we must ensure that it's one of our
  2762  	// local addresses.
  2763  	if addr.Addr.Len() != 0 {
  2764  		nic = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
  2765  		if nic == 0 {
  2766  			return &tcpip.ErrBadLocalAddress{}
  2767  		}
  2768  		e.TransportEndpointInfo.ID.LocalAddress = addr.Addr
  2769  	}
  2770  
  2771  	bindToDevice := tcpip.NICID(e.ops.GetBindToDevice())
  2772  	portRes := ports.Reservation{
  2773  		Networks:     netProtos,
  2774  		Transport:    ProtocolNumber,
  2775  		Addr:         addr.Addr,
  2776  		Port:         addr.Port,
  2777  		Flags:        e.portFlags,
  2778  		BindToDevice: bindToDevice,
  2779  		Dest:         tcpip.FullAddress{},
  2780  	}
  2781  	port, err := e.stack.ReservePort(e.stack.Rand(), portRes, func(p uint16) (bool, tcpip.Error) {
  2782  		id := e.TransportEndpointInfo.ID
  2783  		id.LocalPort = p
  2784  		// CheckRegisterTransportEndpoint should only return an error if there is a
  2785  		// listening endpoint bound with the same id and portFlags and bindToDevice
  2786  		// options.
  2787  		//
  2788  		// NOTE: Only listening and connected endpoint register with
  2789  		// demuxer. Further connected endpoints always have a remote
  2790  		// address/port. Hence this will only return an error if there is a matching
  2791  		// listening endpoint.
  2792  		if err := e.stack.CheckRegisterTransportEndpoint(netProtos, ProtocolNumber, id, e.portFlags, bindToDevice); err != nil {
  2793  			return false, nil
  2794  		}
  2795  		return true, nil
  2796  	})
  2797  	if err != nil {
  2798  		e.stack.Stats().TCP.FailedPortReservations.Increment()
  2799  		return err
  2800  	}
  2801  
  2802  	e.boundBindToDevice = bindToDevice
  2803  	e.boundPortFlags = e.portFlags
  2804  	// TODO(gvisor.dev/issue/3691): Add test to verify boundNICID is correct.
  2805  	e.boundNICID = nic
  2806  	e.isPortReserved = true
  2807  	e.effectiveNetProtos = netProtos
  2808  	e.TransportEndpointInfo.ID.LocalPort = port
  2809  
  2810  	// Mark endpoint as bound.
  2811  	e.setEndpointState(StateBound)
  2812  
  2813  	return nil
  2814  }
  2815  
  2816  // GetLocalAddress returns the address to which the endpoint is bound.
  2817  func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) {
  2818  	e.LockUser()
  2819  	defer e.UnlockUser()
  2820  
  2821  	return tcpip.FullAddress{
  2822  		Addr: e.TransportEndpointInfo.ID.LocalAddress,
  2823  		Port: e.TransportEndpointInfo.ID.LocalPort,
  2824  		NIC:  e.boundNICID,
  2825  	}, nil
  2826  }
  2827  
  2828  // GetRemoteAddress returns the address to which the endpoint is connected.
  2829  func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) {
  2830  	e.LockUser()
  2831  	defer e.UnlockUser()
  2832  
  2833  	if !e.EndpointState().connected() {
  2834  		return tcpip.FullAddress{}, &tcpip.ErrNotConnected{}
  2835  	}
  2836  
  2837  	return e.getRemoteAddress(), nil
  2838  }
  2839  
  2840  func (e *endpoint) getRemoteAddress() tcpip.FullAddress {
  2841  	return tcpip.FullAddress{
  2842  		Addr: e.TransportEndpointInfo.ID.RemoteAddress,
  2843  		Port: e.TransportEndpointInfo.ID.RemotePort,
  2844  		NIC:  e.boundNICID,
  2845  	}
  2846  }
  2847  
  2848  func (*endpoint) HandlePacket(stack.TransportEndpointID, stack.PacketBufferPtr) {
  2849  	// TCP HandlePacket is not required anymore as inbound packets first
  2850  	// land at the Dispatcher which then can either deliver using the
  2851  	// worker go routine or directly do the invoke the tcp processing inline
  2852  	// based on the state of the endpoint.
  2853  }
  2854  
  2855  func (e *endpoint) enqueueSegment(s *segment) bool {
  2856  	// Send packet to worker goroutine.
  2857  	if !e.segmentQueue.enqueue(s) {
  2858  		// The queue is full, so we drop the segment.
  2859  		e.stack.Stats().DroppedPackets.Increment()
  2860  		e.stats.ReceiveErrors.SegmentQueueDropped.Increment()
  2861  		return false
  2862  	}
  2863  	return true
  2864  }
  2865  
  2866  func (e *endpoint) onICMPError(err tcpip.Error, transErr stack.TransportError, pkt stack.PacketBufferPtr) {
  2867  	// Update last error first.
  2868  	e.lastErrorMu.Lock()
  2869  	e.lastError = err
  2870  	e.lastErrorMu.Unlock()
  2871  
  2872  	var recvErr bool
  2873  	switch pkt.NetworkProtocolNumber {
  2874  	case header.IPv4ProtocolNumber:
  2875  		recvErr = e.SocketOptions().GetIPv4RecvError()
  2876  	case header.IPv6ProtocolNumber:
  2877  		recvErr = e.SocketOptions().GetIPv6RecvError()
  2878  	default:
  2879  		panic(fmt.Sprintf("unhandled network protocol number = %d", pkt.NetworkProtocolNumber))
  2880  	}
  2881  
  2882  	if recvErr {
  2883  		e.SocketOptions().QueueErr(&tcpip.SockError{
  2884  			Err:   err,
  2885  			Cause: transErr,
  2886  			// Linux passes the payload with the TCP header. We don't know if the TCP
  2887  			// header even exists, it may not for fragmented packets.
  2888  			Payload: pkt.Data().AsRange().ToView(),
  2889  			Dst: tcpip.FullAddress{
  2890  				NIC:  pkt.NICID,
  2891  				Addr: e.TransportEndpointInfo.ID.RemoteAddress,
  2892  				Port: e.TransportEndpointInfo.ID.RemotePort,
  2893  			},
  2894  			Offender: tcpip.FullAddress{
  2895  				NIC:  pkt.NICID,
  2896  				Addr: e.TransportEndpointInfo.ID.LocalAddress,
  2897  				Port: e.TransportEndpointInfo.ID.LocalPort,
  2898  			},
  2899  			NetProto: pkt.NetworkProtocolNumber,
  2900  		})
  2901  	}
  2902  
  2903  	if e.EndpointState().connecting() {
  2904  		e.mu.Lock()
  2905  		if lEP := e.h.listenEP; lEP != nil {
  2906  			// Remove from listening endpoints pending list.
  2907  			lEP.acceptMu.Lock()
  2908  			delete(lEP.acceptQueue.pendingEndpoints, e)
  2909  			lEP.acceptMu.Unlock()
  2910  			lEP.stats.FailedConnectionAttempts.Increment()
  2911  		}
  2912  		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
  2913  		e.cleanupLocked()
  2914  		e.hardError = err
  2915  		e.setEndpointState(StateError)
  2916  		e.mu.Unlock()
  2917  		e.drainClosingSegmentQueue()
  2918  		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  2919  	}
  2920  }
  2921  
  2922  // HandleError implements stack.TransportEndpoint.
  2923  func (e *endpoint) HandleError(transErr stack.TransportError, pkt stack.PacketBufferPtr) {
  2924  	handlePacketTooBig := func(mtu uint32) {
  2925  		e.sndQueueInfo.sndQueueMu.Lock()
  2926  		update := false
  2927  		if v := int(mtu); v < e.sndQueueInfo.SndMTU {
  2928  			e.sndQueueInfo.SndMTU = v
  2929  			update = true
  2930  		}
  2931  		newMTU := e.sndQueueInfo.SndMTU
  2932  		e.sndQueueInfo.sndQueueMu.Unlock()
  2933  		if update {
  2934  			e.mu.Lock()
  2935  			defer e.mu.Unlock()
  2936  			if e.snd != nil {
  2937  				e.snd.updateMaxPayloadSize(newMTU, 1 /* count */) // +checklocksforce:e.snd.ep.mu
  2938  			}
  2939  		}
  2940  	}
  2941  
  2942  	// TODO(gvisor.dev/issues/5270): Handle all transport errors.
  2943  	switch transErr.Kind() {
  2944  	case stack.PacketTooBigTransportError:
  2945  		handlePacketTooBig(transErr.Info())
  2946  	case stack.DestinationHostUnreachableTransportError:
  2947  		e.onICMPError(&tcpip.ErrHostUnreachable{}, transErr, pkt)
  2948  	case stack.DestinationNetworkUnreachableTransportError:
  2949  		e.onICMPError(&tcpip.ErrNetworkUnreachable{}, transErr, pkt)
  2950  	case stack.DestinationPortUnreachableTransportError:
  2951  		e.onICMPError(&tcpip.ErrConnectionRefused{}, transErr, pkt)
  2952  	case stack.DestinationProtoUnreachableTransportError:
  2953  		e.onICMPError(&tcpip.ErrUnknownProtocolOption{}, transErr, pkt)
  2954  	case stack.SourceRouteFailedTransportError:
  2955  		e.onICMPError(&tcpip.ErrNotSupported{}, transErr, pkt)
  2956  	case stack.SourceHostIsolatedTransportError:
  2957  		e.onICMPError(&tcpip.ErrNoNet{}, transErr, pkt)
  2958  	case stack.DestinationHostDownTransportError:
  2959  		e.onICMPError(&tcpip.ErrHostDown{}, transErr, pkt)
  2960  	}
  2961  }
  2962  
  2963  // updateSndBufferUsage is called by the protocol goroutine when room opens up
  2964  // in the send buffer. The number of newly available bytes is v.
  2965  func (e *endpoint) updateSndBufferUsage(v int) {
  2966  	sendBufferSize := e.getSendBufferSize()
  2967  	e.sndQueueInfo.sndQueueMu.Lock()
  2968  	notify := e.sndQueueInfo.SndBufUsed >= sendBufferSize>>1
  2969  	e.sndQueueInfo.SndBufUsed -= v
  2970  
  2971  	// Get the new send buffer size with auto tuning, but do not set it
  2972  	// unless we decide to notify the writers.
  2973  	newSndBufSz := e.computeTCPSendBufferSize()
  2974  
  2975  	// We only notify when there is half the sendBufferSize available after
  2976  	// a full buffer event occurs. This ensures that we don't wake up
  2977  	// writers to queue just 1-2 segments and go back to sleep.
  2978  	notify = notify && e.sndQueueInfo.SndBufUsed < int(newSndBufSz)>>1
  2979  	e.sndQueueInfo.sndQueueMu.Unlock()
  2980  
  2981  	if notify {
  2982  		// Set the new send buffer size calculated from auto tuning.
  2983  		e.ops.SetSendBufferSize(newSndBufSz, false /* notify */)
  2984  		e.waiterQueue.Notify(waiter.WritableEvents)
  2985  	}
  2986  }
  2987  
  2988  // readyToRead is called by the protocol goroutine when a new segment is ready
  2989  // to be read, or when the connection is closed for receiving (in which case
  2990  // s will be nil).
  2991  //
  2992  // +checklocks:e.mu
  2993  func (e *endpoint) readyToRead(s *segment) {
  2994  	e.rcvQueueMu.Lock()
  2995  	if s != nil {
  2996  		e.RcvBufUsed += s.payloadSize()
  2997  		s.IncRef()
  2998  		e.rcvQueue.PushBack(s)
  2999  	} else {
  3000  		e.RcvClosed = true
  3001  	}
  3002  	e.rcvQueueMu.Unlock()
  3003  	e.waiterQueue.Notify(waiter.ReadableEvents)
  3004  }
  3005  
  3006  // receiveBufferAvailableLocked calculates how many bytes are still available
  3007  // in the receive buffer.
  3008  // +checklocks:e.rcvQueueMu
  3009  func (e *endpoint) receiveBufferAvailableLocked(rcvBufSize int) int {
  3010  	// We may use more bytes than the buffer size when the receive buffer
  3011  	// shrinks.
  3012  	memUsed := e.receiveMemUsed()
  3013  	if memUsed >= rcvBufSize {
  3014  		return 0
  3015  	}
  3016  
  3017  	return rcvBufSize - memUsed
  3018  }
  3019  
  3020  // receiveBufferAvailable calculates how many bytes are still available in the
  3021  // receive buffer based on the actual memory used by all segments held in
  3022  // receive buffer/pending and segment queue.
  3023  func (e *endpoint) receiveBufferAvailable() int {
  3024  	e.rcvQueueMu.Lock()
  3025  	available := e.receiveBufferAvailableLocked(int(e.ops.GetReceiveBufferSize()))
  3026  	e.rcvQueueMu.Unlock()
  3027  	return available
  3028  }
  3029  
  3030  // receiveBufferUsed returns the amount of in-use receive buffer.
  3031  func (e *endpoint) receiveBufferUsed() int {
  3032  	e.rcvQueueMu.Lock()
  3033  	used := e.RcvBufUsed
  3034  	e.rcvQueueMu.Unlock()
  3035  	return used
  3036  }
  3037  
  3038  // receiveMemUsed returns the total memory in use by segments held by this
  3039  // endpoint.
  3040  func (e *endpoint) receiveMemUsed() int {
  3041  	return int(e.rcvMemUsed.Load())
  3042  }
  3043  
  3044  // updateReceiveMemUsed adds the provided delta to e.rcvMemUsed.
  3045  func (e *endpoint) updateReceiveMemUsed(delta int) {
  3046  	e.rcvMemUsed.Add(int32(delta))
  3047  }
  3048  
  3049  // maxReceiveBufferSize returns the stack wide maximum receive buffer size for
  3050  // an endpoint.
  3051  func (e *endpoint) maxReceiveBufferSize() int {
  3052  	var rs tcpip.TCPReceiveBufferSizeRangeOption
  3053  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil {
  3054  		// As a fallback return the hardcoded max buffer size.
  3055  		return MaxBufferSize
  3056  	}
  3057  	return rs.Max
  3058  }
  3059  
  3060  // directionState returns the close state of send and receive part of the endpoint
  3061  func (e *endpoint) connDirectionState() connDirectionState {
  3062  	return connDirectionState(e.connectionDirectionState.Load())
  3063  }
  3064  
  3065  // updateDirectionState updates the close state of send and receive part of the endpoint
  3066  func (e *endpoint) updateConnDirectionState(state connDirectionState) connDirectionState {
  3067  	return connDirectionState(e.connectionDirectionState.Swap(uint32(e.connDirectionState() | state)))
  3068  }
  3069  
  3070  // rcvWndScaleForHandshake computes the receive window scale to offer to the
  3071  // peer when window scaling is enabled (true by default). If auto-tuning is
  3072  // disabled then the window scaling factor is based on the size of the
  3073  // receiveBuffer otherwise we use the max permissible receive buffer size to
  3074  // compute the scale.
  3075  func (e *endpoint) rcvWndScaleForHandshake() int {
  3076  	bufSizeForScale := e.ops.GetReceiveBufferSize()
  3077  
  3078  	e.rcvQueueMu.Lock()
  3079  	autoTuningDisabled := e.RcvAutoParams.Disabled
  3080  	e.rcvQueueMu.Unlock()
  3081  	if autoTuningDisabled {
  3082  		return FindWndScale(seqnum.Size(bufSizeForScale))
  3083  	}
  3084  
  3085  	return FindWndScale(seqnum.Size(e.maxReceiveBufferSize()))
  3086  }
  3087  
  3088  // updateRecentTimestamp updates the recent timestamp using the algorithm
  3089  // described in https://tools.ietf.org/html/rfc7323#section-4.3
  3090  func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) {
  3091  	if e.SendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) {
  3092  		e.setRecentTimestamp(tsVal)
  3093  	}
  3094  }
  3095  
  3096  // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if
  3097  // the SYN options indicate that timestamp option was negotiated. It also
  3098  // initializes the recentTS with the value provided in synOpts.TSval.
  3099  func (e *endpoint) maybeEnableTimestamp(synOpts header.TCPSynOptions) {
  3100  	if synOpts.TS {
  3101  		e.SendTSOk = true
  3102  		e.setRecentTimestamp(synOpts.TSVal)
  3103  	}
  3104  }
  3105  
  3106  func (e *endpoint) tsVal(now tcpip.MonotonicTime) uint32 {
  3107  	return e.TSOffset.TSVal(now)
  3108  }
  3109  
  3110  func (e *endpoint) tsValNow() uint32 {
  3111  	return e.tsVal(e.stack.Clock().NowMonotonic())
  3112  }
  3113  
  3114  func (e *endpoint) elapsed(now tcpip.MonotonicTime, tsEcr uint32) time.Duration {
  3115  	return e.TSOffset.Elapsed(now, tsEcr)
  3116  }
  3117  
  3118  // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint
  3119  // if the SYN options indicate that the SACK option was negotiated and the TCP
  3120  // stack is configured to enable TCP SACK option.
  3121  func (e *endpoint) maybeEnableSACKPermitted(synOpts header.TCPSynOptions) {
  3122  	var v tcpip.TCPSACKEnabled
  3123  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil {
  3124  		// Stack doesn't support SACK. So just return.
  3125  		return
  3126  	}
  3127  	if bool(v) && synOpts.SACKPermitted {
  3128  		e.SACKPermitted = true
  3129  		e.stack.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery)
  3130  	}
  3131  }
  3132  
  3133  // maxOptionSize return the maximum size of TCP options.
  3134  func (e *endpoint) maxOptionSize() (size int) {
  3135  	var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock
  3136  	options := e.makeOptions(maxSackBlocks[:])
  3137  	size = len(options)
  3138  	putOptions(options)
  3139  
  3140  	return size
  3141  }
  3142  
  3143  // completeStateLocked makes a full copy of the endpoint and returns it. This is
  3144  // used before invoking the probe.
  3145  //
  3146  // +checklocks:e.mu
  3147  func (e *endpoint) completeStateLocked(s *stack.TCPEndpointState) {
  3148  	s.TCPEndpointStateInner = e.TCPEndpointStateInner
  3149  	s.ID = stack.TCPEndpointID(e.TransportEndpointInfo.ID)
  3150  	s.SegTime = e.stack.Clock().NowMonotonic()
  3151  	s.Receiver = e.rcv.TCPReceiverState
  3152  	s.Sender = e.snd.TCPSenderState
  3153  
  3154  	sndBufSize := e.getSendBufferSize()
  3155  	// Copy the send buffer atomically.
  3156  	e.sndQueueInfo.sndQueueMu.Lock()
  3157  	e.sndQueueInfo.CloneState(&s.SndBufState)
  3158  	s.SndBufState.SndBufSize = sndBufSize
  3159  	e.sndQueueInfo.sndQueueMu.Unlock()
  3160  
  3161  	// Copy the receive buffer atomically.
  3162  	e.rcvQueueMu.Lock()
  3163  	s.RcvBufState = e.TCPRcvBufState
  3164  	e.rcvQueueMu.Unlock()
  3165  
  3166  	// Copy the endpoint TCP Option state.
  3167  	s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks)
  3168  	copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks])
  3169  	s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy()
  3170  
  3171  	e.snd.rtt.Lock()
  3172  	s.Sender.RTTState = e.snd.rtt.TCPRTTState
  3173  	e.snd.rtt.Unlock()
  3174  
  3175  	if cubic, ok := e.snd.cc.(*cubicState); ok {
  3176  		s.Sender.Cubic = cubic.TCPCubicState
  3177  		s.Sender.Cubic.TimeSinceLastCongestion = e.stack.Clock().NowMonotonic().Sub(s.Sender.Cubic.T)
  3178  	}
  3179  
  3180  	s.Sender.RACKState = e.snd.rc.TCPRACKState
  3181  	s.Sender.RetransmitTS = e.snd.retransmitTS
  3182  	s.Sender.SpuriousRecovery = e.snd.spuriousRecovery
  3183  }
  3184  
  3185  func (e *endpoint) initHostGSO() {
  3186  	switch e.route.NetProto() {
  3187  	case header.IPv4ProtocolNumber:
  3188  		e.gso.Type = stack.GSOTCPv4
  3189  		e.gso.L3HdrLen = header.IPv4MinimumSize
  3190  	case header.IPv6ProtocolNumber:
  3191  		e.gso.Type = stack.GSOTCPv6
  3192  		e.gso.L3HdrLen = header.IPv6MinimumSize
  3193  	default:
  3194  		panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto))
  3195  	}
  3196  	e.gso.NeedsCsum = true
  3197  	e.gso.CsumOffset = header.TCPChecksumOffset
  3198  	e.gso.MaxSize = e.route.GSOMaxSize()
  3199  }
  3200  
  3201  func (e *endpoint) initGSO() {
  3202  	if e.route.HasHostGSOCapability() {
  3203  		e.initHostGSO()
  3204  	} else if e.route.HasGvisorGSOCapability() {
  3205  		e.gso = stack.GSO{
  3206  			MaxSize:   e.route.GSOMaxSize(),
  3207  			Type:      stack.GSOGvisor,
  3208  			NeedsCsum: false,
  3209  		}
  3210  	}
  3211  }
  3212  
  3213  // State implements tcpip.Endpoint.State. It exports the endpoint's protocol
  3214  // state for diagnostics.
  3215  func (e *endpoint) State() uint32 {
  3216  	return uint32(e.EndpointState())
  3217  }
  3218  
  3219  // Info returns a copy of the endpoint info.
  3220  func (e *endpoint) Info() tcpip.EndpointInfo {
  3221  	e.LockUser()
  3222  	// Make a copy of the endpoint info.
  3223  	ret := e.TransportEndpointInfo
  3224  	e.UnlockUser()
  3225  	return &ret
  3226  }
  3227  
  3228  // Stats returns a pointer to the endpoint stats.
  3229  func (e *endpoint) Stats() tcpip.EndpointStats {
  3230  	return &e.stats
  3231  }
  3232  
  3233  // Wait implements stack.TransportEndpoint.Wait.
  3234  func (e *endpoint) Wait() {
  3235  	waitEntry, notifyCh := waiter.NewChannelEntry(waiter.EventHUp)
  3236  	e.waiterQueue.EventRegister(&waitEntry)
  3237  	defer e.waiterQueue.EventUnregister(&waitEntry)
  3238  	switch e.EndpointState() {
  3239  	case StateClose, StateError:
  3240  		return
  3241  	}
  3242  	<-notifyCh
  3243  }
  3244  
  3245  // SocketOptions implements tcpip.Endpoint.SocketOptions.
  3246  func (e *endpoint) SocketOptions() *tcpip.SocketOptions {
  3247  	return &e.ops
  3248  }
  3249  
  3250  // GetTCPSendBufferLimits is used to get send buffer size limits for TCP.
  3251  func GetTCPSendBufferLimits(s tcpip.StackHandler) tcpip.SendBufferSizeOption {
  3252  	var ss tcpip.TCPSendBufferSizeRangeOption
  3253  	if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil {
  3254  		panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err))
  3255  	}
  3256  
  3257  	return tcpip.SendBufferSizeOption{
  3258  		Min:     ss.Min,
  3259  		Default: ss.Default,
  3260  		Max:     ss.Max,
  3261  	}
  3262  }
  3263  
  3264  // allowOutOfWindowAck returns true if an out-of-window ACK can be sent now.
  3265  func (e *endpoint) allowOutOfWindowAck() bool {
  3266  	now := e.stack.Clock().NowMonotonic()
  3267  
  3268  	if e.lastOutOfWindowAckTime != (tcpip.MonotonicTime{}) {
  3269  		var limit stack.TCPInvalidRateLimitOption
  3270  		if err := e.stack.Option(&limit); err != nil {
  3271  			panic(fmt.Sprintf("e.stack.Option(%+v) failed with error: %s", limit, err))
  3272  		}
  3273  		if now.Sub(e.lastOutOfWindowAckTime) < time.Duration(limit) {
  3274  			return false
  3275  		}
  3276  	}
  3277  
  3278  	e.lastOutOfWindowAckTime = now
  3279  	return true
  3280  }
  3281  
  3282  // GetTCPReceiveBufferLimits is used to get send buffer size limits for TCP.
  3283  func GetTCPReceiveBufferLimits(s tcpip.StackHandler) tcpip.ReceiveBufferSizeOption {
  3284  	var ss tcpip.TCPReceiveBufferSizeRangeOption
  3285  	if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil {
  3286  		panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err))
  3287  	}
  3288  
  3289  	return tcpip.ReceiveBufferSizeOption{
  3290  		Min:     ss.Min,
  3291  		Default: ss.Default,
  3292  		Max:     ss.Max,
  3293  	}
  3294  }
  3295  
  3296  // computeTCPSendBufferSize implements auto tuning of send buffer size and
  3297  // returns the new send buffer size.
  3298  func (e *endpoint) computeTCPSendBufferSize() int64 {
  3299  	curSndBufSz := int64(e.getSendBufferSize())
  3300  
  3301  	// Auto tuning is disabled when the user explicitly sets the send
  3302  	// buffer size with SO_SNDBUF option.
  3303  	if disabled := e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled.Load(); disabled == 1 {
  3304  		return curSndBufSz
  3305  	}
  3306  
  3307  	const packetOverheadFactor = 2
  3308  	curMSS := e.snd.MaxPayloadSize
  3309  	numSeg := InitialCwnd
  3310  	if numSeg < e.snd.SndCwnd {
  3311  		numSeg = e.snd.SndCwnd
  3312  	}
  3313  
  3314  	// SndCwnd indicates the number of segments that can be sent. This means
  3315  	// that the sender can send upto #SndCwnd segments and the send buffer
  3316  	// size should be set to SndCwnd*MSS to accommodate sending of all the
  3317  	// segments.
  3318  	newSndBufSz := int64(numSeg * curMSS * packetOverheadFactor)
  3319  	if newSndBufSz < curSndBufSz {
  3320  		return curSndBufSz
  3321  	}
  3322  	if ss := GetTCPSendBufferLimits(e.stack); int64(ss.Max) < newSndBufSz {
  3323  		newSndBufSz = int64(ss.Max)
  3324  	}
  3325  
  3326  	return newSndBufSz
  3327  }