inet.af/netstack@v0.0.0-20220214151720-7585b01ddccf/tcpip/transport/tcp/endpoint.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"encoding/binary"
    19  	"fmt"
    20  	"io"
    21  	"math"
    22  	"runtime"
    23  	"strings"
    24  	"sync/atomic"
    25  	"time"
    26  
    27  	"inet.af/netstack/sleep"
    28  	"inet.af/netstack/sync"
    29  	"inet.af/netstack/tcpip"
    30  	"inet.af/netstack/tcpip/hash/jenkins"
    31  	"inet.af/netstack/tcpip/header"
    32  	"inet.af/netstack/tcpip/ports"
    33  	"inet.af/netstack/tcpip/seqnum"
    34  	"inet.af/netstack/tcpip/stack"
    35  	"inet.af/netstack/waiter"
    36  )
    37  
    38  // EndpointState represents the state of a TCP endpoint.
    39  type EndpointState tcpip.EndpointState
    40  
    41  // Endpoint states. Note that are represented in a netstack-specific manner and
    42  // may not be meaningful externally. Specifically, they need to be translated to
    43  // Linux's representation for these states if presented to userspace.
    44  const (
    45  	_ EndpointState = iota
    46  	// TCP protocol states in sync with the definitions in
    47  	// https://github.com/torvalds/linux/blob/7acac4b3196/include/net/tcp_states.h#L13
    48  	StateEstablished
    49  	StateSynSent
    50  	StateSynRecv
    51  	StateFinWait1
    52  	StateFinWait2
    53  	StateTimeWait
    54  	StateClose
    55  	StateCloseWait
    56  	StateLastAck
    57  	StateListen
    58  	StateClosing
    59  
    60  	// Endpoint states internal to netstack.
    61  	StateInitial
    62  	StateBound
    63  	StateConnecting // Connect() called, but the initial SYN hasn't been sent.
    64  	StateError
    65  )
    66  
    67  const (
    68  	// rcvAdvWndScale is used to split the available socket buffer into
    69  	// application buffer and the window to be advertised to the peer. This is
    70  	// currently hard coded to split the available space equally.
    71  	rcvAdvWndScale = 1
    72  
    73  	// SegOverheadFactor is used to multiply the value provided by the
    74  	// user on a SetSockOpt for setting the socket send/receive buffer sizes.
    75  	SegOverheadFactor = 2
    76  )
    77  
    78  // connected returns true when s is one of the states representing an
    79  // endpoint connected to a peer.
    80  func (s EndpointState) connected() bool {
    81  	switch s {
    82  	case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
    83  		return true
    84  	default:
    85  		return false
    86  	}
    87  }
    88  
    89  // connecting returns true when s is one of the states representing a
    90  // connection in progress, but not yet fully established.
    91  func (s EndpointState) connecting() bool {
    92  	switch s {
    93  	case StateConnecting, StateSynSent, StateSynRecv:
    94  		return true
    95  	default:
    96  		return false
    97  	}
    98  }
    99  
   100  // internal returns true when the state is netstack internal.
   101  func (s EndpointState) internal() bool {
   102  	switch s {
   103  	case StateInitial, StateBound, StateConnecting, StateError:
   104  		return true
   105  	default:
   106  		return false
   107  	}
   108  }
   109  
   110  // handshake returns true when s is one of the states representing an endpoint
   111  // in the middle of a TCP handshake.
   112  func (s EndpointState) handshake() bool {
   113  	switch s {
   114  	case StateSynSent, StateSynRecv:
   115  		return true
   116  	default:
   117  		return false
   118  	}
   119  }
   120  
   121  // closed returns true when s is one of the states an endpoint transitions to
   122  // when closed or when it encounters an error. This is distinct from a newly
   123  // initialized endpoint that was never connected.
   124  func (s EndpointState) closed() bool {
   125  	switch s {
   126  	case StateClose, StateError:
   127  		return true
   128  	default:
   129  		return false
   130  	}
   131  }
   132  
   133  // String implements fmt.Stringer.String.
   134  func (s EndpointState) String() string {
   135  	switch s {
   136  	case StateInitial:
   137  		return "INITIAL"
   138  	case StateBound:
   139  		return "BOUND"
   140  	case StateConnecting:
   141  		return "CONNECTING"
   142  	case StateError:
   143  		return "ERROR"
   144  	case StateEstablished:
   145  		return "ESTABLISHED"
   146  	case StateSynSent:
   147  		return "SYN-SENT"
   148  	case StateSynRecv:
   149  		return "SYN-RCVD"
   150  	case StateFinWait1:
   151  		return "FIN-WAIT1"
   152  	case StateFinWait2:
   153  		return "FIN-WAIT2"
   154  	case StateTimeWait:
   155  		return "TIME-WAIT"
   156  	case StateClose:
   157  		return "CLOSED"
   158  	case StateCloseWait:
   159  		return "CLOSE-WAIT"
   160  	case StateLastAck:
   161  		return "LAST-ACK"
   162  	case StateListen:
   163  		return "LISTEN"
   164  	case StateClosing:
   165  		return "CLOSING"
   166  	default:
   167  		panic("unreachable")
   168  	}
   169  }
   170  
   171  // Reasons for notifying the protocol goroutine.
   172  const (
   173  	notifyNonZeroReceiveWindow = 1 << iota
   174  	notifyClose
   175  	notifyMTUChanged
   176  	notifyDrain
   177  	notifyReset
   178  	notifyResetByPeer
   179  	// notifyAbort is a request for an expedited teardown.
   180  	notifyAbort
   181  	notifyKeepaliveChanged
   182  	notifyMSSChanged
   183  	// notifyTickleWorker is used to tickle the protocol main loop during a
   184  	// restore after we update the endpoint state to the correct one. This
   185  	// ensures the loop terminates if the final state of the endpoint is
   186  	// say TIME_WAIT.
   187  	notifyTickleWorker
   188  	notifyError
   189  	// notifyShutdown means that a connecting socket was shutdown.
   190  	notifyShutdown
   191  )
   192  
   193  // SACKInfo holds TCP SACK related information for a given endpoint.
   194  //
   195  // +stateify savable
   196  type SACKInfo struct {
   197  	// Blocks is the maximum number of SACK blocks we track
   198  	// per endpoint.
   199  	Blocks [MaxSACKBlocks]header.SACKBlock
   200  
   201  	// NumBlocks is the number of valid SACK blocks stored in the
   202  	// blocks array above.
   203  	NumBlocks int
   204  }
   205  
   206  // ReceiveErrors collect segment receive errors within transport layer.
   207  //
   208  // +stateify savable
   209  type ReceiveErrors struct {
   210  	tcpip.ReceiveErrors
   211  
   212  	// SegmentQueueDropped is the number of segments dropped due to
   213  	// a full segment queue.
   214  	SegmentQueueDropped tcpip.StatCounter
   215  
   216  	// ChecksumErrors is the number of segments dropped due to bad checksums.
   217  	ChecksumErrors tcpip.StatCounter
   218  
   219  	// ListenOverflowSynDrop is the number of times the listen queue overflowed
   220  	// and a SYN was dropped.
   221  	ListenOverflowSynDrop tcpip.StatCounter
   222  
   223  	// ListenOverflowAckDrop is the number of times the final ACK
   224  	// in the handshake was dropped due to overflow.
   225  	ListenOverflowAckDrop tcpip.StatCounter
   226  
   227  	// ZeroRcvWindowState is the number of times we advertised
   228  	// a zero receive window when rcvQueue is full.
   229  	ZeroRcvWindowState tcpip.StatCounter
   230  
   231  	// WantZeroWindow is the number of times we wanted to advertise a
   232  	// zero receive window but couldn't because it would have caused
   233  	// the receive window's right edge to shrink.
   234  	WantZeroRcvWindow tcpip.StatCounter
   235  }
   236  
   237  // SendErrors collect segment send errors within the transport layer.
   238  //
   239  // +stateify savable
   240  type SendErrors struct {
   241  	tcpip.SendErrors
   242  
   243  	// SegmentSendToNetworkFailed is the number of TCP segments failed to be sent
   244  	// to the network endpoint.
   245  	SegmentSendToNetworkFailed tcpip.StatCounter
   246  
   247  	// SynSendToNetworkFailed is the number of TCP SYNs failed to be sent
   248  	// to the network endpoint.
   249  	SynSendToNetworkFailed tcpip.StatCounter
   250  
   251  	// Retransmits is the number of TCP segments retransmitted.
   252  	Retransmits tcpip.StatCounter
   253  
   254  	// FastRetransmit is the number of segments retransmitted in fast
   255  	// recovery.
   256  	FastRetransmit tcpip.StatCounter
   257  
   258  	// Timeouts is the number of times the RTO expired.
   259  	Timeouts tcpip.StatCounter
   260  }
   261  
   262  // Stats holds statistics about the endpoint.
   263  //
   264  // +stateify savable
   265  type Stats struct {
   266  	// SegmentsReceived is the number of TCP segments received that
   267  	// the transport layer successfully parsed.
   268  	SegmentsReceived tcpip.StatCounter
   269  
   270  	// SegmentsSent is the number of TCP segments sent.
   271  	SegmentsSent tcpip.StatCounter
   272  
   273  	// FailedConnectionAttempts is the number of times we saw Connect and
   274  	// Accept errors.
   275  	FailedConnectionAttempts tcpip.StatCounter
   276  
   277  	// ReceiveErrors collects segment receive errors within the
   278  	// transport layer.
   279  	ReceiveErrors ReceiveErrors
   280  
   281  	// ReadErrors collects segment read errors from an endpoint read call.
   282  	ReadErrors tcpip.ReadErrors
   283  
   284  	// SendErrors collects segment send errors within the transport layer.
   285  	SendErrors SendErrors
   286  
   287  	// WriteErrors collects segment write errors from an endpoint write call.
   288  	WriteErrors tcpip.WriteErrors
   289  }
   290  
   291  // IsEndpointStats is an empty method to implement the tcpip.EndpointStats
   292  // marker interface.
   293  func (*Stats) IsEndpointStats() {}
   294  
   295  // sndQueueInfo implements a send queue.
   296  //
   297  // +stateify savable
   298  type sndQueueInfo struct {
   299  	sndQueueMu sync.Mutex `state:"nosave"`
   300  	stack.TCPSndBufState
   301  
   302  	// sndWaker is used to signal the protocol goroutine when there may be
   303  	// segments that need to be sent.
   304  	sndWaker sleep.Waker `state:"manual"`
   305  }
   306  
   307  // rcvQueueInfo contains the endpoint's rcvQueue and associated metadata.
   308  //
   309  // +stateify savable
   310  type rcvQueueInfo struct {
   311  	rcvQueueMu sync.Mutex `state:"nosave"`
   312  	stack.TCPRcvBufState
   313  
   314  	// rcvQueue is the queue for ready-for-delivery segments. This struct's
   315  	// mutex must be held in order append segments to list.
   316  	rcvQueue segmentList `state:"wait"`
   317  }
   318  
   319  // endpoint represents a TCP endpoint. This struct serves as the interface
   320  // between users of the endpoint and the protocol implementation; it is legal to
   321  // have concurrent goroutines make calls into the endpoint, they are properly
   322  // synchronized. The protocol implementation, however, runs in a single
   323  // goroutine.
   324  //
   325  // Each endpoint has a few mutexes:
   326  //
   327  // e.mu -> Primary mutex for an endpoint must be held for all operations except
   328  // in e.Readiness where acquiring it will result in a deadlock in epoll
   329  // implementation.
   330  //
   331  // The following three mutexes can be acquired independent of e.mu but if
   332  // acquired with e.mu then e.mu must be acquired first.
   333  //
   334  // e.acceptMu -> Protects e.acceptQueue.
   335  // e.rcvQueueMu -> Protects e.rcvQueue and associated fields.
   336  // e.sndQueueMu -> Protects the e.sndQueue and associated fields.
   337  // e.lastErrorMu -> Protects the lastError field.
   338  //
   339  // LOCKING/UNLOCKING of the endpoint.  The locking of an endpoint is different
   340  // based on the context in which the lock is acquired. In the syscall context
   341  // e.LockUser/e.UnlockUser should be used and when doing background processing
   342  // e.mu.Lock/e.mu.Unlock should be used. The distinction is described below
   343  // in brief.
   344  //
   345  // The reason for this locking behaviour is to avoid wakeups to handle packets.
   346  // In cases where the endpoint is already locked the background processor can
   347  // queue the packet up and go its merry way and the lock owner will eventually
   348  // process the backlog when releasing the lock. Similarly when acquiring the
   349  // lock from say a syscall goroutine we can implement a bit of spinning if we
   350  // know that the lock is not held by another syscall goroutine. Background
   351  // processors should never hold the lock for long and we can avoid an expensive
   352  // sleep/wakeup by spinning for a shortwhile.
   353  //
   354  // For more details please see the detailed documentation on
   355  // e.LockUser/e.UnlockUser methods.
   356  //
   357  // +stateify savable
   358  type endpoint struct {
   359  	stack.TCPEndpointStateInner
   360  	stack.TransportEndpointInfo
   361  	tcpip.DefaultSocketOptionsHandler
   362  
   363  	// endpointEntry is used to queue endpoints for processing to the
   364  	// a given tcp processor goroutine.
   365  	//
   366  	// Precondition: epQueue.mu must be held to read/write this field..
   367  	endpointEntry `state:"nosave"`
   368  
   369  	// pendingProcessing is true if this endpoint is queued for processing
   370  	// to a TCP processor.
   371  	//
   372  	// Precondition: epQueue.mu must be held to read/write this field..
   373  	pendingProcessing bool `state:"nosave"`
   374  
   375  	// The following fields are initialized at creation time and do not
   376  	// change throughout the lifetime of the endpoint.
   377  	stack       *stack.Stack  `state:"manual"`
   378  	protocol    *protocol     `state:"manual"`
   379  	waiterQueue *waiter.Queue `state:"wait"`
   380  	uniqueID    uint64
   381  
   382  	// hardError is meaningful only when state is stateError. It stores the
   383  	// error to be returned when read/write syscalls are called and the
   384  	// endpoint is in this state. hardError is protected by endpoint mu.
   385  	hardError tcpip.Error
   386  
   387  	// lastError represents the last error that the endpoint reported;
   388  	// access to it is protected by the following mutex.
   389  	lastErrorMu sync.Mutex `state:"nosave"`
   390  	lastError   tcpip.Error
   391  
   392  	// rcvReadMu synchronizes calls to Read.
   393  	//
   394  	// mu and rcvQueueMu are temporarily released during data copying. rcvReadMu
   395  	// must be held during each read to ensure atomicity, so that multiple reads
   396  	// do not interleave.
   397  	//
   398  	// rcvReadMu should be held before holding mu.
   399  	rcvReadMu sync.Mutex `state:"nosave"`
   400  
   401  	// rcvQueueInfo holds the implementation of the endpoint's receive buffer.
   402  	// The data within rcvQueueInfo should only be accessed while rcvReadMu, mu,
   403  	// and rcvQueueMu are held, in that stated order. While processing the segment
   404  	// range, you can determine a range and then temporarily release mu and
   405  	// rcvQueueMu, which allows new segments to be appended to the queue while
   406  	// processing.
   407  	rcvQueueInfo rcvQueueInfo
   408  
   409  	// rcvMemUsed tracks the total amount of memory in use by received segments
   410  	// held in rcvQueue, pendingRcvdSegments and the segment queue. This is used to
   411  	// compute the window and the actual available buffer space. This is distinct
   412  	// from rcvBufUsed above which is the actual number of payload bytes held in
   413  	// the buffer not including any segment overheads.
   414  	//
   415  	// rcvMemUsed must be accessed atomically.
   416  	rcvMemUsed int32
   417  
   418  	// mu protects all endpoint fields unless documented otherwise. mu must
   419  	// be acquired before interacting with the endpoint fields.
   420  	//
   421  	// During handshake, mu is locked by the protocol listen goroutine and
   422  	// released by the handshake completion goroutine.
   423  	mu          sync.CrossGoroutineMutex `state:"nosave"`
   424  	ownedByUser uint32
   425  
   426  	// state must be read/set using the EndpointState()/setEndpointState()
   427  	// methods.
   428  	state uint32 `state:".(EndpointState)"`
   429  
   430  	// origEndpointState is only used during a restore phase to save the
   431  	// endpoint state at restore time as the socket is moved to it's correct
   432  	// state.
   433  	origEndpointState uint32 `state:"nosave"`
   434  
   435  	isPortReserved    bool `state:"manual"`
   436  	isRegistered      bool `state:"manual"`
   437  	boundNICID        tcpip.NICID
   438  	route             *stack.Route `state:"manual"`
   439  	ttl               uint8
   440  	isConnectNotified bool
   441  
   442  	// h stores a reference to the current handshake state if the endpoint is in
   443  	// the SYN-SENT or SYN-RECV states, in which case endpoint == endpoint.h.ep.
   444  	// nil otherwise.
   445  	h *handshake `state:"nosave"`
   446  
   447  	// portFlags stores the current values of port related flags.
   448  	portFlags ports.Flags
   449  
   450  	// Values used to reserve a port or register a transport endpoint
   451  	// (which ever happens first).
   452  	boundBindToDevice tcpip.NICID
   453  	boundPortFlags    ports.Flags
   454  	boundDest         tcpip.FullAddress
   455  
   456  	// effectiveNetProtos contains the network protocols actually in use. In
   457  	// most cases it will only contain "netProto", but in cases like IPv6
   458  	// endpoints with v6only set to false, this could include multiple
   459  	// protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g.,
   460  	// IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped
   461  	// address).
   462  	effectiveNetProtos []tcpip.NetworkProtocolNumber
   463  
   464  	// workerRunning specifies if a worker goroutine is running.
   465  	workerRunning bool
   466  
   467  	// workerCleanup specifies if the worker goroutine must perform cleanup
   468  	// before exiting. This can only be set to true when workerRunning is
   469  	// also true, and they're both protected by the mutex.
   470  	workerCleanup bool
   471  
   472  	// recentTSTime is the unix time when we last updated
   473  	// TCPEndpointStateInner.RecentTS.
   474  	recentTSTime tcpip.MonotonicTime
   475  
   476  	// shutdownFlags represent the current shutdown state of the endpoint.
   477  	shutdownFlags tcpip.ShutdownFlags
   478  
   479  	// tcpRecovery is the loss recovery algorithm used by TCP.
   480  	tcpRecovery tcpip.TCPRecovery
   481  
   482  	// sack holds TCP SACK related information for this endpoint.
   483  	sack SACKInfo
   484  
   485  	// delay enables Nagle's algorithm.
   486  	//
   487  	// delay is a boolean (0 is false) and must be accessed atomically.
   488  	delay uint32
   489  
   490  	// scoreboard holds TCP SACK Scoreboard information for this endpoint.
   491  	scoreboard *SACKScoreboard
   492  
   493  	// segmentQueue is used to hand received segments to the protocol
   494  	// goroutine. Segments are queued as long as the queue is not full,
   495  	// and dropped when it is.
   496  	segmentQueue segmentQueue `state:"wait"`
   497  
   498  	// userMSS if non-zero is the MSS value explicitly set by the user
   499  	// for this endpoint using the TCP_MAXSEG setsockopt.
   500  	userMSS uint16
   501  
   502  	// maxSynRetries is the maximum number of SYN retransmits that TCP should
   503  	// send before aborting the attempt to connect. It cannot exceed 255.
   504  	//
   505  	// NOTE: This is currently a no-op and does not change the SYN
   506  	// retransmissions.
   507  	maxSynRetries uint8
   508  
   509  	// windowClamp is used to bound the size of the advertised window to
   510  	// this value.
   511  	windowClamp uint32
   512  
   513  	// sndQueueInfo contains the implementation of the endpoint's send queue.
   514  	sndQueueInfo sndQueueInfo
   515  
   516  	// cc stores the name of the Congestion Control algorithm to use for
   517  	// this endpoint.
   518  	cc tcpip.CongestionControlOption
   519  
   520  	// newSegmentWaker is used to indicate to the protocol goroutine that
   521  	// it needs to wake up and handle new segments queued to it.
   522  	newSegmentWaker sleep.Waker `state:"manual"`
   523  
   524  	// notificationWaker is used to indicate to the protocol goroutine that
   525  	// it needs to wake up and check for notifications.
   526  	notificationWaker sleep.Waker `state:"manual"`
   527  
   528  	// notifyFlags is a bitmask of flags used to indicate to the protocol
   529  	// goroutine what it was notified; this is only accessed atomically.
   530  	notifyFlags uint32 `state:"nosave"`
   531  
   532  	// keepalive manages TCP keepalive state. When the connection is idle
   533  	// (no data sent or received) for keepaliveIdle, we start sending
   534  	// keepalives every keepalive.interval. If we send keepalive.count
   535  	// without hearing a response, the connection is closed.
   536  	keepalive keepalive
   537  
   538  	// userTimeout if non-zero specifies a user specified timeout for
   539  	// a connection w/ pending data to send. A connection that has pending
   540  	// unacked data will be forcibily aborted if the timeout is reached
   541  	// without any data being acked.
   542  	userTimeout time.Duration
   543  
   544  	// deferAccept if non-zero specifies a user specified time during
   545  	// which the final ACK of a handshake will be dropped provided the
   546  	// ACK is a bare ACK and carries no data. If the timeout is crossed then
   547  	// the bare ACK is accepted and the connection is delivered to the
   548  	// listener.
   549  	deferAccept time.Duration
   550  
   551  	// pendingAccepted tracks connections queued to be accepted. It is used to
   552  	// ensure such queued connections are terminated before the accepted queue is
   553  	// marked closed (by setting its capacity to zero).
   554  	pendingAccepted sync.WaitGroup `state:"nosave"`
   555  
   556  	// acceptMu protects accepted.
   557  	acceptMu sync.Mutex `state:"nosave"`
   558  
   559  	// acceptCond is a condition variable that can be used to block on when
   560  	// accepted is full and an endpoint is ready to be delivered.
   561  	//
   562  	// We use this condition variable to block/unblock goroutines which
   563  	// tried to deliver an endpoint but couldn't because accept backlog was
   564  	// full ( See: endpoint.deliverAccepted ).
   565  	acceptCond *sync.Cond `state:"nosave"`
   566  
   567  	// accepted is used by a listening endpoint protocol goroutine to
   568  	// send newly accepted connections to the endpoint so that they can be
   569  	// read by Accept() calls.
   570  	// +checklocks:acceptMu
   571  	acceptQueue acceptQueue
   572  
   573  	// The following are only used from the protocol goroutine, and
   574  	// therefore don't need locks to protect them.
   575  	rcv *receiver `state:"wait"`
   576  	snd *sender   `state:"wait"`
   577  
   578  	// The goroutine drain completion notification channel.
   579  	drainDone chan struct{} `state:"nosave"`
   580  
   581  	// The goroutine undrain notification channel. This is currently used as
   582  	// a way to block the worker goroutines. Today nothing closes/writes
   583  	// this channel and this causes any goroutines waiting on this to just
   584  	// block. This is used during save/restore to prevent worker goroutines
   585  	// from mutating state as it's being saved.
   586  	undrain chan struct{} `state:"nosave"`
   587  
   588  	// probe if not nil is invoked on every received segment. It is passed
   589  	// a copy of the current state of the endpoint.
   590  	probe stack.TCPProbeFunc `state:"nosave"`
   591  
   592  	// The following are only used to assist the restore run to re-connect.
   593  	connectingAddress tcpip.Address
   594  
   595  	// amss is the advertised MSS to the peer by this endpoint.
   596  	amss uint16
   597  
   598  	// sendTOS represents IPv4 TOS or IPv6 TrafficClass,
   599  	// applied while sending packets. Defaults to 0 as on Linux.
   600  	sendTOS uint8
   601  
   602  	gso stack.GSO
   603  
   604  	stats Stats
   605  
   606  	// tcpLingerTimeout is the maximum amount of a time a socket
   607  	// a socket stays in TIME_WAIT state before being marked
   608  	// closed.
   609  	tcpLingerTimeout time.Duration
   610  
   611  	// closed indicates that the user has called closed on the
   612  	// endpoint and at this point the endpoint is only around
   613  	// to complete the TCP shutdown.
   614  	closed bool
   615  
   616  	// txHash is the transport layer hash to be set on outbound packets
   617  	// emitted by this endpoint.
   618  	txHash uint32
   619  
   620  	// owner is used to get uid and gid of the packet.
   621  	owner tcpip.PacketOwner
   622  
   623  	// ops is used to get socket level options.
   624  	ops tcpip.SocketOptions
   625  
   626  	// lastOutOfWindowAckTime is the time at which the an ACK was sent in response
   627  	// to an out of window segment being received by this endpoint.
   628  	lastOutOfWindowAckTime tcpip.MonotonicTime
   629  }
   630  
   631  // UniqueID implements stack.TransportEndpoint.UniqueID.
   632  func (e *endpoint) UniqueID() uint64 {
   633  	return e.uniqueID
   634  }
   635  
   636  // calculateAdvertisedMSS calculates the MSS to advertise.
   637  //
   638  // If userMSS is non-zero and is not greater than the maximum possible MSS for
   639  // r, it will be used; otherwise, the maximum possible MSS will be used.
   640  func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 {
   641  	// The maximum possible MSS is dependent on the route.
   642  	// TODO(b/143359391): Respect TCP Min and Max size.
   643  	maxMSS := uint16(r.MTU() - header.TCPMinimumSize)
   644  
   645  	if userMSS != 0 && userMSS < maxMSS {
   646  		return userMSS
   647  	}
   648  
   649  	return maxMSS
   650  }
   651  
   652  // LockUser tries to lock e.mu and if it fails it will check if the lock is held
   653  // by another syscall goroutine. If yes, then it will goto sleep waiting for the
   654  // lock to be released, if not then it will spin till it acquires the lock or
   655  // another syscall goroutine acquires it in which case it will goto sleep as
   656  // described above.
   657  //
   658  // The assumption behind spinning here being that background packet processing
   659  // should not be holding the lock for long and spinning reduces latency as we
   660  // avoid an expensive sleep/wakeup of of the syscall goroutine).
   661  // +checklocksacquire:e.mu
   662  func (e *endpoint) LockUser() {
   663  	for {
   664  		// Try first if the sock is locked then check if it's owned
   665  		// by another user goroutine if not then we spin, otherwise
   666  		// we just go to sleep on the Lock() and wait.
   667  		if !e.mu.TryLock() {
   668  			// If socket is owned by the user then just go to sleep
   669  			// as the lock could be held for a reasonably long time.
   670  			if atomic.LoadUint32(&e.ownedByUser) == 1 {
   671  				e.mu.Lock()
   672  				atomic.StoreUint32(&e.ownedByUser, 1)
   673  				return
   674  			}
   675  			// Spin but yield the processor since the lower half
   676  			// should yield the lock soon.
   677  			runtime.Gosched()
   678  			continue
   679  		}
   680  		atomic.StoreUint32(&e.ownedByUser, 1)
   681  		return // +checklocksforce
   682  	}
   683  }
   684  
   685  // UnlockUser will check if there are any segments already queued for processing
   686  // and process any such segments before unlocking e.mu. This is required because
   687  // we when packets arrive and endpoint lock is already held then such packets
   688  // are queued up to be processed. If the lock is held by the endpoint goroutine
   689  // then it will process these packets but if the lock is instead held by the
   690  // syscall goroutine then we can have the syscall goroutine process the backlog
   691  // before unlocking.
   692  //
   693  // This avoids an unnecessary wakeup of the endpoint protocol goroutine for the
   694  // endpoint. It's also required eventually when we get rid of the endpoint
   695  // protocol goroutine altogether.
   696  //
   697  // Precondition: e.LockUser() must have been called before calling e.UnlockUser()
   698  // +checklocksrelease:e.mu
   699  func (e *endpoint) UnlockUser() {
   700  	// Lock segment queue before checking so that we avoid a race where
   701  	// segments can be queued between the time we check if queue is empty
   702  	// and actually unlock the endpoint mutex.
   703  	for {
   704  		e.segmentQueue.mu.Lock()
   705  		if e.segmentQueue.emptyLocked() {
   706  			if atomic.SwapUint32(&e.ownedByUser, 0) != 1 {
   707  				panic("e.UnlockUser() called without calling e.LockUser()")
   708  			}
   709  			e.mu.Unlock()
   710  			e.segmentQueue.mu.Unlock()
   711  			return
   712  		}
   713  		e.segmentQueue.mu.Unlock()
   714  
   715  		switch e.EndpointState() {
   716  		case StateEstablished:
   717  			if err := e.handleSegmentsLocked(true /* fastPath */); err != nil {
   718  				e.notifyProtocolGoroutine(notifyTickleWorker)
   719  			}
   720  		default:
   721  			// Since we are waking the endpoint goroutine here just unlock
   722  			// and let it process the queued segments.
   723  			e.newSegmentWaker.Assert()
   724  			if atomic.SwapUint32(&e.ownedByUser, 0) != 1 {
   725  				panic("e.UnlockUser() called without calling e.LockUser()")
   726  			}
   727  			e.mu.Unlock()
   728  			return
   729  		}
   730  	}
   731  }
   732  
   733  // StopWork halts packet processing. Only to be used in tests.
   734  // +checklocksacquire:e.mu
   735  func (e *endpoint) StopWork() {
   736  	e.mu.Lock()
   737  }
   738  
   739  // ResumeWork resumes packet processing. Only to be used in tests.
   740  // +checklocksrelease:e.mu
   741  func (e *endpoint) ResumeWork() {
   742  	e.mu.Unlock()
   743  }
   744  
   745  // setEndpointState updates the state of the endpoint to state atomically. This
   746  // method is unexported as the only place we should update the state is in this
   747  // package but we allow the state to be read freely without holding e.mu.
   748  //
   749  // Precondition: e.mu must be held to call this method.
   750  func (e *endpoint) setEndpointState(state EndpointState) {
   751  	oldstate := EndpointState(atomic.SwapUint32(&e.state, uint32(state)))
   752  	switch state {
   753  	case StateEstablished:
   754  		e.stack.Stats().TCP.CurrentEstablished.Increment()
   755  		e.stack.Stats().TCP.CurrentConnected.Increment()
   756  	case StateError:
   757  		fallthrough
   758  	case StateClose:
   759  		if oldstate == StateCloseWait || oldstate == StateEstablished {
   760  			e.stack.Stats().TCP.EstablishedResets.Increment()
   761  		}
   762  		fallthrough
   763  	default:
   764  		if oldstate == StateEstablished {
   765  			e.stack.Stats().TCP.CurrentEstablished.Decrement()
   766  		}
   767  	}
   768  }
   769  
   770  // EndpointState returns the current state of the endpoint.
   771  func (e *endpoint) EndpointState() EndpointState {
   772  	return EndpointState(atomic.LoadUint32(&e.state))
   773  }
   774  
   775  // setRecentTimestamp sets the recentTS field to the provided value.
   776  func (e *endpoint) setRecentTimestamp(recentTS uint32) {
   777  	e.RecentTS = recentTS
   778  	e.recentTSTime = e.stack.Clock().NowMonotonic()
   779  }
   780  
   781  // recentTimestamp returns the value of the recentTS field.
   782  func (e *endpoint) recentTimestamp() uint32 {
   783  	return e.RecentTS
   784  }
   785  
   786  // keepalive is a synchronization wrapper used to appease stateify. See the
   787  // comment in endpoint, where it is used.
   788  //
   789  // +stateify savable
   790  type keepalive struct {
   791  	sync.Mutex `state:"nosave"`
   792  	idle       time.Duration
   793  	interval   time.Duration
   794  	count      int
   795  	unacked    int
   796  	timer      timer       `state:"nosave"`
   797  	waker      sleep.Waker `state:"nosave"`
   798  }
   799  
   800  func newEndpoint(s *stack.Stack, protocol *protocol, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint {
   801  	e := &endpoint{
   802  		stack:    s,
   803  		protocol: protocol,
   804  		TransportEndpointInfo: stack.TransportEndpointInfo{
   805  			NetProto:   netProto,
   806  			TransProto: header.TCPProtocolNumber,
   807  		},
   808  		sndQueueInfo: sndQueueInfo{
   809  			TCPSndBufState: stack.TCPSndBufState{
   810  				SndMTU: math.MaxInt32,
   811  			},
   812  		},
   813  		waiterQueue: waiterQueue,
   814  		state:       uint32(StateInitial),
   815  		keepalive: keepalive{
   816  			idle:     DefaultKeepaliveIdle,
   817  			interval: DefaultKeepaliveInterval,
   818  			count:    DefaultKeepaliveCount,
   819  		},
   820  		uniqueID:      s.UniqueID(),
   821  		txHash:        s.Rand().Uint32(),
   822  		windowClamp:   DefaultReceiveBufferSize,
   823  		maxSynRetries: DefaultSynRetries,
   824  	}
   825  	e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits, GetTCPReceiveBufferLimits)
   826  	e.ops.SetMulticastLoop(true)
   827  	e.ops.SetQuickAck(true)
   828  	e.ops.SetSendBufferSize(DefaultSendBufferSize, false /* notify */)
   829  	e.ops.SetReceiveBufferSize(DefaultReceiveBufferSize, false /* notify */)
   830  
   831  	var ss tcpip.TCPSendBufferSizeRangeOption
   832  	if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
   833  		e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */)
   834  	}
   835  
   836  	var rs tcpip.TCPReceiveBufferSizeRangeOption
   837  	if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
   838  		e.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */)
   839  	}
   840  
   841  	var cs tcpip.CongestionControlOption
   842  	if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil {
   843  		e.cc = cs
   844  	}
   845  
   846  	var mrb tcpip.TCPModerateReceiveBufferOption
   847  	if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil {
   848  		e.rcvQueueInfo.RcvAutoParams.Disabled = !bool(mrb)
   849  	}
   850  
   851  	var de tcpip.TCPDelayEnabled
   852  	if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de {
   853  		e.ops.SetDelayOption(true)
   854  	}
   855  
   856  	var tcpLT tcpip.TCPLingerTimeoutOption
   857  	if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil {
   858  		e.tcpLingerTimeout = time.Duration(tcpLT)
   859  	}
   860  
   861  	var synRetries tcpip.TCPSynRetriesOption
   862  	if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil {
   863  		e.maxSynRetries = uint8(synRetries)
   864  	}
   865  
   866  	if p := s.GetTCPProbe(); p != nil {
   867  		e.probe = p
   868  	}
   869  
   870  	e.segmentQueue.ep = e
   871  
   872  	e.acceptCond = sync.NewCond(&e.acceptMu)
   873  	e.keepalive.timer.init(e.stack.Clock(), &e.keepalive.waker)
   874  
   875  	return e
   876  }
   877  
   878  // Readiness returns the current readiness of the endpoint. For example, if
   879  // waiter.EventIn is set, the endpoint is immediately readable.
   880  func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
   881  	result := waiter.EventMask(0)
   882  
   883  	switch e.EndpointState() {
   884  	case StateInitial, StateBound:
   885  		// This prevents blocking of new sockets which are not
   886  		// connected when SO_LINGER is set.
   887  		result |= waiter.EventHUp
   888  
   889  	case StateConnecting, StateSynSent, StateSynRecv:
   890  		// Ready for nothing.
   891  
   892  	case StateClose, StateError, StateTimeWait:
   893  		// Ready for anything.
   894  		result = mask
   895  
   896  	case StateListen:
   897  		// Check if there's anything in the accepted queue.
   898  		if (mask & waiter.ReadableEvents) != 0 {
   899  			e.acceptMu.Lock()
   900  			if e.acceptQueue.endpoints.Len() != 0 {
   901  				result |= waiter.ReadableEvents
   902  			}
   903  			e.acceptMu.Unlock()
   904  		}
   905  	}
   906  	if e.EndpointState().connected() {
   907  		// Determine if the endpoint is writable if requested.
   908  		if (mask & waiter.WritableEvents) != 0 {
   909  			e.sndQueueInfo.sndQueueMu.Lock()
   910  			sndBufSize := e.getSendBufferSize()
   911  			if e.sndQueueInfo.SndClosed || e.sndQueueInfo.SndBufUsed < sndBufSize {
   912  				result |= waiter.WritableEvents
   913  			}
   914  			e.sndQueueInfo.sndQueueMu.Unlock()
   915  		}
   916  
   917  		// Determine if the endpoint is readable if requested.
   918  		if (mask & waiter.ReadableEvents) != 0 {
   919  			e.rcvQueueInfo.rcvQueueMu.Lock()
   920  			if e.rcvQueueInfo.RcvBufUsed > 0 || e.rcvQueueInfo.RcvClosed {
   921  				result |= waiter.ReadableEvents
   922  			}
   923  			e.rcvQueueInfo.rcvQueueMu.Unlock()
   924  		}
   925  	}
   926  
   927  	return result
   928  }
   929  
   930  func (e *endpoint) fetchNotifications() uint32 {
   931  	return atomic.SwapUint32(&e.notifyFlags, 0)
   932  }
   933  
   934  func (e *endpoint) notifyProtocolGoroutine(n uint32) {
   935  	for {
   936  		v := atomic.LoadUint32(&e.notifyFlags)
   937  		if v&n == n {
   938  			// The flags are already set.
   939  			return
   940  		}
   941  
   942  		if atomic.CompareAndSwapUint32(&e.notifyFlags, v, v|n) {
   943  			if v == 0 {
   944  				// We are causing a transition from no flags to
   945  				// at least one flag set, so we must cause the
   946  				// protocol goroutine to wake up.
   947  				e.notificationWaker.Assert()
   948  			}
   949  			return
   950  		}
   951  	}
   952  }
   953  
   954  // Abort implements stack.TransportEndpoint.Abort.
   955  func (e *endpoint) Abort() {
   956  	// The abort notification is not processed synchronously, so no
   957  	// synchronization is needed.
   958  	//
   959  	// If the endpoint becomes connected after this check, we still close
   960  	// the endpoint. This worst case results in a slower abort.
   961  	//
   962  	// If the endpoint disconnected after the check, nothing needs to be
   963  	// done, so sending a notification which will potentially be ignored is
   964  	// fine.
   965  	//
   966  	// If the endpoint connecting finishes after the check, the endpoint
   967  	// is either in a connected state (where we would notifyAbort anyway),
   968  	// SYN-RECV (where we would also notifyAbort anyway), or in an error
   969  	// state where nothing is required and the notification can be safely
   970  	// ignored.
   971  	//
   972  	// Endpoints where a Close during connecting or SYN-RECV state would be
   973  	// problematic are set to state connecting before being registered (and
   974  	// thus possible to be Aborted). They are never available in initial
   975  	// state.
   976  	//
   977  	// Endpoints transitioning from initial to connecting state may be
   978  	// safely either closed or sent notifyAbort.
   979  	if s := e.EndpointState(); s == StateConnecting || s == StateSynRecv || s.connected() {
   980  		e.notifyProtocolGoroutine(notifyAbort)
   981  		return
   982  	}
   983  	e.Close()
   984  }
   985  
   986  // Close puts the endpoint in a closed state and frees all resources associated
   987  // with it. It must be called only once and with no other concurrent calls to
   988  // the endpoint.
   989  func (e *endpoint) Close() {
   990  	e.LockUser()
   991  	defer e.UnlockUser()
   992  	if e.closed {
   993  		return
   994  	}
   995  
   996  	linger := e.SocketOptions().GetLinger()
   997  	if linger.Enabled && linger.Timeout == 0 {
   998  		s := e.EndpointState()
   999  		isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv
  1000  		if isResetState {
  1001  			// Close the endpoint without doing full shutdown and
  1002  			// send a RST.
  1003  			e.resetConnectionLocked(&tcpip.ErrConnectionAborted{})
  1004  			e.closeNoShutdownLocked()
  1005  
  1006  			// Wake up worker to close the endpoint.
  1007  			switch s {
  1008  			case StateSynRecv:
  1009  				e.notifyProtocolGoroutine(notifyClose)
  1010  			default:
  1011  				e.notifyProtocolGoroutine(notifyTickleWorker)
  1012  			}
  1013  			return
  1014  		}
  1015  	}
  1016  
  1017  	// Issue a shutdown so that the peer knows we won't send any more data
  1018  	// if we're connected, or stop accepting if we're listening.
  1019  	e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead)
  1020  	e.closeNoShutdownLocked()
  1021  }
  1022  
  1023  // closeNoShutdown closes the endpoint without doing a full shutdown.
  1024  func (e *endpoint) closeNoShutdownLocked() {
  1025  	// For listening sockets, we always release ports inline so that they
  1026  	// are immediately available for reuse after Close() is called. If also
  1027  	// registered, we unregister as well otherwise the next user would fail
  1028  	// in Listen() when trying to register.
  1029  	if e.EndpointState() == StateListen && e.isPortReserved {
  1030  		if e.isRegistered {
  1031  			e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
  1032  			e.isRegistered = false
  1033  		}
  1034  
  1035  		portRes := ports.Reservation{
  1036  			Networks:     e.effectiveNetProtos,
  1037  			Transport:    ProtocolNumber,
  1038  			Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  1039  			Port:         e.TransportEndpointInfo.ID.LocalPort,
  1040  			Flags:        e.boundPortFlags,
  1041  			BindToDevice: e.boundBindToDevice,
  1042  			Dest:         e.boundDest,
  1043  		}
  1044  		e.stack.ReleasePort(portRes)
  1045  		e.isPortReserved = false
  1046  		e.boundBindToDevice = 0
  1047  		e.boundPortFlags = ports.Flags{}
  1048  		e.boundDest = tcpip.FullAddress{}
  1049  	}
  1050  
  1051  	// Mark endpoint as closed.
  1052  	e.closed = true
  1053  
  1054  	switch e.EndpointState() {
  1055  	case StateClose, StateError:
  1056  		return
  1057  	}
  1058  
  1059  	eventMask := waiter.ReadableEvents | waiter.WritableEvents
  1060  	// Either perform the local cleanup or kick the worker to make sure it
  1061  	// knows it needs to cleanup.
  1062  	if e.workerRunning {
  1063  		e.workerCleanup = true
  1064  		tcpip.AddDanglingEndpoint(e)
  1065  		// Worker will remove the dangling endpoint when the endpoint
  1066  		// goroutine terminates.
  1067  		e.notifyProtocolGoroutine(notifyClose)
  1068  	} else {
  1069  		e.transitionToStateCloseLocked()
  1070  		// Notify that the endpoint is closed.
  1071  		eventMask |= waiter.EventHUp
  1072  	}
  1073  
  1074  	// The TCP closing state-machine would eventually notify EventHUp, but we
  1075  	// notify EventIn|EventOut immediately to unblock any blocked waiters.
  1076  	e.waiterQueue.Notify(eventMask)
  1077  }
  1078  
  1079  // closePendingAcceptableConnections closes all connections that have completed
  1080  // handshake but not yet been delivered to the application.
  1081  func (e *endpoint) closePendingAcceptableConnectionsLocked() {
  1082  	e.acceptMu.Lock()
  1083  	// Close any endpoints in SYN-RCVD state.
  1084  	for n := range e.acceptQueue.pendingEndpoints {
  1085  		n.notifyProtocolGoroutine(notifyClose)
  1086  	}
  1087  	e.acceptQueue.pendingEndpoints = nil
  1088  	// Reset all connections that are waiting to be accepted.
  1089  	for n := e.acceptQueue.endpoints.Front(); n != nil; n = n.Next() {
  1090  		n.Value.(*endpoint).notifyProtocolGoroutine(notifyReset)
  1091  	}
  1092  	e.acceptQueue.endpoints.Init()
  1093  	e.acceptMu.Unlock()
  1094  
  1095  	e.acceptCond.Broadcast()
  1096  
  1097  	// Wait for reset of all endpoints that are still waiting to be delivered to
  1098  	// the now closed accepted.
  1099  	e.pendingAccepted.Wait()
  1100  }
  1101  
  1102  // cleanupLocked frees all resources associated with the endpoint. It is called
  1103  // after Close() is called and the worker goroutine (if any) is done with its
  1104  // work.
  1105  func (e *endpoint) cleanupLocked() {
  1106  	// Close all endpoints that might have been accepted by TCP but not by
  1107  	// the client.
  1108  	e.closePendingAcceptableConnectionsLocked()
  1109  	e.keepalive.timer.cleanup()
  1110  
  1111  	e.workerCleanup = false
  1112  
  1113  	if e.isRegistered {
  1114  		e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
  1115  		e.isRegistered = false
  1116  	}
  1117  
  1118  	if e.isPortReserved {
  1119  		portRes := ports.Reservation{
  1120  			Networks:     e.effectiveNetProtos,
  1121  			Transport:    ProtocolNumber,
  1122  			Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  1123  			Port:         e.TransportEndpointInfo.ID.LocalPort,
  1124  			Flags:        e.boundPortFlags,
  1125  			BindToDevice: e.boundBindToDevice,
  1126  			Dest:         e.boundDest,
  1127  		}
  1128  		e.stack.ReleasePort(portRes)
  1129  		e.isPortReserved = false
  1130  	}
  1131  	e.boundBindToDevice = 0
  1132  	e.boundPortFlags = ports.Flags{}
  1133  	e.boundDest = tcpip.FullAddress{}
  1134  
  1135  	if e.route != nil {
  1136  		e.route.Release()
  1137  		e.route = nil
  1138  	}
  1139  
  1140  	e.stack.CompleteTransportEndpointCleanup(e)
  1141  	tcpip.DeleteDanglingEndpoint(e)
  1142  }
  1143  
  1144  // wndFromSpace returns the window that we can advertise based on the available
  1145  // receive buffer space.
  1146  func wndFromSpace(space int) int {
  1147  	return space >> rcvAdvWndScale
  1148  }
  1149  
  1150  // initialReceiveWindow returns the initial receive window to advertise in the
  1151  // SYN/SYN-ACK.
  1152  func (e *endpoint) initialReceiveWindow() int {
  1153  	rcvWnd := wndFromSpace(e.receiveBufferAvailable())
  1154  	if rcvWnd > math.MaxUint16 {
  1155  		rcvWnd = math.MaxUint16
  1156  	}
  1157  
  1158  	// Use the user supplied MSS, if available.
  1159  	routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2
  1160  	if rcvWnd > routeWnd {
  1161  		rcvWnd = routeWnd
  1162  	}
  1163  	rcvWndScale := e.rcvWndScaleForHandshake()
  1164  
  1165  	// Round-down the rcvWnd to a multiple of wndScale. This ensures that the
  1166  	// window offered in SYN won't be reduced due to the loss of precision if
  1167  	// window scaling is enabled after the handshake.
  1168  	rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale)
  1169  
  1170  	// Ensure we can always accept at least 1 byte if the scale specified
  1171  	// was too high for the provided rcvWnd.
  1172  	if rcvWnd == 0 {
  1173  		rcvWnd = 1
  1174  	}
  1175  
  1176  	return rcvWnd
  1177  }
  1178  
  1179  // ModerateRecvBuf adjusts the receive buffer and the advertised window
  1180  // based on the number of bytes copied to userspace.
  1181  func (e *endpoint) ModerateRecvBuf(copied int) {
  1182  	e.LockUser()
  1183  	defer e.UnlockUser()
  1184  
  1185  	e.rcvQueueInfo.rcvQueueMu.Lock()
  1186  	if e.rcvQueueInfo.RcvAutoParams.Disabled {
  1187  		e.rcvQueueInfo.rcvQueueMu.Unlock()
  1188  		return
  1189  	}
  1190  	now := e.stack.Clock().NowMonotonic()
  1191  	if rtt := e.rcvQueueInfo.RcvAutoParams.RTT; rtt == 0 || now.Sub(e.rcvQueueInfo.RcvAutoParams.MeasureTime) < rtt {
  1192  		e.rcvQueueInfo.RcvAutoParams.CopiedBytes += copied
  1193  		e.rcvQueueInfo.rcvQueueMu.Unlock()
  1194  		return
  1195  	}
  1196  	prevRTTCopied := e.rcvQueueInfo.RcvAutoParams.CopiedBytes + copied
  1197  	prevCopied := e.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes
  1198  	rcvWnd := 0
  1199  	if prevRTTCopied > prevCopied {
  1200  		// The minimal receive window based on what was copied by the app
  1201  		// in the immediate preceding RTT and some extra buffer for 16
  1202  		// segments to account for variations.
  1203  		// We multiply by 2 to account for packet losses.
  1204  		rcvWnd = prevRTTCopied*2 + 16*int(e.amss)
  1205  
  1206  		// Scale for slow start based on bytes copied in this RTT vs previous.
  1207  		grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied
  1208  
  1209  		// Multiply growth factor by 2 again to account for sender being
  1210  		// in slow-start where the sender grows it's congestion window
  1211  		// by 100% per RTT.
  1212  		rcvWnd += grow * 2
  1213  
  1214  		// Make sure auto tuned buffer size can always receive upto 2x
  1215  		// the initial window of 10 segments.
  1216  		if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd {
  1217  			rcvWnd = minRcvWnd
  1218  		}
  1219  
  1220  		// Cap the auto tuned buffer size by the maximum permissible
  1221  		// receive buffer size.
  1222  		if max := e.maxReceiveBufferSize(); rcvWnd > max {
  1223  			rcvWnd = max
  1224  		}
  1225  
  1226  		// We do not adjust downwards as that can cause the receiver to
  1227  		// reject valid data that might already be in flight as the
  1228  		// acceptable window will shrink.
  1229  		rcvBufSize := int(e.ops.GetReceiveBufferSize())
  1230  		if rcvWnd > rcvBufSize {
  1231  			availBefore := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize))
  1232  			e.ops.SetReceiveBufferSize(int64(rcvWnd), false /* notify */)
  1233  			availAfter := wndFromSpace(e.receiveBufferAvailableLocked(rcvWnd))
  1234  			if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, rcvBufSize); crossed && above {
  1235  				e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
  1236  			}
  1237  		}
  1238  
  1239  		// We only update PrevCopiedBytes when we grow the buffer because in cases
  1240  		// where PrevCopiedBytes > prevRTTCopied the existing buffer is already big
  1241  		// enough to handle the current rate and we don't need to do any
  1242  		// adjustments.
  1243  		e.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes = prevRTTCopied
  1244  	}
  1245  	e.rcvQueueInfo.RcvAutoParams.MeasureTime = now
  1246  	e.rcvQueueInfo.RcvAutoParams.CopiedBytes = 0
  1247  	e.rcvQueueInfo.rcvQueueMu.Unlock()
  1248  }
  1249  
  1250  // SetOwner implements tcpip.Endpoint.SetOwner.
  1251  func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
  1252  	e.owner = owner
  1253  }
  1254  
  1255  // Preconditions: e.mu must be held to call this function.
  1256  func (e *endpoint) hardErrorLocked() tcpip.Error {
  1257  	err := e.hardError
  1258  	e.hardError = nil
  1259  	return err
  1260  }
  1261  
  1262  // Preconditions: e.mu must be held to call this function.
  1263  func (e *endpoint) lastErrorLocked() tcpip.Error {
  1264  	e.lastErrorMu.Lock()
  1265  	defer e.lastErrorMu.Unlock()
  1266  	err := e.lastError
  1267  	e.lastError = nil
  1268  	return err
  1269  }
  1270  
  1271  // LastError implements tcpip.Endpoint.LastError.
  1272  func (e *endpoint) LastError() tcpip.Error {
  1273  	e.LockUser()
  1274  	defer e.UnlockUser()
  1275  	if err := e.hardErrorLocked(); err != nil {
  1276  		return err
  1277  	}
  1278  	return e.lastErrorLocked()
  1279  }
  1280  
  1281  // LastErrorLocked reads and clears lastError with e.mu held.
  1282  // Only to be used in tests.
  1283  func (e *endpoint) LastErrorLocked() tcpip.Error {
  1284  	return e.lastErrorLocked()
  1285  }
  1286  
  1287  // UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError.
  1288  func (e *endpoint) UpdateLastError(err tcpip.Error) {
  1289  	e.LockUser()
  1290  	e.lastErrorMu.Lock()
  1291  	e.lastError = err
  1292  	e.lastErrorMu.Unlock()
  1293  	e.UnlockUser()
  1294  }
  1295  
  1296  // Read implements tcpip.Endpoint.Read.
  1297  func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) {
  1298  	e.rcvReadMu.Lock()
  1299  	defer e.rcvReadMu.Unlock()
  1300  
  1301  	// N.B. Here we get a range of segments to be processed. It is safe to not
  1302  	// hold rcvQueueMu when processing, since we hold rcvReadMu to ensure only we
  1303  	// can remove segments from the list through commitRead().
  1304  	first, last, serr := e.startRead()
  1305  	if serr != nil {
  1306  		if _, ok := serr.(*tcpip.ErrClosedForReceive); ok {
  1307  			e.stats.ReadErrors.ReadClosed.Increment()
  1308  		}
  1309  		return tcpip.ReadResult{}, serr
  1310  	}
  1311  
  1312  	var err error
  1313  	done := 0
  1314  	s := first
  1315  	for s != nil {
  1316  		var n int
  1317  		n, err = s.data.ReadTo(dst, opts.Peek)
  1318  		// Book keeping first then error handling.
  1319  
  1320  		done += n
  1321  
  1322  		if opts.Peek {
  1323  			// For peek, we use the (first, last) range of segment returned from
  1324  			// startRead. We don't consume the receive buffer, so commitRead should
  1325  			// not be called.
  1326  			//
  1327  			// N.B. It is important to use `last` to determine the last segment, since
  1328  			// appending can happen while we process, and will lead to data race.
  1329  			if s == last {
  1330  				break
  1331  			}
  1332  			s = s.Next()
  1333  		} else {
  1334  			// N.B. commitRead() conveniently returns the next segment to read, after
  1335  			// removing the data/segment that is read.
  1336  			s = e.commitRead(n)
  1337  		}
  1338  
  1339  		if err != nil {
  1340  			break
  1341  		}
  1342  	}
  1343  
  1344  	// If something is read, we must report it. Report error when nothing is read.
  1345  	if done == 0 && err != nil {
  1346  		return tcpip.ReadResult{}, &tcpip.ErrBadBuffer{}
  1347  	}
  1348  	return tcpip.ReadResult{
  1349  		Count: done,
  1350  		Total: done,
  1351  	}, nil
  1352  }
  1353  
  1354  // startRead checks that endpoint is in a readable state, and return the
  1355  // inclusive range of segments that can be read.
  1356  //
  1357  // Precondition: e.rcvReadMu must be held.
  1358  func (e *endpoint) startRead() (first, last *segment, err tcpip.Error) {
  1359  	e.LockUser()
  1360  	defer e.UnlockUser()
  1361  
  1362  	// When in SYN-SENT state, let the caller block on the receive.
  1363  	// An application can initiate a non-blocking connect and then block
  1364  	// on a receive. It can expect to read any data after the handshake
  1365  	// is complete. RFC793, section 3.9, p58.
  1366  	if e.EndpointState() == StateSynSent {
  1367  		return nil, nil, &tcpip.ErrWouldBlock{}
  1368  	}
  1369  
  1370  	// The endpoint can be read if it's connected, or if it's already closed
  1371  	// but has some pending unread data. Also note that a RST being received
  1372  	// would cause the state to become StateError so we should allow the
  1373  	// reads to proceed before returning a ECONNRESET.
  1374  	e.rcvQueueInfo.rcvQueueMu.Lock()
  1375  	defer e.rcvQueueInfo.rcvQueueMu.Unlock()
  1376  
  1377  	bufUsed := e.rcvQueueInfo.RcvBufUsed
  1378  	if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 {
  1379  		if s == StateError {
  1380  			if err := e.hardErrorLocked(); err != nil {
  1381  				return nil, nil, err
  1382  			}
  1383  			return nil, nil, &tcpip.ErrClosedForReceive{}
  1384  		}
  1385  		e.stats.ReadErrors.NotConnected.Increment()
  1386  		return nil, nil, &tcpip.ErrNotConnected{}
  1387  	}
  1388  
  1389  	if e.rcvQueueInfo.RcvBufUsed == 0 {
  1390  		if e.rcvQueueInfo.RcvClosed || !e.EndpointState().connected() {
  1391  			return nil, nil, &tcpip.ErrClosedForReceive{}
  1392  		}
  1393  		return nil, nil, &tcpip.ErrWouldBlock{}
  1394  	}
  1395  
  1396  	return e.rcvQueueInfo.rcvQueue.Front(), e.rcvQueueInfo.rcvQueue.Back(), nil
  1397  }
  1398  
  1399  // commitRead commits a read of done bytes and returns the next non-empty
  1400  // segment to read. Data read from the segment must have also been removed from
  1401  // the segment in order for this method to work correctly.
  1402  //
  1403  // It is performance critical to call commitRead frequently when servicing a big
  1404  // Read request, so TCP can make progress timely. Right now, it is designed to
  1405  // do this per segment read, hence this method conveniently returns the next
  1406  // segment to read while holding the lock.
  1407  //
  1408  // Precondition: e.rcvReadMu must be held.
  1409  func (e *endpoint) commitRead(done int) *segment {
  1410  	e.LockUser()
  1411  	defer e.UnlockUser()
  1412  	e.rcvQueueInfo.rcvQueueMu.Lock()
  1413  	defer e.rcvQueueInfo.rcvQueueMu.Unlock()
  1414  
  1415  	memDelta := 0
  1416  	s := e.rcvQueueInfo.rcvQueue.Front()
  1417  	for s != nil && s.data.Size() == 0 {
  1418  		e.rcvQueueInfo.rcvQueue.Remove(s)
  1419  		// Memory is only considered released when the whole segment has been
  1420  		// read.
  1421  		memDelta += s.segMemSize()
  1422  		s.decRef()
  1423  		s = e.rcvQueueInfo.rcvQueue.Front()
  1424  	}
  1425  	e.rcvQueueInfo.RcvBufUsed -= done
  1426  
  1427  	if memDelta > 0 {
  1428  		// If the window was small before this read and if the read freed up
  1429  		// enough buffer space, to either fit an aMSS or half a receive buffer
  1430  		// (whichever smaller), then notify the protocol goroutine to send a
  1431  		// window update.
  1432  		if crossed, above := e.windowCrossedACKThresholdLocked(memDelta, int(e.ops.GetReceiveBufferSize())); crossed && above {
  1433  			e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
  1434  		}
  1435  	}
  1436  
  1437  	return e.rcvQueueInfo.rcvQueue.Front()
  1438  }
  1439  
  1440  // isEndpointWritableLocked checks if a given endpoint is writable
  1441  // and also returns the number of bytes that can be written at this
  1442  // moment. If the endpoint is not writable then it returns an error
  1443  // indicating the reason why it's not writable.
  1444  // Caller must hold e.mu and e.sndQueueMu
  1445  func (e *endpoint) isEndpointWritableLocked() (int, tcpip.Error) {
  1446  	// The endpoint cannot be written to if it's not connected.
  1447  	switch s := e.EndpointState(); {
  1448  	case s == StateError:
  1449  		if err := e.hardErrorLocked(); err != nil {
  1450  			return 0, err
  1451  		}
  1452  		return 0, &tcpip.ErrClosedForSend{}
  1453  	case !s.connecting() && !s.connected():
  1454  		return 0, &tcpip.ErrClosedForSend{}
  1455  	case s.connecting():
  1456  		// As per RFC793, page 56, a send request arriving when in connecting
  1457  		// state, can be queued to be completed after the state becomes
  1458  		// connected. Return an error code for the caller of endpoint Write to
  1459  		// try again, until the connection handshake is complete.
  1460  		return 0, &tcpip.ErrWouldBlock{}
  1461  	}
  1462  
  1463  	// Check if the connection has already been closed for sends.
  1464  	if e.sndQueueInfo.SndClosed {
  1465  		return 0, &tcpip.ErrClosedForSend{}
  1466  	}
  1467  
  1468  	sndBufSize := e.getSendBufferSize()
  1469  	avail := sndBufSize - e.sndQueueInfo.SndBufUsed
  1470  	if avail <= 0 {
  1471  		return 0, &tcpip.ErrWouldBlock{}
  1472  	}
  1473  	return avail, nil
  1474  }
  1475  
  1476  // readFromPayloader reads a slice from the Payloader.
  1477  // +checklocks:e.mu
  1478  // +checklocks:e.sndQueueInfo.sndQueueMu
  1479  func (e *endpoint) readFromPayloader(p tcpip.Payloader, opts tcpip.WriteOptions, avail int) ([]byte, tcpip.Error) {
  1480  	// We can release locks while copying data.
  1481  	//
  1482  	// This is not possible if atomic is set, because we can't allow the
  1483  	// available buffer space to be consumed by some other caller while we
  1484  	// are copying data in.
  1485  	if !opts.Atomic {
  1486  		e.sndQueueInfo.sndQueueMu.Unlock()
  1487  		defer e.sndQueueInfo.sndQueueMu.Lock()
  1488  
  1489  		e.UnlockUser()
  1490  		defer e.LockUser()
  1491  	}
  1492  
  1493  	// Fetch data.
  1494  	if l := p.Len(); l < avail {
  1495  		avail = l
  1496  	}
  1497  	if avail == 0 {
  1498  		return nil, nil
  1499  	}
  1500  	v := make([]byte, avail)
  1501  	n, err := p.Read(v)
  1502  	if err != nil && err != io.EOF {
  1503  		return nil, &tcpip.ErrBadBuffer{}
  1504  	}
  1505  	return v[:n], nil
  1506  }
  1507  
  1508  // queueSegment reads data from the payloader and returns a segment to be sent.
  1509  // +checklocks:e.mu
  1510  func (e *endpoint) queueSegment(p tcpip.Payloader, opts tcpip.WriteOptions) (*segment, int, tcpip.Error) {
  1511  	e.sndQueueInfo.sndQueueMu.Lock()
  1512  	defer e.sndQueueInfo.sndQueueMu.Unlock()
  1513  
  1514  	avail, err := e.isEndpointWritableLocked()
  1515  	if err != nil {
  1516  		e.stats.WriteErrors.WriteClosed.Increment()
  1517  		return nil, 0, err
  1518  	}
  1519  
  1520  	v, err := e.readFromPayloader(p, opts, avail)
  1521  	if err != nil {
  1522  		return nil, 0, err
  1523  	}
  1524  
  1525  	// Do not queue zero length segments.
  1526  	if len(v) == 0 {
  1527  		return nil, 0, nil
  1528  	}
  1529  
  1530  	if !opts.Atomic {
  1531  		// Since we released locks in between it's possible that the
  1532  		// endpoint transitioned to a CLOSED/ERROR states so make
  1533  		// sure endpoint is still writable before trying to write.
  1534  		avail, err := e.isEndpointWritableLocked()
  1535  		if err != nil {
  1536  			e.stats.WriteErrors.WriteClosed.Increment()
  1537  			return nil, 0, err
  1538  		}
  1539  
  1540  		// Discard any excess data copied in due to avail being reduced due
  1541  		// to a simultaneous write call to the socket.
  1542  		if avail < len(v) {
  1543  			v = v[:avail]
  1544  		}
  1545  	}
  1546  
  1547  	// Add data to the send queue.
  1548  	s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), v)
  1549  	e.sndQueueInfo.SndBufUsed += len(v)
  1550  	e.snd.writeList.PushBack(s)
  1551  
  1552  	return s, len(v), nil
  1553  }
  1554  
  1555  // Write writes data to the endpoint's peer.
  1556  func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) {
  1557  	// Linux completely ignores any address passed to sendto(2) for TCP sockets
  1558  	// (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More
  1559  	// and opts.EndOfRecord are also ignored.
  1560  
  1561  	e.LockUser()
  1562  	defer e.UnlockUser()
  1563  
  1564  	// Return if either we didn't queue anything or if an error occurred while
  1565  	// attempting to queue data.
  1566  	nextSeg, n, err := e.queueSegment(p, opts)
  1567  	if n == 0 || err != nil {
  1568  		return 0, err
  1569  	}
  1570  
  1571  	e.sendData(nextSeg)
  1572  	return int64(n), nil
  1573  }
  1574  
  1575  // selectWindowLocked returns the new window without checking for shrinking or scaling
  1576  // applied.
  1577  // Precondition: e.mu and e.rcvQueueMu must be held.
  1578  func (e *endpoint) selectWindowLocked(rcvBufSize int) (wnd seqnum.Size) {
  1579  	wndFromAvailable := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize))
  1580  	maxWindow := wndFromSpace(rcvBufSize)
  1581  	wndFromUsedBytes := maxWindow - e.rcvQueueInfo.RcvBufUsed
  1582  
  1583  	// We take the lesser of the wndFromAvailable and wndFromUsedBytes because in
  1584  	// cases where we receive a lot of small segments the segment overhead is a
  1585  	// lot higher and we can run out socket buffer space before we can fill the
  1586  	// previous window we advertised. In cases where we receive MSS sized or close
  1587  	// MSS sized segments we will probably run out of window space before we
  1588  	// exhaust receive buffer.
  1589  	newWnd := wndFromAvailable
  1590  	if newWnd > wndFromUsedBytes {
  1591  		newWnd = wndFromUsedBytes
  1592  	}
  1593  	if newWnd < 0 {
  1594  		newWnd = 0
  1595  	}
  1596  	return seqnum.Size(newWnd)
  1597  }
  1598  
  1599  // selectWindow invokes selectWindowLocked after acquiring e.rcvQueueMu.
  1600  func (e *endpoint) selectWindow() (wnd seqnum.Size) {
  1601  	e.rcvQueueInfo.rcvQueueMu.Lock()
  1602  	wnd = e.selectWindowLocked(int(e.ops.GetReceiveBufferSize()))
  1603  	e.rcvQueueInfo.rcvQueueMu.Unlock()
  1604  	return wnd
  1605  }
  1606  
  1607  // windowCrossedACKThresholdLocked checks if the receive window to be announced
  1608  // would be under aMSS or under the window derived from half receive buffer,
  1609  // whichever smaller. This is useful as a receive side silly window syndrome
  1610  // prevention mechanism. If window grows to reasonable value, we should send ACK
  1611  // to the sender to inform the rx space is now large. We also want ensure a
  1612  // series of small read()'s won't trigger a flood of spurious tiny ACK's.
  1613  //
  1614  // For large receive buffers, the threshold is aMSS - once reader reads more
  1615  // than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of
  1616  // receive buffer size. This is chosen arbitrarily.
  1617  // crossed will be true if the window size crossed the ACK threshold.
  1618  // above will be true if the new window is >= ACK threshold and false
  1619  // otherwise.
  1620  //
  1621  // Precondition: e.mu and e.rcvQueueMu must be held.
  1622  func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int, rcvBufSize int) (crossed bool, above bool) {
  1623  	newAvail := int(e.selectWindowLocked(rcvBufSize))
  1624  	oldAvail := newAvail - deltaBefore
  1625  	if oldAvail < 0 {
  1626  		oldAvail = 0
  1627  	}
  1628  	threshold := int(e.amss)
  1629  	// rcvBufFraction is the inverse of the fraction of receive buffer size that
  1630  	// is used to decide if the available buffer space is now above it.
  1631  	const rcvBufFraction = 2
  1632  	if wndThreshold := wndFromSpace(rcvBufSize / rcvBufFraction); threshold > wndThreshold {
  1633  		threshold = wndThreshold
  1634  	}
  1635  	switch {
  1636  	case oldAvail < threshold && newAvail >= threshold:
  1637  		return true, true
  1638  	case oldAvail >= threshold && newAvail < threshold:
  1639  		return true, false
  1640  	}
  1641  	return false, false
  1642  }
  1643  
  1644  // OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet.
  1645  func (e *endpoint) OnReuseAddressSet(v bool) {
  1646  	e.LockUser()
  1647  	e.portFlags.TupleOnly = v
  1648  	e.UnlockUser()
  1649  }
  1650  
  1651  // OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet.
  1652  func (e *endpoint) OnReusePortSet(v bool) {
  1653  	e.LockUser()
  1654  	e.portFlags.LoadBalanced = v
  1655  	e.UnlockUser()
  1656  }
  1657  
  1658  // OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet.
  1659  func (e *endpoint) OnKeepAliveSet(bool) {
  1660  	e.notifyProtocolGoroutine(notifyKeepaliveChanged)
  1661  }
  1662  
  1663  // OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet.
  1664  func (e *endpoint) OnDelayOptionSet(v bool) {
  1665  	if !v {
  1666  		// Handle delayed data.
  1667  		e.sndQueueInfo.sndWaker.Assert()
  1668  	}
  1669  }
  1670  
  1671  // OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet.
  1672  func (e *endpoint) OnCorkOptionSet(v bool) {
  1673  	if !v {
  1674  		// Handle the corked data.
  1675  		e.sndQueueInfo.sndWaker.Assert()
  1676  	}
  1677  }
  1678  
  1679  func (e *endpoint) getSendBufferSize() int {
  1680  	return int(e.ops.GetSendBufferSize())
  1681  }
  1682  
  1683  // OnSetReceiveBufferSize implements tcpip.SocketOptionsHandler.OnSetReceiveBufferSize.
  1684  func (e *endpoint) OnSetReceiveBufferSize(rcvBufSz, oldSz int64) (newSz int64) {
  1685  	e.LockUser()
  1686  	e.rcvQueueInfo.rcvQueueMu.Lock()
  1687  
  1688  	// Make sure the receive buffer size allows us to send a
  1689  	// non-zero window size.
  1690  	scale := uint8(0)
  1691  	if e.rcv != nil {
  1692  		scale = e.rcv.RcvWndScale
  1693  	}
  1694  	if rcvBufSz>>scale == 0 {
  1695  		rcvBufSz = 1 << scale
  1696  	}
  1697  
  1698  	availBefore := wndFromSpace(e.receiveBufferAvailableLocked(int(oldSz)))
  1699  	availAfter := wndFromSpace(e.receiveBufferAvailableLocked(int(rcvBufSz)))
  1700  	e.rcvQueueInfo.RcvAutoParams.Disabled = true
  1701  
  1702  	// Immediately send an ACK to uncork the sender silly window
  1703  	// syndrome prevetion, when our available space grows above aMSS
  1704  	// or half receive buffer, whichever smaller.
  1705  	if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, int(rcvBufSz)); crossed && above {
  1706  		e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
  1707  	}
  1708  
  1709  	e.rcvQueueInfo.rcvQueueMu.Unlock()
  1710  	e.UnlockUser()
  1711  	return rcvBufSz
  1712  }
  1713  
  1714  // OnSetSendBufferSize implements tcpip.SocketOptionsHandler.OnSetSendBufferSize.
  1715  func (e *endpoint) OnSetSendBufferSize(sz int64) int64 {
  1716  	atomic.StoreUint32(&e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled, 1)
  1717  	return sz
  1718  }
  1719  
  1720  // WakeupWriters implements tcpip.SocketOptionsHandler.WakeupWriters.
  1721  func (e *endpoint) WakeupWriters() {
  1722  	e.LockUser()
  1723  	defer e.UnlockUser()
  1724  
  1725  	sendBufferSize := e.getSendBufferSize()
  1726  	e.sndQueueInfo.sndQueueMu.Lock()
  1727  	notify := (sendBufferSize - e.sndQueueInfo.SndBufUsed) >= e.sndQueueInfo.SndBufUsed>>1
  1728  	e.sndQueueInfo.sndQueueMu.Unlock()
  1729  
  1730  	if notify {
  1731  		e.waiterQueue.Notify(waiter.WritableEvents)
  1732  	}
  1733  }
  1734  
  1735  // SetSockOptInt sets a socket option.
  1736  func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error {
  1737  	// Lower 2 bits represents ECN bits. RFC 3168, section 23.1
  1738  	const inetECNMask = 3
  1739  
  1740  	switch opt {
  1741  	case tcpip.KeepaliveCountOption:
  1742  		e.keepalive.Lock()
  1743  		e.keepalive.count = v
  1744  		e.keepalive.Unlock()
  1745  		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
  1746  
  1747  	case tcpip.IPv4TOSOption:
  1748  		e.LockUser()
  1749  		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
  1750  		// ignore the bits for now.
  1751  		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
  1752  		e.UnlockUser()
  1753  
  1754  	case tcpip.IPv6TrafficClassOption:
  1755  		e.LockUser()
  1756  		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
  1757  		// ignore the bits for now.
  1758  		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
  1759  		e.UnlockUser()
  1760  
  1761  	case tcpip.MaxSegOption:
  1762  		userMSS := v
  1763  		if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS {
  1764  			return &tcpip.ErrInvalidOptionValue{}
  1765  		}
  1766  		e.LockUser()
  1767  		e.userMSS = uint16(userMSS)
  1768  		e.UnlockUser()
  1769  		e.notifyProtocolGoroutine(notifyMSSChanged)
  1770  
  1771  	case tcpip.MTUDiscoverOption:
  1772  		// Return not supported if attempting to set this option to
  1773  		// anything other than path MTU discovery disabled.
  1774  		if v != tcpip.PMTUDiscoveryDont {
  1775  			return &tcpip.ErrNotSupported{}
  1776  		}
  1777  
  1778  	case tcpip.TTLOption:
  1779  		e.LockUser()
  1780  		e.ttl = uint8(v)
  1781  		e.UnlockUser()
  1782  
  1783  	case tcpip.TCPSynCountOption:
  1784  		if v < 1 || v > 255 {
  1785  			return &tcpip.ErrInvalidOptionValue{}
  1786  		}
  1787  		e.LockUser()
  1788  		e.maxSynRetries = uint8(v)
  1789  		e.UnlockUser()
  1790  
  1791  	case tcpip.TCPWindowClampOption:
  1792  		if v == 0 {
  1793  			e.LockUser()
  1794  			switch e.EndpointState() {
  1795  			case StateClose, StateInitial:
  1796  				e.windowClamp = 0
  1797  				e.UnlockUser()
  1798  				return nil
  1799  			default:
  1800  				e.UnlockUser()
  1801  				return &tcpip.ErrInvalidOptionValue{}
  1802  			}
  1803  		}
  1804  		var rs tcpip.TCPReceiveBufferSizeRangeOption
  1805  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
  1806  			if v < rs.Min/2 {
  1807  				v = rs.Min / 2
  1808  			}
  1809  		}
  1810  		e.LockUser()
  1811  		e.windowClamp = uint32(v)
  1812  		e.UnlockUser()
  1813  	}
  1814  	return nil
  1815  }
  1816  
  1817  func (e *endpoint) HasNIC(id int32) bool {
  1818  	return id == 0 || e.stack.HasNIC(tcpip.NICID(id))
  1819  }
  1820  
  1821  // SetSockOpt sets a socket option.
  1822  func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error {
  1823  	switch v := opt.(type) {
  1824  	case *tcpip.KeepaliveIdleOption:
  1825  		e.keepalive.Lock()
  1826  		e.keepalive.idle = time.Duration(*v)
  1827  		e.keepalive.Unlock()
  1828  		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
  1829  
  1830  	case *tcpip.KeepaliveIntervalOption:
  1831  		e.keepalive.Lock()
  1832  		e.keepalive.interval = time.Duration(*v)
  1833  		e.keepalive.Unlock()
  1834  		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
  1835  
  1836  	case *tcpip.TCPUserTimeoutOption:
  1837  		e.LockUser()
  1838  		e.userTimeout = time.Duration(*v)
  1839  		e.UnlockUser()
  1840  
  1841  	case *tcpip.CongestionControlOption:
  1842  		// Query the available cc algorithms in the stack and
  1843  		// validate that the specified algorithm is actually
  1844  		// supported in the stack.
  1845  		var avail tcpip.TCPAvailableCongestionControlOption
  1846  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil {
  1847  			return err
  1848  		}
  1849  		availCC := strings.Split(string(avail), " ")
  1850  		for _, cc := range availCC {
  1851  			if *v == tcpip.CongestionControlOption(cc) {
  1852  				e.LockUser()
  1853  				state := e.EndpointState()
  1854  				e.cc = *v
  1855  				switch state {
  1856  				case StateEstablished:
  1857  					if e.EndpointState() == state {
  1858  						e.snd.cc = e.snd.initCongestionControl(e.cc)
  1859  					}
  1860  				}
  1861  				e.UnlockUser()
  1862  				return nil
  1863  			}
  1864  		}
  1865  
  1866  		// Linux returns ENOENT when an invalid congestion
  1867  		// control algorithm is specified.
  1868  		return &tcpip.ErrNoSuchFile{}
  1869  
  1870  	case *tcpip.TCPLingerTimeoutOption:
  1871  		e.LockUser()
  1872  
  1873  		switch {
  1874  		case *v < 0:
  1875  			// Same as effectively disabling TCPLinger timeout.
  1876  			*v = -1
  1877  		case *v == 0:
  1878  			// Same as the stack default.
  1879  			var stackLingerTimeout tcpip.TCPLingerTimeoutOption
  1880  			if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil {
  1881  				panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err))
  1882  			}
  1883  			*v = stackLingerTimeout
  1884  		case *v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout):
  1885  			// Cap it to Stack's default TCP_LINGER2 timeout.
  1886  			*v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout)
  1887  		default:
  1888  		}
  1889  
  1890  		e.tcpLingerTimeout = time.Duration(*v)
  1891  		e.UnlockUser()
  1892  
  1893  	case *tcpip.TCPDeferAcceptOption:
  1894  		e.LockUser()
  1895  		if time.Duration(*v) > MaxRTO {
  1896  			*v = tcpip.TCPDeferAcceptOption(MaxRTO)
  1897  		}
  1898  		e.deferAccept = time.Duration(*v)
  1899  		e.UnlockUser()
  1900  
  1901  	case *tcpip.SocketDetachFilterOption:
  1902  		return nil
  1903  
  1904  	default:
  1905  		return nil
  1906  	}
  1907  	return nil
  1908  }
  1909  
  1910  // readyReceiveSize returns the number of bytes ready to be received.
  1911  func (e *endpoint) readyReceiveSize() (int, tcpip.Error) {
  1912  	e.LockUser()
  1913  	defer e.UnlockUser()
  1914  
  1915  	// The endpoint cannot be in listen state.
  1916  	if e.EndpointState() == StateListen {
  1917  		return 0, &tcpip.ErrInvalidEndpointState{}
  1918  	}
  1919  
  1920  	e.rcvQueueInfo.rcvQueueMu.Lock()
  1921  	defer e.rcvQueueInfo.rcvQueueMu.Unlock()
  1922  
  1923  	return e.rcvQueueInfo.RcvBufUsed, nil
  1924  }
  1925  
  1926  // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
  1927  func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) {
  1928  	switch opt {
  1929  	case tcpip.KeepaliveCountOption:
  1930  		e.keepalive.Lock()
  1931  		v := e.keepalive.count
  1932  		e.keepalive.Unlock()
  1933  		return v, nil
  1934  
  1935  	case tcpip.IPv4TOSOption:
  1936  		e.LockUser()
  1937  		v := int(e.sendTOS)
  1938  		e.UnlockUser()
  1939  		return v, nil
  1940  
  1941  	case tcpip.IPv6TrafficClassOption:
  1942  		e.LockUser()
  1943  		v := int(e.sendTOS)
  1944  		e.UnlockUser()
  1945  		return v, nil
  1946  
  1947  	case tcpip.MaxSegOption:
  1948  		// This is just stubbed out. Linux never returns the user_mss
  1949  		// value as it either returns the defaultMSS or returns the
  1950  		// actual current MSS. Netstack just returns the defaultMSS
  1951  		// always for now.
  1952  		v := header.TCPDefaultMSS
  1953  		return v, nil
  1954  
  1955  	case tcpip.MTUDiscoverOption:
  1956  		// Always return the path MTU discovery disabled setting since
  1957  		// it's the only one supported.
  1958  		return tcpip.PMTUDiscoveryDont, nil
  1959  
  1960  	case tcpip.ReceiveQueueSizeOption:
  1961  		return e.readyReceiveSize()
  1962  
  1963  	case tcpip.TTLOption:
  1964  		e.LockUser()
  1965  		v := int(e.ttl)
  1966  		e.UnlockUser()
  1967  		return v, nil
  1968  
  1969  	case tcpip.TCPSynCountOption:
  1970  		e.LockUser()
  1971  		v := int(e.maxSynRetries)
  1972  		e.UnlockUser()
  1973  		return v, nil
  1974  
  1975  	case tcpip.TCPWindowClampOption:
  1976  		e.LockUser()
  1977  		v := int(e.windowClamp)
  1978  		e.UnlockUser()
  1979  		return v, nil
  1980  
  1981  	case tcpip.MulticastTTLOption:
  1982  		return 1, nil
  1983  
  1984  	default:
  1985  		return -1, &tcpip.ErrUnknownProtocolOption{}
  1986  	}
  1987  }
  1988  
  1989  func (e *endpoint) getTCPInfo() tcpip.TCPInfoOption {
  1990  	info := tcpip.TCPInfoOption{}
  1991  	e.LockUser()
  1992  	if state := e.EndpointState(); state.internal() {
  1993  		info.State = tcpip.EndpointState(StateClose)
  1994  	} else {
  1995  		info.State = tcpip.EndpointState(state)
  1996  	}
  1997  	snd := e.snd
  1998  	if snd != nil {
  1999  		// We do not calculate RTT before sending the data packets. If
  2000  		// the connection did not send and receive data, then RTT will
  2001  		// be zero.
  2002  		snd.rtt.Lock()
  2003  		info.RTT = snd.rtt.TCPRTTState.SRTT
  2004  		info.RTTVar = snd.rtt.TCPRTTState.RTTVar
  2005  		snd.rtt.Unlock()
  2006  
  2007  		info.RTO = snd.RTO
  2008  		info.CcState = snd.state
  2009  		info.SndSsthresh = uint32(snd.Ssthresh)
  2010  		info.SndCwnd = uint32(snd.SndCwnd)
  2011  		info.ReorderSeen = snd.rc.Reord
  2012  	}
  2013  	e.UnlockUser()
  2014  	return info
  2015  }
  2016  
  2017  // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
  2018  func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error {
  2019  	switch o := opt.(type) {
  2020  	case *tcpip.TCPInfoOption:
  2021  		*o = e.getTCPInfo()
  2022  
  2023  	case *tcpip.KeepaliveIdleOption:
  2024  		e.keepalive.Lock()
  2025  		*o = tcpip.KeepaliveIdleOption(e.keepalive.idle)
  2026  		e.keepalive.Unlock()
  2027  
  2028  	case *tcpip.KeepaliveIntervalOption:
  2029  		e.keepalive.Lock()
  2030  		*o = tcpip.KeepaliveIntervalOption(e.keepalive.interval)
  2031  		e.keepalive.Unlock()
  2032  
  2033  	case *tcpip.TCPUserTimeoutOption:
  2034  		e.LockUser()
  2035  		*o = tcpip.TCPUserTimeoutOption(e.userTimeout)
  2036  		e.UnlockUser()
  2037  
  2038  	case *tcpip.CongestionControlOption:
  2039  		e.LockUser()
  2040  		*o = e.cc
  2041  		e.UnlockUser()
  2042  
  2043  	case *tcpip.TCPLingerTimeoutOption:
  2044  		e.LockUser()
  2045  		*o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout)
  2046  		e.UnlockUser()
  2047  
  2048  	case *tcpip.TCPDeferAcceptOption:
  2049  		e.LockUser()
  2050  		*o = tcpip.TCPDeferAcceptOption(e.deferAccept)
  2051  		e.UnlockUser()
  2052  
  2053  	case *tcpip.OriginalDestinationOption:
  2054  		e.LockUser()
  2055  		ipt := e.stack.IPTables()
  2056  		addr, port, err := ipt.OriginalDst(e.TransportEndpointInfo.ID, e.NetProto, ProtocolNumber)
  2057  		e.UnlockUser()
  2058  		if err != nil {
  2059  			return err
  2060  		}
  2061  		*o = tcpip.OriginalDestinationOption{
  2062  			Addr: addr,
  2063  			Port: port,
  2064  		}
  2065  
  2066  	default:
  2067  		return &tcpip.ErrUnknownProtocolOption{}
  2068  	}
  2069  	return nil
  2070  }
  2071  
  2072  // checkV4MappedLocked determines the effective network protocol and converts
  2073  // addr to its canonical form.
  2074  func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) {
  2075  	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only())
  2076  	if err != nil {
  2077  		return tcpip.FullAddress{}, 0, err
  2078  	}
  2079  	return unwrapped, netProto, nil
  2080  }
  2081  
  2082  // Disconnect implements tcpip.Endpoint.Disconnect.
  2083  func (*endpoint) Disconnect() tcpip.Error {
  2084  	return &tcpip.ErrNotSupported{}
  2085  }
  2086  
  2087  // Connect connects the endpoint to its peer.
  2088  func (e *endpoint) Connect(addr tcpip.FullAddress) tcpip.Error {
  2089  	err := e.connect(addr, true, true)
  2090  	if err != nil {
  2091  		if !err.IgnoreStats() {
  2092  			// Connect failed. Let's wake up any waiters.
  2093  			e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  2094  			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
  2095  			e.stats.FailedConnectionAttempts.Increment()
  2096  		}
  2097  	}
  2098  	return err
  2099  }
  2100  
  2101  // connect connects the endpoint to its peer. In the normal non-S/R case, the
  2102  // new connection is expected to run the main goroutine and perform handshake.
  2103  // In restore of previously connected endpoints, both ends will be passively
  2104  // created (so no new handshaking is done); for stack-accepted connections not
  2105  // yet accepted by the app, they are restored without running the main goroutine
  2106  // here.
  2107  func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) tcpip.Error {
  2108  	e.LockUser()
  2109  	defer e.UnlockUser()
  2110  
  2111  	connectingAddr := addr.Addr
  2112  
  2113  	addr, netProto, err := e.checkV4MappedLocked(addr)
  2114  	if err != nil {
  2115  		return err
  2116  	}
  2117  
  2118  	if e.EndpointState().connected() {
  2119  		// The endpoint is already connected. If caller hasn't been
  2120  		// notified yet, return success.
  2121  		if !e.isConnectNotified {
  2122  			e.isConnectNotified = true
  2123  			return nil
  2124  		}
  2125  		// Otherwise return that it's already connected.
  2126  		return &tcpip.ErrAlreadyConnected{}
  2127  	}
  2128  
  2129  	nicID := addr.NIC
  2130  	switch e.EndpointState() {
  2131  	case StateBound:
  2132  		// If we're already bound to a NIC but the caller is requesting
  2133  		// that we use a different one now, we cannot proceed.
  2134  		if e.boundNICID == 0 {
  2135  			break
  2136  		}
  2137  
  2138  		if nicID != 0 && nicID != e.boundNICID {
  2139  			return &tcpip.ErrNoRoute{}
  2140  		}
  2141  
  2142  		nicID = e.boundNICID
  2143  
  2144  	case StateInitial:
  2145  		// Nothing to do. We'll eventually fill-in the gaps in the ID (if any)
  2146  		// when we find a route.
  2147  
  2148  	case StateConnecting, StateSynSent, StateSynRecv:
  2149  		// A connection request has already been issued but hasn't completed
  2150  		// yet.
  2151  		return &tcpip.ErrAlreadyConnecting{}
  2152  
  2153  	case StateError:
  2154  		if err := e.hardErrorLocked(); err != nil {
  2155  			return err
  2156  		}
  2157  		return &tcpip.ErrConnectionAborted{}
  2158  
  2159  	default:
  2160  		return &tcpip.ErrInvalidEndpointState{}
  2161  	}
  2162  
  2163  	// Find a route to the desired destination.
  2164  	r, err := e.stack.FindRoute(nicID, e.TransportEndpointInfo.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */)
  2165  	if err != nil {
  2166  		return err
  2167  	}
  2168  	defer r.Release()
  2169  
  2170  	netProtos := []tcpip.NetworkProtocolNumber{netProto}
  2171  	e.TransportEndpointInfo.ID.LocalAddress = r.LocalAddress()
  2172  	e.TransportEndpointInfo.ID.RemoteAddress = r.RemoteAddress()
  2173  	e.TransportEndpointInfo.ID.RemotePort = addr.Port
  2174  
  2175  	if e.TransportEndpointInfo.ID.LocalPort != 0 {
  2176  		// The endpoint is bound to a port, attempt to register it.
  2177  		err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
  2178  		if err != nil {
  2179  			return err
  2180  		}
  2181  	} else {
  2182  		// The endpoint doesn't have a local port yet, so try to get
  2183  		// one. Make sure that it isn't one that will result in the same
  2184  		// address/port for both local and remote (otherwise this
  2185  		// endpoint would be trying to connect to itself).
  2186  		sameAddr := e.TransportEndpointInfo.ID.LocalAddress == e.TransportEndpointInfo.ID.RemoteAddress
  2187  
  2188  		// Calculate a port offset based on the destination IP/port and
  2189  		// src IP to ensure that for a given tuple (srcIP, destIP,
  2190  		// destPort) the offset used as a starting point is the same to
  2191  		// ensure that we can cycle through the port space effectively.
  2192  		portBuf := make([]byte, 2)
  2193  		binary.LittleEndian.PutUint16(portBuf, e.ID.RemotePort)
  2194  
  2195  		h := jenkins.Sum32(e.protocol.portOffsetSecret)
  2196  		for _, s := range [][]byte{
  2197  			[]byte(e.ID.LocalAddress),
  2198  			[]byte(e.ID.RemoteAddress),
  2199  			portBuf,
  2200  		} {
  2201  			// Per io.Writer.Write:
  2202  			//
  2203  			// Write must return a non-nil error if it returns n < len(p).
  2204  			if _, err := h.Write(s); err != nil {
  2205  				panic(err)
  2206  			}
  2207  		}
  2208  		portOffset := h.Sum32()
  2209  
  2210  		var twReuse tcpip.TCPTimeWaitReuseOption
  2211  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &twReuse); err != nil {
  2212  			panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &twReuse, err))
  2213  		}
  2214  
  2215  		reuse := twReuse == tcpip.TCPTimeWaitReuseGlobal
  2216  		if twReuse == tcpip.TCPTimeWaitReuseLoopbackOnly {
  2217  			switch netProto {
  2218  			case header.IPv4ProtocolNumber:
  2219  				reuse = header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.LocalAddress) && header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.RemoteAddress)
  2220  			case header.IPv6ProtocolNumber:
  2221  				reuse = e.TransportEndpointInfo.ID.LocalAddress == header.IPv6Loopback && e.TransportEndpointInfo.ID.RemoteAddress == header.IPv6Loopback
  2222  			}
  2223  		}
  2224  
  2225  		bindToDevice := tcpip.NICID(e.ops.GetBindToDevice())
  2226  		if _, err := e.stack.PickEphemeralPortStable(portOffset, func(p uint16) (bool, tcpip.Error) {
  2227  			if sameAddr && p == e.TransportEndpointInfo.ID.RemotePort {
  2228  				return false, nil
  2229  			}
  2230  			portRes := ports.Reservation{
  2231  				Networks:     netProtos,
  2232  				Transport:    ProtocolNumber,
  2233  				Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  2234  				Port:         p,
  2235  				Flags:        e.portFlags,
  2236  				BindToDevice: bindToDevice,
  2237  				Dest:         addr,
  2238  			}
  2239  			if _, err := e.stack.ReservePort(e.stack.Rand(), portRes, nil /* testPort */); err != nil {
  2240  				if _, ok := err.(*tcpip.ErrPortInUse); !ok || !reuse {
  2241  					return false, nil
  2242  				}
  2243  				transEPID := e.TransportEndpointInfo.ID
  2244  				transEPID.LocalPort = p
  2245  				// Check if an endpoint is registered with demuxer in TIME-WAIT and if
  2246  				// we can reuse it. If we can't find a transport endpoint then we just
  2247  				// skip using this port as it's possible that either an endpoint has
  2248  				// bound the port but not registered with demuxer yet (no listen/connect
  2249  				// done yet) or the reservation was freed between the check above and
  2250  				// the FindTransportEndpoint below. But rather than retry the same port
  2251  				// we just skip it and move on.
  2252  				transEP := e.stack.FindTransportEndpoint(netProto, ProtocolNumber, transEPID, r.NICID())
  2253  				if transEP == nil {
  2254  					// ReservePort failed but there is no registered endpoint with
  2255  					// demuxer. Which indicates there is at least some endpoint that has
  2256  					// bound the port.
  2257  					return false, nil
  2258  				}
  2259  
  2260  				tcpEP := transEP.(*endpoint)
  2261  				tcpEP.LockUser()
  2262  				// If the endpoint is not in TIME-WAIT or if it is in TIME-WAIT but
  2263  				// less than 1 second has elapsed since its recentTS was updated then
  2264  				// we cannot reuse the port.
  2265  				if tcpEP.EndpointState() != StateTimeWait || e.stack.Clock().NowMonotonic().Sub(tcpEP.recentTSTime) < 1*time.Second {
  2266  					tcpEP.UnlockUser()
  2267  					return false, nil
  2268  				}
  2269  				// Since the endpoint is in TIME-WAIT it should be safe to acquire its
  2270  				// Lock while holding the lock for this endpoint as endpoints in
  2271  				// TIME-WAIT do not acquire locks on other endpoints.
  2272  				tcpEP.workerCleanup = false
  2273  				tcpEP.cleanupLocked()
  2274  				tcpEP.notifyProtocolGoroutine(notifyAbort)
  2275  				tcpEP.UnlockUser()
  2276  				// Now try and Reserve again if it fails then we skip.
  2277  				portRes := ports.Reservation{
  2278  					Networks:     netProtos,
  2279  					Transport:    ProtocolNumber,
  2280  					Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  2281  					Port:         p,
  2282  					Flags:        e.portFlags,
  2283  					BindToDevice: bindToDevice,
  2284  					Dest:         addr,
  2285  				}
  2286  				if _, err := e.stack.ReservePort(e.stack.Rand(), portRes, nil /* testPort */); err != nil {
  2287  					return false, nil
  2288  				}
  2289  			}
  2290  
  2291  			id := e.TransportEndpointInfo.ID
  2292  			id.LocalPort = p
  2293  			if err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.portFlags, bindToDevice); err != nil {
  2294  				portRes := ports.Reservation{
  2295  					Networks:     netProtos,
  2296  					Transport:    ProtocolNumber,
  2297  					Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  2298  					Port:         p,
  2299  					Flags:        e.portFlags,
  2300  					BindToDevice: bindToDevice,
  2301  					Dest:         addr,
  2302  				}
  2303  				e.stack.ReleasePort(portRes)
  2304  				if _, ok := err.(*tcpip.ErrPortInUse); ok {
  2305  					return false, nil
  2306  				}
  2307  				return false, err
  2308  			}
  2309  
  2310  			// Port picking successful. Save the details of
  2311  			// the selected port.
  2312  			e.TransportEndpointInfo.ID = id
  2313  			e.isPortReserved = true
  2314  			e.boundBindToDevice = bindToDevice
  2315  			e.boundPortFlags = e.portFlags
  2316  			e.boundDest = addr
  2317  			return true, nil
  2318  		}); err != nil {
  2319  			e.stack.Stats().TCP.FailedPortReservations.Increment()
  2320  			return err
  2321  		}
  2322  	}
  2323  
  2324  	e.isRegistered = true
  2325  	e.setEndpointState(StateConnecting)
  2326  	r.Acquire()
  2327  	e.route = r
  2328  	e.boundNICID = nicID
  2329  	e.effectiveNetProtos = netProtos
  2330  	e.connectingAddress = connectingAddr
  2331  
  2332  	e.initGSO()
  2333  
  2334  	// Connect in the restore phase does not perform handshake. Restore its
  2335  	// connection setting here.
  2336  	if !handshake {
  2337  		e.segmentQueue.mu.Lock()
  2338  		for _, l := range []segmentList{e.segmentQueue.list, e.snd.writeList} {
  2339  			for s := l.Front(); s != nil; s = s.Next() {
  2340  				s.id = e.TransportEndpointInfo.ID
  2341  				e.sndQueueInfo.sndWaker.Assert()
  2342  			}
  2343  		}
  2344  		e.segmentQueue.mu.Unlock()
  2345  		e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
  2346  		e.setEndpointState(StateEstablished)
  2347  		// Set the new auto tuned send buffer size after entering
  2348  		// established state.
  2349  		e.ops.SetSendBufferSize(e.computeTCPSendBufferSize(), false /* notify */)
  2350  	}
  2351  
  2352  	if run {
  2353  		if handshake {
  2354  			h := e.newHandshake()
  2355  			e.setEndpointState(StateSynSent)
  2356  			h.start()
  2357  		}
  2358  		e.stack.Stats().TCP.ActiveConnectionOpenings.Increment()
  2359  		e.workerRunning = true
  2360  		go e.protocolMainLoop(handshake, nil) // S/R-SAFE: will be drained before save.
  2361  	}
  2362  
  2363  	return &tcpip.ErrConnectStarted{}
  2364  }
  2365  
  2366  // ConnectEndpoint is not supported.
  2367  func (*endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error {
  2368  	return &tcpip.ErrInvalidEndpointState{}
  2369  }
  2370  
  2371  // Shutdown closes the read and/or write end of the endpoint connection to its
  2372  // peer.
  2373  func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error {
  2374  	e.LockUser()
  2375  	defer e.UnlockUser()
  2376  
  2377  	if e.EndpointState().connecting() {
  2378  		// When calling shutdown(2) on a connecting socket, the endpoint must
  2379  		// enter the error state. But this logic cannot belong to the shutdownLocked
  2380  		// method because that method is called during a close(2) (and closing a
  2381  		// connecting socket is not an error).
  2382  		e.resetConnectionLocked(&tcpip.ErrConnectionReset{})
  2383  		e.notifyProtocolGoroutine(notifyShutdown)
  2384  		e.waiterQueue.Notify(waiter.WritableEvents | waiter.EventHUp | waiter.EventErr)
  2385  		return nil
  2386  	}
  2387  
  2388  	return e.shutdownLocked(flags)
  2389  }
  2390  
  2391  func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error {
  2392  	e.shutdownFlags |= flags
  2393  	switch {
  2394  	case e.EndpointState().connected():
  2395  		// Close for read.
  2396  		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
  2397  			// Mark read side as closed.
  2398  			e.rcvQueueInfo.rcvQueueMu.Lock()
  2399  			e.rcvQueueInfo.RcvClosed = true
  2400  			rcvBufUsed := e.rcvQueueInfo.RcvBufUsed
  2401  			e.rcvQueueInfo.rcvQueueMu.Unlock()
  2402  
  2403  			// If we're fully closed and we have unread data we need to abort
  2404  			// the connection with a RST.
  2405  			if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 {
  2406  				e.resetConnectionLocked(&tcpip.ErrConnectionAborted{})
  2407  				// Wake up worker to terminate loop.
  2408  				e.notifyProtocolGoroutine(notifyTickleWorker)
  2409  				return nil
  2410  			}
  2411  			// Wake up any readers that maybe waiting for the stream to become
  2412  			// readable.
  2413  			e.waiterQueue.Notify(waiter.ReadableEvents)
  2414  		}
  2415  
  2416  		// Close for write.
  2417  		if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
  2418  			e.sndQueueInfo.sndQueueMu.Lock()
  2419  			if e.sndQueueInfo.SndClosed {
  2420  				// Already closed.
  2421  				e.sndQueueInfo.sndQueueMu.Unlock()
  2422  				if e.EndpointState() == StateTimeWait {
  2423  					return &tcpip.ErrNotConnected{}
  2424  				}
  2425  				return nil
  2426  			}
  2427  
  2428  			// Queue fin segment.
  2429  			s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), nil)
  2430  			e.snd.writeList.PushBack(s)
  2431  			// Mark endpoint as closed.
  2432  			e.sndQueueInfo.SndClosed = true
  2433  			e.sndQueueInfo.sndQueueMu.Unlock()
  2434  
  2435  			// Drain the send queue.
  2436  			e.sendData(s)
  2437  
  2438  			// Mark send side as closed.
  2439  			e.snd.Closed = true
  2440  
  2441  			// Wake up any writers that maybe waiting for the stream to become
  2442  			// writable.
  2443  			e.waiterQueue.Notify(waiter.WritableEvents)
  2444  		}
  2445  
  2446  		return nil
  2447  	case e.EndpointState() == StateListen:
  2448  		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
  2449  			// Reset all connections from the accept queue and keep the
  2450  			// worker running so that it can continue handling incoming
  2451  			// segments by replying with RST.
  2452  			//
  2453  			// By not removing this endpoint from the demuxer mapping, we
  2454  			// ensure that any other bind to the same port fails, as on Linux.
  2455  			e.rcvQueueInfo.rcvQueueMu.Lock()
  2456  			e.rcvQueueInfo.RcvClosed = true
  2457  			e.rcvQueueInfo.rcvQueueMu.Unlock()
  2458  			e.closePendingAcceptableConnectionsLocked()
  2459  			// Notify waiters that the endpoint is shutdown.
  2460  			e.waiterQueue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr)
  2461  		}
  2462  		return nil
  2463  	default:
  2464  		return &tcpip.ErrNotConnected{}
  2465  	}
  2466  }
  2467  
  2468  // Listen puts the endpoint in "listen" mode, which allows it to accept
  2469  // new connections.
  2470  func (e *endpoint) Listen(backlog int) tcpip.Error {
  2471  	err := e.listen(backlog)
  2472  	if err != nil {
  2473  		if !err.IgnoreStats() {
  2474  			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
  2475  			e.stats.FailedConnectionAttempts.Increment()
  2476  		}
  2477  	}
  2478  	return err
  2479  }
  2480  
  2481  func (e *endpoint) listen(backlog int) tcpip.Error {
  2482  	e.LockUser()
  2483  	defer e.UnlockUser()
  2484  
  2485  	if e.EndpointState() == StateListen && !e.closed {
  2486  		e.acceptMu.Lock()
  2487  		defer e.acceptMu.Unlock()
  2488  
  2489  		// Adjust the size of the backlog iff we can fit
  2490  		// existing pending connections into the new one.
  2491  		if e.acceptQueue.endpoints.Len() > backlog {
  2492  			return &tcpip.ErrInvalidEndpointState{}
  2493  		}
  2494  		e.acceptQueue.capacity = backlog
  2495  
  2496  		if e.acceptQueue.pendingEndpoints == nil {
  2497  			e.acceptQueue.pendingEndpoints = make(map[*endpoint]struct{})
  2498  		}
  2499  
  2500  		e.shutdownFlags = 0
  2501  		e.rcvQueueInfo.rcvQueueMu.Lock()
  2502  		e.rcvQueueInfo.RcvClosed = false
  2503  		e.rcvQueueInfo.rcvQueueMu.Unlock()
  2504  
  2505  		// Notify any blocked goroutines that they can attempt to
  2506  		// deliver endpoints again.
  2507  		e.acceptCond.Broadcast()
  2508  
  2509  		return nil
  2510  	}
  2511  
  2512  	if e.EndpointState() == StateInitial {
  2513  		// The listen is called on an unbound socket, the socket is
  2514  		// automatically bound to a random free port with the local
  2515  		// address set to INADDR_ANY.
  2516  		if err := e.bindLocked(tcpip.FullAddress{}); err != nil {
  2517  			return err
  2518  		}
  2519  	}
  2520  
  2521  	// Endpoint must be bound before it can transition to listen mode.
  2522  	if e.EndpointState() != StateBound {
  2523  		e.stats.ReadErrors.InvalidEndpointState.Increment()
  2524  		return &tcpip.ErrInvalidEndpointState{}
  2525  	}
  2526  
  2527  	// Register the endpoint.
  2528  	if err := e.stack.RegisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil {
  2529  		return err
  2530  	}
  2531  
  2532  	e.isRegistered = true
  2533  	e.setEndpointState(StateListen)
  2534  
  2535  	// The queue may be non-zero when we're restoring the endpoint, and it
  2536  	// may be pre-populated with some previously accepted (but not Accepted)
  2537  	// endpoints.
  2538  	e.acceptMu.Lock()
  2539  	if e.acceptQueue.pendingEndpoints == nil {
  2540  		e.acceptQueue.pendingEndpoints = make(map[*endpoint]struct{})
  2541  	}
  2542  	if e.acceptQueue.capacity == 0 {
  2543  		e.acceptQueue.capacity = backlog
  2544  	}
  2545  	e.acceptMu.Unlock()
  2546  
  2547  	e.workerRunning = true
  2548  	go e.protocolListenLoop( // S/R-SAFE: drained on save.
  2549  		seqnum.Size(e.receiveBufferAvailable()))
  2550  	return nil
  2551  }
  2552  
  2553  // startAcceptedLoop sets up required state and starts a goroutine with the
  2554  // main loop for accepted connections.
  2555  // +checklocksrelease:e.mu
  2556  func (e *endpoint) startAcceptedLoop() {
  2557  	e.workerRunning = true
  2558  	e.mu.Unlock()
  2559  	wakerInitDone := make(chan struct{})
  2560  	go e.protocolMainLoop(false, wakerInitDone) // S/R-SAFE: drained on save.
  2561  	<-wakerInitDone
  2562  }
  2563  
  2564  // Accept returns a new endpoint if a peer has established a connection
  2565  // to an endpoint previously set to listen mode.
  2566  //
  2567  // addr if not-nil will contain the peer address of the returned endpoint.
  2568  func (e *endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) {
  2569  	e.LockUser()
  2570  	defer e.UnlockUser()
  2571  
  2572  	e.rcvQueueInfo.rcvQueueMu.Lock()
  2573  	rcvClosed := e.rcvQueueInfo.RcvClosed
  2574  	e.rcvQueueInfo.rcvQueueMu.Unlock()
  2575  	// Endpoint must be in listen state before it can accept connections.
  2576  	if rcvClosed || e.EndpointState() != StateListen {
  2577  		return nil, nil, &tcpip.ErrInvalidEndpointState{}
  2578  	}
  2579  
  2580  	// Get the new accepted endpoint.
  2581  	var n *endpoint
  2582  	e.acceptMu.Lock()
  2583  	if element := e.acceptQueue.endpoints.Front(); element != nil {
  2584  		n = e.acceptQueue.endpoints.Remove(element).(*endpoint)
  2585  	}
  2586  	e.acceptMu.Unlock()
  2587  	if n == nil {
  2588  		return nil, nil, &tcpip.ErrWouldBlock{}
  2589  	}
  2590  	e.acceptCond.Signal()
  2591  	if peerAddr != nil {
  2592  		*peerAddr = n.getRemoteAddress()
  2593  	}
  2594  	return n, n.waiterQueue, nil
  2595  }
  2596  
  2597  // Bind binds the endpoint to a specific local port and optionally address.
  2598  func (e *endpoint) Bind(addr tcpip.FullAddress) (err tcpip.Error) {
  2599  	e.LockUser()
  2600  	defer e.UnlockUser()
  2601  
  2602  	return e.bindLocked(addr)
  2603  }
  2604  
  2605  func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err tcpip.Error) {
  2606  	// Don't allow binding once endpoint is not in the initial state
  2607  	// anymore. This is because once the endpoint goes into a connected or
  2608  	// listen state, it is already bound.
  2609  	if e.EndpointState() != StateInitial {
  2610  		return &tcpip.ErrAlreadyBound{}
  2611  	}
  2612  
  2613  	e.BindAddr = addr.Addr
  2614  	addr, netProto, err := e.checkV4MappedLocked(addr)
  2615  	if err != nil {
  2616  		return err
  2617  	}
  2618  
  2619  	netProtos := []tcpip.NetworkProtocolNumber{netProto}
  2620  
  2621  	// Expand netProtos to include v4 and v6 under dual-stack if the caller is
  2622  	// binding to a wildcard (empty) address, and this is an IPv6 endpoint with
  2623  	// v6only set to false.
  2624  	if netProto == header.IPv6ProtocolNumber {
  2625  		stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber)
  2626  		alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == "" && stackHasV4
  2627  		if alsoBindToV4 {
  2628  			netProtos = append(netProtos, header.IPv4ProtocolNumber)
  2629  		}
  2630  	}
  2631  
  2632  	var nic tcpip.NICID
  2633  	// If an address is specified, we must ensure that it's one of our
  2634  	// local addresses.
  2635  	if len(addr.Addr) != 0 {
  2636  		nic = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
  2637  		if nic == 0 {
  2638  			return &tcpip.ErrBadLocalAddress{}
  2639  		}
  2640  		e.TransportEndpointInfo.ID.LocalAddress = addr.Addr
  2641  	}
  2642  
  2643  	bindToDevice := tcpip.NICID(e.ops.GetBindToDevice())
  2644  	portRes := ports.Reservation{
  2645  		Networks:     netProtos,
  2646  		Transport:    ProtocolNumber,
  2647  		Addr:         addr.Addr,
  2648  		Port:         addr.Port,
  2649  		Flags:        e.portFlags,
  2650  		BindToDevice: bindToDevice,
  2651  		Dest:         tcpip.FullAddress{},
  2652  	}
  2653  	port, err := e.stack.ReservePort(e.stack.Rand(), portRes, func(p uint16) (bool, tcpip.Error) {
  2654  		id := e.TransportEndpointInfo.ID
  2655  		id.LocalPort = p
  2656  		// CheckRegisterTransportEndpoint should only return an error if there is a
  2657  		// listening endpoint bound with the same id and portFlags and bindToDevice
  2658  		// options.
  2659  		//
  2660  		// NOTE: Only listening and connected endpoint register with
  2661  		// demuxer. Further connected endpoints always have a remote
  2662  		// address/port. Hence this will only return an error if there is a matching
  2663  		// listening endpoint.
  2664  		if err := e.stack.CheckRegisterTransportEndpoint(netProtos, ProtocolNumber, id, e.portFlags, bindToDevice); err != nil {
  2665  			return false, nil
  2666  		}
  2667  		return true, nil
  2668  	})
  2669  	if err != nil {
  2670  		e.stack.Stats().TCP.FailedPortReservations.Increment()
  2671  		return err
  2672  	}
  2673  
  2674  	e.boundBindToDevice = bindToDevice
  2675  	e.boundPortFlags = e.portFlags
  2676  	// TODO(gvisor.dev/issue/3691): Add test to verify boundNICID is correct.
  2677  	e.boundNICID = nic
  2678  	e.isPortReserved = true
  2679  	e.effectiveNetProtos = netProtos
  2680  	e.TransportEndpointInfo.ID.LocalPort = port
  2681  
  2682  	// Mark endpoint as bound.
  2683  	e.setEndpointState(StateBound)
  2684  
  2685  	return nil
  2686  }
  2687  
  2688  // GetLocalAddress returns the address to which the endpoint is bound.
  2689  func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) {
  2690  	e.LockUser()
  2691  	defer e.UnlockUser()
  2692  
  2693  	return tcpip.FullAddress{
  2694  		Addr: e.TransportEndpointInfo.ID.LocalAddress,
  2695  		Port: e.TransportEndpointInfo.ID.LocalPort,
  2696  		NIC:  e.boundNICID,
  2697  	}, nil
  2698  }
  2699  
  2700  // GetRemoteAddress returns the address to which the endpoint is connected.
  2701  func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) {
  2702  	e.LockUser()
  2703  	defer e.UnlockUser()
  2704  
  2705  	if !e.EndpointState().connected() {
  2706  		return tcpip.FullAddress{}, &tcpip.ErrNotConnected{}
  2707  	}
  2708  
  2709  	return e.getRemoteAddress(), nil
  2710  }
  2711  
  2712  func (e *endpoint) getRemoteAddress() tcpip.FullAddress {
  2713  	return tcpip.FullAddress{
  2714  		Addr: e.TransportEndpointInfo.ID.RemoteAddress,
  2715  		Port: e.TransportEndpointInfo.ID.RemotePort,
  2716  		NIC:  e.boundNICID,
  2717  	}
  2718  }
  2719  
  2720  func (*endpoint) HandlePacket(stack.TransportEndpointID, *stack.PacketBuffer) {
  2721  	// TCP HandlePacket is not required anymore as inbound packets first
  2722  	// land at the Dispatcher which then can either deliver using the
  2723  	// worker go routine or directly do the invoke the tcp processing inline
  2724  	// based on the state of the endpoint.
  2725  }
  2726  
  2727  func (e *endpoint) enqueueSegment(s *segment) bool {
  2728  	// Send packet to worker goroutine.
  2729  	if !e.segmentQueue.enqueue(s) {
  2730  		// The queue is full, so we drop the segment.
  2731  		e.stack.Stats().DroppedPackets.Increment()
  2732  		e.stats.ReceiveErrors.SegmentQueueDropped.Increment()
  2733  		return false
  2734  	}
  2735  	return true
  2736  }
  2737  
  2738  func (e *endpoint) onICMPError(err tcpip.Error, transErr stack.TransportError, pkt *stack.PacketBuffer) {
  2739  	// Update last error first.
  2740  	e.lastErrorMu.Lock()
  2741  	e.lastError = err
  2742  	e.lastErrorMu.Unlock()
  2743  
  2744  	// Update the error queue if IP_RECVERR is enabled.
  2745  	if e.SocketOptions().GetRecvError() {
  2746  		e.SocketOptions().QueueErr(&tcpip.SockError{
  2747  			Err:   err,
  2748  			Cause: transErr,
  2749  			// Linux passes the payload with the TCP header. We don't know if the TCP
  2750  			// header even exists, it may not for fragmented packets.
  2751  			Payload: pkt.Data().AsRange().ToOwnedView(),
  2752  			Dst: tcpip.FullAddress{
  2753  				NIC:  pkt.NICID,
  2754  				Addr: e.TransportEndpointInfo.ID.RemoteAddress,
  2755  				Port: e.TransportEndpointInfo.ID.RemotePort,
  2756  			},
  2757  			Offender: tcpip.FullAddress{
  2758  				NIC:  pkt.NICID,
  2759  				Addr: e.TransportEndpointInfo.ID.LocalAddress,
  2760  				Port: e.TransportEndpointInfo.ID.LocalPort,
  2761  			},
  2762  			NetProto: pkt.NetworkProtocolNumber,
  2763  		})
  2764  	}
  2765  
  2766  	// Notify of the error.
  2767  	e.notifyProtocolGoroutine(notifyError)
  2768  }
  2769  
  2770  // HandleError implements stack.TransportEndpoint.
  2771  func (e *endpoint) HandleError(transErr stack.TransportError, pkt *stack.PacketBuffer) {
  2772  	handlePacketTooBig := func(mtu uint32) {
  2773  		e.sndQueueInfo.sndQueueMu.Lock()
  2774  		e.sndQueueInfo.PacketTooBigCount++
  2775  		if v := int(mtu); v < e.sndQueueInfo.SndMTU {
  2776  			e.sndQueueInfo.SndMTU = v
  2777  		}
  2778  		e.sndQueueInfo.sndQueueMu.Unlock()
  2779  		e.notifyProtocolGoroutine(notifyMTUChanged)
  2780  	}
  2781  
  2782  	// TODO(gvisor.dev/issues/5270): Handle all transport errors.
  2783  	switch transErr.Kind() {
  2784  	case stack.PacketTooBigTransportError:
  2785  		handlePacketTooBig(transErr.Info())
  2786  	case stack.DestinationHostUnreachableTransportError:
  2787  		e.onICMPError(&tcpip.ErrNoRoute{}, transErr, pkt)
  2788  	case stack.DestinationNetworkUnreachableTransportError:
  2789  		e.onICMPError(&tcpip.ErrNetworkUnreachable{}, transErr, pkt)
  2790  	}
  2791  }
  2792  
  2793  // updateSndBufferUsage is called by the protocol goroutine when room opens up
  2794  // in the send buffer. The number of newly available bytes is v.
  2795  func (e *endpoint) updateSndBufferUsage(v int) {
  2796  	sendBufferSize := e.getSendBufferSize()
  2797  	e.sndQueueInfo.sndQueueMu.Lock()
  2798  	notify := e.sndQueueInfo.SndBufUsed >= sendBufferSize>>1
  2799  	e.sndQueueInfo.SndBufUsed -= v
  2800  
  2801  	// Get the new send buffer size with auto tuning, but do not set it
  2802  	// unless we decide to notify the writers.
  2803  	newSndBufSz := e.computeTCPSendBufferSize()
  2804  
  2805  	// We only notify when there is half the sendBufferSize available after
  2806  	// a full buffer event occurs. This ensures that we don't wake up
  2807  	// writers to queue just 1-2 segments and go back to sleep.
  2808  	notify = notify && e.sndQueueInfo.SndBufUsed < int(newSndBufSz)>>1
  2809  	e.sndQueueInfo.sndQueueMu.Unlock()
  2810  
  2811  	if notify {
  2812  		// Set the new send buffer size calculated from auto tuning.
  2813  		e.ops.SetSendBufferSize(newSndBufSz, false /* notify */)
  2814  		e.waiterQueue.Notify(waiter.WritableEvents)
  2815  	}
  2816  }
  2817  
  2818  // readyToRead is called by the protocol goroutine when a new segment is ready
  2819  // to be read, or when the connection is closed for receiving (in which case
  2820  // s will be nil).
  2821  func (e *endpoint) readyToRead(s *segment) {
  2822  	e.rcvQueueInfo.rcvQueueMu.Lock()
  2823  	if s != nil {
  2824  		e.rcvQueueInfo.RcvBufUsed += s.payloadSize()
  2825  		s.incRef()
  2826  		e.rcvQueueInfo.rcvQueue.PushBack(s)
  2827  	} else {
  2828  		e.rcvQueueInfo.RcvClosed = true
  2829  	}
  2830  	e.rcvQueueInfo.rcvQueueMu.Unlock()
  2831  	e.waiterQueue.Notify(waiter.ReadableEvents)
  2832  }
  2833  
  2834  // receiveBufferAvailableLocked calculates how many bytes are still available
  2835  // in the receive buffer.
  2836  // rcvQueueMu must be held when this function is called.
  2837  func (e *endpoint) receiveBufferAvailableLocked(rcvBufSize int) int {
  2838  	// We may use more bytes than the buffer size when the receive buffer
  2839  	// shrinks.
  2840  	memUsed := e.receiveMemUsed()
  2841  	if memUsed >= rcvBufSize {
  2842  		return 0
  2843  	}
  2844  
  2845  	return rcvBufSize - memUsed
  2846  }
  2847  
  2848  // receiveBufferAvailable calculates how many bytes are still available in the
  2849  // receive buffer based on the actual memory used by all segments held in
  2850  // receive buffer/pending and segment queue.
  2851  func (e *endpoint) receiveBufferAvailable() int {
  2852  	e.rcvQueueInfo.rcvQueueMu.Lock()
  2853  	available := e.receiveBufferAvailableLocked(int(e.ops.GetReceiveBufferSize()))
  2854  	e.rcvQueueInfo.rcvQueueMu.Unlock()
  2855  	return available
  2856  }
  2857  
  2858  // receiveBufferUsed returns the amount of in-use receive buffer.
  2859  func (e *endpoint) receiveBufferUsed() int {
  2860  	e.rcvQueueInfo.rcvQueueMu.Lock()
  2861  	used := e.rcvQueueInfo.RcvBufUsed
  2862  	e.rcvQueueInfo.rcvQueueMu.Unlock()
  2863  	return used
  2864  }
  2865  
  2866  // receiveMemUsed returns the total memory in use by segments held by this
  2867  // endpoint.
  2868  func (e *endpoint) receiveMemUsed() int {
  2869  	return int(atomic.LoadInt32(&e.rcvMemUsed))
  2870  }
  2871  
  2872  // updateReceiveMemUsed adds the provided delta to e.rcvMemUsed.
  2873  func (e *endpoint) updateReceiveMemUsed(delta int) {
  2874  	atomic.AddInt32(&e.rcvMemUsed, int32(delta))
  2875  }
  2876  
  2877  // maxReceiveBufferSize returns the stack wide maximum receive buffer size for
  2878  // an endpoint.
  2879  func (e *endpoint) maxReceiveBufferSize() int {
  2880  	var rs tcpip.TCPReceiveBufferSizeRangeOption
  2881  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil {
  2882  		// As a fallback return the hardcoded max buffer size.
  2883  		return MaxBufferSize
  2884  	}
  2885  	return rs.Max
  2886  }
  2887  
  2888  // rcvWndScaleForHandshake computes the receive window scale to offer to the
  2889  // peer when window scaling is enabled (true by default). If auto-tuning is
  2890  // disabled then the window scaling factor is based on the size of the
  2891  // receiveBuffer otherwise we use the max permissible receive buffer size to
  2892  // compute the scale.
  2893  func (e *endpoint) rcvWndScaleForHandshake() int {
  2894  	bufSizeForScale := e.ops.GetReceiveBufferSize()
  2895  
  2896  	e.rcvQueueInfo.rcvQueueMu.Lock()
  2897  	autoTuningDisabled := e.rcvQueueInfo.RcvAutoParams.Disabled
  2898  	e.rcvQueueInfo.rcvQueueMu.Unlock()
  2899  	if autoTuningDisabled {
  2900  		return FindWndScale(seqnum.Size(bufSizeForScale))
  2901  	}
  2902  
  2903  	return FindWndScale(seqnum.Size(e.maxReceiveBufferSize()))
  2904  }
  2905  
  2906  // updateRecentTimestamp updates the recent timestamp using the algorithm
  2907  // described in https://tools.ietf.org/html/rfc7323#section-4.3
  2908  func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) {
  2909  	if e.SendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) {
  2910  		e.setRecentTimestamp(tsVal)
  2911  	}
  2912  }
  2913  
  2914  // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if
  2915  // the SYN options indicate that timestamp option was negotiated. It also
  2916  // initializes the recentTS with the value provided in synOpts.TSval.
  2917  func (e *endpoint) maybeEnableTimestamp(synOpts header.TCPSynOptions) {
  2918  	if synOpts.TS {
  2919  		e.SendTSOk = true
  2920  		e.setRecentTimestamp(synOpts.TSVal)
  2921  	}
  2922  }
  2923  
  2924  func (e *endpoint) tsVal(now tcpip.MonotonicTime) uint32 {
  2925  	return e.TSOffset.TSVal(now)
  2926  }
  2927  
  2928  func (e *endpoint) tsValNow() uint32 {
  2929  	return e.tsVal(e.stack.Clock().NowMonotonic())
  2930  }
  2931  
  2932  func (e *endpoint) elapsed(now tcpip.MonotonicTime, tsEcr uint32) time.Duration {
  2933  	return e.TSOffset.Elapsed(now, tsEcr)
  2934  }
  2935  
  2936  // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint
  2937  // if the SYN options indicate that the SACK option was negotiated and the TCP
  2938  // stack is configured to enable TCP SACK option.
  2939  func (e *endpoint) maybeEnableSACKPermitted(synOpts header.TCPSynOptions) {
  2940  	var v tcpip.TCPSACKEnabled
  2941  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil {
  2942  		// Stack doesn't support SACK. So just return.
  2943  		return
  2944  	}
  2945  	if bool(v) && synOpts.SACKPermitted {
  2946  		e.SACKPermitted = true
  2947  		e.stack.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery)
  2948  	}
  2949  }
  2950  
  2951  // maxOptionSize return the maximum size of TCP options.
  2952  func (e *endpoint) maxOptionSize() (size int) {
  2953  	var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock
  2954  	options := e.makeOptions(maxSackBlocks[:])
  2955  	size = len(options)
  2956  	putOptions(options)
  2957  
  2958  	return size
  2959  }
  2960  
  2961  // completeStateLocked makes a full copy of the endpoint and returns it. This is
  2962  // used before invoking the probe.
  2963  //
  2964  // Precondition: e.mu must be held.
  2965  func (e *endpoint) completeStateLocked() stack.TCPEndpointState {
  2966  	s := stack.TCPEndpointState{
  2967  		TCPEndpointStateInner: e.TCPEndpointStateInner,
  2968  		ID:                    stack.TCPEndpointID(e.TransportEndpointInfo.ID),
  2969  		SegTime:               e.stack.Clock().NowMonotonic(),
  2970  		Receiver:              e.rcv.TCPReceiverState,
  2971  		Sender:                e.snd.TCPSenderState,
  2972  	}
  2973  
  2974  	sndBufSize := e.getSendBufferSize()
  2975  	// Copy the send buffer atomically.
  2976  	e.sndQueueInfo.sndQueueMu.Lock()
  2977  	s.SndBufState = e.sndQueueInfo.TCPSndBufState
  2978  	s.SndBufState.SndBufSize = sndBufSize
  2979  	e.sndQueueInfo.sndQueueMu.Unlock()
  2980  
  2981  	// Copy the receive buffer atomically.
  2982  	e.rcvQueueInfo.rcvQueueMu.Lock()
  2983  	s.RcvBufState = e.rcvQueueInfo.TCPRcvBufState
  2984  	e.rcvQueueInfo.rcvQueueMu.Unlock()
  2985  
  2986  	// Copy the endpoint TCP Option state.
  2987  	s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks)
  2988  	copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks])
  2989  	s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy()
  2990  
  2991  	e.snd.rtt.Lock()
  2992  	s.Sender.RTTState = e.snd.rtt.TCPRTTState
  2993  	e.snd.rtt.Unlock()
  2994  
  2995  	if cubic, ok := e.snd.cc.(*cubicState); ok {
  2996  		s.Sender.Cubic = cubic.TCPCubicState
  2997  		s.Sender.Cubic.TimeSinceLastCongestion = e.stack.Clock().NowMonotonic().Sub(s.Sender.Cubic.T)
  2998  	}
  2999  
  3000  	s.Sender.RACKState = e.snd.rc.TCPRACKState
  3001  	s.Sender.RetransmitTS = e.snd.retransmitTS
  3002  	s.Sender.SpuriousRecovery = e.snd.spuriousRecovery
  3003  	return s
  3004  }
  3005  
  3006  func (e *endpoint) initHardwareGSO() {
  3007  	switch e.route.NetProto() {
  3008  	case header.IPv4ProtocolNumber:
  3009  		e.gso.Type = stack.GSOTCPv4
  3010  		e.gso.L3HdrLen = header.IPv4MinimumSize
  3011  	case header.IPv6ProtocolNumber:
  3012  		e.gso.Type = stack.GSOTCPv6
  3013  		e.gso.L3HdrLen = header.IPv6MinimumSize
  3014  	default:
  3015  		panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto))
  3016  	}
  3017  	e.gso.NeedsCsum = true
  3018  	e.gso.CsumOffset = header.TCPChecksumOffset
  3019  	e.gso.MaxSize = e.route.GSOMaxSize()
  3020  }
  3021  
  3022  func (e *endpoint) initGSO() {
  3023  	if e.route.HasHardwareGSOCapability() {
  3024  		e.initHardwareGSO()
  3025  	} else if e.route.HasSoftwareGSOCapability() {
  3026  		e.gso = stack.GSO{
  3027  			MaxSize:   e.route.GSOMaxSize(),
  3028  			Type:      stack.GSOSW,
  3029  			NeedsCsum: false,
  3030  		}
  3031  	}
  3032  }
  3033  
  3034  // State implements tcpip.Endpoint.State. It exports the endpoint's protocol
  3035  // state for diagnostics.
  3036  func (e *endpoint) State() uint32 {
  3037  	return uint32(e.EndpointState())
  3038  }
  3039  
  3040  // Info returns a copy of the endpoint info.
  3041  func (e *endpoint) Info() tcpip.EndpointInfo {
  3042  	e.LockUser()
  3043  	// Make a copy of the endpoint info.
  3044  	ret := e.TransportEndpointInfo
  3045  	e.UnlockUser()
  3046  	return &ret
  3047  }
  3048  
  3049  // Stats returns a pointer to the endpoint stats.
  3050  func (e *endpoint) Stats() tcpip.EndpointStats {
  3051  	return &e.stats
  3052  }
  3053  
  3054  // Wait implements stack.TransportEndpoint.Wait.
  3055  func (e *endpoint) Wait() {
  3056  	waitEntry, notifyCh := waiter.NewChannelEntry(waiter.EventHUp)
  3057  	e.waiterQueue.EventRegister(&waitEntry)
  3058  	defer e.waiterQueue.EventUnregister(&waitEntry)
  3059  	for {
  3060  		e.LockUser()
  3061  		running := e.workerRunning
  3062  		e.UnlockUser()
  3063  		if !running {
  3064  			break
  3065  		}
  3066  		<-notifyCh
  3067  	}
  3068  }
  3069  
  3070  // SocketOptions implements tcpip.Endpoint.SocketOptions.
  3071  func (e *endpoint) SocketOptions() *tcpip.SocketOptions {
  3072  	return &e.ops
  3073  }
  3074  
  3075  // GetTCPSendBufferLimits is used to get send buffer size limits for TCP.
  3076  func GetTCPSendBufferLimits(s tcpip.StackHandler) tcpip.SendBufferSizeOption {
  3077  	var ss tcpip.TCPSendBufferSizeRangeOption
  3078  	if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil {
  3079  		panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err))
  3080  	}
  3081  
  3082  	return tcpip.SendBufferSizeOption{
  3083  		Min:     ss.Min,
  3084  		Default: ss.Default,
  3085  		Max:     ss.Max,
  3086  	}
  3087  }
  3088  
  3089  // allowOutOfWindowAck returns true if an out-of-window ACK can be sent now.
  3090  func (e *endpoint) allowOutOfWindowAck() bool {
  3091  	now := e.stack.Clock().NowMonotonic()
  3092  
  3093  	if e.lastOutOfWindowAckTime != (tcpip.MonotonicTime{}) {
  3094  		var limit stack.TCPInvalidRateLimitOption
  3095  		if err := e.stack.Option(&limit); err != nil {
  3096  			panic(fmt.Sprintf("e.stack.Option(%+v) failed with error: %s", limit, err))
  3097  		}
  3098  		if now.Sub(e.lastOutOfWindowAckTime) < time.Duration(limit) {
  3099  			return false
  3100  		}
  3101  	}
  3102  
  3103  	e.lastOutOfWindowAckTime = now
  3104  	return true
  3105  }
  3106  
  3107  // GetTCPReceiveBufferLimits is used to get send buffer size limits for TCP.
  3108  func GetTCPReceiveBufferLimits(s tcpip.StackHandler) tcpip.ReceiveBufferSizeOption {
  3109  	var ss tcpip.TCPReceiveBufferSizeRangeOption
  3110  	if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil {
  3111  		panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err))
  3112  	}
  3113  
  3114  	return tcpip.ReceiveBufferSizeOption{
  3115  		Min:     ss.Min,
  3116  		Default: ss.Default,
  3117  		Max:     ss.Max,
  3118  	}
  3119  }
  3120  
  3121  // computeTCPSendBufferSize implements auto tuning of send buffer size and
  3122  // returns the new send buffer size.
  3123  func (e *endpoint) computeTCPSendBufferSize() int64 {
  3124  	curSndBufSz := int64(e.getSendBufferSize())
  3125  
  3126  	// Auto tuning is disabled when the user explicitly sets the send
  3127  	// buffer size with SO_SNDBUF option.
  3128  	if disabled := atomic.LoadUint32(&e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled); disabled == 1 {
  3129  		return curSndBufSz
  3130  	}
  3131  
  3132  	const packetOverheadFactor = 2
  3133  	curMSS := e.snd.MaxPayloadSize
  3134  	numSeg := InitialCwnd
  3135  	if numSeg < e.snd.SndCwnd {
  3136  		numSeg = e.snd.SndCwnd
  3137  	}
  3138  
  3139  	// SndCwnd indicates the number of segments that can be sent. This means
  3140  	// that the sender can send upto #SndCwnd segments and the send buffer
  3141  	// size should be set to SndCwnd*MSS to accommodate sending of all the
  3142  	// segments.
  3143  	newSndBufSz := int64(numSeg * curMSS * packetOverheadFactor)
  3144  	if newSndBufSz < curSndBufSz {
  3145  		return curSndBufSz
  3146  	}
  3147  	if ss := GetTCPSendBufferLimits(e.stack); int64(ss.Max) < newSndBufSz {
  3148  		newSndBufSz = int64(ss.Max)
  3149  	}
  3150  
  3151  	return newSndBufSz
  3152  }