github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/tcpip/transport/tcp/endpoint.go

github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/tcpip/transport/tcp/endpoint.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"container/list"
    19  	"encoding/binary"
    20  	"fmt"
    21  	"io"
    22  	"math"
    23  	"math/rand"
    24  	"runtime"
    25  	"strings"
    26  	"sync/atomic"
    27  	"time"
    28  
    29  	"github.com/SagerNet/gvisor/pkg/sleep"
    30  	"github.com/SagerNet/gvisor/pkg/sync"
    31  	"github.com/SagerNet/gvisor/pkg/tcpip"
    32  	"github.com/SagerNet/gvisor/pkg/tcpip/hash/jenkins"
    33  	"github.com/SagerNet/gvisor/pkg/tcpip/header"
    34  	"github.com/SagerNet/gvisor/pkg/tcpip/ports"
    35  	"github.com/SagerNet/gvisor/pkg/tcpip/seqnum"
    36  	"github.com/SagerNet/gvisor/pkg/tcpip/stack"
    37  	"github.com/SagerNet/gvisor/pkg/waiter"
    38  )
    39  
    40  // EndpointState represents the state of a TCP endpoint.
    41  type EndpointState tcpip.EndpointState
    42  
    43  // Endpoint states. Note that are represented in a netstack-specific manner and
    44  // may not be meaningful externally. Specifically, they need to be translated to
    45  // Linux's representation for these states if presented to userspace.
    46  const (
    47  	_ EndpointState = iota
    48  	// TCP protocol states in sync with the definitions in
    49  	// https://github.com/torvalds/linux/blob/7acac4b3196/include/net/tcp_states.h#L13
    50  	StateEstablished
    51  	StateSynSent
    52  	StateSynRecv
    53  	StateFinWait1
    54  	StateFinWait2
    55  	StateTimeWait
    56  	StateClose
    57  	StateCloseWait
    58  	StateLastAck
    59  	StateListen
    60  	StateClosing
    61  
    62  	// Endpoint states internal to netstack.
    63  	StateInitial
    64  	StateBound
    65  	StateConnecting // Connect() called, but the initial SYN hasn't been sent.
    66  	StateError
    67  )
    68  
    69  const (
    70  	// rcvAdvWndScale is used to split the available socket buffer into
    71  	// application buffer and the window to be advertised to the peer. This is
    72  	// currently hard coded to split the available space equally.
    73  	rcvAdvWndScale = 1
    74  
    75  	// SegOverheadFactor is used to multiply the value provided by the
    76  	// user on a SetSockOpt for setting the socket send/receive buffer sizes.
    77  	SegOverheadFactor = 2
    78  )
    79  
    80  // connected returns true when s is one of the states representing an
    81  // endpoint connected to a peer.
    82  func (s EndpointState) connected() bool {
    83  	switch s {
    84  	case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
    85  		return true
    86  	default:
    87  		return false
    88  	}
    89  }
    90  
    91  // connecting returns true when s is one of the states representing a
    92  // connection in progress, but not yet fully established.
    93  func (s EndpointState) connecting() bool {
    94  	switch s {
    95  	case StateConnecting, StateSynSent, StateSynRecv:
    96  		return true
    97  	default:
    98  		return false
    99  	}
   100  }
   101  
   102  // internal returns true when the state is netstack internal.
   103  func (s EndpointState) internal() bool {
   104  	switch s {
   105  	case StateInitial, StateBound, StateConnecting, StateError:
   106  		return true
   107  	default:
   108  		return false
   109  	}
   110  }
   111  
   112  // handshake returns true when s is one of the states representing an endpoint
   113  // in the middle of a TCP handshake.
   114  func (s EndpointState) handshake() bool {
   115  	switch s {
   116  	case StateSynSent, StateSynRecv:
   117  		return true
   118  	default:
   119  		return false
   120  	}
   121  }
   122  
   123  // closed returns true when s is one of the states an endpoint transitions to
   124  // when closed or when it encounters an error. This is distinct from a newly
   125  // initialized endpoint that was never connected.
   126  func (s EndpointState) closed() bool {
   127  	switch s {
   128  	case StateClose, StateError:
   129  		return true
   130  	default:
   131  		return false
   132  	}
   133  }
   134  
   135  // String implements fmt.Stringer.String.
   136  func (s EndpointState) String() string {
   137  	switch s {
   138  	case StateInitial:
   139  		return "INITIAL"
   140  	case StateBound:
   141  		return "BOUND"
   142  	case StateConnecting:
   143  		return "CONNECTING"
   144  	case StateError:
   145  		return "ERROR"
   146  	case StateEstablished:
   147  		return "ESTABLISHED"
   148  	case StateSynSent:
   149  		return "SYN-SENT"
   150  	case StateSynRecv:
   151  		return "SYN-RCVD"
   152  	case StateFinWait1:
   153  		return "FIN-WAIT1"
   154  	case StateFinWait2:
   155  		return "FIN-WAIT2"
   156  	case StateTimeWait:
   157  		return "TIME-WAIT"
   158  	case StateClose:
   159  		return "CLOSED"
   160  	case StateCloseWait:
   161  		return "CLOSE-WAIT"
   162  	case StateLastAck:
   163  		return "LAST-ACK"
   164  	case StateListen:
   165  		return "LISTEN"
   166  	case StateClosing:
   167  		return "CLOSING"
   168  	default:
   169  		panic("unreachable")
   170  	}
   171  }
   172  
   173  // Reasons for notifying the protocol goroutine.
   174  const (
   175  	notifyNonZeroReceiveWindow = 1 << iota
   176  	notifyClose
   177  	notifyMTUChanged
   178  	notifyDrain
   179  	notifyReset
   180  	notifyResetByPeer
   181  	// notifyAbort is a request for an expedited teardown.
   182  	notifyAbort
   183  	notifyKeepaliveChanged
   184  	notifyMSSChanged
   185  	// notifyTickleWorker is used to tickle the protocol main loop during a
   186  	// restore after we update the endpoint state to the correct one. This
   187  	// ensures the loop terminates if the final state of the endpoint is
   188  	// say TIME_WAIT.
   189  	notifyTickleWorker
   190  	notifyError
   191  )
   192  
   193  // SACKInfo holds TCP SACK related information for a given endpoint.
   194  //
   195  // +stateify savable
   196  type SACKInfo struct {
   197  	// Blocks is the maximum number of SACK blocks we track
   198  	// per endpoint.
   199  	Blocks [MaxSACKBlocks]header.SACKBlock
   200  
   201  	// NumBlocks is the number of valid SACK blocks stored in the
   202  	// blocks array above.
   203  	NumBlocks int
   204  }
   205  
   206  // ReceiveErrors collect segment receive errors within transport layer.
   207  type ReceiveErrors struct {
   208  	tcpip.ReceiveErrors
   209  
   210  	// SegmentQueueDropped is the number of segments dropped due to
   211  	// a full segment queue.
   212  	SegmentQueueDropped tcpip.StatCounter
   213  
   214  	// ChecksumErrors is the number of segments dropped due to bad checksums.
   215  	ChecksumErrors tcpip.StatCounter
   216  
   217  	// ListenOverflowSynDrop is the number of times the listen queue overflowed
   218  	// and a SYN was dropped.
   219  	ListenOverflowSynDrop tcpip.StatCounter
   220  
   221  	// ListenOverflowAckDrop is the number of times the final ACK
   222  	// in the handshake was dropped due to overflow.
   223  	ListenOverflowAckDrop tcpip.StatCounter
   224  
   225  	// ZeroRcvWindowState is the number of times we advertised
   226  	// a zero receive window when rcvQueue is full.
   227  	ZeroRcvWindowState tcpip.StatCounter
   228  
   229  	// WantZeroWindow is the number of times we wanted to advertise a
   230  	// zero receive window but couldn't because it would have caused
   231  	// the receive window's right edge to shrink.
   232  	WantZeroRcvWindow tcpip.StatCounter
   233  }
   234  
   235  // SendErrors collect segment send errors within the transport layer.
   236  type SendErrors struct {
   237  	tcpip.SendErrors
   238  
   239  	// SegmentSendToNetworkFailed is the number of TCP segments failed to be sent
   240  	// to the network endpoint.
   241  	SegmentSendToNetworkFailed tcpip.StatCounter
   242  
   243  	// SynSendToNetworkFailed is the number of TCP SYNs failed to be sent
   244  	// to the network endpoint.
   245  	SynSendToNetworkFailed tcpip.StatCounter
   246  
   247  	// Retransmits is the number of TCP segments retransmitted.
   248  	Retransmits tcpip.StatCounter
   249  
   250  	// FastRetransmit is the number of segments retransmitted in fast
   251  	// recovery.
   252  	FastRetransmit tcpip.StatCounter
   253  
   254  	// Timeouts is the number of times the RTO expired.
   255  	Timeouts tcpip.StatCounter
   256  }
   257  
   258  // Stats holds statistics about the endpoint.
   259  type Stats struct {
   260  	// SegmentsReceived is the number of TCP segments received that
   261  	// the transport layer successfully parsed.
   262  	SegmentsReceived tcpip.StatCounter
   263  
   264  	// SegmentsSent is the number of TCP segments sent.
   265  	SegmentsSent tcpip.StatCounter
   266  
   267  	// FailedConnectionAttempts is the number of times we saw Connect and
   268  	// Accept errors.
   269  	FailedConnectionAttempts tcpip.StatCounter
   270  
   271  	// ReceiveErrors collects segment receive errors within the
   272  	// transport layer.
   273  	ReceiveErrors ReceiveErrors
   274  
   275  	// ReadErrors collects segment read errors from an endpoint read call.
   276  	ReadErrors tcpip.ReadErrors
   277  
   278  	// SendErrors collects segment send errors within the transport layer.
   279  	SendErrors SendErrors
   280  
   281  	// WriteErrors collects segment write errors from an endpoint write call.
   282  	WriteErrors tcpip.WriteErrors
   283  }
   284  
   285  // IsEndpointStats is an empty method to implement the tcpip.EndpointStats
   286  // marker interface.
   287  func (*Stats) IsEndpointStats() {}
   288  
   289  // sndQueueInfo implements a send queue.
   290  //
   291  // +stateify savable
   292  type sndQueueInfo struct {
   293  	sndQueueMu sync.Mutex `state:"nosave"`
   294  	stack.TCPSndBufState
   295  
   296  	// sndWaker is used to signal the protocol goroutine when there may be
   297  	// segments that need to be sent.
   298  	sndWaker sleep.Waker `state:"manual"`
   299  }
   300  
   301  // rcvQueueInfo contains the endpoint's rcvQueue and associated metadata.
   302  //
   303  // +stateify savable
   304  type rcvQueueInfo struct {
   305  	rcvQueueMu sync.Mutex `state:"nosave"`
   306  	stack.TCPRcvBufState
   307  
   308  	// rcvQueue is the queue for ready-for-delivery segments. This struct's
   309  	// mutex must be held in order append segments to list.
   310  	rcvQueue segmentList `state:"wait"`
   311  }
   312  
   313  // +stateify savable
   314  type accepted struct {
   315  	// NB: this could be an endpointList, but ilist only permits endpoints to
   316  	// belong to one list at a time, and endpoints are already stored in the
   317  	// dispatcher's list.
   318  	endpoints list.List `state:".([]*endpoint)"`
   319  	cap       int
   320  }
   321  
   322  // endpoint represents a TCP endpoint. This struct serves as the interface
   323  // between users of the endpoint and the protocol implementation; it is legal to
   324  // have concurrent goroutines make calls into the endpoint, they are properly
   325  // synchronized. The protocol implementation, however, runs in a single
   326  // goroutine.
   327  //
   328  // Each endpoint has a few mutexes:
   329  //
   330  // e.mu -> Primary mutex for an endpoint must be held for all operations except
   331  // in e.Readiness where acquiring it will result in a deadlock in epoll
   332  // implementation.
   333  //
   334  // The following three mutexes can be acquired independent of e.mu but if
   335  // acquired with e.mu then e.mu must be acquired first.
   336  //
   337  // e.acceptMu -> protects accepted.
   338  // e.rcvQueueMu -> Protects e.rcvQueue and associated fields.
   339  // e.sndQueueMu -> Protects the e.sndQueue and associated fields.
   340  // e.lastErrorMu -> Protects the lastError field.
   341  //
   342  // LOCKING/UNLOCKING of the endpoint.  The locking of an endpoint is different
   343  // based on the context in which the lock is acquired. In the syscall context
   344  // e.LockUser/e.UnlockUser should be used and when doing background processing
   345  // e.mu.Lock/e.mu.Unlock should be used. The distinction is described below
   346  // in brief.
   347  //
   348  // The reason for this locking behaviour is to avoid wakeups to handle packets.
   349  // In cases where the endpoint is already locked the background processor can
   350  // queue the packet up and go its merry way and the lock owner will eventually
   351  // process the backlog when releasing the lock. Similarly when acquiring the
   352  // lock from say a syscall goroutine we can implement a bit of spinning if we
   353  // know that the lock is not held by another syscall goroutine. Background
   354  // processors should never hold the lock for long and we can avoid an expensive
   355  // sleep/wakeup by spinning for a shortwhile.
   356  //
   357  // For more details please see the detailed documentation on
   358  // e.LockUser/e.UnlockUser methods.
   359  //
   360  // +stateify savable
   361  type endpoint struct {
   362  	stack.TCPEndpointStateInner
   363  	stack.TransportEndpointInfo
   364  	tcpip.DefaultSocketOptionsHandler
   365  
   366  	// endpointEntry is used to queue endpoints for processing to the
   367  	// a given tcp processor goroutine.
   368  	//
   369  	// Precondition: epQueue.mu must be held to read/write this field..
   370  	endpointEntry `state:"nosave"`
   371  
   372  	// pendingProcessing is true if this endpoint is queued for processing
   373  	// to a TCP processor.
   374  	//
   375  	// Precondition: epQueue.mu must be held to read/write this field..
   376  	pendingProcessing bool `state:"nosave"`
   377  
   378  	// The following fields are initialized at creation time and do not
   379  	// change throughout the lifetime of the endpoint.
   380  	stack       *stack.Stack  `state:"manual"`
   381  	waiterQueue *waiter.Queue `state:"wait"`
   382  	uniqueID    uint64
   383  
   384  	// hardError is meaningful only when state is stateError. It stores the
   385  	// error to be returned when read/write syscalls are called and the
   386  	// endpoint is in this state. hardError is protected by endpoint mu.
   387  	hardError tcpip.Error
   388  
   389  	// lastError represents the last error that the endpoint reported;
   390  	// access to it is protected by the following mutex.
   391  	lastErrorMu sync.Mutex `state:"nosave"`
   392  	lastError   tcpip.Error
   393  
   394  	// rcvReadMu synchronizes calls to Read.
   395  	//
   396  	// mu and rcvQueueMu are temporarily released during data copying. rcvReadMu
   397  	// must be held during each read to ensure atomicity, so that multiple reads
   398  	// do not interleave.
   399  	//
   400  	// rcvReadMu should be held before holding mu.
   401  	rcvReadMu sync.Mutex `state:"nosave"`
   402  
   403  	// rcvQueueInfo holds the implementation of the endpoint's receive buffer.
   404  	// The data within rcvQueueInfo should only be accessed while rcvReadMu, mu,
   405  	// and rcvQueueMu are held, in that stated order. While processing the segment
   406  	// range, you can determine a range and then temporarily release mu and
   407  	// rcvQueueMu, which allows new segments to be appended to the queue while
   408  	// processing.
   409  	rcvQueueInfo rcvQueueInfo
   410  
   411  	// rcvMemUsed tracks the total amount of memory in use by received segments
   412  	// held in rcvQueue, pendingRcvdSegments and the segment queue. This is used to
   413  	// compute the window and the actual available buffer space. This is distinct
   414  	// from rcvBufUsed above which is the actual number of payload bytes held in
   415  	// the buffer not including any segment overheads.
   416  	//
   417  	// rcvMemUsed must be accessed atomically.
   418  	rcvMemUsed int32
   419  
   420  	// mu protects all endpoint fields unless documented otherwise. mu must
   421  	// be acquired before interacting with the endpoint fields.
   422  	//
   423  	// During handshake, mu is locked by the protocol listen goroutine and
   424  	// released by the handshake completion goroutine.
   425  	mu          sync.CrossGoroutineMutex `state:"nosave"`
   426  	ownedByUser uint32
   427  
   428  	// state must be read/set using the EndpointState()/setEndpointState()
   429  	// methods.
   430  	state uint32 `state:".(EndpointState)"`
   431  
   432  	// origEndpointState is only used during a restore phase to save the
   433  	// endpoint state at restore time as the socket is moved to it's correct
   434  	// state.
   435  	origEndpointState uint32 `state:"nosave"`
   436  
   437  	isPortReserved    bool `state:"manual"`
   438  	isRegistered      bool `state:"manual"`
   439  	boundNICID        tcpip.NICID
   440  	route             *stack.Route `state:"manual"`
   441  	ttl               uint8
   442  	isConnectNotified bool
   443  
   444  	// h stores a reference to the current handshake state if the endpoint is in
   445  	// the SYN-SENT or SYN-RECV states, in which case endpoint == endpoint.h.ep.
   446  	// nil otherwise.
   447  	h *handshake `state:"nosave"`
   448  
   449  	// portFlags stores the current values of port related flags.
   450  	portFlags ports.Flags
   451  
   452  	// Values used to reserve a port or register a transport endpoint
   453  	// (which ever happens first).
   454  	boundBindToDevice tcpip.NICID
   455  	boundPortFlags    ports.Flags
   456  	boundDest         tcpip.FullAddress
   457  
   458  	// effectiveNetProtos contains the network protocols actually in use. In
   459  	// most cases it will only contain "netProto", but in cases like IPv6
   460  	// endpoints with v6only set to false, this could include multiple
   461  	// protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g.,
   462  	// IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped
   463  	// address).
   464  	effectiveNetProtos []tcpip.NetworkProtocolNumber
   465  
   466  	// workerRunning specifies if a worker goroutine is running.
   467  	workerRunning bool
   468  
   469  	// workerCleanup specifies if the worker goroutine must perform cleanup
   470  	// before exiting. This can only be set to true when workerRunning is
   471  	// also true, and they're both protected by the mutex.
   472  	workerCleanup bool
   473  
   474  	// recentTSTime is the unix time when we last updated
   475  	// TCPEndpointStateInner.RecentTS.
   476  	recentTSTime tcpip.MonotonicTime
   477  
   478  	// shutdownFlags represent the current shutdown state of the endpoint.
   479  	shutdownFlags tcpip.ShutdownFlags
   480  
   481  	// tcpRecovery is the loss deteoction algorithm used by TCP.
   482  	tcpRecovery tcpip.TCPRecovery
   483  
   484  	// sack holds TCP SACK related information for this endpoint.
   485  	sack SACKInfo
   486  
   487  	// delay enables Nagle's algorithm.
   488  	//
   489  	// delay is a boolean (0 is false) and must be accessed atomically.
   490  	delay uint32
   491  
   492  	// scoreboard holds TCP SACK Scoreboard information for this endpoint.
   493  	scoreboard *SACKScoreboard
   494  
   495  	// segmentQueue is used to hand received segments to the protocol
   496  	// goroutine. Segments are queued as long as the queue is not full,
   497  	// and dropped when it is.
   498  	segmentQueue segmentQueue `state:"wait"`
   499  
   500  	// synRcvdCount is the number of connections for this endpoint that are
   501  	// in SYN-RCVD state; this is only accessed atomically.
   502  	synRcvdCount int32
   503  
   504  	// userMSS if non-zero is the MSS value explicitly set by the user
   505  	// for this endpoint using the TCP_MAXSEG setsockopt.
   506  	userMSS uint16
   507  
   508  	// maxSynRetries is the maximum number of SYN retransmits that TCP should
   509  	// send before aborting the attempt to connect. It cannot exceed 255.
   510  	//
   511  	// NOTE: This is currently a no-op and does not change the SYN
   512  	// retransmissions.
   513  	maxSynRetries uint8
   514  
   515  	// windowClamp is used to bound the size of the advertised window to
   516  	// this value.
   517  	windowClamp uint32
   518  
   519  	// sndQueueInfo contains the implementation of the endpoint's send queue.
   520  	sndQueueInfo sndQueueInfo
   521  
   522  	// cc stores the name of the Congestion Control algorithm to use for
   523  	// this endpoint.
   524  	cc tcpip.CongestionControlOption
   525  
   526  	// newSegmentWaker is used to indicate to the protocol goroutine that
   527  	// it needs to wake up and handle new segments queued to it.
   528  	newSegmentWaker sleep.Waker `state:"manual"`
   529  
   530  	// notificationWaker is used to indicate to the protocol goroutine that
   531  	// it needs to wake up and check for notifications.
   532  	notificationWaker sleep.Waker `state:"manual"`
   533  
   534  	// notifyFlags is a bitmask of flags used to indicate to the protocol
   535  	// goroutine what it was notified; this is only accessed atomically.
   536  	notifyFlags uint32 `state:"nosave"`
   537  
   538  	// keepalive manages TCP keepalive state. When the connection is idle
   539  	// (no data sent or received) for keepaliveIdle, we start sending
   540  	// keepalives every keepalive.interval. If we send keepalive.count
   541  	// without hearing a response, the connection is closed.
   542  	keepalive keepalive
   543  
   544  	// userTimeout if non-zero specifies a user specified timeout for
   545  	// a connection w/ pending data to send. A connection that has pending
   546  	// unacked data will be forcibily aborted if the timeout is reached
   547  	// without any data being acked.
   548  	userTimeout time.Duration
   549  
   550  	// deferAccept if non-zero specifies a user specified time during
   551  	// which the final ACK of a handshake will be dropped provided the
   552  	// ACK is a bare ACK and carries no data. If the timeout is crossed then
   553  	// the bare ACK is accepted and the connection is delivered to the
   554  	// listener.
   555  	deferAccept time.Duration
   556  
   557  	// pendingAccepted tracks connections queued to be accepted. It is used to
   558  	// ensure such queued connections are terminated before the accepted queue is
   559  	// marked closed (by setting its capacity to zero).
   560  	pendingAccepted sync.WaitGroup `state:"nosave"`
   561  
   562  	// acceptMu protects accepted.
   563  	acceptMu sync.Mutex `state:"nosave"`
   564  
   565  	// acceptCond is a condition variable that can be used to block on when
   566  	// accepted is full and an endpoint is ready to be delivered.
   567  	//
   568  	// We use this condition variable to block/unblock goroutines which
   569  	// tried to deliver an endpoint but couldn't because accept backlog was
   570  	// full ( See: endpoint.deliverAccepted ).
   571  	acceptCond *sync.Cond `state:"nosave"`
   572  
   573  	// accepted is used by a listening endpoint protocol goroutine to
   574  	// send newly accepted connections to the endpoint so that they can be
   575  	// read by Accept() calls.
   576  	accepted accepted
   577  
   578  	// The following are only used from the protocol goroutine, and
   579  	// therefore don't need locks to protect them.
   580  	rcv *receiver `state:"wait"`
   581  	snd *sender   `state:"wait"`
   582  
   583  	// The goroutine drain completion notification channel.
   584  	drainDone chan struct{} `state:"nosave"`
   585  
   586  	// The goroutine undrain notification channel. This is currently used as
   587  	// a way to block the worker goroutines. Today nothing closes/writes
   588  	// this channel and this causes any goroutines waiting on this to just
   589  	// block. This is used during save/restore to prevent worker goroutines
   590  	// from mutating state as it's being saved.
   591  	undrain chan struct{} `state:"nosave"`
   592  
   593  	// probe if not nil is invoked on every received segment. It is passed
   594  	// a copy of the current state of the endpoint.
   595  	probe stack.TCPProbeFunc `state:"nosave"`
   596  
   597  	// The following are only used to assist the restore run to re-connect.
   598  	connectingAddress tcpip.Address
   599  
   600  	// amss is the advertised MSS to the peer by this endpoint.
   601  	amss uint16
   602  
   603  	// sendTOS represents IPv4 TOS or IPv6 TrafficClass,
   604  	// applied while sending packets. Defaults to 0 as on Linux.
   605  	sendTOS uint8
   606  
   607  	gso stack.GSO
   608  
   609  	// TODO(b/142022063): Add ability to save and restore per endpoint stats.
   610  	stats Stats `state:"nosave"`
   611  
   612  	// tcpLingerTimeout is the maximum amount of a time a socket
   613  	// a socket stays in TIME_WAIT state before being marked
   614  	// closed.
   615  	tcpLingerTimeout time.Duration
   616  
   617  	// closed indicates that the user has called closed on the
   618  	// endpoint and at this point the endpoint is only around
   619  	// to complete the TCP shutdown.
   620  	closed bool
   621  
   622  	// txHash is the transport layer hash to be set on outbound packets
   623  	// emitted by this endpoint.
   624  	txHash uint32
   625  
   626  	// owner is used to get uid and gid of the packet.
   627  	owner tcpip.PacketOwner
   628  
   629  	// ops is used to get socket level options.
   630  	ops tcpip.SocketOptions
   631  
   632  	// lastOutOfWindowAckTime is the time at which the an ACK was sent in response
   633  	// to an out of window segment being received by this endpoint.
   634  	lastOutOfWindowAckTime tcpip.MonotonicTime
   635  }
   636  
   637  // UniqueID implements stack.TransportEndpoint.UniqueID.
   638  func (e *endpoint) UniqueID() uint64 {
   639  	return e.uniqueID
   640  }
   641  
   642  // calculateAdvertisedMSS calculates the MSS to advertise.
   643  //
   644  // If userMSS is non-zero and is not greater than the maximum possible MSS for
   645  // r, it will be used; otherwise, the maximum possible MSS will be used.
   646  func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 {
   647  	// The maximum possible MSS is dependent on the route.
   648  	// TODO(b/143359391): Respect TCP Min and Max size.
   649  	maxMSS := uint16(r.MTU() - header.TCPMinimumSize)
   650  
   651  	if userMSS != 0 && userMSS < maxMSS {
   652  		return userMSS
   653  	}
   654  
   655  	return maxMSS
   656  }
   657  
   658  // LockUser tries to lock e.mu and if it fails it will check if the lock is held
   659  // by another syscall goroutine. If yes, then it will goto sleep waiting for the
   660  // lock to be released, if not then it will spin till it acquires the lock or
   661  // another syscall goroutine acquires it in which case it will goto sleep as
   662  // described above.
   663  //
   664  // The assumption behind spinning here being that background packet processing
   665  // should not be holding the lock for long and spinning reduces latency as we
   666  // avoid an expensive sleep/wakeup of of the syscall goroutine).
   667  // +checklocksacquire:e.mu
   668  func (e *endpoint) LockUser() {
   669  	for {
   670  		// Try first if the sock is locked then check if it's owned
   671  		// by another user goroutine if not then we spin, otherwise
   672  		// we just go to sleep on the Lock() and wait.
   673  		if !e.mu.TryLock() {
   674  			// If socket is owned by the user then just go to sleep
   675  			// as the lock could be held for a reasonably long time.
   676  			if atomic.LoadUint32(&e.ownedByUser) == 1 {
   677  				e.mu.Lock()
   678  				atomic.StoreUint32(&e.ownedByUser, 1)
   679  				return
   680  			}
   681  			// Spin but yield the processor since the lower half
   682  			// should yield the lock soon.
   683  			runtime.Gosched()
   684  			continue
   685  		}
   686  		atomic.StoreUint32(&e.ownedByUser, 1)
   687  		return // +checklocksforce
   688  	}
   689  }
   690  
   691  // UnlockUser will check if there are any segments already queued for processing
   692  // and process any such segments before unlocking e.mu. This is required because
   693  // we when packets arrive and endpoint lock is already held then such packets
   694  // are queued up to be processed. If the lock is held by the endpoint goroutine
   695  // then it will process these packets but if the lock is instead held by the
   696  // syscall goroutine then we can have the syscall goroutine process the backlog
   697  // before unlocking.
   698  //
   699  // This avoids an unnecessary wakeup of the endpoint protocol goroutine for the
   700  // endpoint. It's also required eventually when we get rid of the endpoint
   701  // protocol goroutine altogether.
   702  //
   703  // Precondition: e.LockUser() must have been called before calling e.UnlockUser()
   704  // +checklocksrelease:e.mu
   705  func (e *endpoint) UnlockUser() {
   706  	// Lock segment queue before checking so that we avoid a race where
   707  	// segments can be queued between the time we check if queue is empty
   708  	// and actually unlock the endpoint mutex.
   709  	for {
   710  		e.segmentQueue.mu.Lock()
   711  		if e.segmentQueue.emptyLocked() {
   712  			if atomic.SwapUint32(&e.ownedByUser, 0) != 1 {
   713  				panic("e.UnlockUser() called without calling e.LockUser()")
   714  			}
   715  			e.mu.Unlock()
   716  			e.segmentQueue.mu.Unlock()
   717  			return
   718  		}
   719  		e.segmentQueue.mu.Unlock()
   720  
   721  		switch e.EndpointState() {
   722  		case StateEstablished:
   723  			if err := e.handleSegmentsLocked(true /* fastPath */); err != nil {
   724  				e.notifyProtocolGoroutine(notifyTickleWorker)
   725  			}
   726  		default:
   727  			// Since we are waking the endpoint goroutine here just unlock
   728  			// and let it process the queued segments.
   729  			e.newSegmentWaker.Assert()
   730  			if atomic.SwapUint32(&e.ownedByUser, 0) != 1 {
   731  				panic("e.UnlockUser() called without calling e.LockUser()")
   732  			}
   733  			e.mu.Unlock()
   734  			return
   735  		}
   736  	}
   737  }
   738  
   739  // StopWork halts packet processing. Only to be used in tests.
   740  // +checklocksacquire:e.mu
   741  func (e *endpoint) StopWork() {
   742  	e.mu.Lock()
   743  }
   744  
   745  // ResumeWork resumes packet processing. Only to be used in tests.
   746  // +checklocksrelease:e.mu
   747  func (e *endpoint) ResumeWork() {
   748  	e.mu.Unlock()
   749  }
   750  
   751  // setEndpointState updates the state of the endpoint to state atomically. This
   752  // method is unexported as the only place we should update the state is in this
   753  // package but we allow the state to be read freely without holding e.mu.
   754  //
   755  // Precondition: e.mu must be held to call this method.
   756  func (e *endpoint) setEndpointState(state EndpointState) {
   757  	oldstate := EndpointState(atomic.LoadUint32(&e.state))
   758  	switch state {
   759  	case StateEstablished:
   760  		e.stack.Stats().TCP.CurrentEstablished.Increment()
   761  		e.stack.Stats().TCP.CurrentConnected.Increment()
   762  	case StateError:
   763  		fallthrough
   764  	case StateClose:
   765  		if oldstate == StateCloseWait || oldstate == StateEstablished {
   766  			e.stack.Stats().TCP.EstablishedResets.Increment()
   767  		}
   768  		fallthrough
   769  	default:
   770  		if oldstate == StateEstablished {
   771  			e.stack.Stats().TCP.CurrentEstablished.Decrement()
   772  		}
   773  	}
   774  	atomic.StoreUint32(&e.state, uint32(state))
   775  }
   776  
   777  // EndpointState returns the current state of the endpoint.
   778  func (e *endpoint) EndpointState() EndpointState {
   779  	return EndpointState(atomic.LoadUint32(&e.state))
   780  }
   781  
   782  // setRecentTimestamp sets the recentTS field to the provided value.
   783  func (e *endpoint) setRecentTimestamp(recentTS uint32) {
   784  	e.RecentTS = recentTS
   785  	e.recentTSTime = e.stack.Clock().NowMonotonic()
   786  }
   787  
   788  // recentTimestamp returns the value of the recentTS field.
   789  func (e *endpoint) recentTimestamp() uint32 {
   790  	return e.RecentTS
   791  }
   792  
   793  // keepalive is a synchronization wrapper used to appease stateify. See the
   794  // comment in endpoint, where it is used.
   795  //
   796  // +stateify savable
   797  type keepalive struct {
   798  	sync.Mutex `state:"nosave"`
   799  	idle       time.Duration
   800  	interval   time.Duration
   801  	count      int
   802  	unacked    int
   803  	timer      timer       `state:"nosave"`
   804  	waker      sleep.Waker `state:"nosave"`
   805  }
   806  
   807  func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint {
   808  	e := &endpoint{
   809  		stack: s,
   810  		TransportEndpointInfo: stack.TransportEndpointInfo{
   811  			NetProto:   netProto,
   812  			TransProto: header.TCPProtocolNumber,
   813  		},
   814  		sndQueueInfo: sndQueueInfo{
   815  			TCPSndBufState: stack.TCPSndBufState{
   816  				SndMTU: math.MaxInt32,
   817  			},
   818  		},
   819  		waiterQueue: waiterQueue,
   820  		state:       uint32(StateInitial),
   821  		keepalive: keepalive{
   822  			// Linux defaults.
   823  			idle:     2 * time.Hour,
   824  			interval: 75 * time.Second,
   825  			count:    9,
   826  		},
   827  		uniqueID:      s.UniqueID(),
   828  		txHash:        s.Rand().Uint32(),
   829  		windowClamp:   DefaultReceiveBufferSize,
   830  		maxSynRetries: DefaultSynRetries,
   831  	}
   832  	e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits, GetTCPReceiveBufferLimits)
   833  	e.ops.SetMulticastLoop(true)
   834  	e.ops.SetQuickAck(true)
   835  	e.ops.SetSendBufferSize(DefaultSendBufferSize, false /* notify */)
   836  	e.ops.SetReceiveBufferSize(DefaultReceiveBufferSize, false /* notify */)
   837  
   838  	var ss tcpip.TCPSendBufferSizeRangeOption
   839  	if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
   840  		e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */)
   841  	}
   842  
   843  	var rs tcpip.TCPReceiveBufferSizeRangeOption
   844  	if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
   845  		e.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */)
   846  	}
   847  
   848  	var cs tcpip.CongestionControlOption
   849  	if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil {
   850  		e.cc = cs
   851  	}
   852  
   853  	var mrb tcpip.TCPModerateReceiveBufferOption
   854  	if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil {
   855  		e.rcvQueueInfo.RcvAutoParams.Disabled = !bool(mrb)
   856  	}
   857  
   858  	var de tcpip.TCPDelayEnabled
   859  	if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de {
   860  		e.ops.SetDelayOption(true)
   861  	}
   862  
   863  	var tcpLT tcpip.TCPLingerTimeoutOption
   864  	if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil {
   865  		e.tcpLingerTimeout = time.Duration(tcpLT)
   866  	}
   867  
   868  	var synRetries tcpip.TCPSynRetriesOption
   869  	if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil {
   870  		e.maxSynRetries = uint8(synRetries)
   871  	}
   872  
   873  	s.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery)
   874  
   875  	if p := s.GetTCPProbe(); p != nil {
   876  		e.probe = p
   877  	}
   878  
   879  	e.segmentQueue.ep = e
   880  	e.TSOffset = timeStampOffset(e.stack.Rand())
   881  	e.acceptCond = sync.NewCond(&e.acceptMu)
   882  	e.keepalive.timer.init(e.stack.Clock(), &e.keepalive.waker)
   883  
   884  	return e
   885  }
   886  
   887  // Readiness returns the current readiness of the endpoint. For example, if
   888  // waiter.EventIn is set, the endpoint is immediately readable.
   889  func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
   890  	result := waiter.EventMask(0)
   891  
   892  	switch e.EndpointState() {
   893  	case StateInitial, StateBound:
   894  		// This prevents blocking of new sockets which are not
   895  		// connected when SO_LINGER is set.
   896  		result |= waiter.EventHUp
   897  
   898  	case StateConnecting, StateSynSent, StateSynRecv:
   899  		// Ready for nothing.
   900  
   901  	case StateClose, StateError, StateTimeWait:
   902  		// Ready for anything.
   903  		result = mask
   904  
   905  	case StateListen:
   906  		// Check if there's anything in the accepted queue.
   907  		if (mask & waiter.ReadableEvents) != 0 {
   908  			e.acceptMu.Lock()
   909  			if e.accepted.endpoints.Len() != 0 {
   910  				result |= waiter.ReadableEvents
   911  			}
   912  			e.acceptMu.Unlock()
   913  		}
   914  	}
   915  	if e.EndpointState().connected() {
   916  		// Determine if the endpoint is writable if requested.
   917  		if (mask & waiter.WritableEvents) != 0 {
   918  			e.sndQueueInfo.sndQueueMu.Lock()
   919  			sndBufSize := e.getSendBufferSize()
   920  			if e.sndQueueInfo.SndClosed || e.sndQueueInfo.SndBufUsed < sndBufSize {
   921  				result |= waiter.WritableEvents
   922  			}
   923  			e.sndQueueInfo.sndQueueMu.Unlock()
   924  		}
   925  
   926  		// Determine if the endpoint is readable if requested.
   927  		if (mask & waiter.ReadableEvents) != 0 {
   928  			e.rcvQueueInfo.rcvQueueMu.Lock()
   929  			if e.rcvQueueInfo.RcvBufUsed > 0 || e.rcvQueueInfo.RcvClosed {
   930  				result |= waiter.ReadableEvents
   931  			}
   932  			e.rcvQueueInfo.rcvQueueMu.Unlock()
   933  		}
   934  	}
   935  
   936  	return result
   937  }
   938  
   939  func (e *endpoint) fetchNotifications() uint32 {
   940  	return atomic.SwapUint32(&e.notifyFlags, 0)
   941  }
   942  
   943  func (e *endpoint) notifyProtocolGoroutine(n uint32) {
   944  	for {
   945  		v := atomic.LoadUint32(&e.notifyFlags)
   946  		if v&n == n {
   947  			// The flags are already set.
   948  			return
   949  		}
   950  
   951  		if atomic.CompareAndSwapUint32(&e.notifyFlags, v, v|n) {
   952  			if v == 0 {
   953  				// We are causing a transition from no flags to
   954  				// at least one flag set, so we must cause the
   955  				// protocol goroutine to wake up.
   956  				e.notificationWaker.Assert()
   957  			}
   958  			return
   959  		}
   960  	}
   961  }
   962  
   963  // Abort implements stack.TransportEndpoint.Abort.
   964  func (e *endpoint) Abort() {
   965  	// The abort notification is not processed synchronously, so no
   966  	// synchronization is needed.
   967  	//
   968  	// If the endpoint becomes connected after this check, we still close
   969  	// the endpoint. This worst case results in a slower abort.
   970  	//
   971  	// If the endpoint disconnected after the check, nothing needs to be
   972  	// done, so sending a notification which will potentially be ignored is
   973  	// fine.
   974  	//
   975  	// If the endpoint connecting finishes after the check, the endpoint
   976  	// is either in a connected state (where we would notifyAbort anyway),
   977  	// SYN-RECV (where we would also notifyAbort anyway), or in an error
   978  	// state where nothing is required and the notification can be safely
   979  	// ignored.
   980  	//
   981  	// Endpoints where a Close during connecting or SYN-RECV state would be
   982  	// problematic are set to state connecting before being registered (and
   983  	// thus possible to be Aborted). They are never available in initial
   984  	// state.
   985  	//
   986  	// Endpoints transitioning from initial to connecting state may be
   987  	// safely either closed or sent notifyAbort.
   988  	if s := e.EndpointState(); s == StateConnecting || s == StateSynRecv || s.connected() {
   989  		e.notifyProtocolGoroutine(notifyAbort)
   990  		return
   991  	}
   992  	e.Close()
   993  }
   994  
   995  // Close puts the endpoint in a closed state and frees all resources associated
   996  // with it. It must be called only once and with no other concurrent calls to
   997  // the endpoint.
   998  func (e *endpoint) Close() {
   999  	e.LockUser()
  1000  	defer e.UnlockUser()
  1001  	if e.closed {
  1002  		return
  1003  	}
  1004  
  1005  	linger := e.SocketOptions().GetLinger()
  1006  	if linger.Enabled && linger.Timeout == 0 {
  1007  		s := e.EndpointState()
  1008  		isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv
  1009  		if isResetState {
  1010  			// Close the endpoint without doing full shutdown and
  1011  			// send a RST.
  1012  			e.resetConnectionLocked(&tcpip.ErrConnectionAborted{})
  1013  			e.closeNoShutdownLocked()
  1014  
  1015  			// Wake up worker to close the endpoint.
  1016  			switch s {
  1017  			case StateSynRecv:
  1018  				e.notifyProtocolGoroutine(notifyClose)
  1019  			default:
  1020  				e.notifyProtocolGoroutine(notifyTickleWorker)
  1021  			}
  1022  			return
  1023  		}
  1024  	}
  1025  
  1026  	// Issue a shutdown so that the peer knows we won't send any more data
  1027  	// if we're connected, or stop accepting if we're listening.
  1028  	e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead)
  1029  	e.closeNoShutdownLocked()
  1030  }
  1031  
  1032  // closeNoShutdown closes the endpoint without doing a full shutdown.
  1033  func (e *endpoint) closeNoShutdownLocked() {
  1034  	// For listening sockets, we always release ports inline so that they
  1035  	// are immediately available for reuse after Close() is called. If also
  1036  	// registered, we unregister as well otherwise the next user would fail
  1037  	// in Listen() when trying to register.
  1038  	if e.EndpointState() == StateListen && e.isPortReserved {
  1039  		if e.isRegistered {
  1040  			e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
  1041  			e.isRegistered = false
  1042  		}
  1043  
  1044  		portRes := ports.Reservation{
  1045  			Networks:     e.effectiveNetProtos,
  1046  			Transport:    ProtocolNumber,
  1047  			Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  1048  			Port:         e.TransportEndpointInfo.ID.LocalPort,
  1049  			Flags:        e.boundPortFlags,
  1050  			BindToDevice: e.boundBindToDevice,
  1051  			Dest:         e.boundDest,
  1052  		}
  1053  		e.stack.ReleasePort(portRes)
  1054  		e.isPortReserved = false
  1055  		e.boundBindToDevice = 0
  1056  		e.boundPortFlags = ports.Flags{}
  1057  		e.boundDest = tcpip.FullAddress{}
  1058  	}
  1059  
  1060  	// Mark endpoint as closed.
  1061  	e.closed = true
  1062  
  1063  	switch e.EndpointState() {
  1064  	case StateClose, StateError:
  1065  		return
  1066  	}
  1067  
  1068  	eventMask := waiter.ReadableEvents | waiter.WritableEvents
  1069  	// Either perform the local cleanup or kick the worker to make sure it
  1070  	// knows it needs to cleanup.
  1071  	if e.workerRunning {
  1072  		e.workerCleanup = true
  1073  		tcpip.AddDanglingEndpoint(e)
  1074  		// Worker will remove the dangling endpoint when the endpoint
  1075  		// goroutine terminates.
  1076  		e.notifyProtocolGoroutine(notifyClose)
  1077  	} else {
  1078  		e.transitionToStateCloseLocked()
  1079  		// Notify that the endpoint is closed.
  1080  		eventMask |= waiter.EventHUp
  1081  	}
  1082  
  1083  	// The TCP closing state-machine would eventually notify EventHUp, but we
  1084  	// notify EventIn|EventOut immediately to unblock any blocked waiters.
  1085  	e.waiterQueue.Notify(eventMask)
  1086  }
  1087  
  1088  // closePendingAcceptableConnections closes all connections that have completed
  1089  // handshake but not yet been delivered to the application.
  1090  func (e *endpoint) closePendingAcceptableConnectionsLocked() {
  1091  	e.acceptMu.Lock()
  1092  	acceptedCopy := e.accepted
  1093  	e.accepted = accepted{}
  1094  	e.acceptMu.Unlock()
  1095  
  1096  	if acceptedCopy == (accepted{}) {
  1097  		return
  1098  	}
  1099  
  1100  	e.acceptCond.Broadcast()
  1101  
  1102  	// Reset all connections that are waiting to be accepted.
  1103  	for n := acceptedCopy.endpoints.Front(); n != nil; n = n.Next() {
  1104  		n.Value.(*endpoint).notifyProtocolGoroutine(notifyReset)
  1105  	}
  1106  	// Wait for reset of all endpoints that are still waiting to be delivered to
  1107  	// the now closed accepted.
  1108  	e.pendingAccepted.Wait()
  1109  }
  1110  
  1111  // cleanupLocked frees all resources associated with the endpoint. It is called
  1112  // after Close() is called and the worker goroutine (if any) is done with its
  1113  // work.
  1114  func (e *endpoint) cleanupLocked() {
  1115  	// Close all endpoints that might have been accepted by TCP but not by
  1116  	// the client.
  1117  	e.closePendingAcceptableConnectionsLocked()
  1118  	e.keepalive.timer.cleanup()
  1119  
  1120  	e.workerCleanup = false
  1121  
  1122  	if e.isRegistered {
  1123  		e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
  1124  		e.isRegistered = false
  1125  	}
  1126  
  1127  	if e.isPortReserved {
  1128  		portRes := ports.Reservation{
  1129  			Networks:     e.effectiveNetProtos,
  1130  			Transport:    ProtocolNumber,
  1131  			Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  1132  			Port:         e.TransportEndpointInfo.ID.LocalPort,
  1133  			Flags:        e.boundPortFlags,
  1134  			BindToDevice: e.boundBindToDevice,
  1135  			Dest:         e.boundDest,
  1136  		}
  1137  		e.stack.ReleasePort(portRes)
  1138  		e.isPortReserved = false
  1139  	}
  1140  	e.boundBindToDevice = 0
  1141  	e.boundPortFlags = ports.Flags{}
  1142  	e.boundDest = tcpip.FullAddress{}
  1143  
  1144  	if e.route != nil {
  1145  		e.route.Release()
  1146  		e.route = nil
  1147  	}
  1148  
  1149  	e.stack.CompleteTransportEndpointCleanup(e)
  1150  	tcpip.DeleteDanglingEndpoint(e)
  1151  }
  1152  
  1153  // wndFromSpace returns the window that we can advertise based on the available
  1154  // receive buffer space.
  1155  func wndFromSpace(space int) int {
  1156  	return space >> rcvAdvWndScale
  1157  }
  1158  
  1159  // initialReceiveWindow returns the initial receive window to advertise in the
  1160  // SYN/SYN-ACK.
  1161  func (e *endpoint) initialReceiveWindow() int {
  1162  	rcvWnd := wndFromSpace(e.receiveBufferAvailable())
  1163  	if rcvWnd > math.MaxUint16 {
  1164  		rcvWnd = math.MaxUint16
  1165  	}
  1166  
  1167  	// Use the user supplied MSS, if available.
  1168  	routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2
  1169  	if rcvWnd > routeWnd {
  1170  		rcvWnd = routeWnd
  1171  	}
  1172  	rcvWndScale := e.rcvWndScaleForHandshake()
  1173  
  1174  	// Round-down the rcvWnd to a multiple of wndScale. This ensures that the
  1175  	// window offered in SYN won't be reduced due to the loss of precision if
  1176  	// window scaling is enabled after the handshake.
  1177  	rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale)
  1178  
  1179  	// Ensure we can always accept at least 1 byte if the scale specified
  1180  	// was too high for the provided rcvWnd.
  1181  	if rcvWnd == 0 {
  1182  		rcvWnd = 1
  1183  	}
  1184  
  1185  	return rcvWnd
  1186  }
  1187  
  1188  // ModerateRecvBuf adjusts the receive buffer and the advertised window
  1189  // based on the number of bytes copied to userspace.
  1190  func (e *endpoint) ModerateRecvBuf(copied int) {
  1191  	e.LockUser()
  1192  	defer e.UnlockUser()
  1193  
  1194  	e.rcvQueueInfo.rcvQueueMu.Lock()
  1195  	if e.rcvQueueInfo.RcvAutoParams.Disabled {
  1196  		e.rcvQueueInfo.rcvQueueMu.Unlock()
  1197  		return
  1198  	}
  1199  	now := e.stack.Clock().NowMonotonic()
  1200  	if rtt := e.rcvQueueInfo.RcvAutoParams.RTT; rtt == 0 || now.Sub(e.rcvQueueInfo.RcvAutoParams.MeasureTime) < rtt {
  1201  		e.rcvQueueInfo.RcvAutoParams.CopiedBytes += copied
  1202  		e.rcvQueueInfo.rcvQueueMu.Unlock()
  1203  		return
  1204  	}
  1205  	prevRTTCopied := e.rcvQueueInfo.RcvAutoParams.CopiedBytes + copied
  1206  	prevCopied := e.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes
  1207  	rcvWnd := 0
  1208  	if prevRTTCopied > prevCopied {
  1209  		// The minimal receive window based on what was copied by the app
  1210  		// in the immediate preceding RTT and some extra buffer for 16
  1211  		// segments to account for variations.
  1212  		// We multiply by 2 to account for packet losses.
  1213  		rcvWnd = prevRTTCopied*2 + 16*int(e.amss)
  1214  
  1215  		// Scale for slow start based on bytes copied in this RTT vs previous.
  1216  		grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied
  1217  
  1218  		// Multiply growth factor by 2 again to account for sender being
  1219  		// in slow-start where the sender grows it's congestion window
  1220  		// by 100% per RTT.
  1221  		rcvWnd += grow * 2
  1222  
  1223  		// Make sure auto tuned buffer size can always receive upto 2x
  1224  		// the initial window of 10 segments.
  1225  		if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd {
  1226  			rcvWnd = minRcvWnd
  1227  		}
  1228  
  1229  		// Cap the auto tuned buffer size by the maximum permissible
  1230  		// receive buffer size.
  1231  		if max := e.maxReceiveBufferSize(); rcvWnd > max {
  1232  			rcvWnd = max
  1233  		}
  1234  
  1235  		// We do not adjust downwards as that can cause the receiver to
  1236  		// reject valid data that might already be in flight as the
  1237  		// acceptable window will shrink.
  1238  		rcvBufSize := int(e.ops.GetReceiveBufferSize())
  1239  		if rcvWnd > rcvBufSize {
  1240  			availBefore := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize))
  1241  			e.ops.SetReceiveBufferSize(int64(rcvWnd), false /* notify */)
  1242  			availAfter := wndFromSpace(e.receiveBufferAvailableLocked(rcvWnd))
  1243  			if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, rcvBufSize); crossed && above {
  1244  				e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
  1245  			}
  1246  		}
  1247  
  1248  		// We only update PrevCopiedBytes when we grow the buffer because in cases
  1249  		// where PrevCopiedBytes > prevRTTCopied the existing buffer is already big
  1250  		// enough to handle the current rate and we don't need to do any
  1251  		// adjustments.
  1252  		e.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes = prevRTTCopied
  1253  	}
  1254  	e.rcvQueueInfo.RcvAutoParams.MeasureTime = now
  1255  	e.rcvQueueInfo.RcvAutoParams.CopiedBytes = 0
  1256  	e.rcvQueueInfo.rcvQueueMu.Unlock()
  1257  }
  1258  
  1259  // SetOwner implements tcpip.Endpoint.SetOwner.
  1260  func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
  1261  	e.owner = owner
  1262  }
  1263  
  1264  // Preconditions: e.mu must be held to call this function.
  1265  func (e *endpoint) hardErrorLocked() tcpip.Error {
  1266  	err := e.hardError
  1267  	e.hardError = nil
  1268  	return err
  1269  }
  1270  
  1271  // Preconditions: e.mu must be held to call this function.
  1272  func (e *endpoint) lastErrorLocked() tcpip.Error {
  1273  	e.lastErrorMu.Lock()
  1274  	defer e.lastErrorMu.Unlock()
  1275  	err := e.lastError
  1276  	e.lastError = nil
  1277  	return err
  1278  }
  1279  
  1280  // LastError implements tcpip.Endpoint.LastError.
  1281  func (e *endpoint) LastError() tcpip.Error {
  1282  	e.LockUser()
  1283  	defer e.UnlockUser()
  1284  	if err := e.hardErrorLocked(); err != nil {
  1285  		return err
  1286  	}
  1287  	return e.lastErrorLocked()
  1288  }
  1289  
  1290  // LastErrorLocked reads and clears lastError with e.mu held.
  1291  // Only to be used in tests.
  1292  func (e *endpoint) LastErrorLocked() tcpip.Error {
  1293  	return e.lastErrorLocked()
  1294  }
  1295  
  1296  // UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError.
  1297  func (e *endpoint) UpdateLastError(err tcpip.Error) {
  1298  	e.LockUser()
  1299  	e.lastErrorMu.Lock()
  1300  	e.lastError = err
  1301  	e.lastErrorMu.Unlock()
  1302  	e.UnlockUser()
  1303  }
  1304  
  1305  // Read implements tcpip.Endpoint.Read.
  1306  func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) {
  1307  	e.rcvReadMu.Lock()
  1308  	defer e.rcvReadMu.Unlock()
  1309  
  1310  	// N.B. Here we get a range of segments to be processed. It is safe to not
  1311  	// hold rcvQueueMu when processing, since we hold rcvReadMu to ensure only we
  1312  	// can remove segments from the list through commitRead().
  1313  	first, last, serr := e.startRead()
  1314  	if serr != nil {
  1315  		if _, ok := serr.(*tcpip.ErrClosedForReceive); ok {
  1316  			e.stats.ReadErrors.ReadClosed.Increment()
  1317  		}
  1318  		return tcpip.ReadResult{}, serr
  1319  	}
  1320  
  1321  	var err error
  1322  	done := 0
  1323  	s := first
  1324  	for s != nil {
  1325  		var n int
  1326  		n, err = s.data.ReadTo(dst, opts.Peek)
  1327  		// Book keeping first then error handling.
  1328  
  1329  		done += n
  1330  
  1331  		if opts.Peek {
  1332  			// For peek, we use the (first, last) range of segment returned from
  1333  			// startRead. We don't consume the receive buffer, so commitRead should
  1334  			// not be called.
  1335  			//
  1336  			// N.B. It is important to use `last` to determine the last segment, since
  1337  			// appending can happen while we process, and will lead to data race.
  1338  			if s == last {
  1339  				break
  1340  			}
  1341  			s = s.Next()
  1342  		} else {
  1343  			// N.B. commitRead() conveniently returns the next segment to read, after
  1344  			// removing the data/segment that is read.
  1345  			s = e.commitRead(n)
  1346  		}
  1347  
  1348  		if err != nil {
  1349  			break
  1350  		}
  1351  	}
  1352  
  1353  	// If something is read, we must report it. Report error when nothing is read.
  1354  	if done == 0 && err != nil {
  1355  		return tcpip.ReadResult{}, &tcpip.ErrBadBuffer{}
  1356  	}
  1357  	return tcpip.ReadResult{
  1358  		Count: done,
  1359  		Total: done,
  1360  	}, nil
  1361  }
  1362  
  1363  // startRead checks that endpoint is in a readable state, and return the
  1364  // inclusive range of segments that can be read.
  1365  //
  1366  // Precondition: e.rcvReadMu must be held.
  1367  func (e *endpoint) startRead() (first, last *segment, err tcpip.Error) {
  1368  	e.LockUser()
  1369  	defer e.UnlockUser()
  1370  
  1371  	// When in SYN-SENT state, let the caller block on the receive.
  1372  	// An application can initiate a non-blocking connect and then block
  1373  	// on a receive. It can expect to read any data after the handshake
  1374  	// is complete. RFC793, section 3.9, p58.
  1375  	if e.EndpointState() == StateSynSent {
  1376  		return nil, nil, &tcpip.ErrWouldBlock{}
  1377  	}
  1378  
  1379  	// The endpoint can be read if it's connected, or if it's already closed
  1380  	// but has some pending unread data. Also note that a RST being received
  1381  	// would cause the state to become StateError so we should allow the
  1382  	// reads to proceed before returning a ECONNRESET.
  1383  	e.rcvQueueInfo.rcvQueueMu.Lock()
  1384  	defer e.rcvQueueInfo.rcvQueueMu.Unlock()
  1385  
  1386  	bufUsed := e.rcvQueueInfo.RcvBufUsed
  1387  	if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 {
  1388  		if s == StateError {
  1389  			if err := e.hardErrorLocked(); err != nil {
  1390  				return nil, nil, err
  1391  			}
  1392  			return nil, nil, &tcpip.ErrClosedForReceive{}
  1393  		}
  1394  		e.stats.ReadErrors.NotConnected.Increment()
  1395  		return nil, nil, &tcpip.ErrNotConnected{}
  1396  	}
  1397  
  1398  	if e.rcvQueueInfo.RcvBufUsed == 0 {
  1399  		if e.rcvQueueInfo.RcvClosed || !e.EndpointState().connected() {
  1400  			return nil, nil, &tcpip.ErrClosedForReceive{}
  1401  		}
  1402  		return nil, nil, &tcpip.ErrWouldBlock{}
  1403  	}
  1404  
  1405  	return e.rcvQueueInfo.rcvQueue.Front(), e.rcvQueueInfo.rcvQueue.Back(), nil
  1406  }
  1407  
  1408  // commitRead commits a read of done bytes and returns the next non-empty
  1409  // segment to read. Data read from the segment must have also been removed from
  1410  // the segment in order for this method to work correctly.
  1411  //
  1412  // It is performance critical to call commitRead frequently when servicing a big
  1413  // Read request, so TCP can make progress timely. Right now, it is designed to
  1414  // do this per segment read, hence this method conveniently returns the next
  1415  // segment to read while holding the lock.
  1416  //
  1417  // Precondition: e.rcvReadMu must be held.
  1418  func (e *endpoint) commitRead(done int) *segment {
  1419  	e.LockUser()
  1420  	defer e.UnlockUser()
  1421  	e.rcvQueueInfo.rcvQueueMu.Lock()
  1422  	defer e.rcvQueueInfo.rcvQueueMu.Unlock()
  1423  
  1424  	memDelta := 0
  1425  	s := e.rcvQueueInfo.rcvQueue.Front()
  1426  	for s != nil && s.data.Size() == 0 {
  1427  		e.rcvQueueInfo.rcvQueue.Remove(s)
  1428  		// Memory is only considered released when the whole segment has been
  1429  		// read.
  1430  		memDelta += s.segMemSize()
  1431  		s.decRef()
  1432  		s = e.rcvQueueInfo.rcvQueue.Front()
  1433  	}
  1434  	e.rcvQueueInfo.RcvBufUsed -= done
  1435  
  1436  	if memDelta > 0 {
  1437  		// If the window was small before this read and if the read freed up
  1438  		// enough buffer space, to either fit an aMSS or half a receive buffer
  1439  		// (whichever smaller), then notify the protocol goroutine to send a
  1440  		// window update.
  1441  		if crossed, above := e.windowCrossedACKThresholdLocked(memDelta, int(e.ops.GetReceiveBufferSize())); crossed && above {
  1442  			e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
  1443  		}
  1444  	}
  1445  
  1446  	return e.rcvQueueInfo.rcvQueue.Front()
  1447  }
  1448  
  1449  // isEndpointWritableLocked checks if a given endpoint is writable
  1450  // and also returns the number of bytes that can be written at this
  1451  // moment. If the endpoint is not writable then it returns an error
  1452  // indicating the reason why it's not writable.
  1453  // Caller must hold e.mu and e.sndQueueMu
  1454  func (e *endpoint) isEndpointWritableLocked() (int, tcpip.Error) {
  1455  	// The endpoint cannot be written to if it's not connected.
  1456  	switch s := e.EndpointState(); {
  1457  	case s == StateError:
  1458  		if err := e.hardErrorLocked(); err != nil {
  1459  			return 0, err
  1460  		}
  1461  		return 0, &tcpip.ErrClosedForSend{}
  1462  	case !s.connecting() && !s.connected():
  1463  		return 0, &tcpip.ErrClosedForSend{}
  1464  	case s.connecting():
  1465  		// As per RFC793, page 56, a send request arriving when in connecting
  1466  		// state, can be queued to be completed after the state becomes
  1467  		// connected. Return an error code for the caller of endpoint Write to
  1468  		// try again, until the connection handshake is complete.
  1469  		return 0, &tcpip.ErrWouldBlock{}
  1470  	}
  1471  
  1472  	// Check if the connection has already been closed for sends.
  1473  	if e.sndQueueInfo.SndClosed {
  1474  		return 0, &tcpip.ErrClosedForSend{}
  1475  	}
  1476  
  1477  	sndBufSize := e.getSendBufferSize()
  1478  	avail := sndBufSize - e.sndQueueInfo.SndBufUsed
  1479  	if avail <= 0 {
  1480  		return 0, &tcpip.ErrWouldBlock{}
  1481  	}
  1482  	return avail, nil
  1483  }
  1484  
  1485  // readFromPayloader reads a slice from the Payloader.
  1486  // +checklocks:e.mu
  1487  // +checklocks:e.sndQueueInfo.sndQueueMu
  1488  func (e *endpoint) readFromPayloader(p tcpip.Payloader, opts tcpip.WriteOptions, avail int) ([]byte, tcpip.Error) {
  1489  	// We can release locks while copying data.
  1490  	//
  1491  	// This is not possible if atomic is set, because we can't allow the
  1492  	// available buffer space to be consumed by some other caller while we
  1493  	// are copying data in.
  1494  	if !opts.Atomic {
  1495  		e.sndQueueInfo.sndQueueMu.Unlock()
  1496  		defer e.sndQueueInfo.sndQueueMu.Lock()
  1497  
  1498  		e.UnlockUser()
  1499  		defer e.LockUser()
  1500  	}
  1501  
  1502  	// Fetch data.
  1503  	if l := p.Len(); l < avail {
  1504  		avail = l
  1505  	}
  1506  	if avail == 0 {
  1507  		return nil, nil
  1508  	}
  1509  	v := make([]byte, avail)
  1510  	n, err := p.Read(v)
  1511  	if err != nil && err != io.EOF {
  1512  		return nil, &tcpip.ErrBadBuffer{}
  1513  	}
  1514  	return v[:n], nil
  1515  }
  1516  
  1517  // queueSegment reads data from the payloader and returns a segment to be sent.
  1518  // +checklocks:e.mu
  1519  func (e *endpoint) queueSegment(p tcpip.Payloader, opts tcpip.WriteOptions) (*segment, int, tcpip.Error) {
  1520  	e.sndQueueInfo.sndQueueMu.Lock()
  1521  	defer e.sndQueueInfo.sndQueueMu.Unlock()
  1522  
  1523  	avail, err := e.isEndpointWritableLocked()
  1524  	if err != nil {
  1525  		e.stats.WriteErrors.WriteClosed.Increment()
  1526  		return nil, 0, err
  1527  	}
  1528  
  1529  	v, err := e.readFromPayloader(p, opts, avail)
  1530  	if err != nil {
  1531  		return nil, 0, err
  1532  	}
  1533  	if !opts.Atomic {
  1534  		// Since we released locks in between it's possible that the
  1535  		// endpoint transitioned to a CLOSED/ERROR states so make
  1536  		// sure endpoint is still writable before trying to write.
  1537  		avail, err := e.isEndpointWritableLocked()
  1538  		if err != nil {
  1539  			e.stats.WriteErrors.WriteClosed.Increment()
  1540  			return nil, 0, err
  1541  		}
  1542  
  1543  		// Discard any excess data copied in due to avail being reduced due
  1544  		// to a simultaneous write call to the socket.
  1545  		if avail < len(v) {
  1546  			v = v[:avail]
  1547  		}
  1548  	}
  1549  
  1550  	// Add data to the send queue.
  1551  	s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), v)
  1552  	e.sndQueueInfo.SndBufUsed += len(v)
  1553  	e.snd.writeList.PushBack(s)
  1554  
  1555  	return s, len(v), nil
  1556  }
  1557  
  1558  // Write writes data to the endpoint's peer.
  1559  func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) {
  1560  	// Linux completely ignores any address passed to sendto(2) for TCP sockets
  1561  	// (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More
  1562  	// and opts.EndOfRecord are also ignored.
  1563  
  1564  	e.LockUser()
  1565  	defer e.UnlockUser()
  1566  
  1567  	// Return if either we didn't queue anything or if an error occurred while
  1568  	// attempting to queue data.
  1569  	nextSeg, n, err := e.queueSegment(p, opts)
  1570  	if n == 0 || err != nil {
  1571  		return 0, err
  1572  	}
  1573  
  1574  	e.sendData(nextSeg)
  1575  	return int64(n), nil
  1576  }
  1577  
  1578  // selectWindowLocked returns the new window without checking for shrinking or scaling
  1579  // applied.
  1580  // Precondition: e.mu and e.rcvQueueMu must be held.
  1581  func (e *endpoint) selectWindowLocked(rcvBufSize int) (wnd seqnum.Size) {
  1582  	wndFromAvailable := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize))
  1583  	maxWindow := wndFromSpace(rcvBufSize)
  1584  	wndFromUsedBytes := maxWindow - e.rcvQueueInfo.RcvBufUsed
  1585  
  1586  	// We take the lesser of the wndFromAvailable and wndFromUsedBytes because in
  1587  	// cases where we receive a lot of small segments the segment overhead is a
  1588  	// lot higher and we can run out socket buffer space before we can fill the
  1589  	// previous window we advertised. In cases where we receive MSS sized or close
  1590  	// MSS sized segments we will probably run out of window space before we
  1591  	// exhaust receive buffer.
  1592  	newWnd := wndFromAvailable
  1593  	if newWnd > wndFromUsedBytes {
  1594  		newWnd = wndFromUsedBytes
  1595  	}
  1596  	if newWnd < 0 {
  1597  		newWnd = 0
  1598  	}
  1599  	return seqnum.Size(newWnd)
  1600  }
  1601  
  1602  // selectWindow invokes selectWindowLocked after acquiring e.rcvQueueMu.
  1603  func (e *endpoint) selectWindow() (wnd seqnum.Size) {
  1604  	e.rcvQueueInfo.rcvQueueMu.Lock()
  1605  	wnd = e.selectWindowLocked(int(e.ops.GetReceiveBufferSize()))
  1606  	e.rcvQueueInfo.rcvQueueMu.Unlock()
  1607  	return wnd
  1608  }
  1609  
  1610  // windowCrossedACKThresholdLocked checks if the receive window to be announced
  1611  // would be under aMSS or under the window derived from half receive buffer,
  1612  // whichever smaller. This is useful as a receive side silly window syndrome
  1613  // prevention mechanism. If window grows to reasonable value, we should send ACK
  1614  // to the sender to inform the rx space is now large. We also want ensure a
  1615  // series of small read()'s won't trigger a flood of spurious tiny ACK's.
  1616  //
  1617  // For large receive buffers, the threshold is aMSS - once reader reads more
  1618  // than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of
  1619  // receive buffer size. This is chosen arbitrarily.
  1620  // crossed will be true if the window size crossed the ACK threshold.
  1621  // above will be true if the new window is >= ACK threshold and false
  1622  // otherwise.
  1623  //
  1624  // Precondition: e.mu and e.rcvQueueMu must be held.
  1625  func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int, rcvBufSize int) (crossed bool, above bool) {
  1626  	newAvail := int(e.selectWindowLocked(rcvBufSize))
  1627  	oldAvail := newAvail - deltaBefore
  1628  	if oldAvail < 0 {
  1629  		oldAvail = 0
  1630  	}
  1631  	threshold := int(e.amss)
  1632  	// rcvBufFraction is the inverse of the fraction of receive buffer size that
  1633  	// is used to decide if the available buffer space is now above it.
  1634  	const rcvBufFraction = 2
  1635  	if wndThreshold := wndFromSpace(rcvBufSize / rcvBufFraction); threshold > wndThreshold {
  1636  		threshold = wndThreshold
  1637  	}
  1638  	switch {
  1639  	case oldAvail < threshold && newAvail >= threshold:
  1640  		return true, true
  1641  	case oldAvail >= threshold && newAvail < threshold:
  1642  		return true, false
  1643  	}
  1644  	return false, false
  1645  }
  1646  
  1647  // OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet.
  1648  func (e *endpoint) OnReuseAddressSet(v bool) {
  1649  	e.LockUser()
  1650  	e.portFlags.TupleOnly = v
  1651  	e.UnlockUser()
  1652  }
  1653  
  1654  // OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet.
  1655  func (e *endpoint) OnReusePortSet(v bool) {
  1656  	e.LockUser()
  1657  	e.portFlags.LoadBalanced = v
  1658  	e.UnlockUser()
  1659  }
  1660  
  1661  // OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet.
  1662  func (e *endpoint) OnKeepAliveSet(bool) {
  1663  	e.notifyProtocolGoroutine(notifyKeepaliveChanged)
  1664  }
  1665  
  1666  // OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet.
  1667  func (e *endpoint) OnDelayOptionSet(v bool) {
  1668  	if !v {
  1669  		// Handle delayed data.
  1670  		e.sndQueueInfo.sndWaker.Assert()
  1671  	}
  1672  }
  1673  
  1674  // OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet.
  1675  func (e *endpoint) OnCorkOptionSet(v bool) {
  1676  	if !v {
  1677  		// Handle the corked data.
  1678  		e.sndQueueInfo.sndWaker.Assert()
  1679  	}
  1680  }
  1681  
  1682  func (e *endpoint) getSendBufferSize() int {
  1683  	return int(e.ops.GetSendBufferSize())
  1684  }
  1685  
  1686  // OnSetReceiveBufferSize implements tcpip.SocketOptionsHandler.OnSetReceiveBufferSize.
  1687  func (e *endpoint) OnSetReceiveBufferSize(rcvBufSz, oldSz int64) (newSz int64) {
  1688  	e.LockUser()
  1689  	e.rcvQueueInfo.rcvQueueMu.Lock()
  1690  
  1691  	// Make sure the receive buffer size allows us to send a
  1692  	// non-zero window size.
  1693  	scale := uint8(0)
  1694  	if e.rcv != nil {
  1695  		scale = e.rcv.RcvWndScale
  1696  	}
  1697  	if rcvBufSz>>scale == 0 {
  1698  		rcvBufSz = 1 << scale
  1699  	}
  1700  
  1701  	availBefore := wndFromSpace(e.receiveBufferAvailableLocked(int(oldSz)))
  1702  	availAfter := wndFromSpace(e.receiveBufferAvailableLocked(int(rcvBufSz)))
  1703  	e.rcvQueueInfo.RcvAutoParams.Disabled = true
  1704  
  1705  	// Immediately send an ACK to uncork the sender silly window
  1706  	// syndrome prevetion, when our available space grows above aMSS
  1707  	// or half receive buffer, whichever smaller.
  1708  	if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, int(rcvBufSz)); crossed && above {
  1709  		e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
  1710  	}
  1711  
  1712  	e.rcvQueueInfo.rcvQueueMu.Unlock()
  1713  	e.UnlockUser()
  1714  	return rcvBufSz
  1715  }
  1716  
  1717  // SetSockOptInt sets a socket option.
  1718  func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error {
  1719  	// Lower 2 bits represents ECN bits. RFC 3168, section 23.1
  1720  	const inetECNMask = 3
  1721  
  1722  	switch opt {
  1723  	case tcpip.KeepaliveCountOption:
  1724  		e.keepalive.Lock()
  1725  		e.keepalive.count = v
  1726  		e.keepalive.Unlock()
  1727  		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
  1728  
  1729  	case tcpip.IPv4TOSOption:
  1730  		e.LockUser()
  1731  		// TODO(github.com/SagerNet/issue/995): ECN is not currently supported,
  1732  		// ignore the bits for now.
  1733  		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
  1734  		e.UnlockUser()
  1735  
  1736  	case tcpip.IPv6TrafficClassOption:
  1737  		e.LockUser()
  1738  		// TODO(github.com/SagerNet/issue/995): ECN is not currently supported,
  1739  		// ignore the bits for now.
  1740  		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
  1741  		e.UnlockUser()
  1742  
  1743  	case tcpip.MaxSegOption:
  1744  		userMSS := v
  1745  		if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS {
  1746  			return &tcpip.ErrInvalidOptionValue{}
  1747  		}
  1748  		e.LockUser()
  1749  		e.userMSS = uint16(userMSS)
  1750  		e.UnlockUser()
  1751  		e.notifyProtocolGoroutine(notifyMSSChanged)
  1752  
  1753  	case tcpip.MTUDiscoverOption:
  1754  		// Return not supported if attempting to set this option to
  1755  		// anything other than path MTU discovery disabled.
  1756  		if v != tcpip.PMTUDiscoveryDont {
  1757  			return &tcpip.ErrNotSupported{}
  1758  		}
  1759  
  1760  	case tcpip.TTLOption:
  1761  		e.LockUser()
  1762  		e.ttl = uint8(v)
  1763  		e.UnlockUser()
  1764  
  1765  	case tcpip.TCPSynCountOption:
  1766  		if v < 1 || v > 255 {
  1767  			return &tcpip.ErrInvalidOptionValue{}
  1768  		}
  1769  		e.LockUser()
  1770  		e.maxSynRetries = uint8(v)
  1771  		e.UnlockUser()
  1772  
  1773  	case tcpip.TCPWindowClampOption:
  1774  		if v == 0 {
  1775  			e.LockUser()
  1776  			switch e.EndpointState() {
  1777  			case StateClose, StateInitial:
  1778  				e.windowClamp = 0
  1779  				e.UnlockUser()
  1780  				return nil
  1781  			default:
  1782  				e.UnlockUser()
  1783  				return &tcpip.ErrInvalidOptionValue{}
  1784  			}
  1785  		}
  1786  		var rs tcpip.TCPReceiveBufferSizeRangeOption
  1787  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
  1788  			if v < rs.Min/2 {
  1789  				v = rs.Min / 2
  1790  			}
  1791  		}
  1792  		e.LockUser()
  1793  		e.windowClamp = uint32(v)
  1794  		e.UnlockUser()
  1795  	}
  1796  	return nil
  1797  }
  1798  
  1799  func (e *endpoint) HasNIC(id int32) bool {
  1800  	return id == 0 || e.stack.HasNIC(tcpip.NICID(id))
  1801  }
  1802  
  1803  // SetSockOpt sets a socket option.
  1804  func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error {
  1805  	switch v := opt.(type) {
  1806  	case *tcpip.KeepaliveIdleOption:
  1807  		e.keepalive.Lock()
  1808  		e.keepalive.idle = time.Duration(*v)
  1809  		e.keepalive.Unlock()
  1810  		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
  1811  
  1812  	case *tcpip.KeepaliveIntervalOption:
  1813  		e.keepalive.Lock()
  1814  		e.keepalive.interval = time.Duration(*v)
  1815  		e.keepalive.Unlock()
  1816  		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
  1817  
  1818  	case *tcpip.TCPUserTimeoutOption:
  1819  		e.LockUser()
  1820  		e.userTimeout = time.Duration(*v)
  1821  		e.UnlockUser()
  1822  
  1823  	case *tcpip.CongestionControlOption:
  1824  		// Query the available cc algorithms in the stack and
  1825  		// validate that the specified algorithm is actually
  1826  		// supported in the stack.
  1827  		var avail tcpip.TCPAvailableCongestionControlOption
  1828  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil {
  1829  			return err
  1830  		}
  1831  		availCC := strings.Split(string(avail), " ")
  1832  		for _, cc := range availCC {
  1833  			if *v == tcpip.CongestionControlOption(cc) {
  1834  				e.LockUser()
  1835  				state := e.EndpointState()
  1836  				e.cc = *v
  1837  				switch state {
  1838  				case StateEstablished:
  1839  					if e.EndpointState() == state {
  1840  						e.snd.cc = e.snd.initCongestionControl(e.cc)
  1841  					}
  1842  				}
  1843  				e.UnlockUser()
  1844  				return nil
  1845  			}
  1846  		}
  1847  
  1848  		// Linux returns ENOENT when an invalid congestion
  1849  		// control algorithm is specified.
  1850  		return &tcpip.ErrNoSuchFile{}
  1851  
  1852  	case *tcpip.TCPLingerTimeoutOption:
  1853  		e.LockUser()
  1854  
  1855  		switch {
  1856  		case *v < 0:
  1857  			// Same as effectively disabling TCPLinger timeout.
  1858  			*v = -1
  1859  		case *v == 0:
  1860  			// Same as the stack default.
  1861  			var stackLingerTimeout tcpip.TCPLingerTimeoutOption
  1862  			if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil {
  1863  				panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err))
  1864  			}
  1865  			*v = stackLingerTimeout
  1866  		case *v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout):
  1867  			// Cap it to Stack's default TCP_LINGER2 timeout.
  1868  			*v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout)
  1869  		default:
  1870  		}
  1871  
  1872  		e.tcpLingerTimeout = time.Duration(*v)
  1873  		e.UnlockUser()
  1874  
  1875  	case *tcpip.TCPDeferAcceptOption:
  1876  		e.LockUser()
  1877  		if time.Duration(*v) > MaxRTO {
  1878  			*v = tcpip.TCPDeferAcceptOption(MaxRTO)
  1879  		}
  1880  		e.deferAccept = time.Duration(*v)
  1881  		e.UnlockUser()
  1882  
  1883  	case *tcpip.SocketDetachFilterOption:
  1884  		return nil
  1885  
  1886  	default:
  1887  		return nil
  1888  	}
  1889  	return nil
  1890  }
  1891  
  1892  // readyReceiveSize returns the number of bytes ready to be received.
  1893  func (e *endpoint) readyReceiveSize() (int, tcpip.Error) {
  1894  	e.LockUser()
  1895  	defer e.UnlockUser()
  1896  
  1897  	// The endpoint cannot be in listen state.
  1898  	if e.EndpointState() == StateListen {
  1899  		return 0, &tcpip.ErrInvalidEndpointState{}
  1900  	}
  1901  
  1902  	e.rcvQueueInfo.rcvQueueMu.Lock()
  1903  	defer e.rcvQueueInfo.rcvQueueMu.Unlock()
  1904  
  1905  	return e.rcvQueueInfo.RcvBufUsed, nil
  1906  }
  1907  
  1908  // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
  1909  func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) {
  1910  	switch opt {
  1911  	case tcpip.KeepaliveCountOption:
  1912  		e.keepalive.Lock()
  1913  		v := e.keepalive.count
  1914  		e.keepalive.Unlock()
  1915  		return v, nil
  1916  
  1917  	case tcpip.IPv4TOSOption:
  1918  		e.LockUser()
  1919  		v := int(e.sendTOS)
  1920  		e.UnlockUser()
  1921  		return v, nil
  1922  
  1923  	case tcpip.IPv6TrafficClassOption:
  1924  		e.LockUser()
  1925  		v := int(e.sendTOS)
  1926  		e.UnlockUser()
  1927  		return v, nil
  1928  
  1929  	case tcpip.MaxSegOption:
  1930  		// This is just stubbed out. Linux never returns the user_mss
  1931  		// value as it either returns the defaultMSS or returns the
  1932  		// actual current MSS. Netstack just returns the defaultMSS
  1933  		// always for now.
  1934  		v := header.TCPDefaultMSS
  1935  		return v, nil
  1936  
  1937  	case tcpip.MTUDiscoverOption:
  1938  		// Always return the path MTU discovery disabled setting since
  1939  		// it's the only one supported.
  1940  		return tcpip.PMTUDiscoveryDont, nil
  1941  
  1942  	case tcpip.ReceiveQueueSizeOption:
  1943  		return e.readyReceiveSize()
  1944  
  1945  	case tcpip.TTLOption:
  1946  		e.LockUser()
  1947  		v := int(e.ttl)
  1948  		e.UnlockUser()
  1949  		return v, nil
  1950  
  1951  	case tcpip.TCPSynCountOption:
  1952  		e.LockUser()
  1953  		v := int(e.maxSynRetries)
  1954  		e.UnlockUser()
  1955  		return v, nil
  1956  
  1957  	case tcpip.TCPWindowClampOption:
  1958  		e.LockUser()
  1959  		v := int(e.windowClamp)
  1960  		e.UnlockUser()
  1961  		return v, nil
  1962  
  1963  	case tcpip.MulticastTTLOption:
  1964  		return 1, nil
  1965  
  1966  	default:
  1967  		return -1, &tcpip.ErrUnknownProtocolOption{}
  1968  	}
  1969  }
  1970  
  1971  func (e *endpoint) getTCPInfo() tcpip.TCPInfoOption {
  1972  	info := tcpip.TCPInfoOption{}
  1973  	e.LockUser()
  1974  	if state := e.EndpointState(); state.internal() {
  1975  		info.State = tcpip.EndpointState(StateClose)
  1976  	} else {
  1977  		info.State = tcpip.EndpointState(state)
  1978  	}
  1979  	snd := e.snd
  1980  	if snd != nil {
  1981  		// We do not calculate RTT before sending the data packets. If
  1982  		// the connection did not send and receive data, then RTT will
  1983  		// be zero.
  1984  		snd.rtt.Lock()
  1985  		info.RTT = snd.rtt.TCPRTTState.SRTT
  1986  		info.RTTVar = snd.rtt.TCPRTTState.RTTVar
  1987  		snd.rtt.Unlock()
  1988  
  1989  		info.RTO = snd.RTO
  1990  		info.CcState = snd.state
  1991  		info.SndSsthresh = uint32(snd.Ssthresh)
  1992  		info.SndCwnd = uint32(snd.SndCwnd)
  1993  		info.ReorderSeen = snd.rc.Reord
  1994  	}
  1995  	e.UnlockUser()
  1996  	return info
  1997  }
  1998  
  1999  // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
  2000  func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error {
  2001  	switch o := opt.(type) {
  2002  	case *tcpip.TCPInfoOption:
  2003  		*o = e.getTCPInfo()
  2004  
  2005  	case *tcpip.KeepaliveIdleOption:
  2006  		e.keepalive.Lock()
  2007  		*o = tcpip.KeepaliveIdleOption(e.keepalive.idle)
  2008  		e.keepalive.Unlock()
  2009  
  2010  	case *tcpip.KeepaliveIntervalOption:
  2011  		e.keepalive.Lock()
  2012  		*o = tcpip.KeepaliveIntervalOption(e.keepalive.interval)
  2013  		e.keepalive.Unlock()
  2014  
  2015  	case *tcpip.TCPUserTimeoutOption:
  2016  		e.LockUser()
  2017  		*o = tcpip.TCPUserTimeoutOption(e.userTimeout)
  2018  		e.UnlockUser()
  2019  
  2020  	case *tcpip.CongestionControlOption:
  2021  		e.LockUser()
  2022  		*o = e.cc
  2023  		e.UnlockUser()
  2024  
  2025  	case *tcpip.TCPLingerTimeoutOption:
  2026  		e.LockUser()
  2027  		*o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout)
  2028  		e.UnlockUser()
  2029  
  2030  	case *tcpip.TCPDeferAcceptOption:
  2031  		e.LockUser()
  2032  		*o = tcpip.TCPDeferAcceptOption(e.deferAccept)
  2033  		e.UnlockUser()
  2034  
  2035  	case *tcpip.OriginalDestinationOption:
  2036  		e.LockUser()
  2037  		ipt := e.stack.IPTables()
  2038  		addr, port, err := ipt.OriginalDst(e.TransportEndpointInfo.ID, e.NetProto)
  2039  		e.UnlockUser()
  2040  		if err != nil {
  2041  			return err
  2042  		}
  2043  		*o = tcpip.OriginalDestinationOption{
  2044  			Addr: addr,
  2045  			Port: port,
  2046  		}
  2047  
  2048  	default:
  2049  		return &tcpip.ErrUnknownProtocolOption{}
  2050  	}
  2051  	return nil
  2052  }
  2053  
  2054  // checkV4MappedLocked determines the effective network protocol and converts
  2055  // addr to its canonical form.
  2056  func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) {
  2057  	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only())
  2058  	if err != nil {
  2059  		return tcpip.FullAddress{}, 0, err
  2060  	}
  2061  	return unwrapped, netProto, nil
  2062  }
  2063  
  2064  // Disconnect implements tcpip.Endpoint.Disconnect.
  2065  func (*endpoint) Disconnect() tcpip.Error {
  2066  	return &tcpip.ErrNotSupported{}
  2067  }
  2068  
  2069  // Connect connects the endpoint to its peer.
  2070  func (e *endpoint) Connect(addr tcpip.FullAddress) tcpip.Error {
  2071  	err := e.connect(addr, true, true)
  2072  	if err != nil {
  2073  		if !err.IgnoreStats() {
  2074  			// Connect failed. Let's wake up any waiters.
  2075  			e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  2076  			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
  2077  			e.stats.FailedConnectionAttempts.Increment()
  2078  		}
  2079  	}
  2080  	return err
  2081  }
  2082  
  2083  // connect connects the endpoint to its peer. In the normal non-S/R case, the
  2084  // new connection is expected to run the main goroutine and perform handshake.
  2085  // In restore of previously connected endpoints, both ends will be passively
  2086  // created (so no new handshaking is done); for stack-accepted connections not
  2087  // yet accepted by the app, they are restored without running the main goroutine
  2088  // here.
  2089  func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) tcpip.Error {
  2090  	e.LockUser()
  2091  	defer e.UnlockUser()
  2092  
  2093  	connectingAddr := addr.Addr
  2094  
  2095  	addr, netProto, err := e.checkV4MappedLocked(addr)
  2096  	if err != nil {
  2097  		return err
  2098  	}
  2099  
  2100  	if e.EndpointState().connected() {
  2101  		// The endpoint is already connected. If caller hasn't been
  2102  		// notified yet, return success.
  2103  		if !e.isConnectNotified {
  2104  			e.isConnectNotified = true
  2105  			return nil
  2106  		}
  2107  		// Otherwise return that it's already connected.
  2108  		return &tcpip.ErrAlreadyConnected{}
  2109  	}
  2110  
  2111  	nicID := addr.NIC
  2112  	switch e.EndpointState() {
  2113  	case StateBound:
  2114  		// If we're already bound to a NIC but the caller is requesting
  2115  		// that we use a different one now, we cannot proceed.
  2116  		if e.boundNICID == 0 {
  2117  			break
  2118  		}
  2119  
  2120  		if nicID != 0 && nicID != e.boundNICID {
  2121  			return &tcpip.ErrNoRoute{}
  2122  		}
  2123  
  2124  		nicID = e.boundNICID
  2125  
  2126  	case StateInitial:
  2127  		// Nothing to do. We'll eventually fill-in the gaps in the ID (if any)
  2128  		// when we find a route.
  2129  
  2130  	case StateConnecting, StateSynSent, StateSynRecv:
  2131  		// A connection request has already been issued but hasn't completed
  2132  		// yet.
  2133  		return &tcpip.ErrAlreadyConnecting{}
  2134  
  2135  	case StateError:
  2136  		if err := e.hardErrorLocked(); err != nil {
  2137  			return err
  2138  		}
  2139  		return &tcpip.ErrConnectionAborted{}
  2140  
  2141  	default:
  2142  		return &tcpip.ErrInvalidEndpointState{}
  2143  	}
  2144  
  2145  	// Find a route to the desired destination.
  2146  	r, err := e.stack.FindRoute(nicID, e.TransportEndpointInfo.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */)
  2147  	if err != nil {
  2148  		return err
  2149  	}
  2150  	defer r.Release()
  2151  
  2152  	netProtos := []tcpip.NetworkProtocolNumber{netProto}
  2153  	e.TransportEndpointInfo.ID.LocalAddress = r.LocalAddress()
  2154  	e.TransportEndpointInfo.ID.RemoteAddress = r.RemoteAddress()
  2155  	e.TransportEndpointInfo.ID.RemotePort = addr.Port
  2156  
  2157  	if e.TransportEndpointInfo.ID.LocalPort != 0 {
  2158  		// The endpoint is bound to a port, attempt to register it.
  2159  		err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
  2160  		if err != nil {
  2161  			return err
  2162  		}
  2163  	} else {
  2164  		// The endpoint doesn't have a local port yet, so try to get
  2165  		// one. Make sure that it isn't one that will result in the same
  2166  		// address/port for both local and remote (otherwise this
  2167  		// endpoint would be trying to connect to itself).
  2168  		sameAddr := e.TransportEndpointInfo.ID.LocalAddress == e.TransportEndpointInfo.ID.RemoteAddress
  2169  
  2170  		// Calculate a port offset based on the destination IP/port and
  2171  		// src IP to ensure that for a given tuple (srcIP, destIP,
  2172  		// destPort) the offset used as a starting point is the same to
  2173  		// ensure that we can cycle through the port space effectively.
  2174  		portBuf := make([]byte, 2)
  2175  		binary.LittleEndian.PutUint16(portBuf, e.ID.RemotePort)
  2176  
  2177  		h := jenkins.Sum32(e.stack.Seed())
  2178  		for _, s := range [][]byte{
  2179  			[]byte(e.ID.LocalAddress),
  2180  			[]byte(e.ID.RemoteAddress),
  2181  			portBuf,
  2182  		} {
  2183  			// Per io.Writer.Write:
  2184  			//
  2185  			// Write must return a non-nil error if it returns n < len(p).
  2186  			if _, err := h.Write(s); err != nil {
  2187  				panic(err)
  2188  			}
  2189  		}
  2190  		portOffset := h.Sum32()
  2191  
  2192  		var twReuse tcpip.TCPTimeWaitReuseOption
  2193  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &twReuse); err != nil {
  2194  			panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &twReuse, err))
  2195  		}
  2196  
  2197  		reuse := twReuse == tcpip.TCPTimeWaitReuseGlobal
  2198  		if twReuse == tcpip.TCPTimeWaitReuseLoopbackOnly {
  2199  			switch netProto {
  2200  			case header.IPv4ProtocolNumber:
  2201  				reuse = header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.LocalAddress) && header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.RemoteAddress)
  2202  			case header.IPv6ProtocolNumber:
  2203  				reuse = e.TransportEndpointInfo.ID.LocalAddress == header.IPv6Loopback && e.TransportEndpointInfo.ID.RemoteAddress == header.IPv6Loopback
  2204  			}
  2205  		}
  2206  
  2207  		bindToDevice := tcpip.NICID(e.ops.GetBindToDevice())
  2208  		if _, err := e.stack.PickEphemeralPortStable(portOffset, func(p uint16) (bool, tcpip.Error) {
  2209  			if sameAddr && p == e.TransportEndpointInfo.ID.RemotePort {
  2210  				return false, nil
  2211  			}
  2212  			portRes := ports.Reservation{
  2213  				Networks:     netProtos,
  2214  				Transport:    ProtocolNumber,
  2215  				Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  2216  				Port:         p,
  2217  				Flags:        e.portFlags,
  2218  				BindToDevice: bindToDevice,
  2219  				Dest:         addr,
  2220  			}
  2221  			if _, err := e.stack.ReservePort(e.stack.Rand(), portRes, nil /* testPort */); err != nil {
  2222  				if _, ok := err.(*tcpip.ErrPortInUse); !ok || !reuse {
  2223  					return false, nil
  2224  				}
  2225  				transEPID := e.TransportEndpointInfo.ID
  2226  				transEPID.LocalPort = p
  2227  				// Check if an endpoint is registered with demuxer in TIME-WAIT and if
  2228  				// we can reuse it. If we can't find a transport endpoint then we just
  2229  				// skip using this port as it's possible that either an endpoint has
  2230  				// bound the port but not registered with demuxer yet (no listen/connect
  2231  				// done yet) or the reservation was freed between the check above and
  2232  				// the FindTransportEndpoint below. But rather than retry the same port
  2233  				// we just skip it and move on.
  2234  				transEP := e.stack.FindTransportEndpoint(netProto, ProtocolNumber, transEPID, r.NICID())
  2235  				if transEP == nil {
  2236  					// ReservePort failed but there is no registered endpoint with
  2237  					// demuxer. Which indicates there is at least some endpoint that has
  2238  					// bound the port.
  2239  					return false, nil
  2240  				}
  2241  
  2242  				tcpEP := transEP.(*endpoint)
  2243  				tcpEP.LockUser()
  2244  				// If the endpoint is not in TIME-WAIT or if it is in TIME-WAIT but
  2245  				// less than 1 second has elapsed since its recentTS was updated then
  2246  				// we cannot reuse the port.
  2247  				if tcpEP.EndpointState() != StateTimeWait || e.stack.Clock().NowMonotonic().Sub(tcpEP.recentTSTime) < 1*time.Second {
  2248  					tcpEP.UnlockUser()
  2249  					return false, nil
  2250  				}
  2251  				// Since the endpoint is in TIME-WAIT it should be safe to acquire its
  2252  				// Lock while holding the lock for this endpoint as endpoints in
  2253  				// TIME-WAIT do not acquire locks on other endpoints.
  2254  				tcpEP.workerCleanup = false
  2255  				tcpEP.cleanupLocked()
  2256  				tcpEP.notifyProtocolGoroutine(notifyAbort)
  2257  				tcpEP.UnlockUser()
  2258  				// Now try and Reserve again if it fails then we skip.
  2259  				portRes := ports.Reservation{
  2260  					Networks:     netProtos,
  2261  					Transport:    ProtocolNumber,
  2262  					Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  2263  					Port:         p,
  2264  					Flags:        e.portFlags,
  2265  					BindToDevice: bindToDevice,
  2266  					Dest:         addr,
  2267  				}
  2268  				if _, err := e.stack.ReservePort(e.stack.Rand(), portRes, nil /* testPort */); err != nil {
  2269  					return false, nil
  2270  				}
  2271  			}
  2272  
  2273  			id := e.TransportEndpointInfo.ID
  2274  			id.LocalPort = p
  2275  			if err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.portFlags, bindToDevice); err != nil {
  2276  				portRes := ports.Reservation{
  2277  					Networks:     netProtos,
  2278  					Transport:    ProtocolNumber,
  2279  					Addr:         e.TransportEndpointInfo.ID.LocalAddress,
  2280  					Port:         p,
  2281  					Flags:        e.portFlags,
  2282  					BindToDevice: bindToDevice,
  2283  					Dest:         addr,
  2284  				}
  2285  				e.stack.ReleasePort(portRes)
  2286  				if _, ok := err.(*tcpip.ErrPortInUse); ok {
  2287  					return false, nil
  2288  				}
  2289  				return false, err
  2290  			}
  2291  
  2292  			// Port picking successful. Save the details of
  2293  			// the selected port.
  2294  			e.TransportEndpointInfo.ID = id
  2295  			e.isPortReserved = true
  2296  			e.boundBindToDevice = bindToDevice
  2297  			e.boundPortFlags = e.portFlags
  2298  			e.boundDest = addr
  2299  			return true, nil
  2300  		}); err != nil {
  2301  			e.stack.Stats().TCP.FailedPortReservations.Increment()
  2302  			return err
  2303  		}
  2304  	}
  2305  
  2306  	e.isRegistered = true
  2307  	e.setEndpointState(StateConnecting)
  2308  	r.Acquire()
  2309  	e.route = r
  2310  	e.boundNICID = nicID
  2311  	e.effectiveNetProtos = netProtos
  2312  	e.connectingAddress = connectingAddr
  2313  
  2314  	e.initGSO()
  2315  
  2316  	// Connect in the restore phase does not perform handshake. Restore its
  2317  	// connection setting here.
  2318  	if !handshake {
  2319  		e.segmentQueue.mu.Lock()
  2320  		for _, l := range []segmentList{e.segmentQueue.list, e.snd.writeList} {
  2321  			for s := l.Front(); s != nil; s = s.Next() {
  2322  				s.id = e.TransportEndpointInfo.ID
  2323  				e.sndQueueInfo.sndWaker.Assert()
  2324  			}
  2325  		}
  2326  		e.segmentQueue.mu.Unlock()
  2327  		e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
  2328  		e.setEndpointState(StateEstablished)
  2329  	}
  2330  
  2331  	if run {
  2332  		if handshake {
  2333  			h := e.newHandshake()
  2334  			e.setEndpointState(StateSynSent)
  2335  			h.start()
  2336  		}
  2337  		e.stack.Stats().TCP.ActiveConnectionOpenings.Increment()
  2338  		e.workerRunning = true
  2339  		go e.protocolMainLoop(handshake, nil) // S/R-SAFE: will be drained before save.
  2340  	}
  2341  
  2342  	return &tcpip.ErrConnectStarted{}
  2343  }
  2344  
  2345  // ConnectEndpoint is not supported.
  2346  func (*endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error {
  2347  	return &tcpip.ErrInvalidEndpointState{}
  2348  }
  2349  
  2350  // Shutdown closes the read and/or write end of the endpoint connection to its
  2351  // peer.
  2352  func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error {
  2353  	e.LockUser()
  2354  	defer e.UnlockUser()
  2355  	return e.shutdownLocked(flags)
  2356  }
  2357  
  2358  func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error {
  2359  	e.shutdownFlags |= flags
  2360  	switch {
  2361  	case e.EndpointState().connected():
  2362  		// Close for read.
  2363  		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
  2364  			// Mark read side as closed.
  2365  			e.rcvQueueInfo.rcvQueueMu.Lock()
  2366  			e.rcvQueueInfo.RcvClosed = true
  2367  			rcvBufUsed := e.rcvQueueInfo.RcvBufUsed
  2368  			e.rcvQueueInfo.rcvQueueMu.Unlock()
  2369  
  2370  			// If we're fully closed and we have unread data we need to abort
  2371  			// the connection with a RST.
  2372  			if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 {
  2373  				e.resetConnectionLocked(&tcpip.ErrConnectionAborted{})
  2374  				// Wake up worker to terminate loop.
  2375  				e.notifyProtocolGoroutine(notifyTickleWorker)
  2376  				return nil
  2377  			}
  2378  			// Wake up any readers that maybe waiting for the stream to become
  2379  			// readable.
  2380  			e.waiterQueue.Notify(waiter.ReadableEvents)
  2381  		}
  2382  
  2383  		// Close for write.
  2384  		if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
  2385  			e.sndQueueInfo.sndQueueMu.Lock()
  2386  			if e.sndQueueInfo.SndClosed {
  2387  				// Already closed.
  2388  				e.sndQueueInfo.sndQueueMu.Unlock()
  2389  				if e.EndpointState() == StateTimeWait {
  2390  					return &tcpip.ErrNotConnected{}
  2391  				}
  2392  				return nil
  2393  			}
  2394  
  2395  			// Queue fin segment.
  2396  			s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), nil)
  2397  			e.snd.writeList.PushBack(s)
  2398  			// Mark endpoint as closed.
  2399  			e.sndQueueInfo.SndClosed = true
  2400  			e.sndQueueInfo.sndQueueMu.Unlock()
  2401  
  2402  			// Drain the send queue.
  2403  			e.sendData(s)
  2404  
  2405  			// Mark send side as closed.
  2406  			e.snd.Closed = true
  2407  
  2408  			// Wake up any writers that maybe waiting for the stream to become
  2409  			// writable.
  2410  			e.waiterQueue.Notify(waiter.WritableEvents)
  2411  		}
  2412  
  2413  		return nil
  2414  	case e.EndpointState() == StateListen:
  2415  		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
  2416  			// Reset all connections from the accept queue and keep the
  2417  			// worker running so that it can continue handling incoming
  2418  			// segments by replying with RST.
  2419  			//
  2420  			// By not removing this endpoint from the demuxer mapping, we
  2421  			// ensure that any other bind to the same port fails, as on Linux.
  2422  			e.rcvQueueInfo.rcvQueueMu.Lock()
  2423  			e.rcvQueueInfo.RcvClosed = true
  2424  			e.rcvQueueInfo.rcvQueueMu.Unlock()
  2425  			e.closePendingAcceptableConnectionsLocked()
  2426  			// Notify waiters that the endpoint is shutdown.
  2427  			e.waiterQueue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr)
  2428  		}
  2429  		return nil
  2430  	default:
  2431  		return &tcpip.ErrNotConnected{}
  2432  	}
  2433  }
  2434  
  2435  // Listen puts the endpoint in "listen" mode, which allows it to accept
  2436  // new connections.
  2437  func (e *endpoint) Listen(backlog int) tcpip.Error {
  2438  	err := e.listen(backlog)
  2439  	if err != nil {
  2440  		if !err.IgnoreStats() {
  2441  			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
  2442  			e.stats.FailedConnectionAttempts.Increment()
  2443  		}
  2444  	}
  2445  	return err
  2446  }
  2447  
  2448  func (e *endpoint) listen(backlog int) tcpip.Error {
  2449  	e.LockUser()
  2450  	defer e.UnlockUser()
  2451  
  2452  	if e.EndpointState() == StateListen && !e.closed {
  2453  		e.acceptMu.Lock()
  2454  		defer e.acceptMu.Unlock()
  2455  		if e.accepted == (accepted{}) {
  2456  			// listen is called after shutdown.
  2457  			e.accepted.cap = backlog
  2458  			e.shutdownFlags = 0
  2459  			e.rcvQueueInfo.rcvQueueMu.Lock()
  2460  			e.rcvQueueInfo.RcvClosed = false
  2461  			e.rcvQueueInfo.rcvQueueMu.Unlock()
  2462  		} else {
  2463  			// Adjust the size of the backlog iff we can fit
  2464  			// existing pending connections into the new one.
  2465  			if e.accepted.endpoints.Len() > backlog {
  2466  				return &tcpip.ErrInvalidEndpointState{}
  2467  			}
  2468  			e.accepted.cap = backlog
  2469  		}
  2470  
  2471  		// Notify any blocked goroutines that they can attempt to
  2472  		// deliver endpoints again.
  2473  		e.acceptCond.Broadcast()
  2474  
  2475  		return nil
  2476  	}
  2477  
  2478  	if e.EndpointState() == StateInitial {
  2479  		// The listen is called on an unbound socket, the socket is
  2480  		// automatically bound to a random free port with the local
  2481  		// address set to INADDR_ANY.
  2482  		if err := e.bindLocked(tcpip.FullAddress{}); err != nil {
  2483  			return err
  2484  		}
  2485  	}
  2486  
  2487  	// Endpoint must be bound before it can transition to listen mode.
  2488  	if e.EndpointState() != StateBound {
  2489  		e.stats.ReadErrors.InvalidEndpointState.Increment()
  2490  		return &tcpip.ErrInvalidEndpointState{}
  2491  	}
  2492  
  2493  	// Register the endpoint.
  2494  	if err := e.stack.RegisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil {
  2495  		return err
  2496  	}
  2497  
  2498  	e.isRegistered = true
  2499  	e.setEndpointState(StateListen)
  2500  
  2501  	// The queue may be non-zero when we're restoring the endpoint, and it
  2502  	// may be pre-populated with some previously accepted (but not Accepted)
  2503  	// endpoints.
  2504  	e.acceptMu.Lock()
  2505  	if e.accepted == (accepted{}) {
  2506  		e.accepted.cap = backlog
  2507  	}
  2508  	e.acceptMu.Unlock()
  2509  
  2510  	e.workerRunning = true
  2511  	go e.protocolListenLoop( // S/R-SAFE: drained on save.
  2512  		seqnum.Size(e.receiveBufferAvailable()))
  2513  	return nil
  2514  }
  2515  
  2516  // startAcceptedLoop sets up required state and starts a goroutine with the
  2517  // main loop for accepted connections.
  2518  // +checklocksrelease:e.mu
  2519  func (e *endpoint) startAcceptedLoop() {
  2520  	e.workerRunning = true
  2521  	e.mu.Unlock()
  2522  	wakerInitDone := make(chan struct{})
  2523  	go e.protocolMainLoop(false, wakerInitDone) // S/R-SAFE: drained on save.
  2524  	<-wakerInitDone
  2525  }
  2526  
  2527  // Accept returns a new endpoint if a peer has established a connection
  2528  // to an endpoint previously set to listen mode.
  2529  //
  2530  // addr if not-nil will contain the peer address of the returned endpoint.
  2531  func (e *endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) {
  2532  	e.LockUser()
  2533  	defer e.UnlockUser()
  2534  
  2535  	e.rcvQueueInfo.rcvQueueMu.Lock()
  2536  	rcvClosed := e.rcvQueueInfo.RcvClosed
  2537  	e.rcvQueueInfo.rcvQueueMu.Unlock()
  2538  	// Endpoint must be in listen state before it can accept connections.
  2539  	if rcvClosed || e.EndpointState() != StateListen {
  2540  		return nil, nil, &tcpip.ErrInvalidEndpointState{}
  2541  	}
  2542  
  2543  	// Get the new accepted endpoint.
  2544  	var n *endpoint
  2545  	e.acceptMu.Lock()
  2546  	if element := e.accepted.endpoints.Front(); element != nil {
  2547  		n = e.accepted.endpoints.Remove(element).(*endpoint)
  2548  	}
  2549  	e.acceptMu.Unlock()
  2550  	if n == nil {
  2551  		return nil, nil, &tcpip.ErrWouldBlock{}
  2552  	}
  2553  	e.acceptCond.Signal()
  2554  	if peerAddr != nil {
  2555  		*peerAddr = n.getRemoteAddress()
  2556  	}
  2557  	return n, n.waiterQueue, nil
  2558  }
  2559  
  2560  // Bind binds the endpoint to a specific local port and optionally address.
  2561  func (e *endpoint) Bind(addr tcpip.FullAddress) (err tcpip.Error) {
  2562  	e.LockUser()
  2563  	defer e.UnlockUser()
  2564  
  2565  	return e.bindLocked(addr)
  2566  }
  2567  
  2568  func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err tcpip.Error) {
  2569  	// Don't allow binding once endpoint is not in the initial state
  2570  	// anymore. This is because once the endpoint goes into a connected or
  2571  	// listen state, it is already bound.
  2572  	if e.EndpointState() != StateInitial {
  2573  		return &tcpip.ErrAlreadyBound{}
  2574  	}
  2575  
  2576  	e.BindAddr = addr.Addr
  2577  	addr, netProto, err := e.checkV4MappedLocked(addr)
  2578  	if err != nil {
  2579  		return err
  2580  	}
  2581  
  2582  	netProtos := []tcpip.NetworkProtocolNumber{netProto}
  2583  
  2584  	// Expand netProtos to include v4 and v6 under dual-stack if the caller is
  2585  	// binding to a wildcard (empty) address, and this is an IPv6 endpoint with
  2586  	// v6only set to false.
  2587  	if netProto == header.IPv6ProtocolNumber {
  2588  		stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber)
  2589  		alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == "" && stackHasV4
  2590  		if alsoBindToV4 {
  2591  			netProtos = append(netProtos, header.IPv4ProtocolNumber)
  2592  		}
  2593  	}
  2594  
  2595  	var nic tcpip.NICID
  2596  	// If an address is specified, we must ensure that it's one of our
  2597  	// local addresses.
  2598  	if len(addr.Addr) != 0 {
  2599  		nic = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
  2600  		if nic == 0 {
  2601  			return &tcpip.ErrBadLocalAddress{}
  2602  		}
  2603  		e.TransportEndpointInfo.ID.LocalAddress = addr.Addr
  2604  	}
  2605  
  2606  	bindToDevice := tcpip.NICID(e.ops.GetBindToDevice())
  2607  	portRes := ports.Reservation{
  2608  		Networks:     netProtos,
  2609  		Transport:    ProtocolNumber,
  2610  		Addr:         addr.Addr,
  2611  		Port:         addr.Port,
  2612  		Flags:        e.portFlags,
  2613  		BindToDevice: bindToDevice,
  2614  		Dest:         tcpip.FullAddress{},
  2615  	}
  2616  	port, err := e.stack.ReservePort(e.stack.Rand(), portRes, func(p uint16) (bool, tcpip.Error) {
  2617  		id := e.TransportEndpointInfo.ID
  2618  		id.LocalPort = p
  2619  		// CheckRegisterTransportEndpoint should only return an error if there is a
  2620  		// listening endpoint bound with the same id and portFlags and bindToDevice
  2621  		// options.
  2622  		//
  2623  		// NOTE: Only listening and connected endpoint register with
  2624  		// demuxer. Further connected endpoints always have a remote
  2625  		// address/port. Hence this will only return an error if there is a matching
  2626  		// listening endpoint.
  2627  		if err := e.stack.CheckRegisterTransportEndpoint(netProtos, ProtocolNumber, id, e.portFlags, bindToDevice); err != nil {
  2628  			return false, nil
  2629  		}
  2630  		return true, nil
  2631  	})
  2632  	if err != nil {
  2633  		e.stack.Stats().TCP.FailedPortReservations.Increment()
  2634  		return err
  2635  	}
  2636  
  2637  	e.boundBindToDevice = bindToDevice
  2638  	e.boundPortFlags = e.portFlags
  2639  	// TODO(github.com/SagerNet/issue/3691): Add test to verify boundNICID is correct.
  2640  	e.boundNICID = nic
  2641  	e.isPortReserved = true
  2642  	e.effectiveNetProtos = netProtos
  2643  	e.TransportEndpointInfo.ID.LocalPort = port
  2644  
  2645  	// Mark endpoint as bound.
  2646  	e.setEndpointState(StateBound)
  2647  
  2648  	return nil
  2649  }
  2650  
  2651  // GetLocalAddress returns the address to which the endpoint is bound.
  2652  func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) {
  2653  	e.LockUser()
  2654  	defer e.UnlockUser()
  2655  
  2656  	return tcpip.FullAddress{
  2657  		Addr: e.TransportEndpointInfo.ID.LocalAddress,
  2658  		Port: e.TransportEndpointInfo.ID.LocalPort,
  2659  		NIC:  e.boundNICID,
  2660  	}, nil
  2661  }
  2662  
  2663  // GetRemoteAddress returns the address to which the endpoint is connected.
  2664  func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) {
  2665  	e.LockUser()
  2666  	defer e.UnlockUser()
  2667  
  2668  	if !e.EndpointState().connected() {
  2669  		return tcpip.FullAddress{}, &tcpip.ErrNotConnected{}
  2670  	}
  2671  
  2672  	return e.getRemoteAddress(), nil
  2673  }
  2674  
  2675  func (e *endpoint) getRemoteAddress() tcpip.FullAddress {
  2676  	return tcpip.FullAddress{
  2677  		Addr: e.TransportEndpointInfo.ID.RemoteAddress,
  2678  		Port: e.TransportEndpointInfo.ID.RemotePort,
  2679  		NIC:  e.boundNICID,
  2680  	}
  2681  }
  2682  
  2683  func (*endpoint) HandlePacket(stack.TransportEndpointID, *stack.PacketBuffer) {
  2684  	// TCP HandlePacket is not required anymore as inbound packets first
  2685  	// land at the Dispatcher which then can either deliver using the
  2686  	// worker go routine or directly do the invoke the tcp processing inline
  2687  	// based on the state of the endpoint.
  2688  }
  2689  
  2690  func (e *endpoint) enqueueSegment(s *segment) bool {
  2691  	// Send packet to worker goroutine.
  2692  	if !e.segmentQueue.enqueue(s) {
  2693  		// The queue is full, so we drop the segment.
  2694  		e.stack.Stats().DroppedPackets.Increment()
  2695  		e.stats.ReceiveErrors.SegmentQueueDropped.Increment()
  2696  		return false
  2697  	}
  2698  	return true
  2699  }
  2700  
  2701  func (e *endpoint) onICMPError(err tcpip.Error, transErr stack.TransportError, pkt *stack.PacketBuffer) {
  2702  	// Update last error first.
  2703  	e.lastErrorMu.Lock()
  2704  	e.lastError = err
  2705  	e.lastErrorMu.Unlock()
  2706  
  2707  	// Update the error queue if IP_RECVERR is enabled.
  2708  	if e.SocketOptions().GetRecvError() {
  2709  		e.SocketOptions().QueueErr(&tcpip.SockError{
  2710  			Err:   err,
  2711  			Cause: transErr,
  2712  			// Linux passes the payload with the TCP header. We don't know if the TCP
  2713  			// header even exists, it may not for fragmented packets.
  2714  			Payload: pkt.Data().AsRange().ToOwnedView(),
  2715  			Dst: tcpip.FullAddress{
  2716  				NIC:  pkt.NICID,
  2717  				Addr: e.TransportEndpointInfo.ID.RemoteAddress,
  2718  				Port: e.TransportEndpointInfo.ID.RemotePort,
  2719  			},
  2720  			Offender: tcpip.FullAddress{
  2721  				NIC:  pkt.NICID,
  2722  				Addr: e.TransportEndpointInfo.ID.LocalAddress,
  2723  				Port: e.TransportEndpointInfo.ID.LocalPort,
  2724  			},
  2725  			NetProto: pkt.NetworkProtocolNumber,
  2726  		})
  2727  	}
  2728  
  2729  	// Notify of the error.
  2730  	e.notifyProtocolGoroutine(notifyError)
  2731  }
  2732  
  2733  // HandleError implements stack.TransportEndpoint.
  2734  func (e *endpoint) HandleError(transErr stack.TransportError, pkt *stack.PacketBuffer) {
  2735  	handlePacketTooBig := func(mtu uint32) {
  2736  		e.sndQueueInfo.sndQueueMu.Lock()
  2737  		e.sndQueueInfo.PacketTooBigCount++
  2738  		if v := int(mtu); v < e.sndQueueInfo.SndMTU {
  2739  			e.sndQueueInfo.SndMTU = v
  2740  		}
  2741  		e.sndQueueInfo.sndQueueMu.Unlock()
  2742  		e.notifyProtocolGoroutine(notifyMTUChanged)
  2743  	}
  2744  
  2745  	// TODO(github.com/SagerNet/issues/5270): Handle all transport errors.
  2746  	switch transErr.Kind() {
  2747  	case stack.PacketTooBigTransportError:
  2748  		handlePacketTooBig(transErr.Info())
  2749  	case stack.DestinationHostUnreachableTransportError:
  2750  		e.onICMPError(&tcpip.ErrNoRoute{}, transErr, pkt)
  2751  	case stack.DestinationNetworkUnreachableTransportError:
  2752  		e.onICMPError(&tcpip.ErrNetworkUnreachable{}, transErr, pkt)
  2753  	}
  2754  }
  2755  
  2756  // updateSndBufferUsage is called by the protocol goroutine when room opens up
  2757  // in the send buffer. The number of newly available bytes is v.
  2758  func (e *endpoint) updateSndBufferUsage(v int) {
  2759  	sendBufferSize := e.getSendBufferSize()
  2760  	e.sndQueueInfo.sndQueueMu.Lock()
  2761  	notify := e.sndQueueInfo.SndBufUsed >= sendBufferSize>>1
  2762  	e.sndQueueInfo.SndBufUsed -= v
  2763  	// We only notify when there is half the sendBufferSize available after
  2764  	// a full buffer event occurs. This ensures that we don't wake up
  2765  	// writers to queue just 1-2 segments and go back to sleep.
  2766  	notify = notify && e.sndQueueInfo.SndBufUsed < sendBufferSize>>1
  2767  	e.sndQueueInfo.sndQueueMu.Unlock()
  2768  
  2769  	if notify {
  2770  		e.waiterQueue.Notify(waiter.WritableEvents)
  2771  	}
  2772  }
  2773  
  2774  // readyToRead is called by the protocol goroutine when a new segment is ready
  2775  // to be read, or when the connection is closed for receiving (in which case
  2776  // s will be nil).
  2777  func (e *endpoint) readyToRead(s *segment) {
  2778  	e.rcvQueueInfo.rcvQueueMu.Lock()
  2779  	if s != nil {
  2780  		e.rcvQueueInfo.RcvBufUsed += s.payloadSize()
  2781  		s.incRef()
  2782  		e.rcvQueueInfo.rcvQueue.PushBack(s)
  2783  	} else {
  2784  		e.rcvQueueInfo.RcvClosed = true
  2785  	}
  2786  	e.rcvQueueInfo.rcvQueueMu.Unlock()
  2787  	e.waiterQueue.Notify(waiter.ReadableEvents)
  2788  }
  2789  
  2790  // receiveBufferAvailableLocked calculates how many bytes are still available
  2791  // in the receive buffer.
  2792  // rcvQueueMu must be held when this function is called.
  2793  func (e *endpoint) receiveBufferAvailableLocked(rcvBufSize int) int {
  2794  	// We may use more bytes than the buffer size when the receive buffer
  2795  	// shrinks.
  2796  	memUsed := e.receiveMemUsed()
  2797  	if memUsed >= rcvBufSize {
  2798  		return 0
  2799  	}
  2800  
  2801  	return rcvBufSize - memUsed
  2802  }
  2803  
  2804  // receiveBufferAvailable calculates how many bytes are still available in the
  2805  // receive buffer based on the actual memory used by all segments held in
  2806  // receive buffer/pending and segment queue.
  2807  func (e *endpoint) receiveBufferAvailable() int {
  2808  	e.rcvQueueInfo.rcvQueueMu.Lock()
  2809  	available := e.receiveBufferAvailableLocked(int(e.ops.GetReceiveBufferSize()))
  2810  	e.rcvQueueInfo.rcvQueueMu.Unlock()
  2811  	return available
  2812  }
  2813  
  2814  // receiveBufferUsed returns the amount of in-use receive buffer.
  2815  func (e *endpoint) receiveBufferUsed() int {
  2816  	e.rcvQueueInfo.rcvQueueMu.Lock()
  2817  	used := e.rcvQueueInfo.RcvBufUsed
  2818  	e.rcvQueueInfo.rcvQueueMu.Unlock()
  2819  	return used
  2820  }
  2821  
  2822  // receiveMemUsed returns the total memory in use by segments held by this
  2823  // endpoint.
  2824  func (e *endpoint) receiveMemUsed() int {
  2825  	return int(atomic.LoadInt32(&e.rcvMemUsed))
  2826  }
  2827  
  2828  // updateReceiveMemUsed adds the provided delta to e.rcvMemUsed.
  2829  func (e *endpoint) updateReceiveMemUsed(delta int) {
  2830  	atomic.AddInt32(&e.rcvMemUsed, int32(delta))
  2831  }
  2832  
  2833  // maxReceiveBufferSize returns the stack wide maximum receive buffer size for
  2834  // an endpoint.
  2835  func (e *endpoint) maxReceiveBufferSize() int {
  2836  	var rs tcpip.TCPReceiveBufferSizeRangeOption
  2837  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil {
  2838  		// As a fallback return the hardcoded max buffer size.
  2839  		return MaxBufferSize
  2840  	}
  2841  	return rs.Max
  2842  }
  2843  
  2844  // rcvWndScaleForHandshake computes the receive window scale to offer to the
  2845  // peer when window scaling is enabled (true by default). If auto-tuning is
  2846  // disabled then the window scaling factor is based on the size of the
  2847  // receiveBuffer otherwise we use the max permissible receive buffer size to
  2848  // compute the scale.
  2849  func (e *endpoint) rcvWndScaleForHandshake() int {
  2850  	bufSizeForScale := e.ops.GetReceiveBufferSize()
  2851  
  2852  	e.rcvQueueInfo.rcvQueueMu.Lock()
  2853  	autoTuningDisabled := e.rcvQueueInfo.RcvAutoParams.Disabled
  2854  	e.rcvQueueInfo.rcvQueueMu.Unlock()
  2855  	if autoTuningDisabled {
  2856  		return FindWndScale(seqnum.Size(bufSizeForScale))
  2857  	}
  2858  
  2859  	return FindWndScale(seqnum.Size(e.maxReceiveBufferSize()))
  2860  }
  2861  
  2862  // updateRecentTimestamp updates the recent timestamp using the algorithm
  2863  // described in https://tools.ietf.org/html/rfc7323#section-4.3
  2864  func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) {
  2865  	if e.SendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) {
  2866  		e.setRecentTimestamp(tsVal)
  2867  	}
  2868  }
  2869  
  2870  // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if
  2871  // the SYN options indicate that timestamp option was negotiated. It also
  2872  // initializes the recentTS with the value provided in synOpts.TSval.
  2873  func (e *endpoint) maybeEnableTimestamp(synOpts *header.TCPSynOptions) {
  2874  	if synOpts.TS {
  2875  		e.SendTSOk = true
  2876  		e.setRecentTimestamp(synOpts.TSVal)
  2877  	}
  2878  }
  2879  
  2880  // timestamp returns the timestamp value to be used in the TSVal field of the
  2881  // timestamp option for outgoing TCP segments for a given endpoint.
  2882  func (e *endpoint) timestamp() uint32 {
  2883  	return tcpTimeStamp(e.stack.Clock().NowMonotonic(), e.TSOffset)
  2884  }
  2885  
  2886  // tcpTimeStamp returns a timestamp offset by the provided offset. This is
  2887  // not inlined above as it's used when SYN cookies are in use and endpoint
  2888  // is not created at the time when the SYN cookie is sent.
  2889  func tcpTimeStamp(curTime tcpip.MonotonicTime, offset uint32) uint32 {
  2890  	d := curTime.Sub(tcpip.MonotonicTime{})
  2891  	return uint32(d.Milliseconds()) + offset
  2892  }
  2893  
  2894  // timeStampOffset returns a randomized timestamp offset to be used when sending
  2895  // timestamp values in a timestamp option for a TCP segment.
  2896  func timeStampOffset(rng *rand.Rand) uint32 {
  2897  	// Initialize a random tsOffset that will be added to the recentTS
  2898  	// everytime the timestamp is sent when the Timestamp option is enabled.
  2899  	//
  2900  	// See https://tools.ietf.org/html/rfc7323#section-5.4 for details on
  2901  	// why this is required.
  2902  	//
  2903  	// NOTE: This is not completely to spec as normally this should be
  2904  	// initialized in a manner analogous to how sequence numbers are
  2905  	// randomized per connection basis. But for now this is sufficient.
  2906  	return rng.Uint32()
  2907  }
  2908  
  2909  // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint
  2910  // if the SYN options indicate that the SACK option was negotiated and the TCP
  2911  // stack is configured to enable TCP SACK option.
  2912  func (e *endpoint) maybeEnableSACKPermitted(synOpts *header.TCPSynOptions) {
  2913  	var v tcpip.TCPSACKEnabled
  2914  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil {
  2915  		// Stack doesn't support SACK. So just return.
  2916  		return
  2917  	}
  2918  	if bool(v) && synOpts.SACKPermitted {
  2919  		e.SACKPermitted = true
  2920  	}
  2921  }
  2922  
  2923  // maxOptionSize return the maximum size of TCP options.
  2924  func (e *endpoint) maxOptionSize() (size int) {
  2925  	var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock
  2926  	options := e.makeOptions(maxSackBlocks[:])
  2927  	size = len(options)
  2928  	putOptions(options)
  2929  
  2930  	return size
  2931  }
  2932  
  2933  // completeStateLocked makes a full copy of the endpoint and returns it. This is
  2934  // used before invoking the probe.
  2935  //
  2936  // Precondition: e.mu must be held.
  2937  func (e *endpoint) completeStateLocked() stack.TCPEndpointState {
  2938  	s := stack.TCPEndpointState{
  2939  		TCPEndpointStateInner: e.TCPEndpointStateInner,
  2940  		ID:                    stack.TCPEndpointID(e.TransportEndpointInfo.ID),
  2941  		SegTime:               e.stack.Clock().NowMonotonic(),
  2942  		Receiver:              e.rcv.TCPReceiverState,
  2943  		Sender:                e.snd.TCPSenderState,
  2944  	}
  2945  
  2946  	sndBufSize := e.getSendBufferSize()
  2947  	// Copy the send buffer atomically.
  2948  	e.sndQueueInfo.sndQueueMu.Lock()
  2949  	s.SndBufState = e.sndQueueInfo.TCPSndBufState
  2950  	s.SndBufState.SndBufSize = sndBufSize
  2951  	e.sndQueueInfo.sndQueueMu.Unlock()
  2952  
  2953  	// Copy the receive buffer atomically.
  2954  	e.rcvQueueInfo.rcvQueueMu.Lock()
  2955  	s.RcvBufState = e.rcvQueueInfo.TCPRcvBufState
  2956  	e.rcvQueueInfo.rcvQueueMu.Unlock()
  2957  
  2958  	// Copy the endpoint TCP Option state.
  2959  	s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks)
  2960  	copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks])
  2961  	s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy()
  2962  
  2963  	e.snd.rtt.Lock()
  2964  	s.Sender.RTTState = e.snd.rtt.TCPRTTState
  2965  	e.snd.rtt.Unlock()
  2966  
  2967  	if cubic, ok := e.snd.cc.(*cubicState); ok {
  2968  		s.Sender.Cubic = cubic.TCPCubicState
  2969  		s.Sender.Cubic.TimeSinceLastCongestion = e.stack.Clock().NowMonotonic().Sub(s.Sender.Cubic.T)
  2970  	}
  2971  
  2972  	s.Sender.RACKState = e.snd.rc.TCPRACKState
  2973  	return s
  2974  }
  2975  
  2976  func (e *endpoint) initHardwareGSO() {
  2977  	switch e.route.NetProto() {
  2978  	case header.IPv4ProtocolNumber:
  2979  		e.gso.Type = stack.GSOTCPv4
  2980  		e.gso.L3HdrLen = header.IPv4MinimumSize
  2981  	case header.IPv6ProtocolNumber:
  2982  		e.gso.Type = stack.GSOTCPv6
  2983  		e.gso.L3HdrLen = header.IPv6MinimumSize
  2984  	default:
  2985  		panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto))
  2986  	}
  2987  	e.gso.NeedsCsum = true
  2988  	e.gso.CsumOffset = header.TCPChecksumOffset
  2989  	e.gso.MaxSize = e.route.GSOMaxSize()
  2990  }
  2991  
  2992  func (e *endpoint) initGSO() {
  2993  	if e.route.HasHardwareGSOCapability() {
  2994  		e.initHardwareGSO()
  2995  	} else if e.route.HasSoftwareGSOCapability() {
  2996  		e.gso = stack.GSO{
  2997  			MaxSize:   e.route.GSOMaxSize(),
  2998  			Type:      stack.GSOSW,
  2999  			NeedsCsum: false,
  3000  		}
  3001  	}
  3002  }
  3003  
  3004  // State implements tcpip.Endpoint.State. It exports the endpoint's protocol
  3005  // state for diagnostics.
  3006  func (e *endpoint) State() uint32 {
  3007  	return uint32(e.EndpointState())
  3008  }
  3009  
  3010  // Info returns a copy of the endpoint info.
  3011  func (e *endpoint) Info() tcpip.EndpointInfo {
  3012  	e.LockUser()
  3013  	// Make a copy of the endpoint info.
  3014  	ret := e.TransportEndpointInfo
  3015  	e.UnlockUser()
  3016  	return &ret
  3017  }
  3018  
  3019  // Stats returns a pointer to the endpoint stats.
  3020  func (e *endpoint) Stats() tcpip.EndpointStats {
  3021  	return &e.stats
  3022  }
  3023  
  3024  // Wait implements stack.TransportEndpoint.Wait.
  3025  func (e *endpoint) Wait() {
  3026  	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
  3027  	e.waiterQueue.EventRegister(&waitEntry, waiter.EventHUp)
  3028  	defer e.waiterQueue.EventUnregister(&waitEntry)
  3029  	for {
  3030  		e.LockUser()
  3031  		running := e.workerRunning
  3032  		e.UnlockUser()
  3033  		if !running {
  3034  			break
  3035  		}
  3036  		<-notifyCh
  3037  	}
  3038  }
  3039  
  3040  // SocketOptions implements tcpip.Endpoint.SocketOptions.
  3041  func (e *endpoint) SocketOptions() *tcpip.SocketOptions {
  3042  	return &e.ops
  3043  }
  3044  
  3045  // GetTCPSendBufferLimits is used to get send buffer size limits for TCP.
  3046  func GetTCPSendBufferLimits(s tcpip.StackHandler) tcpip.SendBufferSizeOption {
  3047  	var ss tcpip.TCPSendBufferSizeRangeOption
  3048  	if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil {
  3049  		panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err))
  3050  	}
  3051  
  3052  	return tcpip.SendBufferSizeOption{
  3053  		Min:     ss.Min,
  3054  		Default: ss.Default,
  3055  		Max:     ss.Max,
  3056  	}
  3057  }
  3058  
  3059  // allowOutOfWindowAck returns true if an out-of-window ACK can be sent now.
  3060  func (e *endpoint) allowOutOfWindowAck() bool {
  3061  	now := e.stack.Clock().NowMonotonic()
  3062  
  3063  	if e.lastOutOfWindowAckTime != (tcpip.MonotonicTime{}) {
  3064  		var limit stack.TCPInvalidRateLimitOption
  3065  		if err := e.stack.Option(&limit); err != nil {
  3066  			panic(fmt.Sprintf("e.stack.Option(%+v) failed with error: %s", limit, err))
  3067  		}
  3068  		if now.Sub(e.lastOutOfWindowAckTime) < time.Duration(limit) {
  3069  			return false
  3070  		}
  3071  	}
  3072  
  3073  	e.lastOutOfWindowAckTime = now
  3074  	return true
  3075  }
  3076  
  3077  // GetTCPReceiveBufferLimits is used to get send buffer size limits for TCP.
  3078  func GetTCPReceiveBufferLimits(s tcpip.StackHandler) tcpip.ReceiveBufferSizeOption {
  3079  	var ss tcpip.TCPReceiveBufferSizeRangeOption
  3080  	if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil {
  3081  		panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err))
  3082  	}
  3083  
  3084  	return tcpip.ReceiveBufferSizeOption{
  3085  		Min:     ss.Min,
  3086  		Default: ss.Default,
  3087  		Max:     ss.Max,
  3088  	}
  3089  }