github.com/polevpn/netstack@v1.10.9/tcpip/transport/tcp/endpoint.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"encoding/binary"
    19  	"fmt"
    20  	"math"
    21  	"strings"
    22  	"sync"
    23  	"sync/atomic"
    24  	"time"
    25  
    26  	"github.com/polevpn/netstack/rand"
    27  	"github.com/polevpn/netstack/sleep"
    28  	"github.com/polevpn/netstack/tcpip"
    29  	"github.com/polevpn/netstack/tcpip/buffer"
    30  	"github.com/polevpn/netstack/tcpip/hash/jenkins"
    31  	"github.com/polevpn/netstack/tcpip/header"
    32  	"github.com/polevpn/netstack/tcpip/iptables"
    33  	"github.com/polevpn/netstack/tcpip/seqnum"
    34  	"github.com/polevpn/netstack/tcpip/stack"
    35  	"github.com/polevpn/netstack/tmutex"
    36  	"github.com/polevpn/netstack/waiter"
    37  )
    38  
    39  // EndpointState represents the state of a TCP endpoint.
    40  type EndpointState uint32
    41  
    42  // Endpoint states. Note that are represented in a netstack-specific manner and
    43  // may not be meaningful externally. Specifically, they need to be translated to
    44  // Linux's representation for these states if presented to userspace.
    45  const (
    46  	// Endpoint states internal to netstack. These map to the TCP state CLOSED.
    47  	StateInitial EndpointState = iota
    48  	StateBound
    49  	StateConnecting // Connect() called, but the initial SYN hasn't been sent.
    50  	StateError
    51  
    52  	// TCP protocol states.
    53  	StateEstablished
    54  	StateSynSent
    55  	StateSynRecv
    56  	StateFinWait1
    57  	StateFinWait2
    58  	StateTimeWait
    59  	StateClose
    60  	StateCloseWait
    61  	StateLastAck
    62  	StateListen
    63  	StateClosing
    64  )
    65  
    66  // connected is the set of states where an endpoint is connected to a peer.
    67  func (s EndpointState) connected() bool {
    68  	switch s {
    69  	case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
    70  		return true
    71  	default:
    72  		return false
    73  	}
    74  }
    75  
    76  // String implements fmt.Stringer.String.
    77  func (s EndpointState) String() string {
    78  	switch s {
    79  	case StateInitial:
    80  		return "INITIAL"
    81  	case StateBound:
    82  		return "BOUND"
    83  	case StateConnecting:
    84  		return "CONNECTING"
    85  	case StateError:
    86  		return "ERROR"
    87  	case StateEstablished:
    88  		return "ESTABLISHED"
    89  	case StateSynSent:
    90  		return "SYN-SENT"
    91  	case StateSynRecv:
    92  		return "SYN-RCVD"
    93  	case StateFinWait1:
    94  		return "FIN-WAIT1"
    95  	case StateFinWait2:
    96  		return "FIN-WAIT2"
    97  	case StateTimeWait:
    98  		return "TIME-WAIT"
    99  	case StateClose:
   100  		return "CLOSED"
   101  	case StateCloseWait:
   102  		return "CLOSE-WAIT"
   103  	case StateLastAck:
   104  		return "LAST-ACK"
   105  	case StateListen:
   106  		return "LISTEN"
   107  	case StateClosing:
   108  		return "CLOSING"
   109  	default:
   110  		panic("unreachable")
   111  	}
   112  }
   113  
   114  // Reasons for notifying the protocol goroutine.
   115  const (
   116  	notifyNonZeroReceiveWindow = 1 << iota
   117  	notifyReceiveWindowChanged
   118  	notifyClose
   119  	notifyMTUChanged
   120  	notifyDrain
   121  	notifyReset
   122  	notifyKeepaliveChanged
   123  	notifyMSSChanged
   124  	// notifyTickleWorker is used to tickle the protocol main loop during a
   125  	// restore after we update the endpoint state to the correct one. This
   126  	// ensures the loop terminates if the final state of the endpoint is
   127  	// say TIME_WAIT.
   128  	notifyTickleWorker
   129  )
   130  
   131  // SACKInfo holds TCP SACK related information for a given endpoint.
   132  //
   133  // +stateify savable
   134  type SACKInfo struct {
   135  	// Blocks is the maximum number of SACK blocks we track
   136  	// per endpoint.
   137  	Blocks [MaxSACKBlocks]header.SACKBlock
   138  
   139  	// NumBlocks is the number of valid SACK blocks stored in the
   140  	// blocks array above.
   141  	NumBlocks int
   142  }
   143  
   144  // rcvBufAutoTuneParams are used to hold state variables to compute
   145  // the auto tuned recv buffer size.
   146  //
   147  // +stateify savable
   148  type rcvBufAutoTuneParams struct {
   149  	// measureTime is the time at which the current measurement
   150  	// was started.
   151  	measureTime time.Time
   152  
   153  	// copied is the number of bytes copied out of the receive
   154  	// buffers since this measure began.
   155  	copied int
   156  
   157  	// prevCopied is the number of bytes copied out of the receive
   158  	// buffers in the previous RTT period.
   159  	prevCopied int
   160  
   161  	// rtt is the non-smoothed minimum RTT as measured by observing the time
   162  	// between when a byte is first acknowledged and the receipt of data
   163  	// that is at least one window beyond the sequence number that was
   164  	// acknowledged.
   165  	rtt time.Duration
   166  
   167  	// rttMeasureSeqNumber is the highest acceptable sequence number at the
   168  	// time this RTT measurement period began.
   169  	rttMeasureSeqNumber seqnum.Value
   170  
   171  	// rttMeasureTime is the absolute time at which the current rtt
   172  	// measurement period began.
   173  	rttMeasureTime time.Time
   174  
   175  	// disabled is true if an explicit receive buffer is set for the
   176  	// endpoint.
   177  	disabled bool
   178  }
   179  
   180  // ReceiveErrors collect segment receive errors within transport layer.
   181  type ReceiveErrors struct {
   182  	tcpip.ReceiveErrors
   183  
   184  	// SegmentQueueDropped is the number of segments dropped due to
   185  	// a full segment queue.
   186  	SegmentQueueDropped tcpip.StatCounter
   187  
   188  	// ChecksumErrors is the number of segments dropped due to bad checksums.
   189  	ChecksumErrors tcpip.StatCounter
   190  
   191  	// ListenOverflowSynDrop is the number of times the listen queue overflowed
   192  	// and a SYN was dropped.
   193  	ListenOverflowSynDrop tcpip.StatCounter
   194  
   195  	// ListenOverflowAckDrop is the number of times the final ACK
   196  	// in the handshake was dropped due to overflow.
   197  	ListenOverflowAckDrop tcpip.StatCounter
   198  
   199  	// ZeroRcvWindowState is the number of times we advertised
   200  	// a zero receive window when rcvList is full.
   201  	ZeroRcvWindowState tcpip.StatCounter
   202  }
   203  
   204  // SendErrors collect segment send errors within the transport layer.
   205  type SendErrors struct {
   206  	tcpip.SendErrors
   207  
   208  	// SegmentSendToNetworkFailed is the number of TCP segments failed to be sent
   209  	// to the network endpoint.
   210  	SegmentSendToNetworkFailed tcpip.StatCounter
   211  
   212  	// SynSendToNetworkFailed is the number of TCP SYNs failed to be sent
   213  	// to the network endpoint.
   214  	SynSendToNetworkFailed tcpip.StatCounter
   215  
   216  	// Retransmits is the number of TCP segments retransmitted.
   217  	Retransmits tcpip.StatCounter
   218  
   219  	// FastRetransmit is the number of segments retransmitted in fast
   220  	// recovery.
   221  	FastRetransmit tcpip.StatCounter
   222  
   223  	// Timeouts is the number of times the RTO expired.
   224  	Timeouts tcpip.StatCounter
   225  }
   226  
   227  // Stats holds statistics about the endpoint.
   228  type Stats struct {
   229  	// SegmentsReceived is the number of TCP segments received that
   230  	// the transport layer successfully parsed.
   231  	SegmentsReceived tcpip.StatCounter
   232  
   233  	// SegmentsSent is the number of TCP segments sent.
   234  	SegmentsSent tcpip.StatCounter
   235  
   236  	// FailedConnectionAttempts is the number of times we saw Connect and
   237  	// Accept errors.
   238  	FailedConnectionAttempts tcpip.StatCounter
   239  
   240  	// ReceiveErrors collects segment receive errors within the
   241  	// transport layer.
   242  	ReceiveErrors ReceiveErrors
   243  
   244  	// ReadErrors collects segment read errors from an endpoint read call.
   245  	ReadErrors tcpip.ReadErrors
   246  
   247  	// SendErrors collects segment send errors within the transport layer.
   248  	SendErrors SendErrors
   249  
   250  	// WriteErrors collects segment write errors from an endpoint write call.
   251  	WriteErrors tcpip.WriteErrors
   252  }
   253  
   254  // IsEndpointStats is an empty method to implement the tcpip.EndpointStats
   255  // marker interface.
   256  func (*Stats) IsEndpointStats() {}
   257  
   258  // EndpointInfo holds useful information about a transport endpoint which
   259  // can be queried by monitoring tools.
   260  //
   261  // +stateify savable
   262  type EndpointInfo struct {
   263  	stack.TransportEndpointInfo
   264  
   265  	// HardError is meaningful only when state is stateError. It stores the
   266  	// error to be returned when read/write syscalls are called and the
   267  	// endpoint is in this state. HardError is protected by endpoint mu.
   268  	HardError *tcpip.Error
   269  }
   270  
   271  // IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo
   272  // marker interface.
   273  func (*EndpointInfo) IsEndpointInfo() {}
   274  
   275  // endpoint represents a TCP endpoint. This struct serves as the interface
   276  // between users of the endpoint and the protocol implementation; it is legal to
   277  // have concurrent goroutines make calls into the endpoint, they are properly
   278  // synchronized. The protocol implementation, however, runs in a single
   279  // goroutine.
   280  //
   281  // +stateify savable
   282  type endpoint struct {
   283  	EndpointInfo
   284  
   285  	// workMu is used to arbitrate which goroutine may perform protocol
   286  	// work. Only the main protocol goroutine is expected to call Lock() on
   287  	// it, but other goroutines (e.g., send) may call TryLock() to eagerly
   288  	// perform work without having to wait for the main one to wake up.
   289  	workMu tmutex.Mutex
   290  
   291  	// The following fields are initialized at creation time and do not
   292  	// change throughout the lifetime of the endpoint.
   293  	stack       *stack.Stack
   294  	waiterQueue *waiter.Queue
   295  	uniqueID    uint64
   296  
   297  	// lastError represents the last error that the endpoint reported;
   298  	// access to it is protected by the following mutex.
   299  	lastErrorMu sync.Mutex
   300  	lastError   *tcpip.Error
   301  
   302  	// The following fields are used to manage the receive queue. The
   303  	// protocol goroutine adds ready-for-delivery segments to rcvList,
   304  	// which are returned by Read() calls to users.
   305  	//
   306  	// Once the peer has closed its send side, rcvClosed is set to true
   307  	// to indicate to users that no more data is coming.
   308  	//
   309  	// rcvListMu can be taken after the endpoint mu below.
   310  	rcvListMu     sync.Mutex
   311  	rcvList       segmentList
   312  	rcvClosed     bool
   313  	rcvBufSize    int
   314  	rcvBufUsed    int
   315  	rcvAutoParams rcvBufAutoTuneParams
   316  	// zeroWindow indicates that the window was closed due to receive buffer
   317  	// space being filled up. This is set by the worker goroutine before
   318  	// moving a segment to the rcvList. This setting is cleared by the
   319  	// endpoint when a Read() call reads enough data for the new window to
   320  	// be non-zero.
   321  	zeroWindow bool
   322  
   323  	// The following fields are protected by the mutex.
   324  	mu sync.RWMutex
   325  
   326  	state EndpointState
   327  
   328  	// origEndpointState is only used during a restore phase to save the
   329  	// endpoint state at restore time as the socket is moved to it's correct
   330  	// state.
   331  	origEndpointState EndpointState
   332  
   333  	isPortReserved    bool
   334  	isRegistered      bool
   335  	boundNICID        tcpip.NICID
   336  	route             stack.Route
   337  	ttl               uint8
   338  	v6only            bool
   339  	isConnectNotified bool
   340  	// TCP should never broadcast but Linux nevertheless supports enabling/
   341  	// disabling SO_BROADCAST, albeit as a NOOP.
   342  	broadcast bool
   343  
   344  	// effectiveNetProtos contains the network protocols actually in use. In
   345  	// most cases it will only contain "netProto", but in cases like IPv6
   346  	// endpoints with v6only set to false, this could include multiple
   347  	// protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g.,
   348  	// IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped
   349  	// address).
   350  	effectiveNetProtos []tcpip.NetworkProtocolNumber
   351  
   352  	// workerRunning specifies if a worker goroutine is running.
   353  	workerRunning bool
   354  
   355  	// workerCleanup specifies if the worker goroutine must perform cleanup
   356  	// before exitting. This can only be set to true when workerRunning is
   357  	// also true, and they're both protected by the mutex.
   358  	workerCleanup bool
   359  
   360  	// sendTSOk is used to indicate when the TS Option has been negotiated.
   361  	// When sendTSOk is true every non-RST segment should carry a TS as per
   362  	// RFC7323#section-1.1
   363  	sendTSOk bool
   364  
   365  	// recentTS is the timestamp that should be sent in the TSEcr field of
   366  	// the timestamp for future segments sent by the endpoint. This field is
   367  	// updated if required when a new segment is received by this endpoint.
   368  	recentTS uint32
   369  
   370  	// tsOffset is a randomized offset added to the value of the
   371  	// TSVal field in the timestamp option.
   372  	tsOffset uint32
   373  
   374  	// shutdownFlags represent the current shutdown state of the endpoint.
   375  	shutdownFlags tcpip.ShutdownFlags
   376  
   377  	// sackPermitted is set to true if the peer sends the TCPSACKPermitted
   378  	// option in the SYN/SYN-ACK.
   379  	sackPermitted bool
   380  
   381  	// sack holds TCP SACK related information for this endpoint.
   382  	sack SACKInfo
   383  
   384  	// reusePort is set to true if SO_REUSEPORT is enabled.
   385  	reusePort bool
   386  
   387  	// bindToDevice is set to the NIC on which to bind or disabled if 0.
   388  	bindToDevice tcpip.NICID
   389  
   390  	// delay enables Nagle's algorithm.
   391  	//
   392  	// delay is a boolean (0 is false) and must be accessed atomically.
   393  	delay uint32
   394  
   395  	// cork holds back segments until full.
   396  	//
   397  	// cork is a boolean (0 is false) and must be accessed atomically.
   398  	cork uint32
   399  
   400  	// scoreboard holds TCP SACK Scoreboard information for this endpoint.
   401  	scoreboard *SACKScoreboard
   402  
   403  	// The options below aren't implemented, but we remember the user
   404  	// settings because applications expect to be able to set/query these
   405  	// options.
   406  	reuseAddr bool
   407  
   408  	// slowAck holds the negated state of quick ack. It is stubbed out and
   409  	// does nothing.
   410  	//
   411  	// slowAck is a boolean (0 is false) and must be accessed atomically.
   412  	slowAck uint32
   413  
   414  	// segmentQueue is used to hand received segments to the protocol
   415  	// goroutine. Segments are queued as long as the queue is not full,
   416  	// and dropped when it is.
   417  	segmentQueue segmentQueue
   418  
   419  	// synRcvdCount is the number of connections for this endpoint that are
   420  	// in SYN-RCVD state.
   421  	synRcvdCount int
   422  
   423  	// userMSS if non-zero is the MSS value explicitly set by the user
   424  	// for this endpoint using the TCP_MAXSEG setsockopt.
   425  	userMSS uint16
   426  
   427  	// The following fields are used to manage the send buffer. When
   428  	// segments are ready to be sent, they are added to sndQueue and the
   429  	// protocol goroutine is signaled via sndWaker.
   430  	//
   431  	// When the send side is closed, the protocol goroutine is notified via
   432  	// sndCloseWaker, and sndClosed is set to true.
   433  	sndBufMu      sync.Mutex
   434  	sndBufSize    int
   435  	sndBufUsed    int
   436  	sndClosed     bool
   437  	sndBufInQueue seqnum.Size
   438  	sndQueue      segmentList
   439  	sndWaker      sleep.Waker
   440  	sndCloseWaker sleep.Waker
   441  
   442  	// cc stores the name of the Congestion Control algorithm to use for
   443  	// this endpoint.
   444  	cc tcpip.CongestionControlOption
   445  
   446  	// The following are used when a "packet too big" control packet is
   447  	// received. They are protected by sndBufMu. They are used to
   448  	// communicate to the main protocol goroutine how many such control
   449  	// messages have been received since the last notification was processed
   450  	// and what was the smallest MTU seen.
   451  	packetTooBigCount int
   452  	sndMTU            int
   453  
   454  	// newSegmentWaker is used to indicate to the protocol goroutine that
   455  	// it needs to wake up and handle new segments queued to it.
   456  	newSegmentWaker sleep.Waker
   457  
   458  	// notificationWaker is used to indicate to the protocol goroutine that
   459  	// it needs to wake up and check for notifications.
   460  	notificationWaker sleep.Waker
   461  
   462  	// notifyFlags is a bitmask of flags used to indicate to the protocol
   463  	// goroutine what it was notified; this is only accessed atomically.
   464  	notifyFlags uint32
   465  
   466  	// keepalive manages TCP keepalive state. When the connection is idle
   467  	// (no data sent or received) for keepaliveIdle, we start sending
   468  	// keepalives every keepalive.interval. If we send keepalive.count
   469  	// without hearing a response, the connection is closed.
   470  	keepalive keepalive
   471  
   472  	// pendingAccepted is a synchronization primitive used to track number
   473  	// of connections that are queued up to be delivered to the accepted
   474  	// channel. We use this to ensure that all goroutines blocked on writing
   475  	// to the acceptedChan below terminate before we close acceptedChan.
   476  	pendingAccepted sync.WaitGroup
   477  
   478  	// acceptedChan is used by a listening endpoint protocol goroutine to
   479  	// send newly accepted connections to the endpoint so that they can be
   480  	// read by Accept() calls.
   481  	acceptedChan chan *endpoint
   482  
   483  	// The following are only used from the protocol goroutine, and
   484  	// therefore don't need locks to protect them.
   485  	rcv *receiver
   486  	snd *sender
   487  
   488  	// The goroutine drain completion notification channel.
   489  	drainDone chan struct{}
   490  
   491  	// The goroutine undrain notification channel. This is currently used as
   492  	// a way to block the worker goroutines. Today nothing closes/writes
   493  	// this channel and this causes any goroutines waiting on this to just
   494  	// block. This is used during save/restore to prevent worker goroutines
   495  	// from mutating state as it's being saved.
   496  	undrain chan struct{}
   497  
   498  	// probe if not nil is invoked on every received segment. It is passed
   499  	// a copy of the current state of the endpoint.
   500  	probe stack.TCPProbeFunc
   501  
   502  	// The following are only used to assist the restore run to re-connect.
   503  	connectingAddress tcpip.Address
   504  
   505  	// amss is the advertised MSS to the peer by this endpoint.
   506  	amss uint16
   507  
   508  	// sendTOS represents IPv4 TOS or IPv6 TrafficClass,
   509  	// applied while sending packets. Defaults to 0 as on Linux.
   510  	sendTOS uint8
   511  
   512  	gso *stack.GSO
   513  
   514  	// TODO(b/142022063): Add ability to save and restore per endpoint stats.
   515  	stats Stats
   516  
   517  	// tcpLingerTimeout is the maximum amount of a time a socket
   518  	// a socket stays in TIME_WAIT state before being marked
   519  	// closed.
   520  	tcpLingerTimeout time.Duration
   521  
   522  	// closed indicates that the user has called closed on the
   523  	// endpoint and at this point the endpoint is only around
   524  	// to complete the TCP shutdown.
   525  	closed bool
   526  }
   527  
   528  // UniqueID implements stack.TransportEndpoint.UniqueID.
   529  func (e *endpoint) UniqueID() uint64 {
   530  	return e.uniqueID
   531  }
   532  
   533  // calculateAdvertisedMSS calculates the MSS to advertise.
   534  //
   535  // If userMSS is non-zero and is not greater than the maximum possible MSS for
   536  // r, it will be used; otherwise, the maximum possible MSS will be used.
   537  func calculateAdvertisedMSS(userMSS uint16, r stack.Route) uint16 {
   538  	// The maximum possible MSS is dependent on the route.
   539  	maxMSS := mssForRoute(&r)
   540  
   541  	if userMSS != 0 && userMSS < maxMSS {
   542  		return userMSS
   543  	}
   544  
   545  	return maxMSS
   546  }
   547  
   548  // StopWork halts packet processing. Only to be used in tests.
   549  func (e *endpoint) StopWork() {
   550  	e.workMu.Lock()
   551  }
   552  
   553  // ResumeWork resumes packet processing. Only to be used in tests.
   554  func (e *endpoint) ResumeWork() {
   555  	e.workMu.Unlock()
   556  }
   557  
   558  // keepalive is a synchronization wrapper used to appease stateify. See the
   559  // comment in endpoint, where it is used.
   560  //
   561  // +stateify savable
   562  type keepalive struct {
   563  	sync.Mutex
   564  	enabled  bool
   565  	idle     time.Duration
   566  	interval time.Duration
   567  	count    int
   568  	unacked  int
   569  	timer    timer
   570  	waker    sleep.Waker
   571  }
   572  
   573  func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint {
   574  	e := &endpoint{
   575  		stack: s,
   576  		EndpointInfo: EndpointInfo{
   577  			TransportEndpointInfo: stack.TransportEndpointInfo{
   578  				NetProto:   netProto,
   579  				TransProto: header.TCPProtocolNumber,
   580  			},
   581  		},
   582  		waiterQueue: waiterQueue,
   583  		state:       StateInitial,
   584  		rcvBufSize:  DefaultReceiveBufferSize,
   585  		sndBufSize:  DefaultSendBufferSize,
   586  		sndMTU:      int(math.MaxInt32),
   587  		reuseAddr:   true,
   588  		keepalive: keepalive{
   589  			// Linux defaults.
   590  			idle:     2 * time.Hour,
   591  			interval: 75 * time.Second,
   592  			count:    9,
   593  		},
   594  		uniqueID: s.UniqueID(),
   595  	}
   596  
   597  	var ss SendBufferSizeOption
   598  	if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
   599  		e.sndBufSize = ss.Default
   600  	}
   601  
   602  	var rs ReceiveBufferSizeOption
   603  	if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
   604  		e.rcvBufSize = rs.Default
   605  	}
   606  
   607  	var cs tcpip.CongestionControlOption
   608  	if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil {
   609  		e.cc = cs
   610  	}
   611  
   612  	var mrb tcpip.ModerateReceiveBufferOption
   613  	if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil {
   614  		e.rcvAutoParams.disabled = !bool(mrb)
   615  	}
   616  
   617  	var de DelayEnabled
   618  	if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de {
   619  		e.SetSockOptInt(tcpip.DelayOption, 1)
   620  	}
   621  
   622  	var tcpLT tcpip.TCPLingerTimeoutOption
   623  	if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil {
   624  		e.tcpLingerTimeout = time.Duration(tcpLT)
   625  	}
   626  
   627  	if p := s.GetTCPProbe(); p != nil {
   628  		e.probe = p
   629  	}
   630  
   631  	e.segmentQueue.setLimit(MaxUnprocessedSegments)
   632  	e.workMu.Init()
   633  	e.workMu.Lock()
   634  	e.tsOffset = timeStampOffset()
   635  
   636  	return e
   637  }
   638  
   639  // Readiness returns the current readiness of the endpoint. For example, if
   640  // waiter.EventIn is set, the endpoint is immediately readable.
   641  func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
   642  	result := waiter.EventMask(0)
   643  
   644  	e.mu.RLock()
   645  	defer e.mu.RUnlock()
   646  
   647  	switch e.state {
   648  	case StateInitial, StateBound, StateConnecting, StateSynSent, StateSynRecv:
   649  		// Ready for nothing.
   650  
   651  	case StateClose, StateError:
   652  		// Ready for anything.
   653  		result = mask
   654  
   655  	case StateListen:
   656  		// Check if there's anything in the accepted channel.
   657  		if (mask & waiter.EventIn) != 0 {
   658  			if len(e.acceptedChan) > 0 {
   659  				result |= waiter.EventIn
   660  			}
   661  		}
   662  	}
   663  	if e.state.connected() {
   664  		// Determine if the endpoint is writable if requested.
   665  		if (mask & waiter.EventOut) != 0 {
   666  			e.sndBufMu.Lock()
   667  			if e.sndClosed || e.sndBufUsed < e.sndBufSize {
   668  				result |= waiter.EventOut
   669  			}
   670  			e.sndBufMu.Unlock()
   671  		}
   672  
   673  		// Determine if the endpoint is readable if requested.
   674  		if (mask & waiter.EventIn) != 0 {
   675  			e.rcvListMu.Lock()
   676  			if e.rcvBufUsed > 0 || e.rcvClosed {
   677  				result |= waiter.EventIn
   678  			}
   679  			e.rcvListMu.Unlock()
   680  		}
   681  	}
   682  
   683  	return result
   684  }
   685  
   686  func (e *endpoint) fetchNotifications() uint32 {
   687  	return atomic.SwapUint32(&e.notifyFlags, 0)
   688  }
   689  
   690  func (e *endpoint) notifyProtocolGoroutine(n uint32) {
   691  	for {
   692  		v := atomic.LoadUint32(&e.notifyFlags)
   693  		if v&n == n {
   694  			// The flags are already set.
   695  			return
   696  		}
   697  
   698  		if atomic.CompareAndSwapUint32(&e.notifyFlags, v, v|n) {
   699  			if v == 0 {
   700  				// We are causing a transition from no flags to
   701  				// at least one flag set, so we must cause the
   702  				// protocol goroutine to wake up.
   703  				e.notificationWaker.Assert()
   704  			}
   705  			return
   706  		}
   707  	}
   708  }
   709  
   710  // Close puts the endpoint in a closed state and frees all resources associated
   711  // with it. It must be called only once and with no other concurrent calls to
   712  // the endpoint.
   713  func (e *endpoint) Close() {
   714  	e.mu.Lock()
   715  	closed := e.closed
   716  	e.mu.Unlock()
   717  	if closed {
   718  		return
   719  	}
   720  
   721  	// Issue a shutdown so that the peer knows we won't send any more data
   722  	// if we're connected, or stop accepting if we're listening.
   723  	e.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead)
   724  
   725  	e.mu.Lock()
   726  
   727  	// For listening sockets, we always release ports inline so that they
   728  	// are immediately available for reuse after Close() is called. If also
   729  	// registered, we unregister as well otherwise the next user would fail
   730  	// in Listen() when trying to register.
   731  	if e.state == StateListen && e.isPortReserved {
   732  		if e.isRegistered {
   733  			e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.bindToDevice)
   734  			e.isRegistered = false
   735  		}
   736  
   737  		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.bindToDevice)
   738  		e.isPortReserved = false
   739  	}
   740  
   741  	// Mark endpoint as closed.
   742  	e.closed = true
   743  	// Either perform the local cleanup or kick the worker to make sure it
   744  	// knows it needs to cleanup.
   745  	tcpip.AddDanglingEndpoint(e)
   746  	if !e.workerRunning {
   747  		e.cleanupLocked()
   748  	} else {
   749  		e.workerCleanup = true
   750  		e.notifyProtocolGoroutine(notifyClose)
   751  	}
   752  
   753  	e.mu.Unlock()
   754  }
   755  
   756  // closePendingAcceptableConnections closes all connections that have completed
   757  // handshake but not yet been delivered to the application.
   758  func (e *endpoint) closePendingAcceptableConnectionsLocked() {
   759  	done := make(chan struct{})
   760  	// Spin a goroutine up as ranging on e.acceptedChan will just block when
   761  	// there are no more connections in the channel. Using a non-blocking
   762  	// select does not work as it can potentially select the default case
   763  	// even when there are pending writes but that are not yet written to
   764  	// the channel.
   765  	go func() {
   766  		defer close(done)
   767  		for n := range e.acceptedChan {
   768  			n.notifyProtocolGoroutine(notifyReset)
   769  			n.Close()
   770  		}
   771  	}()
   772  	// pendingAccepted(see endpoint.deliverAccepted) tracks the number of
   773  	// endpoints which have completed handshake but are not yet written to
   774  	// the e.acceptedChan. We wait here till the goroutine above can drain
   775  	// all such connections from e.acceptedChan.
   776  	e.pendingAccepted.Wait()
   777  	close(e.acceptedChan)
   778  	<-done
   779  	e.acceptedChan = nil
   780  }
   781  
   782  // cleanupLocked frees all resources associated with the endpoint. It is called
   783  // after Close() is called and the worker goroutine (if any) is done with its
   784  // work.
   785  func (e *endpoint) cleanupLocked() {
   786  	// Close all endpoints that might have been accepted by TCP but not by
   787  	// the client.
   788  	if e.acceptedChan != nil {
   789  		e.closePendingAcceptableConnectionsLocked()
   790  	}
   791  	e.workerCleanup = false
   792  
   793  	if e.isRegistered {
   794  		e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.bindToDevice)
   795  		e.isRegistered = false
   796  	}
   797  
   798  	if e.isPortReserved {
   799  		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.bindToDevice)
   800  		e.isPortReserved = false
   801  	}
   802  
   803  	e.route.Release()
   804  	e.stack.CompleteTransportEndpointCleanup(e)
   805  	tcpip.DeleteDanglingEndpoint(e)
   806  }
   807  
   808  // initialReceiveWindow returns the initial receive window to advertise in the
   809  // SYN/SYN-ACK.
   810  func (e *endpoint) initialReceiveWindow() int {
   811  	rcvWnd := e.receiveBufferAvailable()
   812  	if rcvWnd > math.MaxUint16 {
   813  		rcvWnd = math.MaxUint16
   814  	}
   815  
   816  	// Use the user supplied MSS, if available.
   817  	routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2
   818  	if rcvWnd > routeWnd {
   819  		rcvWnd = routeWnd
   820  	}
   821  	return rcvWnd
   822  }
   823  
   824  // ModerateRecvBuf adjusts the receive buffer and the advertised window
   825  // based on the number of bytes copied to user space.
   826  func (e *endpoint) ModerateRecvBuf(copied int) {
   827  	e.rcvListMu.Lock()
   828  	if e.rcvAutoParams.disabled {
   829  		e.rcvListMu.Unlock()
   830  		return
   831  	}
   832  	now := time.Now()
   833  	if rtt := e.rcvAutoParams.rtt; rtt == 0 || now.Sub(e.rcvAutoParams.measureTime) < rtt {
   834  		e.rcvAutoParams.copied += copied
   835  		e.rcvListMu.Unlock()
   836  		return
   837  	}
   838  	prevRTTCopied := e.rcvAutoParams.copied + copied
   839  	prevCopied := e.rcvAutoParams.prevCopied
   840  	rcvWnd := 0
   841  	if prevRTTCopied > prevCopied {
   842  		// The minimal receive window based on what was copied by the app
   843  		// in the immediate preceding RTT and some extra buffer for 16
   844  		// segments to account for variations.
   845  		// We multiply by 2 to account for packet losses.
   846  		rcvWnd = prevRTTCopied*2 + 16*int(e.amss)
   847  
   848  		// Scale for slow start based on bytes copied in this RTT vs previous.
   849  		grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied
   850  
   851  		// Multiply growth factor by 2 again to account for sender being
   852  		// in slow-start where the sender grows it's congestion window
   853  		// by 100% per RTT.
   854  		rcvWnd += grow * 2
   855  
   856  		// Make sure auto tuned buffer size can always receive upto 2x
   857  		// the initial window of 10 segments.
   858  		if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd {
   859  			rcvWnd = minRcvWnd
   860  		}
   861  
   862  		// Cap the auto tuned buffer size by the maximum permissible
   863  		// receive buffer size.
   864  		if max := e.maxReceiveBufferSize(); rcvWnd > max {
   865  			rcvWnd = max
   866  		}
   867  
   868  		// We do not adjust downwards as that can cause the receiver to
   869  		// reject valid data that might already be in flight as the
   870  		// acceptable window will shrink.
   871  		if rcvWnd > e.rcvBufSize {
   872  			e.rcvBufSize = rcvWnd
   873  			e.notifyProtocolGoroutine(notifyReceiveWindowChanged)
   874  		}
   875  
   876  		// We only update prevCopied when we grow the buffer because in cases
   877  		// where prevCopied > prevRTTCopied the existing buffer is already big
   878  		// enough to handle the current rate and we don't need to do any
   879  		// adjustments.
   880  		e.rcvAutoParams.prevCopied = prevRTTCopied
   881  	}
   882  	e.rcvAutoParams.measureTime = now
   883  	e.rcvAutoParams.copied = 0
   884  	e.rcvListMu.Unlock()
   885  }
   886  
   887  // IPTables implements tcpip.Endpoint.IPTables.
   888  func (e *endpoint) IPTables() (iptables.IPTables, error) {
   889  	return e.stack.IPTables(), nil
   890  }
   891  
   892  // Read reads data from the endpoint.
   893  func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
   894  	e.mu.RLock()
   895  	// The endpoint can be read if it's connected, or if it's already closed
   896  	// but has some pending unread data. Also note that a RST being received
   897  	// would cause the state to become StateError so we should allow the
   898  	// reads to proceed before returning a ECONNRESET.
   899  	e.rcvListMu.Lock()
   900  	bufUsed := e.rcvBufUsed
   901  	if s := e.state; !s.connected() && s != StateClose && bufUsed == 0 {
   902  		e.rcvListMu.Unlock()
   903  		he := e.HardError
   904  		e.mu.RUnlock()
   905  		if s == StateError {
   906  			return buffer.View{}, tcpip.ControlMessages{}, he
   907  		}
   908  		e.stats.ReadErrors.InvalidEndpointState.Increment()
   909  		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
   910  	}
   911  
   912  	v, err := e.readLocked()
   913  	e.rcvListMu.Unlock()
   914  
   915  	e.mu.RUnlock()
   916  
   917  	if err == tcpip.ErrClosedForReceive {
   918  		e.stats.ReadErrors.ReadClosed.Increment()
   919  	}
   920  	return v, tcpip.ControlMessages{}, err
   921  }
   922  
   923  func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
   924  	if e.rcvBufUsed == 0 {
   925  		if e.rcvClosed || !e.state.connected() {
   926  			return buffer.View{}, tcpip.ErrClosedForReceive
   927  		}
   928  		return buffer.View{}, tcpip.ErrWouldBlock
   929  	}
   930  
   931  	s := e.rcvList.Front()
   932  	views := s.data.Views()
   933  	v := views[s.viewToDeliver]
   934  	s.viewToDeliver++
   935  
   936  	if s.viewToDeliver >= len(views) {
   937  		e.rcvList.Remove(s)
   938  		s.decRef()
   939  	}
   940  
   941  	e.rcvBufUsed -= len(v)
   942  	// If the window was zero before this read and if the read freed up
   943  	// enough buffer space for the scaled window to be non-zero then notify
   944  	// the protocol goroutine to send a window update.
   945  	if e.zeroWindow && !e.zeroReceiveWindow(e.rcv.rcvWndScale) {
   946  		e.zeroWindow = false
   947  		e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
   948  	}
   949  
   950  	return v, nil
   951  }
   952  
   953  // isEndpointWritableLocked checks if a given endpoint is writable
   954  // and also returns the number of bytes that can be written at this
   955  // moment. If the endpoint is not writable then it returns an error
   956  // indicating the reason why it's not writable.
   957  // Caller must hold e.mu and e.sndBufMu
   958  func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) {
   959  	// The endpoint cannot be written to if it's not connected.
   960  	if !e.state.connected() {
   961  		switch e.state {
   962  		case StateError:
   963  			return 0, e.HardError
   964  		default:
   965  			return 0, tcpip.ErrClosedForSend
   966  		}
   967  	}
   968  
   969  	// Check if the connection has already been closed for sends.
   970  	if e.sndClosed {
   971  		return 0, tcpip.ErrClosedForSend
   972  	}
   973  
   974  	avail := e.sndBufSize - e.sndBufUsed
   975  	if avail <= 0 {
   976  		return 0, tcpip.ErrWouldBlock
   977  	}
   978  	return avail, nil
   979  }
   980  
   981  // Write writes data to the endpoint's peer.
   982  func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
   983  	// Linux completely ignores any address passed to sendto(2) for TCP sockets
   984  	// (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More
   985  	// and opts.EndOfRecord are also ignored.
   986  
   987  	e.mu.RLock()
   988  	e.sndBufMu.Lock()
   989  
   990  	avail, err := e.isEndpointWritableLocked()
   991  	if err != nil {
   992  		e.sndBufMu.Unlock()
   993  		e.mu.RUnlock()
   994  		e.stats.WriteErrors.WriteClosed.Increment()
   995  		return 0, nil, err
   996  	}
   997  
   998  	// We can release locks while copying data.
   999  	//
  1000  	// This is not possible if atomic is set, because we can't allow the
  1001  	// available buffer space to be consumed by some other caller while we
  1002  	// are copying data in.
  1003  	if !opts.Atomic {
  1004  		e.sndBufMu.Unlock()
  1005  		e.mu.RUnlock()
  1006  	}
  1007  
  1008  	// Fetch data.
  1009  	v, perr := p.Payload(avail)
  1010  	if perr != nil || len(v) == 0 {
  1011  		if opts.Atomic { // See above.
  1012  			e.sndBufMu.Unlock()
  1013  			e.mu.RUnlock()
  1014  		}
  1015  		// Note that perr may be nil if len(v) == 0.
  1016  		return 0, nil, perr
  1017  	}
  1018  
  1019  	if !opts.Atomic { // See above.
  1020  		e.mu.RLock()
  1021  		e.sndBufMu.Lock()
  1022  
  1023  		// Because we released the lock before copying, check state again
  1024  		// to make sure the endpoint is still in a valid state for a write.
  1025  		avail, err = e.isEndpointWritableLocked()
  1026  		if err != nil {
  1027  			e.sndBufMu.Unlock()
  1028  			e.mu.RUnlock()
  1029  			e.stats.WriteErrors.WriteClosed.Increment()
  1030  			return 0, nil, err
  1031  		}
  1032  
  1033  		// Discard any excess data copied in due to avail being reduced due
  1034  		// to a simultaneous write call to the socket.
  1035  		if avail < len(v) {
  1036  			v = v[:avail]
  1037  		}
  1038  	}
  1039  
  1040  	// Add data to the send queue.
  1041  	s := newSegmentFromView(&e.route, e.ID, v)
  1042  	e.sndBufUsed += len(v)
  1043  	e.sndBufInQueue += seqnum.Size(len(v))
  1044  	e.sndQueue.PushBack(s)
  1045  	e.sndBufMu.Unlock()
  1046  	// Release the endpoint lock to prevent deadlocks due to lock
  1047  	// order inversion when acquiring workMu.
  1048  	e.mu.RUnlock()
  1049  
  1050  	if e.workMu.TryLock() {
  1051  		// Do the work inline.
  1052  		e.handleWrite()
  1053  		e.workMu.Unlock()
  1054  	} else {
  1055  		// Let the protocol goroutine do the work.
  1056  		e.sndWaker.Assert()
  1057  	}
  1058  
  1059  	return int64(len(v)), nil, nil
  1060  }
  1061  
  1062  // Peek reads data without consuming it from the endpoint.
  1063  //
  1064  // This method does not block if there is no data pending.
  1065  func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
  1066  	e.mu.RLock()
  1067  	defer e.mu.RUnlock()
  1068  
  1069  	// The endpoint can be read if it's connected, or if it's already closed
  1070  	// but has some pending unread data.
  1071  	if s := e.state; !s.connected() && s != StateClose {
  1072  		if s == StateError {
  1073  			return 0, tcpip.ControlMessages{}, e.HardError
  1074  		}
  1075  		e.stats.ReadErrors.InvalidEndpointState.Increment()
  1076  		return 0, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
  1077  	}
  1078  
  1079  	e.rcvListMu.Lock()
  1080  	defer e.rcvListMu.Unlock()
  1081  
  1082  	if e.rcvBufUsed == 0 {
  1083  		if e.rcvClosed || !e.state.connected() {
  1084  			e.stats.ReadErrors.ReadClosed.Increment()
  1085  			return 0, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive
  1086  		}
  1087  		return 0, tcpip.ControlMessages{}, tcpip.ErrWouldBlock
  1088  	}
  1089  
  1090  	// Make a copy of vec so we can modify the slide headers.
  1091  	vec = append([][]byte(nil), vec...)
  1092  
  1093  	var num int64
  1094  	for s := e.rcvList.Front(); s != nil; s = s.Next() {
  1095  		views := s.data.Views()
  1096  
  1097  		for i := s.viewToDeliver; i < len(views); i++ {
  1098  			v := views[i]
  1099  
  1100  			for len(v) > 0 {
  1101  				if len(vec) == 0 {
  1102  					return num, tcpip.ControlMessages{}, nil
  1103  				}
  1104  				if len(vec[0]) == 0 {
  1105  					vec = vec[1:]
  1106  					continue
  1107  				}
  1108  
  1109  				n := copy(vec[0], v)
  1110  				v = v[n:]
  1111  				vec[0] = vec[0][n:]
  1112  				num += int64(n)
  1113  			}
  1114  		}
  1115  	}
  1116  
  1117  	return num, tcpip.ControlMessages{}, nil
  1118  }
  1119  
  1120  // zeroReceiveWindow checks if the receive window to be announced now would be
  1121  // zero, based on the amount of available buffer and the receive window scaling.
  1122  //
  1123  // It must be called with rcvListMu held.
  1124  func (e *endpoint) zeroReceiveWindow(scale uint8) bool {
  1125  	if e.rcvBufUsed >= e.rcvBufSize {
  1126  		return true
  1127  	}
  1128  
  1129  	return ((e.rcvBufSize - e.rcvBufUsed) >> scale) == 0
  1130  }
  1131  
  1132  // SetSockOptInt sets a socket option.
  1133  func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
  1134  	switch opt {
  1135  	case tcpip.ReceiveBufferSizeOption:
  1136  		// Make sure the receive buffer size is within the min and max
  1137  		// allowed.
  1138  		var rs ReceiveBufferSizeOption
  1139  		size := int(v)
  1140  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
  1141  			if size < rs.Min {
  1142  				size = rs.Min
  1143  			}
  1144  			if size > rs.Max {
  1145  				size = rs.Max
  1146  			}
  1147  		}
  1148  
  1149  		mask := uint32(notifyReceiveWindowChanged)
  1150  
  1151  		e.rcvListMu.Lock()
  1152  
  1153  		// Make sure the receive buffer size allows us to send a
  1154  		// non-zero window size.
  1155  		scale := uint8(0)
  1156  		if e.rcv != nil {
  1157  			scale = e.rcv.rcvWndScale
  1158  		}
  1159  		if size>>scale == 0 {
  1160  			size = 1 << scale
  1161  		}
  1162  
  1163  		// Make sure 2*size doesn't overflow.
  1164  		if size > math.MaxInt32/2 {
  1165  			size = math.MaxInt32 / 2
  1166  		}
  1167  
  1168  		e.rcvBufSize = size
  1169  		e.rcvAutoParams.disabled = true
  1170  		if e.zeroWindow && !e.zeroReceiveWindow(scale) {
  1171  			e.zeroWindow = false
  1172  			mask |= notifyNonZeroReceiveWindow
  1173  		}
  1174  		e.rcvListMu.Unlock()
  1175  
  1176  		e.notifyProtocolGoroutine(mask)
  1177  		return nil
  1178  
  1179  	case tcpip.SendBufferSizeOption:
  1180  		// Make sure the send buffer size is within the min and max
  1181  		// allowed.
  1182  		size := int(v)
  1183  		var ss SendBufferSizeOption
  1184  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
  1185  			if size < ss.Min {
  1186  				size = ss.Min
  1187  			}
  1188  			if size > ss.Max {
  1189  				size = ss.Max
  1190  			}
  1191  		}
  1192  
  1193  		e.sndBufMu.Lock()
  1194  		e.sndBufSize = size
  1195  		e.sndBufMu.Unlock()
  1196  		return nil
  1197  
  1198  	case tcpip.DelayOption:
  1199  		if v == 0 {
  1200  			atomic.StoreUint32(&e.delay, 0)
  1201  
  1202  			// Handle delayed data.
  1203  			e.sndWaker.Assert()
  1204  		} else {
  1205  			atomic.StoreUint32(&e.delay, 1)
  1206  		}
  1207  		return nil
  1208  
  1209  	default:
  1210  		return nil
  1211  	}
  1212  }
  1213  
  1214  // SetSockOpt sets a socket option.
  1215  func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
  1216  	// Lower 2 bits represents ECN bits. RFC 3168, section 23.1
  1217  	const inetECNMask = 3
  1218  	switch v := opt.(type) {
  1219  	case tcpip.CorkOption:
  1220  		if v == 0 {
  1221  			atomic.StoreUint32(&e.cork, 0)
  1222  
  1223  			// Handle the corked data.
  1224  			e.sndWaker.Assert()
  1225  		} else {
  1226  			atomic.StoreUint32(&e.cork, 1)
  1227  		}
  1228  		return nil
  1229  
  1230  	case tcpip.ReuseAddressOption:
  1231  		e.mu.Lock()
  1232  		e.reuseAddr = v != 0
  1233  		e.mu.Unlock()
  1234  		return nil
  1235  
  1236  	case tcpip.ReusePortOption:
  1237  		e.mu.Lock()
  1238  		e.reusePort = v != 0
  1239  		e.mu.Unlock()
  1240  		return nil
  1241  
  1242  	case tcpip.BindToDeviceOption:
  1243  		e.mu.Lock()
  1244  		defer e.mu.Unlock()
  1245  		if v == "" {
  1246  			e.bindToDevice = 0
  1247  			return nil
  1248  		}
  1249  		for nicID, nic := range e.stack.NICInfo() {
  1250  			if nic.Name == string(v) {
  1251  				e.bindToDevice = nicID
  1252  				return nil
  1253  			}
  1254  		}
  1255  		return tcpip.ErrUnknownDevice
  1256  
  1257  	case tcpip.QuickAckOption:
  1258  		if v == 0 {
  1259  			atomic.StoreUint32(&e.slowAck, 1)
  1260  		} else {
  1261  			atomic.StoreUint32(&e.slowAck, 0)
  1262  		}
  1263  		return nil
  1264  
  1265  	case tcpip.MaxSegOption:
  1266  		userMSS := v
  1267  		if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS {
  1268  			return tcpip.ErrInvalidOptionValue
  1269  		}
  1270  		e.mu.Lock()
  1271  		e.userMSS = uint16(userMSS)
  1272  		e.mu.Unlock()
  1273  		e.notifyProtocolGoroutine(notifyMSSChanged)
  1274  		return nil
  1275  
  1276  	case tcpip.V6OnlyOption:
  1277  		// We only recognize this option on v6 endpoints.
  1278  		if e.NetProto != header.IPv6ProtocolNumber {
  1279  			return tcpip.ErrInvalidEndpointState
  1280  		}
  1281  
  1282  		e.mu.Lock()
  1283  		defer e.mu.Unlock()
  1284  
  1285  		// We only allow this to be set when we're in the initial state.
  1286  		if e.state != StateInitial {
  1287  			return tcpip.ErrInvalidEndpointState
  1288  		}
  1289  
  1290  		e.v6only = v != 0
  1291  		return nil
  1292  
  1293  	case tcpip.TTLOption:
  1294  		e.mu.Lock()
  1295  		e.ttl = uint8(v)
  1296  		e.mu.Unlock()
  1297  		return nil
  1298  
  1299  	case tcpip.KeepaliveEnabledOption:
  1300  		e.keepalive.Lock()
  1301  		e.keepalive.enabled = v != 0
  1302  		e.keepalive.Unlock()
  1303  		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
  1304  		return nil
  1305  
  1306  	case tcpip.KeepaliveIdleOption:
  1307  		e.keepalive.Lock()
  1308  		e.keepalive.idle = time.Duration(v)
  1309  		e.keepalive.Unlock()
  1310  		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
  1311  		return nil
  1312  
  1313  	case tcpip.KeepaliveIntervalOption:
  1314  		e.keepalive.Lock()
  1315  		e.keepalive.interval = time.Duration(v)
  1316  		e.keepalive.Unlock()
  1317  		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
  1318  		return nil
  1319  
  1320  	case tcpip.KeepaliveCountOption:
  1321  		e.keepalive.Lock()
  1322  		e.keepalive.count = int(v)
  1323  		e.keepalive.Unlock()
  1324  		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
  1325  		return nil
  1326  
  1327  	case tcpip.BroadcastOption:
  1328  		e.mu.Lock()
  1329  		e.broadcast = v != 0
  1330  		e.mu.Unlock()
  1331  		return nil
  1332  
  1333  	case tcpip.CongestionControlOption:
  1334  		// Query the available cc algorithms in the stack and
  1335  		// validate that the specified algorithm is actually
  1336  		// supported in the stack.
  1337  		var avail tcpip.AvailableCongestionControlOption
  1338  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil {
  1339  			return err
  1340  		}
  1341  		availCC := strings.Split(string(avail), " ")
  1342  		for _, cc := range availCC {
  1343  			if v == tcpip.CongestionControlOption(cc) {
  1344  				// Acquire the work mutex as we may need to
  1345  				// reinitialize the congestion control state.
  1346  				e.mu.Lock()
  1347  				state := e.state
  1348  				e.cc = v
  1349  				e.mu.Unlock()
  1350  				switch state {
  1351  				case StateEstablished:
  1352  					e.workMu.Lock()
  1353  					e.mu.Lock()
  1354  					if e.state == state {
  1355  						e.snd.cc = e.snd.initCongestionControl(e.cc)
  1356  					}
  1357  					e.mu.Unlock()
  1358  					e.workMu.Unlock()
  1359  				}
  1360  				return nil
  1361  			}
  1362  		}
  1363  
  1364  		// Linux returns ENOENT when an invalid congestion
  1365  		// control algorithm is specified.
  1366  		return tcpip.ErrNoSuchFile
  1367  
  1368  	case tcpip.IPv4TOSOption:
  1369  		e.mu.Lock()
  1370  		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
  1371  		// ignore the bits for now.
  1372  		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
  1373  		e.mu.Unlock()
  1374  		return nil
  1375  
  1376  	case tcpip.IPv6TrafficClassOption:
  1377  		e.mu.Lock()
  1378  		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
  1379  		// ignore the bits for now.
  1380  		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
  1381  		e.mu.Unlock()
  1382  		return nil
  1383  
  1384  	case tcpip.TCPLingerTimeoutOption:
  1385  		e.mu.Lock()
  1386  		if v < 0 {
  1387  			// Same as effectively disabling TCPLinger timeout.
  1388  			v = 0
  1389  		}
  1390  		var stkTCPLingerTimeout tcpip.TCPLingerTimeoutOption
  1391  		if err := e.stack.TransportProtocolOption(header.TCPProtocolNumber, &stkTCPLingerTimeout); err != nil {
  1392  			// We were unable to retrieve a stack config, just use
  1393  			// the DefaultTCPLingerTimeout.
  1394  			if v > tcpip.TCPLingerTimeoutOption(DefaultTCPLingerTimeout) {
  1395  				stkTCPLingerTimeout = tcpip.TCPLingerTimeoutOption(DefaultTCPLingerTimeout)
  1396  			}
  1397  		}
  1398  		// Cap it to the stack wide TCPLinger timeout.
  1399  		if v > stkTCPLingerTimeout {
  1400  			v = stkTCPLingerTimeout
  1401  		}
  1402  		e.tcpLingerTimeout = time.Duration(v)
  1403  		e.mu.Unlock()
  1404  		return nil
  1405  
  1406  	default:
  1407  		return nil
  1408  	}
  1409  }
  1410  
  1411  // readyReceiveSize returns the number of bytes ready to be received.
  1412  func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) {
  1413  	e.mu.RLock()
  1414  	defer e.mu.RUnlock()
  1415  
  1416  	// The endpoint cannot be in listen state.
  1417  	if e.state == StateListen {
  1418  		return 0, tcpip.ErrInvalidEndpointState
  1419  	}
  1420  
  1421  	e.rcvListMu.Lock()
  1422  	defer e.rcvListMu.Unlock()
  1423  
  1424  	return e.rcvBufUsed, nil
  1425  }
  1426  
  1427  // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
  1428  func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
  1429  	switch opt {
  1430  	case tcpip.ReceiveQueueSizeOption:
  1431  		return e.readyReceiveSize()
  1432  
  1433  	case tcpip.SendBufferSizeOption:
  1434  		e.sndBufMu.Lock()
  1435  		v := e.sndBufSize
  1436  		e.sndBufMu.Unlock()
  1437  		return v, nil
  1438  
  1439  	case tcpip.ReceiveBufferSizeOption:
  1440  		e.rcvListMu.Lock()
  1441  		v := e.rcvBufSize
  1442  		e.rcvListMu.Unlock()
  1443  		return v, nil
  1444  
  1445  	case tcpip.DelayOption:
  1446  		var o int
  1447  		if v := atomic.LoadUint32(&e.delay); v != 0 {
  1448  			o = 1
  1449  		}
  1450  		return o, nil
  1451  
  1452  	default:
  1453  		return -1, tcpip.ErrUnknownProtocolOption
  1454  	}
  1455  }
  1456  
  1457  // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
  1458  func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
  1459  	switch o := opt.(type) {
  1460  	case tcpip.ErrorOption:
  1461  		e.lastErrorMu.Lock()
  1462  		err := e.lastError
  1463  		e.lastError = nil
  1464  		e.lastErrorMu.Unlock()
  1465  		return err
  1466  
  1467  	case *tcpip.MaxSegOption:
  1468  		// This is just stubbed out. Linux never returns the user_mss
  1469  		// value as it either returns the defaultMSS or returns the
  1470  		// actual current MSS. Netstack just returns the defaultMSS
  1471  		// always for now.
  1472  		*o = header.TCPDefaultMSS
  1473  		return nil
  1474  
  1475  	case *tcpip.CorkOption:
  1476  		*o = 0
  1477  		if v := atomic.LoadUint32(&e.cork); v != 0 {
  1478  			*o = 1
  1479  		}
  1480  		return nil
  1481  
  1482  	case *tcpip.ReuseAddressOption:
  1483  		e.mu.RLock()
  1484  		v := e.reuseAddr
  1485  		e.mu.RUnlock()
  1486  
  1487  		*o = 0
  1488  		if v {
  1489  			*o = 1
  1490  		}
  1491  		return nil
  1492  
  1493  	case *tcpip.ReusePortOption:
  1494  		e.mu.RLock()
  1495  		v := e.reusePort
  1496  		e.mu.RUnlock()
  1497  
  1498  		*o = 0
  1499  		if v {
  1500  			*o = 1
  1501  		}
  1502  		return nil
  1503  
  1504  	case *tcpip.BindToDeviceOption:
  1505  		e.mu.RLock()
  1506  		defer e.mu.RUnlock()
  1507  		if nic, ok := e.stack.NICInfo()[e.bindToDevice]; ok {
  1508  			*o = tcpip.BindToDeviceOption(nic.Name)
  1509  			return nil
  1510  		}
  1511  		*o = ""
  1512  		return nil
  1513  
  1514  	case *tcpip.QuickAckOption:
  1515  		*o = 1
  1516  		if v := atomic.LoadUint32(&e.slowAck); v != 0 {
  1517  			*o = 0
  1518  		}
  1519  		return nil
  1520  
  1521  	case *tcpip.V6OnlyOption:
  1522  		// We only recognize this option on v6 endpoints.
  1523  		if e.NetProto != header.IPv6ProtocolNumber {
  1524  			return tcpip.ErrUnknownProtocolOption
  1525  		}
  1526  
  1527  		e.mu.Lock()
  1528  		v := e.v6only
  1529  		e.mu.Unlock()
  1530  
  1531  		*o = 0
  1532  		if v {
  1533  			*o = 1
  1534  		}
  1535  		return nil
  1536  
  1537  	case *tcpip.TTLOption:
  1538  		e.mu.Lock()
  1539  		*o = tcpip.TTLOption(e.ttl)
  1540  		e.mu.Unlock()
  1541  		return nil
  1542  
  1543  	case *tcpip.TCPInfoOption:
  1544  		*o = tcpip.TCPInfoOption{}
  1545  		e.mu.RLock()
  1546  		snd := e.snd
  1547  		e.mu.RUnlock()
  1548  		if snd != nil {
  1549  			snd.rtt.Lock()
  1550  			o.RTT = snd.rtt.srtt
  1551  			o.RTTVar = snd.rtt.rttvar
  1552  			snd.rtt.Unlock()
  1553  		}
  1554  		return nil
  1555  
  1556  	case *tcpip.KeepaliveEnabledOption:
  1557  		e.keepalive.Lock()
  1558  		v := e.keepalive.enabled
  1559  		e.keepalive.Unlock()
  1560  
  1561  		*o = 0
  1562  		if v {
  1563  			*o = 1
  1564  		}
  1565  		return nil
  1566  
  1567  	case *tcpip.KeepaliveIdleOption:
  1568  		e.keepalive.Lock()
  1569  		*o = tcpip.KeepaliveIdleOption(e.keepalive.idle)
  1570  		e.keepalive.Unlock()
  1571  		return nil
  1572  
  1573  	case *tcpip.KeepaliveIntervalOption:
  1574  		e.keepalive.Lock()
  1575  		*o = tcpip.KeepaliveIntervalOption(e.keepalive.interval)
  1576  		e.keepalive.Unlock()
  1577  		return nil
  1578  
  1579  	case *tcpip.KeepaliveCountOption:
  1580  		e.keepalive.Lock()
  1581  		*o = tcpip.KeepaliveCountOption(e.keepalive.count)
  1582  		e.keepalive.Unlock()
  1583  		return nil
  1584  
  1585  	case *tcpip.OutOfBandInlineOption:
  1586  		// We don't currently support disabling this option.
  1587  		*o = 1
  1588  		return nil
  1589  
  1590  	case *tcpip.BroadcastOption:
  1591  		e.mu.Lock()
  1592  		v := e.broadcast
  1593  		e.mu.Unlock()
  1594  
  1595  		*o = 0
  1596  		if v {
  1597  			*o = 1
  1598  		}
  1599  		return nil
  1600  
  1601  	case *tcpip.CongestionControlOption:
  1602  		e.mu.Lock()
  1603  		*o = e.cc
  1604  		e.mu.Unlock()
  1605  		return nil
  1606  
  1607  	case *tcpip.IPv4TOSOption:
  1608  		e.mu.RLock()
  1609  		*o = tcpip.IPv4TOSOption(e.sendTOS)
  1610  		e.mu.RUnlock()
  1611  		return nil
  1612  
  1613  	case *tcpip.IPv6TrafficClassOption:
  1614  		e.mu.RLock()
  1615  		*o = tcpip.IPv6TrafficClassOption(e.sendTOS)
  1616  		e.mu.RUnlock()
  1617  		return nil
  1618  
  1619  	case *tcpip.TCPLingerTimeoutOption:
  1620  		e.mu.Lock()
  1621  		*o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout)
  1622  		e.mu.Unlock()
  1623  		return nil
  1624  
  1625  	default:
  1626  		return tcpip.ErrUnknownProtocolOption
  1627  	}
  1628  }
  1629  
  1630  func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
  1631  	netProto := e.NetProto
  1632  	if header.IsV4MappedAddress(addr.Addr) {
  1633  		// Fail if using a v4 mapped address on a v6only endpoint.
  1634  		if e.v6only {
  1635  			return 0, tcpip.ErrNoRoute
  1636  		}
  1637  
  1638  		netProto = header.IPv4ProtocolNumber
  1639  		addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:]
  1640  		if addr.Addr == header.IPv4Any {
  1641  			addr.Addr = ""
  1642  		}
  1643  	}
  1644  
  1645  	// Fail if we're bound to an address length different from the one we're
  1646  	// checking.
  1647  	if l := len(e.ID.LocalAddress); l != 0 && len(addr.Addr) != 0 && l != len(addr.Addr) {
  1648  		return 0, tcpip.ErrInvalidEndpointState
  1649  	}
  1650  
  1651  	return netProto, nil
  1652  }
  1653  
  1654  // Disconnect implements tcpip.Endpoint.Disconnect.
  1655  func (*endpoint) Disconnect() *tcpip.Error {
  1656  	return tcpip.ErrNotSupported
  1657  }
  1658  
  1659  // Connect connects the endpoint to its peer.
  1660  func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
  1661  	err := e.connect(addr, true, true)
  1662  	if err != nil && !err.IgnoreStats() {
  1663  		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
  1664  		e.stats.FailedConnectionAttempts.Increment()
  1665  	}
  1666  	return err
  1667  }
  1668  
  1669  // connect connects the endpoint to its peer. In the normal non-S/R case, the
  1670  // new connection is expected to run the main goroutine and perform handshake.
  1671  // In restore of previously connected endpoints, both ends will be passively
  1672  // created (so no new handshaking is done); for stack-accepted connections not
  1673  // yet accepted by the app, they are restored without running the main goroutine
  1674  // here.
  1675  func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tcpip.Error {
  1676  	e.mu.Lock()
  1677  	defer e.mu.Unlock()
  1678  
  1679  	connectingAddr := addr.Addr
  1680  
  1681  	netProto, err := e.checkV4Mapped(&addr)
  1682  	if err != nil {
  1683  		return err
  1684  	}
  1685  
  1686  	if e.state.connected() {
  1687  		// The endpoint is already connected. If caller hasn't been
  1688  		// notified yet, return success.
  1689  		if !e.isConnectNotified {
  1690  			e.isConnectNotified = true
  1691  			return nil
  1692  		}
  1693  		// Otherwise return that it's already connected.
  1694  		return tcpip.ErrAlreadyConnected
  1695  	}
  1696  
  1697  	nicID := addr.NIC
  1698  	switch e.state {
  1699  	case StateBound:
  1700  		// If we're already bound to a NIC but the caller is requesting
  1701  		// that we use a different one now, we cannot proceed.
  1702  		if e.boundNICID == 0 {
  1703  			break
  1704  		}
  1705  
  1706  		if nicID != 0 && nicID != e.boundNICID {
  1707  			return tcpip.ErrNoRoute
  1708  		}
  1709  
  1710  		nicID = e.boundNICID
  1711  
  1712  	case StateInitial:
  1713  		// Nothing to do. We'll eventually fill-in the gaps in the ID (if any)
  1714  		// when we find a route.
  1715  
  1716  	case StateConnecting, StateSynSent, StateSynRecv:
  1717  		// A connection request has already been issued but hasn't completed
  1718  		// yet.
  1719  		return tcpip.ErrAlreadyConnecting
  1720  
  1721  	case StateError:
  1722  		return e.HardError
  1723  
  1724  	default:
  1725  		return tcpip.ErrInvalidEndpointState
  1726  	}
  1727  
  1728  	// Find a route to the desired destination.
  1729  	r, err := e.stack.FindRoute(nicID, e.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */)
  1730  	if err != nil {
  1731  		return err
  1732  	}
  1733  	defer r.Release()
  1734  
  1735  	origID := e.ID
  1736  
  1737  	netProtos := []tcpip.NetworkProtocolNumber{netProto}
  1738  	e.ID.LocalAddress = r.LocalAddress
  1739  	e.ID.RemoteAddress = r.RemoteAddress
  1740  	e.ID.RemotePort = addr.Port
  1741  
  1742  	if e.ID.LocalPort != 0 {
  1743  		// The endpoint is bound to a port, attempt to register it.
  1744  		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, e.ID, e, e.reusePort, e.bindToDevice)
  1745  		if err != nil {
  1746  			return err
  1747  		}
  1748  	} else {
  1749  		// The endpoint doesn't have a local port yet, so try to get
  1750  		// one. Make sure that it isn't one that will result in the same
  1751  		// address/port for both local and remote (otherwise this
  1752  		// endpoint would be trying to connect to itself).
  1753  		sameAddr := e.ID.LocalAddress == e.ID.RemoteAddress
  1754  
  1755  		// Calculate a port offset based on the destination IP/port and
  1756  		// src IP to ensure that for a given tuple (srcIP, destIP,
  1757  		// destPort) the offset used as a starting point is the same to
  1758  		// ensure that we can cycle through the port space effectively.
  1759  		h := jenkins.Sum32(e.stack.Seed())
  1760  		h.Write([]byte(e.ID.LocalAddress))
  1761  		h.Write([]byte(e.ID.RemoteAddress))
  1762  		portBuf := make([]byte, 2)
  1763  		binary.LittleEndian.PutUint16(portBuf, e.ID.RemotePort)
  1764  		h.Write(portBuf)
  1765  		portOffset := h.Sum32()
  1766  
  1767  		if _, err := e.stack.PickEphemeralPortStable(portOffset, func(p uint16) (bool, *tcpip.Error) {
  1768  			if sameAddr && p == e.ID.RemotePort {
  1769  				return false, nil
  1770  			}
  1771  			// reusePort is false below because connect cannot reuse a port even if
  1772  			// reusePort was set.
  1773  			if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.ID.LocalAddress, p, false /* reusePort */, e.bindToDevice) {
  1774  				return false, nil
  1775  			}
  1776  
  1777  			id := e.ID
  1778  			id.LocalPort = p
  1779  			switch e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice) {
  1780  			case nil:
  1781  				e.ID = id
  1782  				return true, nil
  1783  			case tcpip.ErrPortInUse:
  1784  				return false, nil
  1785  			default:
  1786  				return false, err
  1787  			}
  1788  		}); err != nil {
  1789  			return err
  1790  		}
  1791  	}
  1792  
  1793  	// Remove the port reservation. This can happen when Bind is called
  1794  	// before Connect: in such a case we don't want to hold on to
  1795  	// reservations anymore.
  1796  	if e.isPortReserved {
  1797  		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort, e.bindToDevice)
  1798  		e.isPortReserved = false
  1799  	}
  1800  
  1801  	e.isRegistered = true
  1802  	e.state = StateConnecting
  1803  	e.route = r.Clone()
  1804  	e.boundNICID = nicID
  1805  	e.effectiveNetProtos = netProtos
  1806  	e.connectingAddress = connectingAddr
  1807  
  1808  	e.initGSO()
  1809  
  1810  	// Connect in the restore phase does not perform handshake. Restore its
  1811  	// connection setting here.
  1812  	if !handshake {
  1813  		e.segmentQueue.mu.Lock()
  1814  		for _, l := range []segmentList{e.segmentQueue.list, e.sndQueue, e.snd.writeList} {
  1815  			for s := l.Front(); s != nil; s = s.Next() {
  1816  				s.id = e.ID
  1817  				s.route = r.Clone()
  1818  				e.sndWaker.Assert()
  1819  			}
  1820  		}
  1821  		e.segmentQueue.mu.Unlock()
  1822  		e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
  1823  		e.state = StateEstablished
  1824  		e.stack.Stats().TCP.CurrentEstablished.Increment()
  1825  	}
  1826  
  1827  	if run {
  1828  		e.workerRunning = true
  1829  		e.stack.Stats().TCP.ActiveConnectionOpenings.Increment()
  1830  		go e.protocolMainLoop(handshake)
  1831  	}
  1832  
  1833  	return tcpip.ErrConnectStarted
  1834  }
  1835  
  1836  // ConnectEndpoint is not supported.
  1837  func (*endpoint) ConnectEndpoint(tcpip.Endpoint) *tcpip.Error {
  1838  	return tcpip.ErrInvalidEndpointState
  1839  }
  1840  
  1841  // Shutdown closes the read and/or write end of the endpoint connection to its
  1842  // peer.
  1843  func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
  1844  	e.mu.Lock()
  1845  	e.shutdownFlags |= flags
  1846  	finQueued := false
  1847  	switch {
  1848  	case e.state.connected():
  1849  		// Close for read.
  1850  		if (e.shutdownFlags & tcpip.ShutdownRead) != 0 {
  1851  			// Mark read side as closed.
  1852  			e.rcvListMu.Lock()
  1853  			e.rcvClosed = true
  1854  			rcvBufUsed := e.rcvBufUsed
  1855  			e.rcvListMu.Unlock()
  1856  
  1857  			// If we're fully closed and we have unread data we need to abort
  1858  			// the connection with a RST.
  1859  			if (e.shutdownFlags&tcpip.ShutdownWrite) != 0 && rcvBufUsed > 0 {
  1860  				e.notifyProtocolGoroutine(notifyReset)
  1861  				e.mu.Unlock()
  1862  				return nil
  1863  			}
  1864  		}
  1865  
  1866  		// Close for write.
  1867  		if (e.shutdownFlags & tcpip.ShutdownWrite) != 0 {
  1868  			e.sndBufMu.Lock()
  1869  
  1870  			if e.sndClosed {
  1871  				// Already closed.
  1872  				e.sndBufMu.Unlock()
  1873  				break
  1874  			}
  1875  
  1876  			// Queue fin segment.
  1877  			s := newSegmentFromView(&e.route, e.ID, nil)
  1878  			e.sndQueue.PushBack(s)
  1879  			e.sndBufInQueue++
  1880  			finQueued = true
  1881  			// Mark endpoint as closed.
  1882  			e.sndClosed = true
  1883  
  1884  			e.sndBufMu.Unlock()
  1885  		}
  1886  
  1887  	case e.state == StateListen:
  1888  		// Tell protocolListenLoop to stop.
  1889  		if flags&tcpip.ShutdownRead != 0 {
  1890  			e.notifyProtocolGoroutine(notifyClose)
  1891  		}
  1892  	default:
  1893  		e.mu.Unlock()
  1894  		return tcpip.ErrNotConnected
  1895  	}
  1896  	e.mu.Unlock()
  1897  	if finQueued {
  1898  		if e.workMu.TryLock() {
  1899  			e.handleClose()
  1900  			e.workMu.Unlock()
  1901  		} else {
  1902  			// Tell protocol goroutine to close.
  1903  			e.sndCloseWaker.Assert()
  1904  		}
  1905  	}
  1906  	return nil
  1907  }
  1908  
  1909  // Listen puts the endpoint in "listen" mode, which allows it to accept
  1910  // new connections.
  1911  func (e *endpoint) Listen(backlog int) *tcpip.Error {
  1912  	err := e.listen(backlog)
  1913  	if err != nil && !err.IgnoreStats() {
  1914  		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
  1915  		e.stats.FailedConnectionAttempts.Increment()
  1916  	}
  1917  	return err
  1918  }
  1919  
  1920  func (e *endpoint) listen(backlog int) *tcpip.Error {
  1921  	e.mu.Lock()
  1922  	defer e.mu.Unlock()
  1923  
  1924  	// Allow the backlog to be adjusted if the endpoint is not shutting down.
  1925  	// When the endpoint shuts down, it sets workerCleanup to true, and from
  1926  	// that point onward, acceptedChan is the responsibility of the cleanup()
  1927  	// method (and should not be touched anywhere else, including here).
  1928  	if e.state == StateListen && !e.workerCleanup {
  1929  		// Adjust the size of the channel iff we can fix existing
  1930  		// pending connections into the new one.
  1931  		if len(e.acceptedChan) > backlog {
  1932  			return tcpip.ErrInvalidEndpointState
  1933  		}
  1934  		if cap(e.acceptedChan) == backlog {
  1935  			return nil
  1936  		}
  1937  		origChan := e.acceptedChan
  1938  		e.acceptedChan = make(chan *endpoint, backlog)
  1939  		close(origChan)
  1940  		for ep := range origChan {
  1941  			e.acceptedChan <- ep
  1942  		}
  1943  		return nil
  1944  	}
  1945  
  1946  	// Endpoint must be bound before it can transition to listen mode.
  1947  	if e.state != StateBound {
  1948  		e.stats.ReadErrors.InvalidEndpointState.Increment()
  1949  		return tcpip.ErrInvalidEndpointState
  1950  	}
  1951  
  1952  	// Register the endpoint.
  1953  	if err := e.stack.RegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.reusePort, e.bindToDevice); err != nil {
  1954  		return err
  1955  	}
  1956  
  1957  	e.isRegistered = true
  1958  	e.state = StateListen
  1959  	if e.acceptedChan == nil {
  1960  		e.acceptedChan = make(chan *endpoint, backlog)
  1961  	}
  1962  	e.workerRunning = true
  1963  
  1964  	go e.protocolListenLoop(
  1965  		seqnum.Size(e.receiveBufferAvailable()))
  1966  
  1967  	return nil
  1968  }
  1969  
  1970  // startAcceptedLoop sets up required state and starts a goroutine with the
  1971  // main loop for accepted connections.
  1972  func (e *endpoint) startAcceptedLoop(waiterQueue *waiter.Queue) {
  1973  	e.waiterQueue = waiterQueue
  1974  	e.workerRunning = true
  1975  	go e.protocolMainLoop(false)
  1976  }
  1977  
  1978  // Accept returns a new endpoint if a peer has established a connection
  1979  // to an endpoint previously set to listen mode.
  1980  func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
  1981  	e.mu.RLock()
  1982  	defer e.mu.RUnlock()
  1983  
  1984  	// Endpoint must be in listen state before it can accept connections.
  1985  	if e.state != StateListen {
  1986  		return nil, nil, tcpip.ErrInvalidEndpointState
  1987  	}
  1988  
  1989  	// Get the new accepted endpoint.
  1990  	var n *endpoint
  1991  	select {
  1992  	case n = <-e.acceptedChan:
  1993  	default:
  1994  		return nil, nil, tcpip.ErrWouldBlock
  1995  	}
  1996  
  1997  	return n, n.waiterQueue, nil
  1998  }
  1999  
  2000  // Bind binds the endpoint to a specific local port and optionally address.
  2001  func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
  2002  	e.mu.Lock()
  2003  	defer e.mu.Unlock()
  2004  
  2005  	// Don't allow binding once endpoint is not in the initial state
  2006  	// anymore. This is because once the endpoint goes into a connected or
  2007  	// listen state, it is already bound.
  2008  	if e.state != StateInitial {
  2009  		return tcpip.ErrAlreadyBound
  2010  	}
  2011  
  2012  	e.BindAddr = addr.Addr
  2013  	netProto, err := e.checkV4Mapped(&addr)
  2014  	if err != nil {
  2015  		return err
  2016  	}
  2017  
  2018  	// Expand netProtos to include v4 and v6 if the caller is binding to a
  2019  	// wildcard (empty) address, and this is an IPv6 endpoint with v6only
  2020  	// set to false.
  2021  	netProtos := []tcpip.NetworkProtocolNumber{netProto}
  2022  	if netProto == header.IPv6ProtocolNumber && !e.v6only && addr.Addr == "" {
  2023  		netProtos = []tcpip.NetworkProtocolNumber{
  2024  			header.IPv6ProtocolNumber,
  2025  			header.IPv4ProtocolNumber,
  2026  		}
  2027  	}
  2028  
  2029  	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.reusePort, e.bindToDevice)
  2030  	if err != nil {
  2031  		return err
  2032  	}
  2033  
  2034  	e.isPortReserved = true
  2035  	e.effectiveNetProtos = netProtos
  2036  	e.ID.LocalPort = port
  2037  
  2038  	// Any failures beyond this point must remove the port registration.
  2039  	defer func(bindToDevice tcpip.NICID) {
  2040  		if err != nil {
  2041  			e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, bindToDevice)
  2042  			e.isPortReserved = false
  2043  			e.effectiveNetProtos = nil
  2044  			e.ID.LocalPort = 0
  2045  			e.ID.LocalAddress = ""
  2046  			e.boundNICID = 0
  2047  		}
  2048  	}(e.bindToDevice)
  2049  
  2050  	// If an address is specified, we must ensure that it's one of our
  2051  	// local addresses.
  2052  	if len(addr.Addr) != 0 {
  2053  		nic := e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
  2054  		if nic == 0 {
  2055  			return tcpip.ErrBadLocalAddress
  2056  		}
  2057  
  2058  		e.boundNICID = nic
  2059  		e.ID.LocalAddress = addr.Addr
  2060  	}
  2061  
  2062  	// Mark endpoint as bound.
  2063  	e.state = StateBound
  2064  
  2065  	return nil
  2066  }
  2067  
  2068  // GetLocalAddress returns the address to which the endpoint is bound.
  2069  func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
  2070  	e.mu.RLock()
  2071  	defer e.mu.RUnlock()
  2072  
  2073  	return tcpip.FullAddress{
  2074  		Addr: e.ID.LocalAddress,
  2075  		Port: e.ID.LocalPort,
  2076  		NIC:  e.boundNICID,
  2077  	}, nil
  2078  }
  2079  
  2080  // GetRemoteAddress returns the address to which the endpoint is connected.
  2081  func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
  2082  	e.mu.RLock()
  2083  	defer e.mu.RUnlock()
  2084  
  2085  	if !e.state.connected() {
  2086  		return tcpip.FullAddress{}, tcpip.ErrNotConnected
  2087  	}
  2088  
  2089  	return tcpip.FullAddress{
  2090  		Addr: e.ID.RemoteAddress,
  2091  		Port: e.ID.RemotePort,
  2092  		NIC:  e.boundNICID,
  2093  	}, nil
  2094  }
  2095  
  2096  // HandlePacket is called by the stack when new packets arrive to this transport
  2097  // endpoint.
  2098  func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
  2099  	s := newSegment(r, id, pkt)
  2100  	if !s.parse() {
  2101  		e.stack.Stats().MalformedRcvdPackets.Increment()
  2102  		e.stack.Stats().TCP.InvalidSegmentsReceived.Increment()
  2103  		e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
  2104  		s.decRef()
  2105  		return
  2106  	}
  2107  
  2108  	if !s.csumValid {
  2109  		e.stack.Stats().MalformedRcvdPackets.Increment()
  2110  		e.stack.Stats().TCP.ChecksumErrors.Increment()
  2111  		e.stats.ReceiveErrors.ChecksumErrors.Increment()
  2112  		s.decRef()
  2113  		return
  2114  	}
  2115  
  2116  	e.stack.Stats().TCP.ValidSegmentsReceived.Increment()
  2117  	e.stats.SegmentsReceived.Increment()
  2118  	if (s.flags & header.TCPFlagRst) != 0 {
  2119  		e.stack.Stats().TCP.ResetsReceived.Increment()
  2120  	}
  2121  
  2122  	e.enqueueSegment(s)
  2123  }
  2124  
  2125  func (e *endpoint) enqueueSegment(s *segment) {
  2126  	// Send packet to worker goroutine.
  2127  	if e.segmentQueue.enqueue(s) {
  2128  		e.newSegmentWaker.Assert()
  2129  	} else {
  2130  		// The queue is full, so we drop the segment.
  2131  		e.stack.Stats().DroppedPackets.Increment()
  2132  		e.stats.ReceiveErrors.SegmentQueueDropped.Increment()
  2133  		s.decRef()
  2134  	}
  2135  }
  2136  
  2137  // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
  2138  func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
  2139  	switch typ {
  2140  	case stack.ControlPacketTooBig:
  2141  		e.sndBufMu.Lock()
  2142  		e.packetTooBigCount++
  2143  		if v := int(extra); v < e.sndMTU {
  2144  			e.sndMTU = v
  2145  		}
  2146  		e.sndBufMu.Unlock()
  2147  
  2148  		e.notifyProtocolGoroutine(notifyMTUChanged)
  2149  	}
  2150  }
  2151  
  2152  // updateSndBufferUsage is called by the protocol goroutine when room opens up
  2153  // in the send buffer. The number of newly available bytes is v.
  2154  func (e *endpoint) updateSndBufferUsage(v int) {
  2155  	e.sndBufMu.Lock()
  2156  	notify := e.sndBufUsed >= e.sndBufSize>>1
  2157  	e.sndBufUsed -= v
  2158  	// We only notify when there is half the sndBufSize available after
  2159  	// a full buffer event occurs. This ensures that we don't wake up
  2160  	// writers to queue just 1-2 segments and go back to sleep.
  2161  	notify = notify && e.sndBufUsed < e.sndBufSize>>1
  2162  	e.sndBufMu.Unlock()
  2163  
  2164  	if notify {
  2165  		e.waiterQueue.Notify(waiter.EventOut)
  2166  	}
  2167  }
  2168  
  2169  // readyToRead is called by the protocol goroutine when a new segment is ready
  2170  // to be read, or when the connection is closed for receiving (in which case
  2171  // s will be nil).
  2172  func (e *endpoint) readyToRead(s *segment) {
  2173  	e.rcvListMu.Lock()
  2174  	if s != nil {
  2175  		s.incRef()
  2176  		e.rcvBufUsed += s.data.Size()
  2177  		// Check if the receive window is now closed. If so make sure
  2178  		// we set the zero window before we deliver the segment to ensure
  2179  		// that a subsequent read of the segment will correctly trigger
  2180  		// a non-zero notification.
  2181  		if avail := e.receiveBufferAvailableLocked(); avail>>e.rcv.rcvWndScale == 0 {
  2182  			e.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
  2183  			e.zeroWindow = true
  2184  		}
  2185  		e.rcvList.PushBack(s)
  2186  	} else {
  2187  		e.rcvClosed = true
  2188  	}
  2189  	e.rcvListMu.Unlock()
  2190  
  2191  	e.waiterQueue.Notify(waiter.EventIn)
  2192  }
  2193  
  2194  // receiveBufferAvailableLocked calculates how many bytes are still available
  2195  // in the receive buffer.
  2196  // rcvListMu must be held when this function is called.
  2197  func (e *endpoint) receiveBufferAvailableLocked() int {
  2198  	// We may use more bytes than the buffer size when the receive buffer
  2199  	// shrinks.
  2200  	if e.rcvBufUsed >= e.rcvBufSize {
  2201  		return 0
  2202  	}
  2203  
  2204  	return e.rcvBufSize - e.rcvBufUsed
  2205  }
  2206  
  2207  // receiveBufferAvailable calculates how many bytes are still available in the
  2208  // receive buffer.
  2209  func (e *endpoint) receiveBufferAvailable() int {
  2210  	e.rcvListMu.Lock()
  2211  	available := e.receiveBufferAvailableLocked()
  2212  	e.rcvListMu.Unlock()
  2213  	return available
  2214  }
  2215  
  2216  func (e *endpoint) receiveBufferSize() int {
  2217  	e.rcvListMu.Lock()
  2218  	size := e.rcvBufSize
  2219  	e.rcvListMu.Unlock()
  2220  
  2221  	return size
  2222  }
  2223  
  2224  func (e *endpoint) maxReceiveBufferSize() int {
  2225  	var rs ReceiveBufferSizeOption
  2226  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil {
  2227  		// As a fallback return the hardcoded max buffer size.
  2228  		return MaxBufferSize
  2229  	}
  2230  	return rs.Max
  2231  }
  2232  
  2233  // rcvWndScaleForHandshake computes the receive window scale to offer to the
  2234  // peer when window scaling is enabled (true by default). If auto-tuning is
  2235  // disabled then the window scaling factor is based on the size of the
  2236  // receiveBuffer otherwise we use the max permissible receive buffer size to
  2237  // compute the scale.
  2238  func (e *endpoint) rcvWndScaleForHandshake() int {
  2239  	bufSizeForScale := e.receiveBufferSize()
  2240  
  2241  	e.rcvListMu.Lock()
  2242  	autoTuningDisabled := e.rcvAutoParams.disabled
  2243  	e.rcvListMu.Unlock()
  2244  	if autoTuningDisabled {
  2245  		return FindWndScale(seqnum.Size(bufSizeForScale))
  2246  	}
  2247  
  2248  	return FindWndScale(seqnum.Size(e.maxReceiveBufferSize()))
  2249  }
  2250  
  2251  // updateRecentTimestamp updates the recent timestamp using the algorithm
  2252  // described in https://tools.ietf.org/html/rfc7323#section-4.3
  2253  func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) {
  2254  	if e.sendTSOk && seqnum.Value(e.recentTS).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) {
  2255  		e.recentTS = tsVal
  2256  	}
  2257  }
  2258  
  2259  // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if
  2260  // the SYN options indicate that timestamp option was negotiated. It also
  2261  // initializes the recentTS with the value provided in synOpts.TSval.
  2262  func (e *endpoint) maybeEnableTimestamp(synOpts *header.TCPSynOptions) {
  2263  	if synOpts.TS {
  2264  		e.sendTSOk = true
  2265  		e.recentTS = synOpts.TSVal
  2266  	}
  2267  }
  2268  
  2269  // timestamp returns the timestamp value to be used in the TSVal field of the
  2270  // timestamp option for outgoing TCP segments for a given endpoint.
  2271  func (e *endpoint) timestamp() uint32 {
  2272  	return tcpTimeStamp(e.tsOffset)
  2273  }
  2274  
  2275  // tcpTimeStamp returns a timestamp offset by the provided offset. This is
  2276  // not inlined above as it's used when SYN cookies are in use and endpoint
  2277  // is not created at the time when the SYN cookie is sent.
  2278  func tcpTimeStamp(offset uint32) uint32 {
  2279  	now := time.Now()
  2280  	return uint32(now.Unix()*1000+int64(now.Nanosecond()/1e6)) + offset
  2281  }
  2282  
  2283  // timeStampOffset returns a randomized timestamp offset to be used when sending
  2284  // timestamp values in a timestamp option for a TCP segment.
  2285  func timeStampOffset() uint32 {
  2286  	b := make([]byte, 4)
  2287  	if _, err := rand.Read(b); err != nil {
  2288  		panic(err)
  2289  	}
  2290  	// Initialize a random tsOffset that will be added to the recentTS
  2291  	// everytime the timestamp is sent when the Timestamp option is enabled.
  2292  	//
  2293  	// See https://tools.ietf.org/html/rfc7323#section-5.4 for details on
  2294  	// why this is required.
  2295  	//
  2296  	// NOTE: This is not completely to spec as normally this should be
  2297  	// initialized in a manner analogous to how sequence numbers are
  2298  	// randomized per connection basis. But for now this is sufficient.
  2299  	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
  2300  }
  2301  
  2302  // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint
  2303  // if the SYN options indicate that the SACK option was negotiated and the TCP
  2304  // stack is configured to enable TCP SACK option.
  2305  func (e *endpoint) maybeEnableSACKPermitted(synOpts *header.TCPSynOptions) {
  2306  	var v SACKEnabled
  2307  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil {
  2308  		// Stack doesn't support SACK. So just return.
  2309  		return
  2310  	}
  2311  	if bool(v) && synOpts.SACKPermitted {
  2312  		e.sackPermitted = true
  2313  	}
  2314  }
  2315  
  2316  // maxOptionSize return the maximum size of TCP options.
  2317  func (e *endpoint) maxOptionSize() (size int) {
  2318  	var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock
  2319  	options := e.makeOptions(maxSackBlocks[:])
  2320  	size = len(options)
  2321  	putOptions(options)
  2322  
  2323  	return size
  2324  }
  2325  
  2326  // completeState makes a full copy of the endpoint and returns it. This is used
  2327  // before invoking the probe. The state returned may not be fully consistent if
  2328  // there are intervening syscalls when the state is being copied.
  2329  func (e *endpoint) completeState() stack.TCPEndpointState {
  2330  	var s stack.TCPEndpointState
  2331  	s.SegTime = time.Now()
  2332  
  2333  	// Copy EndpointID.
  2334  	e.mu.Lock()
  2335  	s.ID = stack.TCPEndpointID(e.ID)
  2336  	e.mu.Unlock()
  2337  
  2338  	// Copy endpoint rcv state.
  2339  	e.rcvListMu.Lock()
  2340  	s.RcvBufSize = e.rcvBufSize
  2341  	s.RcvBufUsed = e.rcvBufUsed
  2342  	s.RcvClosed = e.rcvClosed
  2343  	s.RcvAutoParams.MeasureTime = e.rcvAutoParams.measureTime
  2344  	s.RcvAutoParams.CopiedBytes = e.rcvAutoParams.copied
  2345  	s.RcvAutoParams.PrevCopiedBytes = e.rcvAutoParams.prevCopied
  2346  	s.RcvAutoParams.RTT = e.rcvAutoParams.rtt
  2347  	s.RcvAutoParams.RTTMeasureSeqNumber = e.rcvAutoParams.rttMeasureSeqNumber
  2348  	s.RcvAutoParams.RTTMeasureTime = e.rcvAutoParams.rttMeasureTime
  2349  	s.RcvAutoParams.Disabled = e.rcvAutoParams.disabled
  2350  	e.rcvListMu.Unlock()
  2351  
  2352  	// Endpoint TCP Option state.
  2353  	s.SendTSOk = e.sendTSOk
  2354  	s.RecentTS = e.recentTS
  2355  	s.TSOffset = e.tsOffset
  2356  	s.SACKPermitted = e.sackPermitted
  2357  	s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks)
  2358  	copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks])
  2359  	s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy()
  2360  
  2361  	// Copy endpoint send state.
  2362  	e.sndBufMu.Lock()
  2363  	s.SndBufSize = e.sndBufSize
  2364  	s.SndBufUsed = e.sndBufUsed
  2365  	s.SndClosed = e.sndClosed
  2366  	s.SndBufInQueue = e.sndBufInQueue
  2367  	s.PacketTooBigCount = e.packetTooBigCount
  2368  	s.SndMTU = e.sndMTU
  2369  	e.sndBufMu.Unlock()
  2370  
  2371  	// Copy receiver state.
  2372  	s.Receiver = stack.TCPReceiverState{
  2373  		RcvNxt:         e.rcv.rcvNxt,
  2374  		RcvAcc:         e.rcv.rcvAcc,
  2375  		RcvWndScale:    e.rcv.rcvWndScale,
  2376  		PendingBufUsed: e.rcv.pendingBufUsed,
  2377  		PendingBufSize: e.rcv.pendingBufSize,
  2378  	}
  2379  
  2380  	// Copy sender state.
  2381  	s.Sender = stack.TCPSenderState{
  2382  		LastSendTime: e.snd.lastSendTime,
  2383  		DupAckCount:  e.snd.dupAckCount,
  2384  		FastRecovery: stack.TCPFastRecoveryState{
  2385  			Active:    e.snd.fr.active,
  2386  			First:     e.snd.fr.first,
  2387  			Last:      e.snd.fr.last,
  2388  			MaxCwnd:   e.snd.fr.maxCwnd,
  2389  			HighRxt:   e.snd.fr.highRxt,
  2390  			RescueRxt: e.snd.fr.rescueRxt,
  2391  		},
  2392  		SndCwnd:          e.snd.sndCwnd,
  2393  		Ssthresh:         e.snd.sndSsthresh,
  2394  		SndCAAckCount:    e.snd.sndCAAckCount,
  2395  		Outstanding:      e.snd.outstanding,
  2396  		SndWnd:           e.snd.sndWnd,
  2397  		SndUna:           e.snd.sndUna,
  2398  		SndNxt:           e.snd.sndNxt,
  2399  		RTTMeasureSeqNum: e.snd.rttMeasureSeqNum,
  2400  		RTTMeasureTime:   e.snd.rttMeasureTime,
  2401  		Closed:           e.snd.closed,
  2402  		RTO:              e.snd.rto,
  2403  		MaxPayloadSize:   e.snd.maxPayloadSize,
  2404  		SndWndScale:      e.snd.sndWndScale,
  2405  		MaxSentAck:       e.snd.maxSentAck,
  2406  	}
  2407  	e.snd.rtt.Lock()
  2408  	s.Sender.SRTT = e.snd.rtt.srtt
  2409  	s.Sender.SRTTInited = e.snd.rtt.srttInited
  2410  	e.snd.rtt.Unlock()
  2411  
  2412  	if cubic, ok := e.snd.cc.(*cubicState); ok {
  2413  		s.Sender.Cubic = stack.TCPCubicState{
  2414  			WMax:                    cubic.wMax,
  2415  			WLastMax:                cubic.wLastMax,
  2416  			T:                       cubic.t,
  2417  			TimeSinceLastCongestion: time.Since(cubic.t),
  2418  			C:                       cubic.c,
  2419  			K:                       cubic.k,
  2420  			Beta:                    cubic.beta,
  2421  			WC:                      cubic.wC,
  2422  			WEst:                    cubic.wEst,
  2423  		}
  2424  	}
  2425  	return s
  2426  }
  2427  
  2428  func (e *endpoint) initHardwareGSO() {
  2429  	gso := &stack.GSO{}
  2430  	switch e.route.NetProto {
  2431  	case header.IPv4ProtocolNumber:
  2432  		gso.Type = stack.GSOTCPv4
  2433  		gso.L3HdrLen = header.IPv4MinimumSize
  2434  	case header.IPv6ProtocolNumber:
  2435  		gso.Type = stack.GSOTCPv6
  2436  		gso.L3HdrLen = header.IPv6MinimumSize
  2437  	default:
  2438  		panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto))
  2439  	}
  2440  	gso.NeedsCsum = true
  2441  	gso.CsumOffset = header.TCPChecksumOffset
  2442  	gso.MaxSize = e.route.GSOMaxSize()
  2443  	e.gso = gso
  2444  }
  2445  
  2446  func (e *endpoint) initGSO() {
  2447  	if e.route.Capabilities()&stack.CapabilityHardwareGSO != 0 {
  2448  		e.initHardwareGSO()
  2449  	} else if e.route.Capabilities()&stack.CapabilitySoftwareGSO != 0 {
  2450  		e.gso = &stack.GSO{
  2451  			MaxSize:   e.route.GSOMaxSize(),
  2452  			Type:      stack.GSOSW,
  2453  			NeedsCsum: false,
  2454  		}
  2455  	}
  2456  }
  2457  
  2458  // State implements tcpip.Endpoint.State. It exports the endpoint's protocol
  2459  // state for diagnostics.
  2460  func (e *endpoint) State() uint32 {
  2461  	e.mu.Lock()
  2462  	defer e.mu.Unlock()
  2463  	return uint32(e.state)
  2464  }
  2465  
  2466  // Info returns a copy of the endpoint info.
  2467  func (e *endpoint) Info() tcpip.EndpointInfo {
  2468  	e.mu.RLock()
  2469  	// Make a copy of the endpoint info.
  2470  	ret := e.EndpointInfo
  2471  	e.mu.RUnlock()
  2472  	return &ret
  2473  }
  2474  
  2475  // Stats returns a pointer to the endpoint stats.
  2476  func (e *endpoint) Stats() tcpip.EndpointStats {
  2477  	return &e.stats
  2478  }
  2479  
  2480  // Wait implements stack.TransportEndpoint.Wait.
  2481  func (e *endpoint) Wait() {
  2482  	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
  2483  	e.waiterQueue.EventRegister(&waitEntry, waiter.EventHUp)
  2484  	defer e.waiterQueue.EventUnregister(&waitEntry)
  2485  	for {
  2486  		e.mu.Lock()
  2487  		running := e.workerRunning
  2488  		e.mu.Unlock()
  2489  		if !running {
  2490  			break
  2491  		}
  2492  		<-notifyCh
  2493  	}
  2494  }
  2495  
  2496  func mssForRoute(r *stack.Route) uint16 {
  2497  	// TODO(b/143359391): Respect TCP Min and Max size.
  2498  	return uint16(r.MTU() - header.TCPMinimumSize)
  2499  }