github.com/flowerwrong/netstack@v0.0.0-20191009141956-e5848263af28/tcpip/transport/tcp/endpoint.go

github.com/flowerwrong/netstack@v0.0.0-20191009141956-e5848263af28/tcpip/transport/tcp/endpoint.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"encoding/binary"
    19  	"fmt"
    20  	"math"
    21  	"strings"
    22  	"sync"
    23  	"sync/atomic"
    24  	"time"
    25  
    26  	"github.com/FlowerWrong/netstack/rand"
    27  	"github.com/FlowerWrong/netstack/sleep"
    28  	"github.com/FlowerWrong/netstack/tcpip"
    29  	"github.com/FlowerWrong/netstack/tcpip/buffer"
    30  	"github.com/FlowerWrong/netstack/tcpip/hash/jenkins"
    31  	"github.com/FlowerWrong/netstack/tcpip/header"
    32  	"github.com/FlowerWrong/netstack/tcpip/iptables"
    33  	"github.com/FlowerWrong/netstack/tcpip/seqnum"
    34  	"github.com/FlowerWrong/netstack/tcpip/stack"
    35  	"github.com/FlowerWrong/netstack/tmutex"
    36  	"github.com/FlowerWrong/netstack/waiter"
    37  )
    38  
    39  // EndpointState represents the state of a TCP endpoint.
    40  type EndpointState uint32
    41  
    42  // Endpoint states. Note that are represented in a netstack-specific manner and
    43  // may not be meaningful externally. Specifically, they need to be translated to
    44  // Linux's representation for these states if presented to userspace.
    45  const (
    46  	// Endpoint states internal to netstack. These map to the TCP state CLOSED.
    47  	StateInitial EndpointState = iota
    48  	StateBound
    49  	StateConnecting // Connect() called, but the initial SYN hasn't been sent.
    50  	StateError
    51  
    52  	// TCP protocol states.
    53  	StateEstablished
    54  	StateSynSent
    55  	StateSynRecv
    56  	StateFinWait1
    57  	StateFinWait2
    58  	StateTimeWait
    59  	StateClose
    60  	StateCloseWait
    61  	StateLastAck
    62  	StateListen
    63  	StateClosing
    64  )
    65  
    66  // connected is the set of states where an endpoint is connected to a peer.
    67  func (s EndpointState) connected() bool {
    68  	switch s {
    69  	case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
    70  		return true
    71  	default:
    72  		return false
    73  	}
    74  }
    75  
    76  // String implements fmt.Stringer.String.
    77  func (s EndpointState) String() string {
    78  	switch s {
    79  	case StateInitial:
    80  		return "INITIAL"
    81  	case StateBound:
    82  		return "BOUND"
    83  	case StateConnecting:
    84  		return "CONNECTING"
    85  	case StateError:
    86  		return "ERROR"
    87  	case StateEstablished:
    88  		return "ESTABLISHED"
    89  	case StateSynSent:
    90  		return "SYN-SENT"
    91  	case StateSynRecv:
    92  		return "SYN-RCVD"
    93  	case StateFinWait1:
    94  		return "FIN-WAIT1"
    95  	case StateFinWait2:
    96  		return "FIN-WAIT2"
    97  	case StateTimeWait:
    98  		return "TIME-WAIT"
    99  	case StateClose:
   100  		return "CLOSED"
   101  	case StateCloseWait:
   102  		return "CLOSE-WAIT"
   103  	case StateLastAck:
   104  		return "LAST-ACK"
   105  	case StateListen:
   106  		return "LISTEN"
   107  	case StateClosing:
   108  		return "CLOSING"
   109  	default:
   110  		panic("unreachable")
   111  	}
   112  }
   113  
   114  // Reasons for notifying the protocol goroutine.
   115  const (
   116  	notifyNonZeroReceiveWindow = 1 << iota
   117  	notifyReceiveWindowChanged
   118  	notifyClose
   119  	notifyMTUChanged
   120  	notifyDrain
   121  	notifyReset
   122  	notifyKeepaliveChanged
   123  	notifyMSSChanged
   124  )
   125  
   126  // SACKInfo holds TCP SACK related information for a given endpoint.
   127  //
   128  // +stateify savable
   129  type SACKInfo struct {
   130  	// Blocks is the maximum number of SACK blocks we track
   131  	// per endpoint.
   132  	Blocks [MaxSACKBlocks]header.SACKBlock
   133  
   134  	// NumBlocks is the number of valid SACK blocks stored in the
   135  	// blocks array above.
   136  	NumBlocks int
   137  }
   138  
   139  // rcvBufAutoTuneParams are used to hold state variables to compute
   140  // the auto tuned recv buffer size.
   141  //
   142  // +stateify savable
   143  type rcvBufAutoTuneParams struct {
   144  	// measureTime is the time at which the current measurement
   145  	// was started.
   146  	measureTime time.Time
   147  
   148  	// copied is the number of bytes copied out of the receive
   149  	// buffers since this measure began.
   150  	copied int
   151  
   152  	// prevCopied is the number of bytes copied out of the receive
   153  	// buffers in the previous RTT period.
   154  	prevCopied int
   155  
   156  	// rtt is the non-smoothed minimum RTT as measured by observing the time
   157  	// between when a byte is first acknowledged and the receipt of data
   158  	// that is at least one window beyond the sequence number that was
   159  	// acknowledged.
   160  	rtt time.Duration
   161  
   162  	// rttMeasureSeqNumber is the highest acceptable sequence number at the
   163  	// time this RTT measurement period began.
   164  	rttMeasureSeqNumber seqnum.Value
   165  
   166  	// rttMeasureTime is the absolute time at which the current rtt
   167  	// measurement period began.
   168  	rttMeasureTime time.Time
   169  
   170  	// disabled is true if an explicit receive buffer is set for the
   171  	// endpoint.
   172  	disabled bool
   173  }
   174  
   175  // endpoint represents a TCP endpoint. This struct serves as the interface
   176  // between users of the endpoint and the protocol implementation; it is legal to
   177  // have concurrent goroutines make calls into the endpoint, they are properly
   178  // synchronized. The protocol implementation, however, runs in a single
   179  // goroutine.
   180  //
   181  // +stateify savable
   182  type endpoint struct {
   183  	// workMu is used to arbitrate which goroutine may perform protocol
   184  	// work. Only the main protocol goroutine is expected to call Lock() on
   185  	// it, but other goroutines (e.g., send) may call TryLock() to eagerly
   186  	// perform work without having to wait for the main one to wake up.
   187  	workMu tmutex.Mutex
   188  
   189  	// The following fields are initialized at creation time and do not
   190  	// change throughout the lifetime of the endpoint.
   191  	stack       *stack.Stack
   192  	netProto    tcpip.NetworkProtocolNumber
   193  	waiterQueue *waiter.Queue
   194  
   195  	// lastError represents the last error that the endpoint reported;
   196  	// access to it is protected by the following mutex.
   197  	lastErrorMu sync.Mutex
   198  	lastError   *tcpip.Error
   199  
   200  	// The following fields are used to manage the receive queue. The
   201  	// protocol goroutine adds ready-for-delivery segments to rcvList,
   202  	// which are returned by Read() calls to users.
   203  	//
   204  	// Once the peer has closed its send side, rcvClosed is set to true
   205  	// to indicate to users that no more data is coming.
   206  	//
   207  	// rcvListMu can be taken after the endpoint mu below.
   208  	rcvListMu     sync.Mutex
   209  	rcvList       segmentList
   210  	rcvClosed     bool
   211  	rcvBufSize    int
   212  	rcvBufUsed    int
   213  	rcvAutoParams rcvBufAutoTuneParams
   214  	// zeroWindow indicates that the window was closed due to receive buffer
   215  	// space being filled up. This is set by the worker goroutine before
   216  	// moving a segment to the rcvList. This setting is cleared by the
   217  	// endpoint when a Read() call reads enough data for the new window to
   218  	// be non-zero.
   219  	zeroWindow bool
   220  
   221  	// The following fields are protected by the mutex.
   222  	mu sync.RWMutex
   223  	id stack.TransportEndpointID
   224  
   225  	state EndpointState
   226  
   227  	isPortReserved    bool
   228  	isRegistered      bool
   229  	boundNICID        tcpip.NICID
   230  	route             stack.Route
   231  	ttl               uint8
   232  	v6only            bool
   233  	isConnectNotified bool
   234  	// TCP should never broadcast but Linux nevertheless supports enabling/
   235  	// disabling SO_BROADCAST, albeit as a NOOP.
   236  	broadcast bool
   237  
   238  	// effectiveNetProtos contains the network protocols actually in use. In
   239  	// most cases it will only contain "netProto", but in cases like IPv6
   240  	// endpoints with v6only set to false, this could include multiple
   241  	// protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g.,
   242  	// IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped
   243  	// address).
   244  	effectiveNetProtos []tcpip.NetworkProtocolNumber
   245  
   246  	// hardError is meaningful only when state is stateError, it stores the
   247  	// error to be returned when read/write syscalls are called and the
   248  	// endpoint is in this state. hardError is protected by mu.
   249  	hardError *tcpip.Error
   250  
   251  	// workerRunning specifies if a worker goroutine is running.
   252  	workerRunning bool
   253  
   254  	// workerCleanup specifies if the worker goroutine must perform cleanup
   255  	// before exitting. This can only be set to true when workerRunning is
   256  	// also true, and they're both protected by the mutex.
   257  	workerCleanup bool
   258  
   259  	// sendTSOk is used to indicate when the TS Option has been negotiated.
   260  	// When sendTSOk is true every non-RST segment should carry a TS as per
   261  	// RFC7323#section-1.1
   262  	sendTSOk bool
   263  
   264  	// recentTS is the timestamp that should be sent in the TSEcr field of
   265  	// the timestamp for future segments sent by the endpoint. This field is
   266  	// updated if required when a new segment is received by this endpoint.
   267  	recentTS uint32
   268  
   269  	// tsOffset is a randomized offset added to the value of the
   270  	// TSVal field in the timestamp option.
   271  	tsOffset uint32
   272  
   273  	// shutdownFlags represent the current shutdown state of the endpoint.
   274  	shutdownFlags tcpip.ShutdownFlags
   275  
   276  	// sackPermitted is set to true if the peer sends the TCPSACKPermitted
   277  	// option in the SYN/SYN-ACK.
   278  	sackPermitted bool
   279  
   280  	// sack holds TCP SACK related information for this endpoint.
   281  	sack SACKInfo
   282  
   283  	// reusePort is set to true if SO_REUSEPORT is enabled.
   284  	reusePort bool
   285  
   286  	// bindToDevice is set to the NIC on which to bind or disabled if 0.
   287  	bindToDevice tcpip.NICID
   288  
   289  	// delay enables Nagle's algorithm.
   290  	//
   291  	// delay is a boolean (0 is false) and must be accessed atomically.
   292  	delay uint32
   293  
   294  	// cork holds back segments until full.
   295  	//
   296  	// cork is a boolean (0 is false) and must be accessed atomically.
   297  	cork uint32
   298  
   299  	// scoreboard holds TCP SACK Scoreboard information for this endpoint.
   300  	scoreboard *SACKScoreboard
   301  
   302  	// The options below aren't implemented, but we remember the user
   303  	// settings because applications expect to be able to set/query these
   304  	// options.
   305  	reuseAddr bool
   306  
   307  	// slowAck holds the negated state of quick ack. It is stubbed out and
   308  	// does nothing.
   309  	//
   310  	// slowAck is a boolean (0 is false) and must be accessed atomically.
   311  	slowAck uint32
   312  
   313  	// segmentQueue is used to hand received segments to the protocol
   314  	// goroutine. Segments are queued as long as the queue is not full,
   315  	// and dropped when it is.
   316  	segmentQueue segmentQueue
   317  
   318  	// synRcvdCount is the number of connections for this endpoint that are
   319  	// in SYN-RCVD state.
   320  	synRcvdCount int
   321  
   322  	// userMSS if non-zero is the MSS value explicitly set by the user
   323  	// for this endpoint using the TCP_MAXSEG setsockopt.
   324  	userMSS int
   325  
   326  	// The following fields are used to manage the send buffer. When
   327  	// segments are ready to be sent, they are added to sndQueue and the
   328  	// protocol goroutine is signaled via sndWaker.
   329  	//
   330  	// When the send side is closed, the protocol goroutine is notified via
   331  	// sndCloseWaker, and sndClosed is set to true.
   332  	sndBufMu      sync.Mutex
   333  	sndBufSize    int
   334  	sndBufUsed    int
   335  	sndClosed     bool
   336  	sndBufInQueue seqnum.Size
   337  	sndQueue      segmentList
   338  	sndWaker      sleep.Waker
   339  	sndCloseWaker sleep.Waker
   340  
   341  	// cc stores the name of the Congestion Control algorithm to use for
   342  	// this endpoint.
   343  	cc tcpip.CongestionControlOption
   344  
   345  	// The following are used when a "packet too big" control packet is
   346  	// received. They are protected by sndBufMu. They are used to
   347  	// communicate to the main protocol goroutine how many such control
   348  	// messages have been received since the last notification was processed
   349  	// and what was the smallest MTU seen.
   350  	packetTooBigCount int
   351  	sndMTU            int
   352  
   353  	// newSegmentWaker is used to indicate to the protocol goroutine that
   354  	// it needs to wake up and handle new segments queued to it.
   355  	newSegmentWaker sleep.Waker
   356  
   357  	// notificationWaker is used to indicate to the protocol goroutine that
   358  	// it needs to wake up and check for notifications.
   359  	notificationWaker sleep.Waker
   360  
   361  	// notifyFlags is a bitmask of flags used to indicate to the protocol
   362  	// goroutine what it was notified; this is only accessed atomically.
   363  	notifyFlags uint32
   364  
   365  	// keepalive manages TCP keepalive state. When the connection is idle
   366  	// (no data sent or received) for keepaliveIdle, we start sending
   367  	// keepalives every keepalive.interval. If we send keepalive.count
   368  	// without hearing a response, the connection is closed.
   369  	keepalive keepalive
   370  
   371  	// pendingAccepted is a synchronization primitive used to track number
   372  	// of connections that are queued up to be delivered to the accepted
   373  	// channel. We use this to ensure that all goroutines blocked on writing
   374  	// to the acceptedChan below terminate before we close acceptedChan.
   375  	pendingAccepted sync.WaitGroup
   376  
   377  	// acceptedChan is used by a listening endpoint protocol goroutine to
   378  	// send newly accepted connections to the endpoint so that they can be
   379  	// read by Accept() calls.
   380  	acceptedChan chan *endpoint
   381  
   382  	// The following are only used from the protocol goroutine, and
   383  	// therefore don't need locks to protect them.
   384  	rcv *receiver
   385  	snd *sender
   386  
   387  	// The goroutine drain completion notification channel.
   388  	drainDone chan struct{}
   389  
   390  	// The goroutine undrain notification channel. This is currently used as
   391  	// a way to block the worker goroutines. Today nothing closes/writes
   392  	// this channel and this causes any goroutines waiting on this to just
   393  	// block. This is used during save/restore to prevent worker goroutines
   394  	// from mutating state as it's being saved.
   395  	undrain chan struct{}
   396  
   397  	// probe if not nil is invoked on every received segment. It is passed
   398  	// a copy of the current state of the endpoint.
   399  	probe stack.TCPProbeFunc
   400  
   401  	// The following are only used to assist the restore run to re-connect.
   402  	bindAddress       tcpip.Address
   403  	connectingAddress tcpip.Address
   404  
   405  	// amss is the advertised MSS to the peer by this endpoint.
   406  	amss uint16
   407  
   408  	gso *stack.GSO
   409  }
   410  
   411  // StopWork halts packet processing. Only to be used in tests.
   412  func (e *endpoint) StopWork() {
   413  	e.workMu.Lock()
   414  }
   415  
   416  // ResumeWork resumes packet processing. Only to be used in tests.
   417  func (e *endpoint) ResumeWork() {
   418  	e.workMu.Unlock()
   419  }
   420  
   421  // keepalive is a synchronization wrapper used to appease stateify. See the
   422  // comment in endpoint, where it is used.
   423  //
   424  // +stateify savable
   425  type keepalive struct {
   426  	sync.Mutex
   427  	enabled  bool
   428  	idle     time.Duration
   429  	interval time.Duration
   430  	count    int
   431  	unacked  int
   432  	timer    timer
   433  	waker    sleep.Waker
   434  }
   435  
   436  func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint {
   437  	e := &endpoint{
   438  		stack:       stack,
   439  		netProto:    netProto,
   440  		waiterQueue: waiterQueue,
   441  		state:       StateInitial,
   442  		rcvBufSize:  DefaultReceiveBufferSize,
   443  		sndBufSize:  DefaultSendBufferSize,
   444  		sndMTU:      int(math.MaxInt32),
   445  		reuseAddr:   true,
   446  		keepalive: keepalive{
   447  			// Linux defaults.
   448  			idle:     2 * time.Hour,
   449  			interval: 75 * time.Second,
   450  			count:    9,
   451  		},
   452  	}
   453  
   454  	var ss SendBufferSizeOption
   455  	if err := stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
   456  		e.sndBufSize = ss.Default
   457  	}
   458  
   459  	var rs ReceiveBufferSizeOption
   460  	if err := stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
   461  		e.rcvBufSize = rs.Default
   462  	}
   463  
   464  	var cs tcpip.CongestionControlOption
   465  	if err := stack.TransportProtocolOption(ProtocolNumber, &cs); err == nil {
   466  		e.cc = cs
   467  	}
   468  
   469  	var mrb tcpip.ModerateReceiveBufferOption
   470  	if err := stack.TransportProtocolOption(ProtocolNumber, &mrb); err == nil {
   471  		e.rcvAutoParams.disabled = !bool(mrb)
   472  	}
   473  
   474  	if p := stack.GetTCPProbe(); p != nil {
   475  		e.probe = p
   476  	}
   477  
   478  	e.segmentQueue.setLimit(MaxUnprocessedSegments)
   479  	e.workMu.Init()
   480  	e.workMu.Lock()
   481  	e.tsOffset = timeStampOffset()
   482  
   483  	return e
   484  }
   485  
   486  // Readiness returns the current readiness of the endpoint. For example, if
   487  // waiter.EventIn is set, the endpoint is immediately readable.
   488  func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
   489  	result := waiter.EventMask(0)
   490  
   491  	e.mu.RLock()
   492  	defer e.mu.RUnlock()
   493  
   494  	switch e.state {
   495  	case StateInitial, StateBound, StateConnecting, StateSynSent, StateSynRecv:
   496  		// Ready for nothing.
   497  
   498  	case StateClose, StateError:
   499  		// Ready for anything.
   500  		result = mask
   501  
   502  	case StateListen:
   503  		// Check if there's anything in the accepted channel.
   504  		if (mask & waiter.EventIn) != 0 {
   505  			if len(e.acceptedChan) > 0 {
   506  				result |= waiter.EventIn
   507  			}
   508  		}
   509  	}
   510  	if e.state.connected() {
   511  		// Determine if the endpoint is writable if requested.
   512  		if (mask & waiter.EventOut) != 0 {
   513  			e.sndBufMu.Lock()
   514  			if e.sndClosed || e.sndBufUsed < e.sndBufSize {
   515  				result |= waiter.EventOut
   516  			}
   517  			e.sndBufMu.Unlock()
   518  		}
   519  
   520  		// Determine if the endpoint is readable if requested.
   521  		if (mask & waiter.EventIn) != 0 {
   522  			e.rcvListMu.Lock()
   523  			if e.rcvBufUsed > 0 || e.rcvClosed {
   524  				result |= waiter.EventIn
   525  			}
   526  			e.rcvListMu.Unlock()
   527  		}
   528  	}
   529  
   530  	return result
   531  }
   532  
   533  func (e *endpoint) fetchNotifications() uint32 {
   534  	return atomic.SwapUint32(&e.notifyFlags, 0)
   535  }
   536  
   537  func (e *endpoint) notifyProtocolGoroutine(n uint32) {
   538  	for {
   539  		v := atomic.LoadUint32(&e.notifyFlags)
   540  		if v&n == n {
   541  			// The flags are already set.
   542  			return
   543  		}
   544  
   545  		if atomic.CompareAndSwapUint32(&e.notifyFlags, v, v|n) {
   546  			if v == 0 {
   547  				// We are causing a transition from no flags to
   548  				// at least one flag set, so we must cause the
   549  				// protocol goroutine to wake up.
   550  				e.notificationWaker.Assert()
   551  			}
   552  			return
   553  		}
   554  	}
   555  }
   556  
   557  // Close puts the endpoint in a closed state and frees all resources associated
   558  // with it. It must be called only once and with no other concurrent calls to
   559  // the endpoint.
   560  func (e *endpoint) Close() {
   561  	// Issue a shutdown so that the peer knows we won't send any more data
   562  	// if we're connected, or stop accepting if we're listening.
   563  	e.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead)
   564  
   565  	e.mu.Lock()
   566  
   567  	// For listening sockets, we always release ports inline so that they
   568  	// are immediately available for reuse after Close() is called. If also
   569  	// registered, we unregister as well otherwise the next user would fail
   570  	// in Listen() when trying to register.
   571  	if e.state == StateListen && e.isPortReserved {
   572  		if e.isRegistered {
   573  			e.stack.UnregisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e, e.bindToDevice)
   574  			e.isRegistered = false
   575  		}
   576  
   577  		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.id.LocalAddress, e.id.LocalPort, e.bindToDevice)
   578  		e.isPortReserved = false
   579  	}
   580  
   581  	// Either perform the local cleanup or kick the worker to make sure it
   582  	// knows it needs to cleanup.
   583  	tcpip.AddDanglingEndpoint(e)
   584  	if !e.workerRunning {
   585  		e.cleanupLocked()
   586  	} else {
   587  		e.workerCleanup = true
   588  		e.notifyProtocolGoroutine(notifyClose)
   589  	}
   590  
   591  	e.mu.Unlock()
   592  }
   593  
   594  // closePendingAcceptableConnections closes all connections that have completed
   595  // handshake but not yet been delivered to the application.
   596  func (e *endpoint) closePendingAcceptableConnectionsLocked() {
   597  	done := make(chan struct{})
   598  	// Spin a goroutine up as ranging on e.acceptedChan will just block when
   599  	// there are no more connections in the channel. Using a non-blocking
   600  	// select does not work as it can potentially select the default case
   601  	// even when there are pending writes but that are not yet written to
   602  	// the channel.
   603  	go func() {
   604  		defer close(done)
   605  		for n := range e.acceptedChan {
   606  			n.mu.Lock()
   607  			n.resetConnectionLocked(tcpip.ErrConnectionAborted)
   608  			n.mu.Unlock()
   609  			n.Close()
   610  		}
   611  	}()
   612  	// pendingAccepted(see endpoint.deliverAccepted) tracks the number of
   613  	// endpoints which have completed handshake but are not yet written to
   614  	// the e.acceptedChan. We wait here till the goroutine above can drain
   615  	// all such connections from e.acceptedChan.
   616  	e.pendingAccepted.Wait()
   617  	close(e.acceptedChan)
   618  	<-done
   619  	e.acceptedChan = nil
   620  }
   621  
   622  // cleanupLocked frees all resources associated with the endpoint. It is called
   623  // after Close() is called and the worker goroutine (if any) is done with its
   624  // work.
   625  func (e *endpoint) cleanupLocked() {
   626  	// Close all endpoints that might have been accepted by TCP but not by
   627  	// the client.
   628  	if e.acceptedChan != nil {
   629  		e.closePendingAcceptableConnectionsLocked()
   630  	}
   631  	e.workerCleanup = false
   632  
   633  	if e.isRegistered {
   634  		e.stack.UnregisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e, e.bindToDevice)
   635  		e.isRegistered = false
   636  	}
   637  
   638  	if e.isPortReserved {
   639  		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.id.LocalAddress, e.id.LocalPort, e.bindToDevice)
   640  		e.isPortReserved = false
   641  	}
   642  
   643  	e.route.Release()
   644  	tcpip.DeleteDanglingEndpoint(e)
   645  }
   646  
   647  // initialReceiveWindow returns the initial receive window to advertise in the
   648  // SYN/SYN-ACK.
   649  func (e *endpoint) initialReceiveWindow() int {
   650  	rcvWnd := e.receiveBufferAvailable()
   651  	if rcvWnd > math.MaxUint16 {
   652  		rcvWnd = math.MaxUint16
   653  	}
   654  	routeWnd := InitialCwnd * int(mssForRoute(&e.route)) * 2
   655  	if rcvWnd > routeWnd {
   656  		rcvWnd = routeWnd
   657  	}
   658  	return rcvWnd
   659  }
   660  
   661  // ModerateRecvBuf adjusts the receive buffer and the advertised window
   662  // based on the number of bytes copied to user space.
   663  func (e *endpoint) ModerateRecvBuf(copied int) {
   664  	e.rcvListMu.Lock()
   665  	if e.rcvAutoParams.disabled {
   666  		e.rcvListMu.Unlock()
   667  		return
   668  	}
   669  	now := time.Now()
   670  	if rtt := e.rcvAutoParams.rtt; rtt == 0 || now.Sub(e.rcvAutoParams.measureTime) < rtt {
   671  		e.rcvAutoParams.copied += copied
   672  		e.rcvListMu.Unlock()
   673  		return
   674  	}
   675  	prevRTTCopied := e.rcvAutoParams.copied + copied
   676  	prevCopied := e.rcvAutoParams.prevCopied
   677  	rcvWnd := 0
   678  	if prevRTTCopied > prevCopied {
   679  		// The minimal receive window based on what was copied by the app
   680  		// in the immediate preceding RTT and some extra buffer for 16
   681  		// segments to account for variations.
   682  		// We multiply by 2 to account for packet losses.
   683  		rcvWnd = prevRTTCopied*2 + 16*int(e.amss)
   684  
   685  		// Scale for slow start based on bytes copied in this RTT vs previous.
   686  		grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied
   687  
   688  		// Multiply growth factor by 2 again to account for sender being
   689  		// in slow-start where the sender grows it's congestion window
   690  		// by 100% per RTT.
   691  		rcvWnd += grow * 2
   692  
   693  		// Make sure auto tuned buffer size can always receive upto 2x
   694  		// the initial window of 10 segments.
   695  		if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd {
   696  			rcvWnd = minRcvWnd
   697  		}
   698  
   699  		// Cap the auto tuned buffer size by the maximum permissible
   700  		// receive buffer size.
   701  		if max := e.maxReceiveBufferSize(); rcvWnd > max {
   702  			rcvWnd = max
   703  		}
   704  
   705  		// We do not adjust downwards as that can cause the receiver to
   706  		// reject valid data that might already be in flight as the
   707  		// acceptable window will shrink.
   708  		if rcvWnd > e.rcvBufSize {
   709  			e.rcvBufSize = rcvWnd
   710  			e.notifyProtocolGoroutine(notifyReceiveWindowChanged)
   711  		}
   712  
   713  		// We only update prevCopied when we grow the buffer because in cases
   714  		// where prevCopied > prevRTTCopied the existing buffer is already big
   715  		// enough to handle the current rate and we don't need to do any
   716  		// adjustments.
   717  		e.rcvAutoParams.prevCopied = prevRTTCopied
   718  	}
   719  	e.rcvAutoParams.measureTime = now
   720  	e.rcvAutoParams.copied = 0
   721  	e.rcvListMu.Unlock()
   722  }
   723  
   724  // IPTables implements tcpip.Endpoint.IPTables.
   725  func (e *endpoint) IPTables() (iptables.IPTables, error) {
   726  	return e.stack.IPTables(), nil
   727  }
   728  
   729  // Read reads data from the endpoint.
   730  func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
   731  	e.mu.RLock()
   732  	// The endpoint can be read if it's connected, or if it's already closed
   733  	// but has some pending unread data. Also note that a RST being received
   734  	// would cause the state to become StateError so we should allow the
   735  	// reads to proceed before returning a ECONNRESET.
   736  	e.rcvListMu.Lock()
   737  	bufUsed := e.rcvBufUsed
   738  	if s := e.state; !s.connected() && s != StateClose && bufUsed == 0 {
   739  		e.rcvListMu.Unlock()
   740  		he := e.hardError
   741  		e.mu.RUnlock()
   742  		if s == StateError {
   743  			return buffer.View{}, tcpip.ControlMessages{}, he
   744  		}
   745  		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
   746  	}
   747  
   748  	v, err := e.readLocked()
   749  	e.rcvListMu.Unlock()
   750  
   751  	e.mu.RUnlock()
   752  
   753  	return v, tcpip.ControlMessages{}, err
   754  }
   755  
   756  func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
   757  	if e.rcvBufUsed == 0 {
   758  		if e.rcvClosed || !e.state.connected() {
   759  			return buffer.View{}, tcpip.ErrClosedForReceive
   760  		}
   761  		return buffer.View{}, tcpip.ErrWouldBlock
   762  	}
   763  
   764  	s := e.rcvList.Front()
   765  	views := s.data.Views()
   766  	v := views[s.viewToDeliver]
   767  	s.viewToDeliver++
   768  
   769  	if s.viewToDeliver >= len(views) {
   770  		e.rcvList.Remove(s)
   771  		s.decRef()
   772  	}
   773  
   774  	e.rcvBufUsed -= len(v)
   775  	// If the window was zero before this read and if the read freed up
   776  	// enough buffer space for the scaled window to be non-zero then notify
   777  	// the protocol goroutine to send a window update.
   778  	if e.zeroWindow && !e.zeroReceiveWindow(e.rcv.rcvWndScale) {
   779  		e.zeroWindow = false
   780  		e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
   781  	}
   782  
   783  	return v, nil
   784  }
   785  
   786  // isEndpointWritableLocked checks if a given endpoint is writable
   787  // and also returns the number of bytes that can be written at this
   788  // moment. If the endpoint is not writable then it returns an error
   789  // indicating the reason why it's not writable.
   790  // Caller must hold e.mu and e.sndBufMu
   791  func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) {
   792  	// The endpoint cannot be written to if it's not connected.
   793  	if !e.state.connected() {
   794  		switch e.state {
   795  		case StateError:
   796  			return 0, e.hardError
   797  		default:
   798  			return 0, tcpip.ErrClosedForSend
   799  		}
   800  	}
   801  
   802  	// Check if the connection has already been closed for sends.
   803  	if e.sndClosed {
   804  		return 0, tcpip.ErrClosedForSend
   805  	}
   806  
   807  	avail := e.sndBufSize - e.sndBufUsed
   808  	if avail <= 0 {
   809  		return 0, tcpip.ErrWouldBlock
   810  	}
   811  	return avail, nil
   812  }
   813  
   814  // Write writes data to the endpoint's peer.
   815  func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
   816  	// Linux completely ignores any address passed to sendto(2) for TCP sockets
   817  	// (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More
   818  	// and opts.EndOfRecord are also ignored.
   819  
   820  	e.mu.RLock()
   821  	e.sndBufMu.Lock()
   822  
   823  	avail, err := e.isEndpointWritableLocked()
   824  	if err != nil {
   825  		e.sndBufMu.Unlock()
   826  		e.mu.RUnlock()
   827  		return 0, nil, err
   828  	}
   829  
   830  	// We can release locks while copying data.
   831  	//
   832  	// This is not possible if atomic is set, because we can't allow the
   833  	// available buffer space to be consumed by some other caller while we
   834  	// are copying data in.
   835  	if !opts.Atomic {
   836  		e.sndBufMu.Unlock()
   837  		e.mu.RUnlock()
   838  	}
   839  
   840  	// Fetch data.
   841  	v, perr := p.Payload(avail)
   842  	if perr != nil || len(v) == 0 {
   843  		if opts.Atomic { // See above.
   844  			e.sndBufMu.Unlock()
   845  			e.mu.RUnlock()
   846  		}
   847  		// Note that perr may be nil if len(v) == 0.
   848  		return 0, nil, perr
   849  	}
   850  
   851  	if !opts.Atomic { // See above.
   852  		e.mu.RLock()
   853  		e.sndBufMu.Lock()
   854  
   855  		// Because we released the lock before copying, check state again
   856  		// to make sure the endpoint is still in a valid state for a write.
   857  		avail, err = e.isEndpointWritableLocked()
   858  		if err != nil {
   859  			e.sndBufMu.Unlock()
   860  			e.mu.RUnlock()
   861  			return 0, nil, err
   862  		}
   863  
   864  		// Discard any excess data copied in due to avail being reduced due
   865  		// to a simultaneous write call to the socket.
   866  		if avail < len(v) {
   867  			v = v[:avail]
   868  		}
   869  	}
   870  
   871  	// Add data to the send queue.
   872  	s := newSegmentFromView(&e.route, e.id, v)
   873  	e.sndBufUsed += len(v)
   874  	e.sndBufInQueue += seqnum.Size(len(v))
   875  	e.sndQueue.PushBack(s)
   876  	e.sndBufMu.Unlock()
   877  	// Release the endpoint lock to prevent deadlocks due to lock
   878  	// order inversion when acquiring workMu.
   879  	e.mu.RUnlock()
   880  
   881  	if e.workMu.TryLock() {
   882  		// Do the work inline.
   883  		e.handleWrite()
   884  		e.workMu.Unlock()
   885  	} else {
   886  		// Let the protocol goroutine do the work.
   887  		e.sndWaker.Assert()
   888  	}
   889  
   890  	return int64(len(v)), nil, nil
   891  }
   892  
   893  // Peek reads data without consuming it from the endpoint.
   894  //
   895  // This method does not block if there is no data pending.
   896  func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
   897  	e.mu.RLock()
   898  	defer e.mu.RUnlock()
   899  
   900  	// The endpoint can be read if it's connected, or if it's already closed
   901  	// but has some pending unread data.
   902  	if s := e.state; !s.connected() && s != StateClose {
   903  		if s == StateError {
   904  			return 0, tcpip.ControlMessages{}, e.hardError
   905  		}
   906  		return 0, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
   907  	}
   908  
   909  	e.rcvListMu.Lock()
   910  	defer e.rcvListMu.Unlock()
   911  
   912  	if e.rcvBufUsed == 0 {
   913  		if e.rcvClosed || !e.state.connected() {
   914  			return 0, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive
   915  		}
   916  		return 0, tcpip.ControlMessages{}, tcpip.ErrWouldBlock
   917  	}
   918  
   919  	// Make a copy of vec so we can modify the slide headers.
   920  	vec = append([][]byte(nil), vec...)
   921  
   922  	var num int64
   923  	for s := e.rcvList.Front(); s != nil; s = s.Next() {
   924  		views := s.data.Views()
   925  
   926  		for i := s.viewToDeliver; i < len(views); i++ {
   927  			v := views[i]
   928  
   929  			for len(v) > 0 {
   930  				if len(vec) == 0 {
   931  					return num, tcpip.ControlMessages{}, nil
   932  				}
   933  				if len(vec[0]) == 0 {
   934  					vec = vec[1:]
   935  					continue
   936  				}
   937  
   938  				n := copy(vec[0], v)
   939  				v = v[n:]
   940  				vec[0] = vec[0][n:]
   941  				num += int64(n)
   942  			}
   943  		}
   944  	}
   945  
   946  	return num, tcpip.ControlMessages{}, nil
   947  }
   948  
   949  // zeroReceiveWindow checks if the receive window to be announced now would be
   950  // zero, based on the amount of available buffer and the receive window scaling.
   951  //
   952  // It must be called with rcvListMu held.
   953  func (e *endpoint) zeroReceiveWindow(scale uint8) bool {
   954  	if e.rcvBufUsed >= e.rcvBufSize {
   955  		return true
   956  	}
   957  
   958  	return ((e.rcvBufSize - e.rcvBufUsed) >> scale) == 0
   959  }
   960  
   961  // SetSockOptInt sets a socket option.
   962  func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
   963  	switch opt {
   964  	case tcpip.ReceiveBufferSizeOption:
   965  		// Make sure the receive buffer size is within the min and max
   966  		// allowed.
   967  		var rs ReceiveBufferSizeOption
   968  		size := int(v)
   969  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
   970  			if size < rs.Min {
   971  				size = rs.Min
   972  			}
   973  			if size > rs.Max {
   974  				size = rs.Max
   975  			}
   976  		}
   977  
   978  		mask := uint32(notifyReceiveWindowChanged)
   979  
   980  		e.rcvListMu.Lock()
   981  
   982  		// Make sure the receive buffer size allows us to send a
   983  		// non-zero window size.
   984  		scale := uint8(0)
   985  		if e.rcv != nil {
   986  			scale = e.rcv.rcvWndScale
   987  		}
   988  		if size>>scale == 0 {
   989  			size = 1 << scale
   990  		}
   991  
   992  		// Make sure 2*size doesn't overflow.
   993  		if size > math.MaxInt32/2 {
   994  			size = math.MaxInt32 / 2
   995  		}
   996  
   997  		e.rcvBufSize = size
   998  		e.rcvAutoParams.disabled = true
   999  		if e.zeroWindow && !e.zeroReceiveWindow(scale) {
  1000  			e.zeroWindow = false
  1001  			mask |= notifyNonZeroReceiveWindow
  1002  		}
  1003  		e.rcvListMu.Unlock()
  1004  
  1005  		e.notifyProtocolGoroutine(mask)
  1006  		return nil
  1007  
  1008  	case tcpip.SendBufferSizeOption:
  1009  		// Make sure the send buffer size is within the min and max
  1010  		// allowed.
  1011  		size := int(v)
  1012  		var ss SendBufferSizeOption
  1013  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
  1014  			if size < ss.Min {
  1015  				size = ss.Min
  1016  			}
  1017  			if size > ss.Max {
  1018  				size = ss.Max
  1019  			}
  1020  		}
  1021  
  1022  		e.sndBufMu.Lock()
  1023  		e.sndBufSize = size
  1024  		e.sndBufMu.Unlock()
  1025  		return nil
  1026  
  1027  	default:
  1028  		return nil
  1029  	}
  1030  }
  1031  
  1032  // SetSockOpt sets a socket option.
  1033  func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
  1034  	switch v := opt.(type) {
  1035  	case tcpip.DelayOption:
  1036  		if v == 0 {
  1037  			atomic.StoreUint32(&e.delay, 0)
  1038  
  1039  			// Handle delayed data.
  1040  			e.sndWaker.Assert()
  1041  		} else {
  1042  			atomic.StoreUint32(&e.delay, 1)
  1043  		}
  1044  		return nil
  1045  
  1046  	case tcpip.CorkOption:
  1047  		if v == 0 {
  1048  			atomic.StoreUint32(&e.cork, 0)
  1049  
  1050  			// Handle the corked data.
  1051  			e.sndWaker.Assert()
  1052  		} else {
  1053  			atomic.StoreUint32(&e.cork, 1)
  1054  		}
  1055  		return nil
  1056  
  1057  	case tcpip.ReuseAddressOption:
  1058  		e.mu.Lock()
  1059  		e.reuseAddr = v != 0
  1060  		e.mu.Unlock()
  1061  		return nil
  1062  
  1063  	case tcpip.ReusePortOption:
  1064  		e.mu.Lock()
  1065  		e.reusePort = v != 0
  1066  		e.mu.Unlock()
  1067  		return nil
  1068  
  1069  	case tcpip.BindToDeviceOption:
  1070  		e.mu.Lock()
  1071  		defer e.mu.Unlock()
  1072  		if v == "" {
  1073  			e.bindToDevice = 0
  1074  			return nil
  1075  		}
  1076  		for nicid, nic := range e.stack.NICInfo() {
  1077  			if nic.Name == string(v) {
  1078  				e.bindToDevice = nicid
  1079  				return nil
  1080  			}
  1081  		}
  1082  		return tcpip.ErrUnknownDevice
  1083  
  1084  	case tcpip.QuickAckOption:
  1085  		if v == 0 {
  1086  			atomic.StoreUint32(&e.slowAck, 1)
  1087  		} else {
  1088  			atomic.StoreUint32(&e.slowAck, 0)
  1089  		}
  1090  		return nil
  1091  
  1092  	case tcpip.MaxSegOption:
  1093  		userMSS := v
  1094  		if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS {
  1095  			return tcpip.ErrInvalidOptionValue
  1096  		}
  1097  		e.mu.Lock()
  1098  		e.userMSS = int(userMSS)
  1099  		e.mu.Unlock()
  1100  		e.notifyProtocolGoroutine(notifyMSSChanged)
  1101  		return nil
  1102  
  1103  	case tcpip.V6OnlyOption:
  1104  		// We only recognize this option on v6 endpoints.
  1105  		if e.netProto != header.IPv6ProtocolNumber {
  1106  			return tcpip.ErrInvalidEndpointState
  1107  		}
  1108  
  1109  		e.mu.Lock()
  1110  		defer e.mu.Unlock()
  1111  
  1112  		// We only allow this to be set when we're in the initial state.
  1113  		if e.state != StateInitial {
  1114  			return tcpip.ErrInvalidEndpointState
  1115  		}
  1116  
  1117  		e.v6only = v != 0
  1118  		return nil
  1119  
  1120  	case tcpip.TTLOption:
  1121  		e.mu.Lock()
  1122  		e.ttl = uint8(v)
  1123  		e.mu.Unlock()
  1124  		return nil
  1125  
  1126  	case tcpip.KeepaliveEnabledOption:
  1127  		e.keepalive.Lock()
  1128  		e.keepalive.enabled = v != 0
  1129  		e.keepalive.Unlock()
  1130  		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
  1131  		return nil
  1132  
  1133  	case tcpip.KeepaliveIdleOption:
  1134  		e.keepalive.Lock()
  1135  		e.keepalive.idle = time.Duration(v)
  1136  		e.keepalive.Unlock()
  1137  		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
  1138  		return nil
  1139  
  1140  	case tcpip.KeepaliveIntervalOption:
  1141  		e.keepalive.Lock()
  1142  		e.keepalive.interval = time.Duration(v)
  1143  		e.keepalive.Unlock()
  1144  		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
  1145  		return nil
  1146  
  1147  	case tcpip.KeepaliveCountOption:
  1148  		e.keepalive.Lock()
  1149  		e.keepalive.count = int(v)
  1150  		e.keepalive.Unlock()
  1151  		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
  1152  		return nil
  1153  
  1154  	case tcpip.BroadcastOption:
  1155  		e.mu.Lock()
  1156  		e.broadcast = v != 0
  1157  		e.mu.Unlock()
  1158  		return nil
  1159  
  1160  	case tcpip.CongestionControlOption:
  1161  		// Query the available cc algorithms in the stack and
  1162  		// validate that the specified algorithm is actually
  1163  		// supported in the stack.
  1164  		var avail tcpip.AvailableCongestionControlOption
  1165  		if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil {
  1166  			return err
  1167  		}
  1168  		availCC := strings.Split(string(avail), " ")
  1169  		for _, cc := range availCC {
  1170  			if v == tcpip.CongestionControlOption(cc) {
  1171  				// Acquire the work mutex as we may need to
  1172  				// reinitialize the congestion control state.
  1173  				e.mu.Lock()
  1174  				state := e.state
  1175  				e.cc = v
  1176  				e.mu.Unlock()
  1177  				switch state {
  1178  				case StateEstablished:
  1179  					e.workMu.Lock()
  1180  					e.mu.Lock()
  1181  					if e.state == state {
  1182  						e.snd.cc = e.snd.initCongestionControl(e.cc)
  1183  					}
  1184  					e.mu.Unlock()
  1185  					e.workMu.Unlock()
  1186  				}
  1187  				return nil
  1188  			}
  1189  		}
  1190  
  1191  		// Linux returns ENOENT when an invalid congestion
  1192  		// control algorithm is specified.
  1193  		return tcpip.ErrNoSuchFile
  1194  	default:
  1195  		return nil
  1196  	}
  1197  }
  1198  
  1199  // readyReceiveSize returns the number of bytes ready to be received.
  1200  func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) {
  1201  	e.mu.RLock()
  1202  	defer e.mu.RUnlock()
  1203  
  1204  	// The endpoint cannot be in listen state.
  1205  	if e.state == StateListen {
  1206  		return 0, tcpip.ErrInvalidEndpointState
  1207  	}
  1208  
  1209  	e.rcvListMu.Lock()
  1210  	defer e.rcvListMu.Unlock()
  1211  
  1212  	return e.rcvBufUsed, nil
  1213  }
  1214  
  1215  // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
  1216  func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
  1217  	switch opt {
  1218  	case tcpip.ReceiveQueueSizeOption:
  1219  		return e.readyReceiveSize()
  1220  	case tcpip.SendBufferSizeOption:
  1221  		e.sndBufMu.Lock()
  1222  		v := e.sndBufSize
  1223  		e.sndBufMu.Unlock()
  1224  		return v, nil
  1225  
  1226  	case tcpip.ReceiveBufferSizeOption:
  1227  		e.rcvListMu.Lock()
  1228  		v := e.rcvBufSize
  1229  		e.rcvListMu.Unlock()
  1230  		return v, nil
  1231  
  1232  	}
  1233  	return -1, tcpip.ErrUnknownProtocolOption
  1234  }
  1235  
  1236  // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
  1237  func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
  1238  	switch o := opt.(type) {
  1239  	case tcpip.ErrorOption:
  1240  		e.lastErrorMu.Lock()
  1241  		err := e.lastError
  1242  		e.lastError = nil
  1243  		e.lastErrorMu.Unlock()
  1244  		return err
  1245  
  1246  	case *tcpip.MaxSegOption:
  1247  		// This is just stubbed out. Linux never returns the user_mss
  1248  		// value as it either returns the defaultMSS or returns the
  1249  		// actual current MSS. Netstack just returns the defaultMSS
  1250  		// always for now.
  1251  		*o = header.TCPDefaultMSS
  1252  		return nil
  1253  
  1254  	case *tcpip.DelayOption:
  1255  		*o = 0
  1256  		if v := atomic.LoadUint32(&e.delay); v != 0 {
  1257  			*o = 1
  1258  		}
  1259  		return nil
  1260  
  1261  	case *tcpip.CorkOption:
  1262  		*o = 0
  1263  		if v := atomic.LoadUint32(&e.cork); v != 0 {
  1264  			*o = 1
  1265  		}
  1266  		return nil
  1267  
  1268  	case *tcpip.ReuseAddressOption:
  1269  		e.mu.RLock()
  1270  		v := e.reuseAddr
  1271  		e.mu.RUnlock()
  1272  
  1273  		*o = 0
  1274  		if v {
  1275  			*o = 1
  1276  		}
  1277  		return nil
  1278  
  1279  	case *tcpip.ReusePortOption:
  1280  		e.mu.RLock()
  1281  		v := e.reusePort
  1282  		e.mu.RUnlock()
  1283  
  1284  		*o = 0
  1285  		if v {
  1286  			*o = 1
  1287  		}
  1288  		return nil
  1289  
  1290  	case *tcpip.BindToDeviceOption:
  1291  		e.mu.RLock()
  1292  		defer e.mu.RUnlock()
  1293  		if nic, ok := e.stack.NICInfo()[e.bindToDevice]; ok {
  1294  			*o = tcpip.BindToDeviceOption(nic.Name)
  1295  			return nil
  1296  		}
  1297  		*o = ""
  1298  		return nil
  1299  
  1300  	case *tcpip.QuickAckOption:
  1301  		*o = 1
  1302  		if v := atomic.LoadUint32(&e.slowAck); v != 0 {
  1303  			*o = 0
  1304  		}
  1305  		return nil
  1306  
  1307  	case *tcpip.V6OnlyOption:
  1308  		// We only recognize this option on v6 endpoints.
  1309  		if e.netProto != header.IPv6ProtocolNumber {
  1310  			return tcpip.ErrUnknownProtocolOption
  1311  		}
  1312  
  1313  		e.mu.Lock()
  1314  		v := e.v6only
  1315  		e.mu.Unlock()
  1316  
  1317  		*o = 0
  1318  		if v {
  1319  			*o = 1
  1320  		}
  1321  		return nil
  1322  
  1323  	case *tcpip.TTLOption:
  1324  		e.mu.Lock()
  1325  		*o = tcpip.TTLOption(e.ttl)
  1326  		e.mu.Unlock()
  1327  		return nil
  1328  
  1329  	case *tcpip.TCPInfoOption:
  1330  		*o = tcpip.TCPInfoOption{}
  1331  		e.mu.RLock()
  1332  		snd := e.snd
  1333  		e.mu.RUnlock()
  1334  		if snd != nil {
  1335  			snd.rtt.Lock()
  1336  			o.RTT = snd.rtt.srtt
  1337  			o.RTTVar = snd.rtt.rttvar
  1338  			snd.rtt.Unlock()
  1339  		}
  1340  		return nil
  1341  
  1342  	case *tcpip.KeepaliveEnabledOption:
  1343  		e.keepalive.Lock()
  1344  		v := e.keepalive.enabled
  1345  		e.keepalive.Unlock()
  1346  
  1347  		*o = 0
  1348  		if v {
  1349  			*o = 1
  1350  		}
  1351  		return nil
  1352  
  1353  	case *tcpip.KeepaliveIdleOption:
  1354  		e.keepalive.Lock()
  1355  		*o = tcpip.KeepaliveIdleOption(e.keepalive.idle)
  1356  		e.keepalive.Unlock()
  1357  		return nil
  1358  
  1359  	case *tcpip.KeepaliveIntervalOption:
  1360  		e.keepalive.Lock()
  1361  		*o = tcpip.KeepaliveIntervalOption(e.keepalive.interval)
  1362  		e.keepalive.Unlock()
  1363  		return nil
  1364  
  1365  	case *tcpip.KeepaliveCountOption:
  1366  		e.keepalive.Lock()
  1367  		*o = tcpip.KeepaliveCountOption(e.keepalive.count)
  1368  		e.keepalive.Unlock()
  1369  		return nil
  1370  
  1371  	case *tcpip.OutOfBandInlineOption:
  1372  		// We don't currently support disabling this option.
  1373  		*o = 1
  1374  		return nil
  1375  
  1376  	case *tcpip.BroadcastOption:
  1377  		e.mu.Lock()
  1378  		v := e.broadcast
  1379  		e.mu.Unlock()
  1380  
  1381  		*o = 0
  1382  		if v {
  1383  			*o = 1
  1384  		}
  1385  		return nil
  1386  
  1387  	case *tcpip.CongestionControlOption:
  1388  		e.mu.Lock()
  1389  		*o = e.cc
  1390  		e.mu.Unlock()
  1391  		return nil
  1392  
  1393  	default:
  1394  		return tcpip.ErrUnknownProtocolOption
  1395  	}
  1396  }
  1397  
  1398  func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
  1399  	netProto := e.netProto
  1400  	if header.IsV4MappedAddress(addr.Addr) {
  1401  		// Fail if using a v4 mapped address on a v6only endpoint.
  1402  		if e.v6only {
  1403  			return 0, tcpip.ErrNoRoute
  1404  		}
  1405  
  1406  		netProto = header.IPv4ProtocolNumber
  1407  		addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:]
  1408  		if addr.Addr == header.IPv4Any {
  1409  			addr.Addr = ""
  1410  		}
  1411  	}
  1412  
  1413  	// Fail if we're bound to an address length different from the one we're
  1414  	// checking.
  1415  	if l := len(e.id.LocalAddress); l != 0 && len(addr.Addr) != 0 && l != len(addr.Addr) {
  1416  		return 0, tcpip.ErrInvalidEndpointState
  1417  	}
  1418  
  1419  	return netProto, nil
  1420  }
  1421  
  1422  // Disconnect implements tcpip.Endpoint.Disconnect.
  1423  func (*endpoint) Disconnect() *tcpip.Error {
  1424  	return tcpip.ErrNotSupported
  1425  }
  1426  
  1427  // Connect connects the endpoint to its peer.
  1428  func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
  1429  	return e.connect(addr, true, true)
  1430  }
  1431  
  1432  // connect connects the endpoint to its peer. In the normal non-S/R case, the
  1433  // new connection is expected to run the main goroutine and perform handshake.
  1434  // In restore of previously connected endpoints, both ends will be passively
  1435  // created (so no new handshaking is done); for stack-accepted connections not
  1436  // yet accepted by the app, they are restored without running the main goroutine
  1437  // here.
  1438  func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (err *tcpip.Error) {
  1439  	e.mu.Lock()
  1440  	defer e.mu.Unlock()
  1441  	defer func() {
  1442  		if err != nil && !err.IgnoreStats() {
  1443  			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
  1444  		}
  1445  	}()
  1446  
  1447  	connectingAddr := addr.Addr
  1448  
  1449  	netProto, err := e.checkV4Mapped(&addr)
  1450  	if err != nil {
  1451  		return err
  1452  	}
  1453  
  1454  	if e.state.connected() {
  1455  		// The endpoint is already connected. If caller hasn't been
  1456  		// notified yet, return success.
  1457  		if !e.isConnectNotified {
  1458  			e.isConnectNotified = true
  1459  			return nil
  1460  		}
  1461  		// Otherwise return that it's already connected.
  1462  		return tcpip.ErrAlreadyConnected
  1463  	}
  1464  
  1465  	nicid := addr.NIC
  1466  	switch e.state {
  1467  	case StateBound:
  1468  		// If we're already bound to a NIC but the caller is requesting
  1469  		// that we use a different one now, we cannot proceed.
  1470  		if e.boundNICID == 0 {
  1471  			break
  1472  		}
  1473  
  1474  		if nicid != 0 && nicid != e.boundNICID {
  1475  			return tcpip.ErrNoRoute
  1476  		}
  1477  
  1478  		nicid = e.boundNICID
  1479  
  1480  	case StateInitial:
  1481  		// Nothing to do. We'll eventually fill-in the gaps in the ID (if any)
  1482  		// when we find a route.
  1483  
  1484  	case StateConnecting, StateSynSent, StateSynRecv:
  1485  		// A connection request has already been issued but hasn't completed
  1486  		// yet.
  1487  		return tcpip.ErrAlreadyConnecting
  1488  
  1489  	case StateError:
  1490  		return e.hardError
  1491  
  1492  	default:
  1493  		return tcpip.ErrInvalidEndpointState
  1494  	}
  1495  
  1496  	// Find a route to the desired destination.
  1497  	r, err := e.stack.FindRoute(nicid, e.id.LocalAddress, addr.Addr, netProto, false /* multicastLoop */)
  1498  	if err != nil {
  1499  		return err
  1500  	}
  1501  	defer r.Release()
  1502  
  1503  	origID := e.id
  1504  
  1505  	netProtos := []tcpip.NetworkProtocolNumber{netProto}
  1506  	e.id.LocalAddress = r.LocalAddress
  1507  	e.id.RemoteAddress = r.RemoteAddress
  1508  	e.id.RemotePort = addr.Port
  1509  
  1510  	if e.id.LocalPort != 0 {
  1511  		// The endpoint is bound to a port, attempt to register it.
  1512  		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, e.id, e, e.reusePort, e.bindToDevice)
  1513  		if err != nil {
  1514  			return err
  1515  		}
  1516  	} else {
  1517  		// The endpoint doesn't have a local port yet, so try to get
  1518  		// one. Make sure that it isn't one that will result in the same
  1519  		// address/port for both local and remote (otherwise this
  1520  		// endpoint would be trying to connect to itself).
  1521  		sameAddr := e.id.LocalAddress == e.id.RemoteAddress
  1522  
  1523  		// Calculate a port offset based on the destination IP/port and
  1524  		// src IP to ensure that for a given tuple (srcIP, destIP,
  1525  		// destPort) the offset used as a starting point is the same to
  1526  		// ensure that we can cycle through the port space effectively.
  1527  		h := jenkins.Sum32(e.stack.PortSeed())
  1528  		h.Write([]byte(e.id.LocalAddress))
  1529  		h.Write([]byte(e.id.RemoteAddress))
  1530  		portBuf := make([]byte, 2)
  1531  		binary.LittleEndian.PutUint16(portBuf, e.id.RemotePort)
  1532  		h.Write(portBuf)
  1533  		portOffset := h.Sum32()
  1534  
  1535  		if _, err := e.stack.PickEphemeralPortStable(portOffset, func(p uint16) (bool, *tcpip.Error) {
  1536  			if sameAddr && p == e.id.RemotePort {
  1537  				return false, nil
  1538  			}
  1539  			// reusePort is false below because connect cannot reuse a port even if
  1540  			// reusePort was set.
  1541  			if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.id.LocalAddress, p, false /* reusePort */, e.bindToDevice) {
  1542  				return false, nil
  1543  			}
  1544  
  1545  			id := e.id
  1546  			id.LocalPort = p
  1547  			switch e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice) {
  1548  			case nil:
  1549  				e.id = id
  1550  				return true, nil
  1551  			case tcpip.ErrPortInUse:
  1552  				return false, nil
  1553  			default:
  1554  				return false, err
  1555  			}
  1556  		}); err != nil {
  1557  			return err
  1558  		}
  1559  	}
  1560  
  1561  	// Remove the port reservation. This can happen when Bind is called
  1562  	// before Connect: in such a case we don't want to hold on to
  1563  	// reservations anymore.
  1564  	if e.isPortReserved {
  1565  		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort, e.bindToDevice)
  1566  		e.isPortReserved = false
  1567  	}
  1568  
  1569  	e.isRegistered = true
  1570  	e.state = StateConnecting
  1571  	e.route = r.Clone()
  1572  	e.boundNICID = nicid
  1573  	e.effectiveNetProtos = netProtos
  1574  	e.connectingAddress = connectingAddr
  1575  
  1576  	e.initGSO()
  1577  
  1578  	// Connect in the restore phase does not perform handshake. Restore its
  1579  	// connection setting here.
  1580  	if !handshake {
  1581  		e.segmentQueue.mu.Lock()
  1582  		for _, l := range []segmentList{e.segmentQueue.list, e.sndQueue, e.snd.writeList} {
  1583  			for s := l.Front(); s != nil; s = s.Next() {
  1584  				s.id = e.id
  1585  				s.route = r.Clone()
  1586  				e.sndWaker.Assert()
  1587  			}
  1588  		}
  1589  		e.segmentQueue.mu.Unlock()
  1590  		e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
  1591  		e.state = StateEstablished
  1592  	}
  1593  
  1594  	if run {
  1595  		e.workerRunning = true
  1596  		e.stack.Stats().TCP.ActiveConnectionOpenings.Increment()
  1597  		go e.protocolMainLoop(handshake)
  1598  	}
  1599  
  1600  	return tcpip.ErrConnectStarted
  1601  }
  1602  
  1603  // ConnectEndpoint is not supported.
  1604  func (*endpoint) ConnectEndpoint(tcpip.Endpoint) *tcpip.Error {
  1605  	return tcpip.ErrInvalidEndpointState
  1606  }
  1607  
  1608  // Shutdown closes the read and/or write end of the endpoint connection to its
  1609  // peer.
  1610  func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
  1611  	e.mu.Lock()
  1612  	defer e.mu.Unlock()
  1613  	e.shutdownFlags |= flags
  1614  
  1615  	switch {
  1616  	case e.state.connected():
  1617  		// Close for read.
  1618  		if (e.shutdownFlags & tcpip.ShutdownRead) != 0 {
  1619  			// Mark read side as closed.
  1620  			e.rcvListMu.Lock()
  1621  			e.rcvClosed = true
  1622  			rcvBufUsed := e.rcvBufUsed
  1623  			e.rcvListMu.Unlock()
  1624  
  1625  			// If we're fully closed and we have unread data we need to abort
  1626  			// the connection with a RST.
  1627  			if (e.shutdownFlags&tcpip.ShutdownWrite) != 0 && rcvBufUsed > 0 {
  1628  				e.notifyProtocolGoroutine(notifyReset)
  1629  				return nil
  1630  			}
  1631  		}
  1632  
  1633  		// Close for write.
  1634  		if (e.shutdownFlags & tcpip.ShutdownWrite) != 0 {
  1635  			e.sndBufMu.Lock()
  1636  
  1637  			if e.sndClosed {
  1638  				// Already closed.
  1639  				e.sndBufMu.Unlock()
  1640  				break
  1641  			}
  1642  
  1643  			// Queue fin segment.
  1644  			s := newSegmentFromView(&e.route, e.id, nil)
  1645  			e.sndQueue.PushBack(s)
  1646  			e.sndBufInQueue++
  1647  
  1648  			// Mark endpoint as closed.
  1649  			e.sndClosed = true
  1650  
  1651  			e.sndBufMu.Unlock()
  1652  
  1653  			// Tell protocol goroutine to close.
  1654  			e.sndCloseWaker.Assert()
  1655  		}
  1656  
  1657  	case e.state == StateListen:
  1658  		// Tell protocolListenLoop to stop.
  1659  		if flags&tcpip.ShutdownRead != 0 {
  1660  			e.notifyProtocolGoroutine(notifyClose)
  1661  		}
  1662  
  1663  	default:
  1664  		return tcpip.ErrNotConnected
  1665  	}
  1666  
  1667  	return nil
  1668  }
  1669  
  1670  // Listen puts the endpoint in "listen" mode, which allows it to accept
  1671  // new connections.
  1672  func (e *endpoint) Listen(backlog int) (err *tcpip.Error) {
  1673  	e.mu.Lock()
  1674  	defer e.mu.Unlock()
  1675  	defer func() {
  1676  		if err != nil && !err.IgnoreStats() {
  1677  			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
  1678  		}
  1679  	}()
  1680  
  1681  	// Allow the backlog to be adjusted if the endpoint is not shutting down.
  1682  	// When the endpoint shuts down, it sets workerCleanup to true, and from
  1683  	// that point onward, acceptedChan is the responsibility of the cleanup()
  1684  	// method (and should not be touched anywhere else, including here).
  1685  	if e.state == StateListen && !e.workerCleanup {
  1686  		// Adjust the size of the channel iff we can fix existing
  1687  		// pending connections into the new one.
  1688  		if len(e.acceptedChan) > backlog {
  1689  			return tcpip.ErrInvalidEndpointState
  1690  		}
  1691  		if cap(e.acceptedChan) == backlog {
  1692  			return nil
  1693  		}
  1694  		origChan := e.acceptedChan
  1695  		e.acceptedChan = make(chan *endpoint, backlog)
  1696  		close(origChan)
  1697  		for ep := range origChan {
  1698  			e.acceptedChan <- ep
  1699  		}
  1700  		return nil
  1701  	}
  1702  
  1703  	// Endpoint must be bound before it can transition to listen mode.
  1704  	if e.state != StateBound {
  1705  		return tcpip.ErrInvalidEndpointState
  1706  	}
  1707  
  1708  	// Register the endpoint.
  1709  	if err := e.stack.RegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e, e.reusePort, e.bindToDevice); err != nil {
  1710  		return err
  1711  	}
  1712  
  1713  	e.isRegistered = true
  1714  	e.state = StateListen
  1715  	if e.acceptedChan == nil {
  1716  		e.acceptedChan = make(chan *endpoint, backlog)
  1717  	}
  1718  	e.workerRunning = true
  1719  
  1720  	go e.protocolListenLoop(
  1721  		seqnum.Size(e.receiveBufferAvailable()))
  1722  
  1723  	return nil
  1724  }
  1725  
  1726  // startAcceptedLoop sets up required state and starts a goroutine with the
  1727  // main loop for accepted connections.
  1728  func (e *endpoint) startAcceptedLoop(waiterQueue *waiter.Queue) {
  1729  	e.waiterQueue = waiterQueue
  1730  	e.workerRunning = true
  1731  	go e.protocolMainLoop(false)
  1732  }
  1733  
  1734  // Accept returns a new endpoint if a peer has established a connection
  1735  // to an endpoint previously set to listen mode.
  1736  func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
  1737  	e.mu.RLock()
  1738  	defer e.mu.RUnlock()
  1739  
  1740  	// Endpoint must be in listen state before it can accept connections.
  1741  	if e.state != StateListen {
  1742  		return nil, nil, tcpip.ErrInvalidEndpointState
  1743  	}
  1744  
  1745  	// Get the new accepted endpoint.
  1746  	var n *endpoint
  1747  	select {
  1748  	case n = <-e.acceptedChan:
  1749  	default:
  1750  		return nil, nil, tcpip.ErrWouldBlock
  1751  	}
  1752  
  1753  	// Start the protocol goroutine.
  1754  	wq := &waiter.Queue{}
  1755  	n.startAcceptedLoop(wq)
  1756  	e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
  1757  
  1758  	return n, wq, nil
  1759  }
  1760  
  1761  // Bind binds the endpoint to a specific local port and optionally address.
  1762  func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
  1763  	e.mu.Lock()
  1764  	defer e.mu.Unlock()
  1765  
  1766  	// Don't allow binding once endpoint is not in the initial state
  1767  	// anymore. This is because once the endpoint goes into a connected or
  1768  	// listen state, it is already bound.
  1769  	if e.state != StateInitial {
  1770  		return tcpip.ErrAlreadyBound
  1771  	}
  1772  
  1773  	e.bindAddress = addr.Addr
  1774  	netProto, err := e.checkV4Mapped(&addr)
  1775  	if err != nil {
  1776  		return err
  1777  	}
  1778  
  1779  	// Expand netProtos to include v4 and v6 if the caller is binding to a
  1780  	// wildcard (empty) address, and this is an IPv6 endpoint with v6only
  1781  	// set to false.
  1782  	netProtos := []tcpip.NetworkProtocolNumber{netProto}
  1783  	if netProto == header.IPv6ProtocolNumber && !e.v6only && addr.Addr == "" {
  1784  		netProtos = []tcpip.NetworkProtocolNumber{
  1785  			header.IPv6ProtocolNumber,
  1786  			header.IPv4ProtocolNumber,
  1787  		}
  1788  	}
  1789  
  1790  	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.reusePort, e.bindToDevice)
  1791  	if err != nil {
  1792  		return err
  1793  	}
  1794  
  1795  	e.isPortReserved = true
  1796  	e.effectiveNetProtos = netProtos
  1797  	e.id.LocalPort = port
  1798  
  1799  	// Any failures beyond this point must remove the port registration.
  1800  	defer func(bindToDevice tcpip.NICID) {
  1801  		if err != nil {
  1802  			e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, bindToDevice)
  1803  			e.isPortReserved = false
  1804  			e.effectiveNetProtos = nil
  1805  			e.id.LocalPort = 0
  1806  			e.id.LocalAddress = ""
  1807  			e.boundNICID = 0
  1808  		}
  1809  	}(e.bindToDevice)
  1810  
  1811  	// If an address is specified, we must ensure that it's one of our
  1812  	// local addresses.
  1813  	if len(addr.Addr) != 0 {
  1814  		nic := e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
  1815  		if nic == 0 {
  1816  			return tcpip.ErrBadLocalAddress
  1817  		}
  1818  
  1819  		e.boundNICID = nic
  1820  		e.id.LocalAddress = addr.Addr
  1821  	}
  1822  
  1823  	// Mark endpoint as bound.
  1824  	e.state = StateBound
  1825  
  1826  	return nil
  1827  }
  1828  
  1829  // GetLocalAddress returns the address to which the endpoint is bound.
  1830  func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
  1831  	e.mu.RLock()
  1832  	defer e.mu.RUnlock()
  1833  
  1834  	return tcpip.FullAddress{
  1835  		Addr: e.id.LocalAddress,
  1836  		Port: e.id.LocalPort,
  1837  		NIC:  e.boundNICID,
  1838  	}, nil
  1839  }
  1840  
  1841  // GetRemoteAddress returns the address to which the endpoint is connected.
  1842  func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
  1843  	e.mu.RLock()
  1844  	defer e.mu.RUnlock()
  1845  
  1846  	if !e.state.connected() {
  1847  		return tcpip.FullAddress{}, tcpip.ErrNotConnected
  1848  	}
  1849  
  1850  	return tcpip.FullAddress{
  1851  		Addr: e.id.RemoteAddress,
  1852  		Port: e.id.RemotePort,
  1853  		NIC:  e.boundNICID,
  1854  	}, nil
  1855  }
  1856  
  1857  // HandlePacket is called by the stack when new packets arrive to this transport
  1858  // endpoint.
  1859  func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) {
  1860  	s := newSegment(r, id, vv)
  1861  	if !s.parse() {
  1862  		e.stack.Stats().MalformedRcvdPackets.Increment()
  1863  		e.stack.Stats().TCP.InvalidSegmentsReceived.Increment()
  1864  		s.decRef()
  1865  		return
  1866  	}
  1867  
  1868  	if !s.csumValid {
  1869  		e.stack.Stats().MalformedRcvdPackets.Increment()
  1870  		e.stack.Stats().TCP.ChecksumErrors.Increment()
  1871  		s.decRef()
  1872  		return
  1873  	}
  1874  
  1875  	e.stack.Stats().TCP.ValidSegmentsReceived.Increment()
  1876  	if (s.flags & header.TCPFlagRst) != 0 {
  1877  		e.stack.Stats().TCP.ResetsReceived.Increment()
  1878  	}
  1879  
  1880  	// Send packet to worker goroutine.
  1881  	if e.segmentQueue.enqueue(s) {
  1882  		e.newSegmentWaker.Assert()
  1883  	} else {
  1884  		// The queue is full, so we drop the segment.
  1885  		e.stack.Stats().DroppedPackets.Increment()
  1886  		s.decRef()
  1887  	}
  1888  }
  1889  
  1890  // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
  1891  func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
  1892  	switch typ {
  1893  	case stack.ControlPacketTooBig:
  1894  		e.sndBufMu.Lock()
  1895  		e.packetTooBigCount++
  1896  		if v := int(extra); v < e.sndMTU {
  1897  			e.sndMTU = v
  1898  		}
  1899  		e.sndBufMu.Unlock()
  1900  
  1901  		e.notifyProtocolGoroutine(notifyMTUChanged)
  1902  	}
  1903  }
  1904  
  1905  // updateSndBufferUsage is called by the protocol goroutine when room opens up
  1906  // in the send buffer. The number of newly available bytes is v.
  1907  func (e *endpoint) updateSndBufferUsage(v int) {
  1908  	e.sndBufMu.Lock()
  1909  	notify := e.sndBufUsed >= e.sndBufSize>>1
  1910  	e.sndBufUsed -= v
  1911  	// We only notify when there is half the sndBufSize available after
  1912  	// a full buffer event occurs. This ensures that we don't wake up
  1913  	// writers to queue just 1-2 segments and go back to sleep.
  1914  	notify = notify && e.sndBufUsed < e.sndBufSize>>1
  1915  	e.sndBufMu.Unlock()
  1916  
  1917  	if notify {
  1918  		e.waiterQueue.Notify(waiter.EventOut)
  1919  	}
  1920  }
  1921  
  1922  // readyToRead is called by the protocol goroutine when a new segment is ready
  1923  // to be read, or when the connection is closed for receiving (in which case
  1924  // s will be nil).
  1925  func (e *endpoint) readyToRead(s *segment) {
  1926  	e.rcvListMu.Lock()
  1927  	if s != nil {
  1928  		s.incRef()
  1929  		e.rcvBufUsed += s.data.Size()
  1930  		// Check if the receive window is now closed. If so make sure
  1931  		// we set the zero window before we deliver the segment to ensure
  1932  		// that a subsequent read of the segment will correctly trigger
  1933  		// a non-zero notification.
  1934  		if avail := e.receiveBufferAvailableLocked(); avail>>e.rcv.rcvWndScale == 0 {
  1935  			e.zeroWindow = true
  1936  		}
  1937  		e.rcvList.PushBack(s)
  1938  	} else {
  1939  		e.rcvClosed = true
  1940  	}
  1941  	e.rcvListMu.Unlock()
  1942  
  1943  	e.waiterQueue.Notify(waiter.EventIn)
  1944  }
  1945  
  1946  // receiveBufferAvailableLocked calculates how many bytes are still available
  1947  // in the receive buffer.
  1948  // rcvListMu must be held when this function is called.
  1949  func (e *endpoint) receiveBufferAvailableLocked() int {
  1950  	// We may use more bytes than the buffer size when the receive buffer
  1951  	// shrinks.
  1952  	if e.rcvBufUsed >= e.rcvBufSize {
  1953  		return 0
  1954  	}
  1955  
  1956  	return e.rcvBufSize - e.rcvBufUsed
  1957  }
  1958  
  1959  // receiveBufferAvailable calculates how many bytes are still available in the
  1960  // receive buffer.
  1961  func (e *endpoint) receiveBufferAvailable() int {
  1962  	e.rcvListMu.Lock()
  1963  	available := e.receiveBufferAvailableLocked()
  1964  	e.rcvListMu.Unlock()
  1965  	return available
  1966  }
  1967  
  1968  func (e *endpoint) receiveBufferSize() int {
  1969  	e.rcvListMu.Lock()
  1970  	size := e.rcvBufSize
  1971  	e.rcvListMu.Unlock()
  1972  
  1973  	return size
  1974  }
  1975  
  1976  func (e *endpoint) maxReceiveBufferSize() int {
  1977  	var rs ReceiveBufferSizeOption
  1978  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil {
  1979  		// As a fallback return the hardcoded max buffer size.
  1980  		return MaxBufferSize
  1981  	}
  1982  	return rs.Max
  1983  }
  1984  
  1985  // rcvWndScaleForHandshake computes the receive window scale to offer to the
  1986  // peer when window scaling is enabled (true by default). If auto-tuning is
  1987  // disabled then the window scaling factor is based on the size of the
  1988  // receiveBuffer otherwise we use the max permissible receive buffer size to
  1989  // compute the scale.
  1990  func (e *endpoint) rcvWndScaleForHandshake() int {
  1991  	bufSizeForScale := e.receiveBufferSize()
  1992  
  1993  	e.rcvListMu.Lock()
  1994  	autoTuningDisabled := e.rcvAutoParams.disabled
  1995  	e.rcvListMu.Unlock()
  1996  	if autoTuningDisabled {
  1997  		return FindWndScale(seqnum.Size(bufSizeForScale))
  1998  	}
  1999  
  2000  	return FindWndScale(seqnum.Size(e.maxReceiveBufferSize()))
  2001  }
  2002  
  2003  // updateRecentTimestamp updates the recent timestamp using the algorithm
  2004  // described in https://tools.ietf.org/html/rfc7323#section-4.3
  2005  func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) {
  2006  	if e.sendTSOk && seqnum.Value(e.recentTS).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) {
  2007  		e.recentTS = tsVal
  2008  	}
  2009  }
  2010  
  2011  // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if
  2012  // the SYN options indicate that timestamp option was negotiated. It also
  2013  // initializes the recentTS with the value provided in synOpts.TSval.
  2014  func (e *endpoint) maybeEnableTimestamp(synOpts *header.TCPSynOptions) {
  2015  	if synOpts.TS {
  2016  		e.sendTSOk = true
  2017  		e.recentTS = synOpts.TSVal
  2018  	}
  2019  }
  2020  
  2021  // timestamp returns the timestamp value to be used in the TSVal field of the
  2022  // timestamp option for outgoing TCP segments for a given endpoint.
  2023  func (e *endpoint) timestamp() uint32 {
  2024  	return tcpTimeStamp(e.tsOffset)
  2025  }
  2026  
  2027  // tcpTimeStamp returns a timestamp offset by the provided offset. This is
  2028  // not inlined above as it's used when SYN cookies are in use and endpoint
  2029  // is not created at the time when the SYN cookie is sent.
  2030  func tcpTimeStamp(offset uint32) uint32 {
  2031  	now := time.Now()
  2032  	return uint32(now.Unix()*1000+int64(now.Nanosecond()/1e6)) + offset
  2033  }
  2034  
  2035  // timeStampOffset returns a randomized timestamp offset to be used when sending
  2036  // timestamp values in a timestamp option for a TCP segment.
  2037  func timeStampOffset() uint32 {
  2038  	b := make([]byte, 4)
  2039  	if _, err := rand.Read(b); err != nil {
  2040  		panic(err)
  2041  	}
  2042  	// Initialize a random tsOffset that will be added to the recentTS
  2043  	// everytime the timestamp is sent when the Timestamp option is enabled.
  2044  	//
  2045  	// See https://tools.ietf.org/html/rfc7323#section-5.4 for details on
  2046  	// why this is required.
  2047  	//
  2048  	// NOTE: This is not completely to spec as normally this should be
  2049  	// initialized in a manner analogous to how sequence numbers are
  2050  	// randomized per connection basis. But for now this is sufficient.
  2051  	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
  2052  }
  2053  
  2054  // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint
  2055  // if the SYN options indicate that the SACK option was negotiated and the TCP
  2056  // stack is configured to enable TCP SACK option.
  2057  func (e *endpoint) maybeEnableSACKPermitted(synOpts *header.TCPSynOptions) {
  2058  	var v SACKEnabled
  2059  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil {
  2060  		// Stack doesn't support SACK. So just return.
  2061  		return
  2062  	}
  2063  	if bool(v) && synOpts.SACKPermitted {
  2064  		e.sackPermitted = true
  2065  	}
  2066  }
  2067  
  2068  // maxOptionSize return the maximum size of TCP options.
  2069  func (e *endpoint) maxOptionSize() (size int) {
  2070  	var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock
  2071  	options := e.makeOptions(maxSackBlocks[:])
  2072  	size = len(options)
  2073  	putOptions(options)
  2074  
  2075  	return size
  2076  }
  2077  
  2078  // completeState makes a full copy of the endpoint and returns it. This is used
  2079  // before invoking the probe. The state returned may not be fully consistent if
  2080  // there are intervening syscalls when the state is being copied.
  2081  func (e *endpoint) completeState() stack.TCPEndpointState {
  2082  	var s stack.TCPEndpointState
  2083  	s.SegTime = time.Now()
  2084  
  2085  	// Copy EndpointID.
  2086  	e.mu.Lock()
  2087  	s.ID = stack.TCPEndpointID(e.id)
  2088  	e.mu.Unlock()
  2089  
  2090  	// Copy endpoint rcv state.
  2091  	e.rcvListMu.Lock()
  2092  	s.RcvBufSize = e.rcvBufSize
  2093  	s.RcvBufUsed = e.rcvBufUsed
  2094  	s.RcvClosed = e.rcvClosed
  2095  	s.RcvAutoParams.MeasureTime = e.rcvAutoParams.measureTime
  2096  	s.RcvAutoParams.CopiedBytes = e.rcvAutoParams.copied
  2097  	s.RcvAutoParams.PrevCopiedBytes = e.rcvAutoParams.prevCopied
  2098  	s.RcvAutoParams.RTT = e.rcvAutoParams.rtt
  2099  	s.RcvAutoParams.RTTMeasureSeqNumber = e.rcvAutoParams.rttMeasureSeqNumber
  2100  	s.RcvAutoParams.RTTMeasureTime = e.rcvAutoParams.rttMeasureTime
  2101  	s.RcvAutoParams.Disabled = e.rcvAutoParams.disabled
  2102  	e.rcvListMu.Unlock()
  2103  
  2104  	// Endpoint TCP Option state.
  2105  	s.SendTSOk = e.sendTSOk
  2106  	s.RecentTS = e.recentTS
  2107  	s.TSOffset = e.tsOffset
  2108  	s.SACKPermitted = e.sackPermitted
  2109  	s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks)
  2110  	copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks])
  2111  	s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy()
  2112  
  2113  	// Copy endpoint send state.
  2114  	e.sndBufMu.Lock()
  2115  	s.SndBufSize = e.sndBufSize
  2116  	s.SndBufUsed = e.sndBufUsed
  2117  	s.SndClosed = e.sndClosed
  2118  	s.SndBufInQueue = e.sndBufInQueue
  2119  	s.PacketTooBigCount = e.packetTooBigCount
  2120  	s.SndMTU = e.sndMTU
  2121  	e.sndBufMu.Unlock()
  2122  
  2123  	// Copy receiver state.
  2124  	s.Receiver = stack.TCPReceiverState{
  2125  		RcvNxt:         e.rcv.rcvNxt,
  2126  		RcvAcc:         e.rcv.rcvAcc,
  2127  		RcvWndScale:    e.rcv.rcvWndScale,
  2128  		PendingBufUsed: e.rcv.pendingBufUsed,
  2129  		PendingBufSize: e.rcv.pendingBufSize,
  2130  	}
  2131  
  2132  	// Copy sender state.
  2133  	s.Sender = stack.TCPSenderState{
  2134  		LastSendTime: e.snd.lastSendTime,
  2135  		DupAckCount:  e.snd.dupAckCount,
  2136  		FastRecovery: stack.TCPFastRecoveryState{
  2137  			Active:    e.snd.fr.active,
  2138  			First:     e.snd.fr.first,
  2139  			Last:      e.snd.fr.last,
  2140  			MaxCwnd:   e.snd.fr.maxCwnd,
  2141  			HighRxt:   e.snd.fr.highRxt,
  2142  			RescueRxt: e.snd.fr.rescueRxt,
  2143  		},
  2144  		SndCwnd:          e.snd.sndCwnd,
  2145  		Ssthresh:         e.snd.sndSsthresh,
  2146  		SndCAAckCount:    e.snd.sndCAAckCount,
  2147  		Outstanding:      e.snd.outstanding,
  2148  		SndWnd:           e.snd.sndWnd,
  2149  		SndUna:           e.snd.sndUna,
  2150  		SndNxt:           e.snd.sndNxt,
  2151  		RTTMeasureSeqNum: e.snd.rttMeasureSeqNum,
  2152  		RTTMeasureTime:   e.snd.rttMeasureTime,
  2153  		Closed:           e.snd.closed,
  2154  		RTO:              e.snd.rto,
  2155  		MaxPayloadSize:   e.snd.maxPayloadSize,
  2156  		SndWndScale:      e.snd.sndWndScale,
  2157  		MaxSentAck:       e.snd.maxSentAck,
  2158  	}
  2159  	e.snd.rtt.Lock()
  2160  	s.Sender.SRTT = e.snd.rtt.srtt
  2161  	s.Sender.SRTTInited = e.snd.rtt.srttInited
  2162  	e.snd.rtt.Unlock()
  2163  
  2164  	if cubic, ok := e.snd.cc.(*cubicState); ok {
  2165  		s.Sender.Cubic = stack.TCPCubicState{
  2166  			WMax:                    cubic.wMax,
  2167  			WLastMax:                cubic.wLastMax,
  2168  			T:                       cubic.t,
  2169  			TimeSinceLastCongestion: time.Since(cubic.t),
  2170  			C:                       cubic.c,
  2171  			K:                       cubic.k,
  2172  			Beta:                    cubic.beta,
  2173  			WC:                      cubic.wC,
  2174  			WEst:                    cubic.wEst,
  2175  		}
  2176  	}
  2177  	return s
  2178  }
  2179  
  2180  func (e *endpoint) initGSO() {
  2181  	if e.route.Capabilities()&stack.CapabilityGSO == 0 {
  2182  		return
  2183  	}
  2184  
  2185  	gso := &stack.GSO{}
  2186  	switch e.route.NetProto {
  2187  	case header.IPv4ProtocolNumber:
  2188  		gso.Type = stack.GSOTCPv4
  2189  		gso.L3HdrLen = header.IPv4MinimumSize
  2190  	case header.IPv6ProtocolNumber:
  2191  		gso.Type = stack.GSOTCPv6
  2192  		gso.L3HdrLen = header.IPv6MinimumSize
  2193  	default:
  2194  		panic(fmt.Sprintf("Unknown netProto: %v", e.netProto))
  2195  	}
  2196  	gso.NeedsCsum = true
  2197  	gso.CsumOffset = header.TCPChecksumOffset
  2198  	gso.MaxSize = e.route.GSOMaxSize()
  2199  	e.gso = gso
  2200  }
  2201  
  2202  // State implements tcpip.Endpoint.State. It exports the endpoint's protocol
  2203  // state for diagnostics.
  2204  func (e *endpoint) State() uint32 {
  2205  	e.mu.Lock()
  2206  	defer e.mu.Unlock()
  2207  	return uint32(e.state)
  2208  }
  2209  
  2210  func mssForRoute(r *stack.Route) uint16 {
  2211  	return uint16(r.MTU() - header.TCPMinimumSize)
  2212  }