github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/tcpip/transport/tcp/connect.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"encoding/binary"
    19  	"fmt"
    20  	"math"
    21  	"time"
    22  
    23  	"github.com/MerlinKodo/gvisor/pkg/sync"
    24  	"github.com/MerlinKodo/gvisor/pkg/tcpip"
    25  	"github.com/MerlinKodo/gvisor/pkg/tcpip/checksum"
    26  	"github.com/MerlinKodo/gvisor/pkg/tcpip/hash/jenkins"
    27  	"github.com/MerlinKodo/gvisor/pkg/tcpip/header"
    28  	"github.com/MerlinKodo/gvisor/pkg/tcpip/seqnum"
    29  	"github.com/MerlinKodo/gvisor/pkg/tcpip/stack"
    30  	"github.com/MerlinKodo/gvisor/pkg/waiter"
    31  )
    32  
    33  // InitialRTO is the initial retransmission timeout.
    34  // https://github.com/torvalds/linux/blob/7c636d4d20f/include/net/tcp.h#L142
    35  const InitialRTO = time.Second
    36  
    37  // maxSegmentsPerWake is the maximum number of segments to process in the main
    38  // protocol goroutine per wake-up. Yielding [after this number of segments are
    39  // processed] allows other events to be processed as well (e.g., timeouts,
    40  // resets, etc.).
    41  const maxSegmentsPerWake = 100
    42  
    43  type handshakeState int
    44  
    45  // The following are the possible states of the TCP connection during a 3-way
    46  // handshake. A depiction of the states and transitions can be found in RFC 793,
    47  // page 23.
    48  const (
    49  	handshakeSynSent handshakeState = iota
    50  	handshakeSynRcvd
    51  	handshakeCompleted
    52  )
    53  
    54  const (
    55  	// Maximum space available for options.
    56  	maxOptionSize = 40
    57  )
    58  
    59  // handshake holds the state used during a TCP 3-way handshake.
    60  //
    61  // NOTE: handshake.ep.mu is held during handshake processing. It is released if
    62  // we are going to block and reacquired when we start processing an event.
    63  //
    64  // +stateify savable
    65  type handshake struct {
    66  	ep       *endpoint
    67  	listenEP *endpoint
    68  	state    handshakeState
    69  	active   bool
    70  	flags    header.TCPFlags
    71  	ackNum   seqnum.Value
    72  
    73  	// iss is the initial send sequence number, as defined in RFC 793.
    74  	iss seqnum.Value
    75  
    76  	// rcvWnd is the receive window, as defined in RFC 793.
    77  	rcvWnd seqnum.Size
    78  
    79  	// sndWnd is the send window, as defined in RFC 793.
    80  	sndWnd seqnum.Size
    81  
    82  	// mss is the maximum segment size received from the peer.
    83  	mss uint16
    84  
    85  	// sndWndScale is the send window scale, as defined in RFC 1323. A
    86  	// negative value means no scaling is supported by the peer.
    87  	sndWndScale int
    88  
    89  	// rcvWndScale is the receive window scale, as defined in RFC 1323.
    90  	rcvWndScale int
    91  
    92  	// startTime is the time at which the first SYN/SYN-ACK was sent.
    93  	startTime tcpip.MonotonicTime
    94  
    95  	// deferAccept if non-zero will drop the final ACK for a passive
    96  	// handshake till an ACK segment with data is received or the timeout is
    97  	// hit.
    98  	deferAccept time.Duration
    99  
   100  	// acked is true if the the final ACK for a 3-way handshake has
   101  	// been received. This is required to stop retransmitting the
   102  	// original SYN-ACK when deferAccept is enabled.
   103  	acked bool
   104  
   105  	// sendSYNOpts is the cached values for the SYN options to be sent.
   106  	sendSYNOpts header.TCPSynOptions
   107  
   108  	// sampleRTTWithTSOnly is true when the segment was retransmitted or we can't
   109  	// tell; then RTT can only be sampled when the incoming segment has timestamp
   110  	// options enabled.
   111  	sampleRTTWithTSOnly bool
   112  
   113  	// retransmitTimer is used to retransmit SYN/SYN-ACK with exponential backoff
   114  	// till handshake is either completed or timesout.
   115  	retransmitTimer *backoffTimer `state:"nosave"`
   116  }
   117  
   118  // maybeFailTimerHandler takes a handler function for a timer that may fail and
   119  // returns a function that will invoke the provided handler with the endpoint
   120  // mutex held. In addition the returned function will perform any cleanup that
   121  // maybe required if the timer handler returns an error and in case of no errors
   122  // will notify the processor if there are pending segments that need to be
   123  // processed.
   124  
   125  // NOTE: e.mu is held for the duration of the call to f().
   126  func maybeFailTimerHandler(e *endpoint, f func() tcpip.Error) func() {
   127  	return func() {
   128  		e.mu.Lock()
   129  		if err := f(); err != nil {
   130  			e.lastErrorMu.Lock()
   131  			// If the handler timed out and we have a lastError recorded (maybe due
   132  			// to an ICMP message received), promote it to be the hard error.
   133  			if _, isTimeout := err.(*tcpip.ErrTimeout); e.lastError != nil && isTimeout {
   134  				e.hardError = e.lastError
   135  			} else {
   136  				e.hardError = err
   137  			}
   138  			e.lastError = err
   139  			e.lastErrorMu.Unlock()
   140  			e.cleanupLocked()
   141  			e.setEndpointState(StateError)
   142  			e.mu.Unlock()
   143  			e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
   144  			return
   145  		}
   146  		processor := e.protocol.dispatcher.selectProcessor(e.ID)
   147  		e.mu.Unlock()
   148  
   149  		// notify processor if there are pending segments to be
   150  		// processed.
   151  		if !e.segmentQueue.empty() {
   152  			processor.queueEndpoint(e)
   153  		}
   154  	}
   155  }
   156  
   157  // timerHandler takes a handler function for a timer that never results in a
   158  // connection being aborted and returns a function that will invoke the provided
   159  // handler with the endpoint mutex held. In addition the returned function will
   160  // notify the processor if there are pending segments that need to be processed
   161  // once the handler function completes.
   162  //
   163  // NOTE: e.mu is held for the duration of the call to f()
   164  func timerHandler(e *endpoint, f func()) func() {
   165  	return func() {
   166  		e.mu.Lock()
   167  		f()
   168  		processor := e.protocol.dispatcher.selectProcessor(e.ID)
   169  		e.mu.Unlock()
   170  		// notify processor if there are pending segments to be
   171  		// processed.
   172  		if !e.segmentQueue.empty() {
   173  			processor.queueEndpoint(e)
   174  		}
   175  	}
   176  }
   177  
   178  // +checklocks:e.mu
   179  // +checklocksacquire:h.ep.mu
   180  func (e *endpoint) newHandshake() (h *handshake) {
   181  	h = &handshake{
   182  		ep:          e,
   183  		active:      true,
   184  		rcvWnd:      seqnum.Size(e.initialReceiveWindow()),
   185  		rcvWndScale: e.rcvWndScaleForHandshake(),
   186  	}
   187  	h.ep.AssertLockHeld(e)
   188  	h.resetState()
   189  	// Store reference to handshake state in endpoint.
   190  	e.h = h
   191  	// By the time handshake is created, e.ID is already initialized.
   192  	e.TSOffset = e.protocol.tsOffset(e.ID.LocalAddress, e.ID.RemoteAddress)
   193  	timer, err := newBackoffTimer(h.ep.stack.Clock(), InitialRTO, MaxRTO, maybeFailTimerHandler(e, h.retransmitHandlerLocked))
   194  	if err != nil {
   195  		panic(fmt.Sprintf("newBackOffTimer(_, %s, %s, _) failed: %s", InitialRTO, MaxRTO, err))
   196  	}
   197  	h.retransmitTimer = timer
   198  	return h
   199  }
   200  
   201  // +checklocks:e.mu
   202  // +checklocksacquire:h.ep.mu
   203  func (e *endpoint) newPassiveHandshake(isn, irs seqnum.Value, opts header.TCPSynOptions, deferAccept time.Duration) (h *handshake) {
   204  	h = e.newHandshake()
   205  	h.resetToSynRcvd(isn, irs, opts, deferAccept)
   206  	return h
   207  }
   208  
   209  // FindWndScale determines the window scale to use for the given maximum window
   210  // size.
   211  func FindWndScale(wnd seqnum.Size) int {
   212  	if wnd < 0x10000 {
   213  		return 0
   214  	}
   215  
   216  	max := seqnum.Size(math.MaxUint16)
   217  	s := 0
   218  	for wnd > max && s < header.MaxWndScale {
   219  		s++
   220  		max <<= 1
   221  	}
   222  
   223  	return s
   224  }
   225  
   226  // resetState resets the state of the handshake object such that it becomes
   227  // ready for a new 3-way handshake.
   228  func (h *handshake) resetState() {
   229  	h.state = handshakeSynSent
   230  	h.flags = header.TCPFlagSyn
   231  	h.ackNum = 0
   232  	h.mss = 0
   233  	h.iss = generateSecureISN(h.ep.TransportEndpointInfo.ID, h.ep.stack.Clock(), h.ep.protocol.seqnumSecret)
   234  }
   235  
   236  // generateSecureISN generates a secure Initial Sequence number based on the
   237  // recommendation here https://tools.ietf.org/html/rfc6528#page-3.
   238  func generateSecureISN(id stack.TransportEndpointID, clock tcpip.Clock, seed uint32) seqnum.Value {
   239  	isnHasher := jenkins.Sum32(seed)
   240  	// Per hash.Hash.Writer:
   241  	//
   242  	// It never returns an error.
   243  	_, _ = isnHasher.Write(id.LocalAddress.AsSlice())
   244  	_, _ = isnHasher.Write(id.RemoteAddress.AsSlice())
   245  	portBuf := make([]byte, 2)
   246  	binary.LittleEndian.PutUint16(portBuf, id.LocalPort)
   247  	_, _ = isnHasher.Write(portBuf)
   248  	binary.LittleEndian.PutUint16(portBuf, id.RemotePort)
   249  	_, _ = isnHasher.Write(portBuf)
   250  	// The time period here is 64ns. This is similar to what linux uses
   251  	// generate a sequence number that overlaps less than one
   252  	// time per MSL (2 minutes).
   253  	//
   254  	// A 64ns clock ticks 10^9/64 = 15625000) times in a second.
   255  	// To wrap the whole 32 bit space would require
   256  	// 2^32/1562500 ~ 274 seconds.
   257  	//
   258  	// Which sort of guarantees that we won't reuse the ISN for a new
   259  	// connection for the same tuple for at least 274s.
   260  	isn := isnHasher.Sum32() + uint32(clock.NowMonotonic().Sub(tcpip.MonotonicTime{}).Nanoseconds()>>6)
   261  	return seqnum.Value(isn)
   262  }
   263  
   264  // effectiveRcvWndScale returns the effective receive window scale to be used.
   265  // If the peer doesn't support window scaling, the effective rcv wnd scale is
   266  // zero; otherwise it's the value calculated based on the initial rcv wnd.
   267  func (h *handshake) effectiveRcvWndScale() uint8 {
   268  	if h.sndWndScale < 0 {
   269  		return 0
   270  	}
   271  	return uint8(h.rcvWndScale)
   272  }
   273  
   274  // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD
   275  // state.
   276  // +checklocks:h.ep.mu
   277  func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts header.TCPSynOptions, deferAccept time.Duration) {
   278  	h.active = false
   279  	h.state = handshakeSynRcvd
   280  	h.flags = header.TCPFlagSyn | header.TCPFlagAck
   281  	h.iss = iss
   282  	h.ackNum = irs + 1
   283  	h.mss = opts.MSS
   284  	h.sndWndScale = opts.WS
   285  	h.deferAccept = deferAccept
   286  	h.ep.setEndpointState(StateSynRecv)
   287  }
   288  
   289  // checkAck checks if the ACK number, if present, of a segment received during
   290  // a TCP 3-way handshake is valid. If it's not, a RST segment is sent back in
   291  // response.
   292  func (h *handshake) checkAck(s *segment) bool {
   293  	if s.flags.Contains(header.TCPFlagAck) && s.ackNumber != h.iss+1 {
   294  		// RFC 793, page 72 (https://datatracker.ietf.org/doc/html/rfc793#page-72):
   295  		//   If the segment acknowledgment is not acceptable, form a reset segment,
   296  		//        <SEQ=SEG.ACK><CTL=RST>
   297  		//   and send it.
   298  		h.ep.sendEmptyRaw(header.TCPFlagRst, s.ackNumber, 0, 0)
   299  		return false
   300  	}
   301  
   302  	return true
   303  }
   304  
   305  // synSentState handles a segment received when the TCP 3-way handshake is in
   306  // the SYN-SENT state.
   307  // +checklocks:h.ep.mu
   308  func (h *handshake) synSentState(s *segment) tcpip.Error {
   309  	// RFC 793, page 37, states that in the SYN-SENT state, a reset is
   310  	// acceptable if the ack field acknowledges the SYN.
   311  	if s.flags.Contains(header.TCPFlagRst) {
   312  		if s.flags.Contains(header.TCPFlagAck) && s.ackNumber == h.iss+1 {
   313  			// RFC 793, page 67, states that "If the RST bit is set [and] If the ACK
   314  			// was acceptable then signal the user "error: connection reset", drop
   315  			// the segment, enter CLOSED state, delete TCB, and return."
   316  			// Although the RFC above calls out ECONNRESET, Linux actually returns
   317  			// ECONNREFUSED here so we do as well.
   318  			return &tcpip.ErrConnectionRefused{}
   319  		}
   320  		return nil
   321  	}
   322  
   323  	if !h.checkAck(s) {
   324  		return nil
   325  	}
   326  
   327  	// We are in the SYN-SENT state. We only care about segments that have
   328  	// the SYN flag.
   329  	if !s.flags.Contains(header.TCPFlagSyn) {
   330  		return nil
   331  	}
   332  
   333  	// Parse the SYN options.
   334  	rcvSynOpts := parseSynSegmentOptions(s)
   335  
   336  	// Remember if the Timestamp option was negotiated.
   337  	h.ep.maybeEnableTimestamp(rcvSynOpts)
   338  
   339  	// Remember if the SACKPermitted option was negotiated.
   340  	h.ep.maybeEnableSACKPermitted(rcvSynOpts)
   341  
   342  	// Remember the sequence we'll ack from now on.
   343  	h.ackNum = s.sequenceNumber + 1
   344  	h.flags |= header.TCPFlagAck
   345  	h.mss = rcvSynOpts.MSS
   346  	h.sndWndScale = rcvSynOpts.WS
   347  
   348  	// If this is a SYN ACK response, we only need to acknowledge the SYN
   349  	// and the handshake is completed.
   350  	if s.flags.Contains(header.TCPFlagAck) {
   351  		h.state = handshakeCompleted
   352  		h.transitionToStateEstablishedLocked(s)
   353  
   354  		h.ep.sendEmptyRaw(header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale())
   355  		return nil
   356  	}
   357  
   358  	// A SYN segment was received, but no ACK in it. We acknowledge the SYN
   359  	// but resend our own SYN and wait for it to be acknowledged in the
   360  	// SYN-RCVD state.
   361  	h.state = handshakeSynRcvd
   362  	ttl := calculateTTL(h.ep.route, h.ep.ipv4TTL, h.ep.ipv6HopLimit)
   363  	amss := h.ep.amss
   364  	h.ep.setEndpointState(StateSynRecv)
   365  	synOpts := header.TCPSynOptions{
   366  		WS:    int(h.effectiveRcvWndScale()),
   367  		TS:    rcvSynOpts.TS,
   368  		TSVal: h.ep.tsValNow(),
   369  		TSEcr: h.ep.recentTimestamp(),
   370  
   371  		// We only send SACKPermitted if the other side indicated it
   372  		// permits SACK. This is not explicitly defined in the RFC but
   373  		// this is the behaviour implemented by Linux.
   374  		SACKPermitted: rcvSynOpts.SACKPermitted,
   375  		MSS:           amss,
   376  	}
   377  	if ttl == 0 {
   378  		ttl = h.ep.route.DefaultTTL()
   379  	}
   380  	h.ep.sendSynTCP(h.ep.route, tcpFields{
   381  		id:     h.ep.TransportEndpointInfo.ID,
   382  		ttl:    ttl,
   383  		tos:    h.ep.sendTOS,
   384  		flags:  h.flags,
   385  		seq:    h.iss,
   386  		ack:    h.ackNum,
   387  		rcvWnd: h.rcvWnd,
   388  	}, synOpts)
   389  	return nil
   390  }
   391  
   392  // synRcvdState handles a segment received when the TCP 3-way handshake is in
   393  // the SYN-RCVD state.
   394  // +checklocks:h.ep.mu
   395  func (h *handshake) synRcvdState(s *segment) tcpip.Error {
   396  	if s.flags.Contains(header.TCPFlagRst) {
   397  		// RFC 793, page 37, states that in the SYN-RCVD state, a reset
   398  		// is acceptable if the sequence number is in the window.
   399  		if s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) {
   400  			return &tcpip.ErrConnectionRefused{}
   401  		}
   402  		return nil
   403  	}
   404  
   405  	if !h.checkAck(s) {
   406  		return nil
   407  	}
   408  
   409  	// RFC 793, Section 3.9, page 69, states that in the SYN-RCVD state, a
   410  	// sequence number outside of the window causes an ACK with the proper seq
   411  	// number and "After sending the acknowledgment, drop the unacceptable
   412  	// segment and return."
   413  	if !s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) {
   414  		if h.ep.allowOutOfWindowAck() {
   415  			h.ep.sendEmptyRaw(header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd)
   416  		}
   417  		return nil
   418  	}
   419  
   420  	if s.flags.Contains(header.TCPFlagSyn) && s.sequenceNumber != h.ackNum-1 {
   421  		// We received two SYN segments with different sequence
   422  		// numbers, so we reset this and restart the whole
   423  		// process, except that we don't reset the timer.
   424  		ack := s.sequenceNumber.Add(s.logicalLen())
   425  		seq := seqnum.Value(0)
   426  		if s.flags.Contains(header.TCPFlagAck) {
   427  			seq = s.ackNumber
   428  		}
   429  		h.ep.sendEmptyRaw(header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0)
   430  
   431  		if !h.active {
   432  			return &tcpip.ErrInvalidEndpointState{}
   433  		}
   434  
   435  		h.resetState()
   436  		synOpts := header.TCPSynOptions{
   437  			WS:            h.rcvWndScale,
   438  			TS:            h.ep.SendTSOk,
   439  			TSVal:         h.ep.tsValNow(),
   440  			TSEcr:         h.ep.recentTimestamp(),
   441  			SACKPermitted: h.ep.SACKPermitted,
   442  			MSS:           h.ep.amss,
   443  		}
   444  		h.ep.sendSynTCP(h.ep.route, tcpFields{
   445  			id:     h.ep.TransportEndpointInfo.ID,
   446  			ttl:    calculateTTL(h.ep.route, h.ep.ipv4TTL, h.ep.ipv6HopLimit),
   447  			tos:    h.ep.sendTOS,
   448  			flags:  h.flags,
   449  			seq:    h.iss,
   450  			ack:    h.ackNum,
   451  			rcvWnd: h.rcvWnd,
   452  		}, synOpts)
   453  		return nil
   454  	}
   455  
   456  	// We have previously received (and acknowledged) the peer's SYN. If the
   457  	// peer acknowledges our SYN, the handshake is completed.
   458  	if s.flags.Contains(header.TCPFlagAck) {
   459  		// If deferAccept is not zero and this is a bare ACK and the
   460  		// timeout is not hit then drop the ACK.
   461  		if h.deferAccept != 0 && s.payloadSize() == 0 && h.ep.stack.Clock().NowMonotonic().Sub(h.startTime) < h.deferAccept {
   462  			h.acked = true
   463  			h.ep.stack.Stats().DroppedPackets.Increment()
   464  			return nil
   465  		}
   466  
   467  		// If the timestamp option is negotiated and the segment does
   468  		// not carry a timestamp option then the segment must be dropped
   469  		// as per https://tools.ietf.org/html/rfc7323#section-3.2.
   470  		if h.ep.SendTSOk && !s.parsedOptions.TS {
   471  			h.ep.stack.Stats().DroppedPackets.Increment()
   472  			return nil
   473  		}
   474  
   475  		// Drop the ACK if the accept queue is full.
   476  		// https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_ipv4.c#L1523
   477  		// We could abort the connection as well with a tunable as in
   478  		// https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_minisocks.c#L788
   479  		if listenEP := h.listenEP; listenEP != nil && listenEP.acceptQueueIsFull() {
   480  			listenEP.stack.Stats().DroppedPackets.Increment()
   481  			return nil
   482  		}
   483  
   484  		// Update timestamp if required. See RFC7323, section-4.3.
   485  		if h.ep.SendTSOk && s.parsedOptions.TS {
   486  			h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber)
   487  		}
   488  
   489  		h.state = handshakeCompleted
   490  		h.transitionToStateEstablishedLocked(s)
   491  
   492  		// Requeue the segment if the ACK completing the handshake has more info
   493  		// to be processed by the newly established endpoint.
   494  		if (s.flags.Contains(header.TCPFlagFin) || s.payloadSize() > 0) && h.ep.enqueueSegment(s) {
   495  			h.ep.protocol.dispatcher.selectProcessor(h.ep.ID).queueEndpoint(h.ep)
   496  
   497  		}
   498  		return nil
   499  	}
   500  
   501  	return nil
   502  }
   503  
   504  // +checklocks:h.ep.mu
   505  func (h *handshake) handleSegment(s *segment) tcpip.Error {
   506  	h.sndWnd = s.window
   507  	if !s.flags.Contains(header.TCPFlagSyn) && h.sndWndScale > 0 {
   508  		h.sndWnd <<= uint8(h.sndWndScale)
   509  	}
   510  
   511  	switch h.state {
   512  	case handshakeSynRcvd:
   513  		return h.synRcvdState(s)
   514  	case handshakeSynSent:
   515  		return h.synSentState(s)
   516  	}
   517  	return nil
   518  }
   519  
   520  // processSegments goes through the segment queue and processes up to
   521  // maxSegmentsPerWake (if they're available).
   522  // +checklocks:h.ep.mu
   523  func (h *handshake) processSegments() tcpip.Error {
   524  	for i := 0; i < maxSegmentsPerWake; i++ {
   525  		s := h.ep.segmentQueue.dequeue()
   526  		if s == nil {
   527  			return nil
   528  		}
   529  
   530  		err := h.handleSegment(s)
   531  		s.DecRef()
   532  		if err != nil {
   533  			return err
   534  		}
   535  
   536  		// We stop processing packets once the handshake is completed,
   537  		// otherwise we may process packets meant to be processed by
   538  		// the main protocol goroutine.
   539  		if h.state == handshakeCompleted {
   540  			break
   541  		}
   542  	}
   543  
   544  	return nil
   545  }
   546  
   547  // start sends the first SYN/SYN-ACK. It does not block, even if link address
   548  // resolution is required.
   549  func (h *handshake) start() {
   550  	h.startTime = h.ep.stack.Clock().NowMonotonic()
   551  	h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route)
   552  	var sackEnabled tcpip.TCPSACKEnabled
   553  	if err := h.ep.stack.TransportProtocolOption(ProtocolNumber, &sackEnabled); err != nil {
   554  		// If stack returned an error when checking for SACKEnabled
   555  		// status then just default to switching off SACK negotiation.
   556  		sackEnabled = false
   557  	}
   558  
   559  	synOpts := header.TCPSynOptions{
   560  		WS:            h.rcvWndScale,
   561  		TS:            true,
   562  		TSVal:         h.ep.tsValNow(),
   563  		TSEcr:         h.ep.recentTimestamp(),
   564  		SACKPermitted: bool(sackEnabled),
   565  		MSS:           h.ep.amss,
   566  	}
   567  
   568  	// start() is also called in a listen context so we want to make sure we only
   569  	// send the TS/SACK option when we received the TS/SACK in the initial SYN.
   570  	if h.state == handshakeSynRcvd {
   571  		synOpts.TS = h.ep.SendTSOk
   572  		synOpts.SACKPermitted = h.ep.SACKPermitted && bool(sackEnabled)
   573  		if h.sndWndScale < 0 {
   574  			// Disable window scaling if the peer did not send us
   575  			// the window scaling option.
   576  			synOpts.WS = -1
   577  		}
   578  	}
   579  
   580  	h.sendSYNOpts = synOpts
   581  	h.ep.sendSynTCP(h.ep.route, tcpFields{
   582  		id:     h.ep.TransportEndpointInfo.ID,
   583  		ttl:    calculateTTL(h.ep.route, h.ep.ipv4TTL, h.ep.ipv6HopLimit),
   584  		tos:    h.ep.sendTOS,
   585  		flags:  h.flags,
   586  		seq:    h.iss,
   587  		ack:    h.ackNum,
   588  		rcvWnd: h.rcvWnd,
   589  	}, synOpts)
   590  }
   591  
   592  // retransmitHandler handles retransmissions of un-acked SYNs.
   593  // +checklocks:h.ep.mu
   594  func (h *handshake) retransmitHandlerLocked() tcpip.Error {
   595  	e := h.ep
   596  	// If the endpoint has already transition out of a connecting state due
   597  	// to say an error (e.g) peer send RST or an ICMP error. Then just
   598  	// return. Any required cleanup should have been done when the RST/error
   599  	// was handled.
   600  	if !e.EndpointState().connecting() {
   601  		return nil
   602  	}
   603  
   604  	if err := h.retransmitTimer.reset(); err != nil {
   605  		return err
   606  	}
   607  
   608  	// Resend the SYN/SYN-ACK only if the following conditions hold.
   609  	//  - It's an active handshake (deferAccept does not apply)
   610  	//  - It's a passive handshake and we have not yet got the final-ACK.
   611  	//  - It's a passive handshake and we got an ACK but deferAccept is
   612  	//    enabled and we are now past the deferAccept duration.
   613  	// The last is required to provide a way for the peer to complete
   614  	// the connection with another ACK or data (as ACKs are never
   615  	// retransmitted on their own).
   616  	if h.active || !h.acked || h.deferAccept != 0 && e.stack.Clock().NowMonotonic().Sub(h.startTime) > h.deferAccept {
   617  		e.sendSynTCP(e.route, tcpFields{
   618  			id:     e.TransportEndpointInfo.ID,
   619  			ttl:    calculateTTL(e.route, e.ipv4TTL, e.ipv6HopLimit),
   620  			tos:    e.sendTOS,
   621  			flags:  h.flags,
   622  			seq:    h.iss,
   623  			ack:    h.ackNum,
   624  			rcvWnd: h.rcvWnd,
   625  		}, h.sendSYNOpts)
   626  		// If we have ever retransmitted the SYN-ACK or
   627  		// SYN segment, we should only measure RTT if
   628  		// TS option is present.
   629  		h.sampleRTTWithTSOnly = true
   630  	}
   631  	return nil
   632  }
   633  
   634  // transitionToStateEstablisedLocked transitions the endpoint of the handshake
   635  // to an established state given the last segment received from peer. It also
   636  // initializes sender/receiver.
   637  // +checklocks:h.ep.mu
   638  func (h *handshake) transitionToStateEstablishedLocked(s *segment) {
   639  	// Stop the SYN retransmissions now that handshake is complete.
   640  	if h.retransmitTimer != nil {
   641  		h.retransmitTimer.stop()
   642  	}
   643  
   644  	// Transfer handshake state to TCP connection. We disable
   645  	// receive window scaling if the peer doesn't support it
   646  	// (indicated by a negative send window scale).
   647  	h.ep.snd = newSender(h.ep, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale)
   648  
   649  	now := h.ep.stack.Clock().NowMonotonic()
   650  
   651  	var rtt time.Duration
   652  	if h.ep.SendTSOk && s.parsedOptions.TSEcr != 0 {
   653  		rtt = h.ep.elapsed(now, s.parsedOptions.TSEcr)
   654  	}
   655  	if !h.sampleRTTWithTSOnly && rtt == 0 {
   656  		rtt = now.Sub(h.startTime)
   657  	}
   658  
   659  	if rtt > 0 {
   660  		h.ep.snd.updateRTO(rtt)
   661  	}
   662  
   663  	h.ep.rcvQueueMu.Lock()
   664  	h.ep.rcv = newReceiver(h.ep, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale())
   665  	// Bootstrap the auto tuning algorithm. Starting at zero will
   666  	// result in a really large receive window after the first auto
   667  	// tuning adjustment.
   668  	h.ep.RcvAutoParams.PrevCopiedBytes = int(h.rcvWnd)
   669  	h.ep.rcvQueueMu.Unlock()
   670  
   671  	h.ep.setEndpointState(StateEstablished)
   672  
   673  	// Completing the 3-way handshake is an indication that the route is valid
   674  	// and the remote is reachable as the only way we can complete a handshake
   675  	// is if our SYN reached the remote and their ACK reached us.
   676  	h.ep.route.ConfirmReachable()
   677  
   678  	// Tell waiters that the endpoint is connected and writable.
   679  	h.ep.waiterQueue.Notify(waiter.WritableEvents)
   680  }
   681  
   682  type backoffTimer struct {
   683  	timeout    time.Duration
   684  	maxTimeout time.Duration
   685  	t          tcpip.Timer
   686  }
   687  
   688  func newBackoffTimer(clock tcpip.Clock, timeout, maxTimeout time.Duration, f func()) (*backoffTimer, tcpip.Error) {
   689  	if timeout > maxTimeout {
   690  		return nil, &tcpip.ErrTimeout{}
   691  	}
   692  	bt := &backoffTimer{timeout: timeout, maxTimeout: maxTimeout}
   693  	bt.t = clock.AfterFunc(timeout, f)
   694  	return bt, nil
   695  }
   696  
   697  func (bt *backoffTimer) reset() tcpip.Error {
   698  	bt.timeout *= 2
   699  	if bt.timeout > bt.maxTimeout {
   700  		return &tcpip.ErrTimeout{}
   701  	}
   702  	bt.t.Reset(bt.timeout)
   703  	return nil
   704  }
   705  
   706  func (bt *backoffTimer) stop() {
   707  	bt.t.Stop()
   708  }
   709  
   710  func parseSynSegmentOptions(s *segment) header.TCPSynOptions {
   711  	synOpts := header.ParseSynOptions(s.options, s.flags.Contains(header.TCPFlagAck))
   712  	if synOpts.TS {
   713  		s.parsedOptions.TSVal = synOpts.TSVal
   714  		s.parsedOptions.TSEcr = synOpts.TSEcr
   715  	}
   716  	return synOpts
   717  }
   718  
   719  var optionPool = sync.Pool{
   720  	New: func() any {
   721  		return &[maxOptionSize]byte{}
   722  	},
   723  }
   724  
   725  func getOptions() []byte {
   726  	return (*optionPool.Get().(*[maxOptionSize]byte))[:]
   727  }
   728  
   729  func putOptions(options []byte) {
   730  	// Reslice to full capacity.
   731  	optionPool.Put(optionsToArray(options))
   732  }
   733  
   734  func makeSynOptions(opts header.TCPSynOptions) []byte {
   735  	// Emulate linux option order. This is as follows:
   736  	//
   737  	// if md5: NOP NOP MD5SIG 18 md5sig(16)
   738  	// if mss: MSS 4 mss(2)
   739  	// if ts and sack_advertise:
   740  	//	SACK 2 TIMESTAMP 2 timestamp(8)
   741  	// elif ts: NOP NOP TIMESTAMP 10 timestamp(8)
   742  	// elif sack: NOP NOP SACK 2
   743  	// if wscale: NOP WINDOW 3 ws(1)
   744  	// if sack_blocks: NOP NOP SACK ((2 + (#blocks * 8))
   745  	//	[for each block] start_seq(4) end_seq(4)
   746  	// if fastopen_cookie:
   747  	//	if exp: EXP (4 + len(cookie)) FASTOPEN_MAGIC(2)
   748  	// 	else: FASTOPEN (2 + len(cookie))
   749  	//	cookie(variable) [padding to four bytes]
   750  	//
   751  	options := getOptions()
   752  
   753  	// Always encode the mss.
   754  	offset := header.EncodeMSSOption(uint32(opts.MSS), options)
   755  
   756  	// Special ordering is required here. If both TS and SACK are enabled,
   757  	// then the SACK option precedes TS, with no padding. If they are
   758  	// enabled individually, then we see padding before the option.
   759  	if opts.TS && opts.SACKPermitted {
   760  		offset += header.EncodeSACKPermittedOption(options[offset:])
   761  		offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:])
   762  	} else if opts.TS {
   763  		offset += header.EncodeNOP(options[offset:])
   764  		offset += header.EncodeNOP(options[offset:])
   765  		offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:])
   766  	} else if opts.SACKPermitted {
   767  		offset += header.EncodeNOP(options[offset:])
   768  		offset += header.EncodeNOP(options[offset:])
   769  		offset += header.EncodeSACKPermittedOption(options[offset:])
   770  	}
   771  
   772  	// Initialize the WS option.
   773  	if opts.WS >= 0 {
   774  		offset += header.EncodeNOP(options[offset:])
   775  		offset += header.EncodeWSOption(opts.WS, options[offset:])
   776  	}
   777  
   778  	// Padding to the end; note that this never apply unless we add a
   779  	// fastopen option, we always expect the offset to remain the same.
   780  	if delta := header.AddTCPOptionPadding(options, offset); delta != 0 {
   781  		panic("unexpected option encoding")
   782  	}
   783  
   784  	return options[:offset]
   785  }
   786  
   787  // tcpFields is a struct to carry different parameters required by the
   788  // send*TCP variant functions below.
   789  type tcpFields struct {
   790  	id     stack.TransportEndpointID
   791  	ttl    uint8
   792  	tos    uint8
   793  	flags  header.TCPFlags
   794  	seq    seqnum.Value
   795  	ack    seqnum.Value
   796  	rcvWnd seqnum.Size
   797  	opts   []byte
   798  	txHash uint32
   799  }
   800  
   801  func (e *endpoint) sendSynTCP(r *stack.Route, tf tcpFields, opts header.TCPSynOptions) tcpip.Error {
   802  	tf.opts = makeSynOptions(opts)
   803  	// We ignore SYN send errors and let the callers re-attempt send.
   804  	p := stack.NewPacketBuffer(stack.PacketBufferOptions{ReserveHeaderBytes: header.TCPMinimumSize + int(r.MaxHeaderLength()) + len(tf.opts)})
   805  	defer p.DecRef()
   806  	if err := e.sendTCP(r, tf, p, stack.GSO{}); err != nil {
   807  		e.stats.SendErrors.SynSendToNetworkFailed.Increment()
   808  	}
   809  	putOptions(tf.opts)
   810  	return nil
   811  }
   812  
   813  // This method takes ownership of pkt.
   814  func (e *endpoint) sendTCP(r *stack.Route, tf tcpFields, pkt stack.PacketBufferPtr, gso stack.GSO) tcpip.Error {
   815  	tf.txHash = e.txHash
   816  	if err := sendTCP(r, tf, pkt, gso, e.owner); err != nil {
   817  		e.stats.SendErrors.SegmentSendToNetworkFailed.Increment()
   818  		return err
   819  	}
   820  	e.stats.SegmentsSent.Increment()
   821  	return nil
   822  }
   823  
   824  func buildTCPHdr(r *stack.Route, tf tcpFields, pkt stack.PacketBufferPtr, gso stack.GSO) {
   825  	optLen := len(tf.opts)
   826  	tcp := header.TCP(pkt.TransportHeader().Push(header.TCPMinimumSize + optLen))
   827  	pkt.TransportProtocolNumber = header.TCPProtocolNumber
   828  	tcp.Encode(&header.TCPFields{
   829  		SrcPort:    tf.id.LocalPort,
   830  		DstPort:    tf.id.RemotePort,
   831  		SeqNum:     uint32(tf.seq),
   832  		AckNum:     uint32(tf.ack),
   833  		DataOffset: uint8(header.TCPMinimumSize + optLen),
   834  		Flags:      tf.flags,
   835  		WindowSize: uint16(tf.rcvWnd),
   836  	})
   837  	copy(tcp[header.TCPMinimumSize:], tf.opts)
   838  
   839  	xsum := r.PseudoHeaderChecksum(ProtocolNumber, uint16(pkt.Size()))
   840  	// Only calculate the checksum if offloading isn't supported.
   841  	if gso.Type != stack.GSONone && gso.NeedsCsum {
   842  		// This is called CHECKSUM_PARTIAL in the Linux kernel. We
   843  		// calculate a checksum of the pseudo-header and save it in the
   844  		// TCP header, then the kernel calculate a checksum of the
   845  		// header and data and get the right sum of the TCP packet.
   846  		tcp.SetChecksum(xsum)
   847  	} else if r.RequiresTXTransportChecksum() {
   848  		xsum = checksum.Combine(xsum, pkt.Data().Checksum())
   849  		tcp.SetChecksum(^tcp.CalculateChecksum(xsum))
   850  	}
   851  }
   852  
   853  func sendTCPBatch(r *stack.Route, tf tcpFields, pkt stack.PacketBufferPtr, gso stack.GSO, owner tcpip.PacketOwner) tcpip.Error {
   854  	optLen := len(tf.opts)
   855  	if tf.rcvWnd > math.MaxUint16 {
   856  		tf.rcvWnd = math.MaxUint16
   857  	}
   858  
   859  	mss := int(gso.MSS)
   860  	n := (pkt.Data().Size() + mss - 1) / mss
   861  
   862  	size := pkt.Data().Size()
   863  	hdrSize := header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen
   864  	for i := 0; i < n; i++ {
   865  		packetSize := mss
   866  		if packetSize > size {
   867  			packetSize = size
   868  		}
   869  		size -= packetSize
   870  
   871  		pkt := pkt
   872  		// No need to split the packet in the final iteration. The original
   873  		// packet already has the truncated data.
   874  		shouldSplitPacket := i != n-1
   875  		if shouldSplitPacket {
   876  			splitPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ReserveHeaderBytes: hdrSize})
   877  			splitPkt.Data().ReadFromPacketData(pkt.Data(), packetSize)
   878  			pkt = splitPkt
   879  		}
   880  		pkt.Hash = tf.txHash
   881  		pkt.Owner = owner
   882  
   883  		buildTCPHdr(r, tf, pkt, gso)
   884  		tf.seq = tf.seq.Add(seqnum.Size(packetSize))
   885  		pkt.GSOOptions = gso
   886  		if err := r.WritePacket(stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos}, pkt); err != nil {
   887  			r.Stats().TCP.SegmentSendErrors.Increment()
   888  			if shouldSplitPacket {
   889  				pkt.DecRef()
   890  			}
   891  			return err
   892  		}
   893  		r.Stats().TCP.SegmentsSent.Increment()
   894  		if shouldSplitPacket {
   895  			pkt.DecRef()
   896  		}
   897  	}
   898  	return nil
   899  }
   900  
   901  // sendTCP sends a TCP segment with the provided options via the provided
   902  // network endpoint and under the provided identity. This method takes
   903  // ownership of pkt.
   904  func sendTCP(r *stack.Route, tf tcpFields, pkt stack.PacketBufferPtr, gso stack.GSO, owner tcpip.PacketOwner) tcpip.Error {
   905  	if tf.rcvWnd > math.MaxUint16 {
   906  		tf.rcvWnd = math.MaxUint16
   907  	}
   908  
   909  	if r.Loop()&stack.PacketLoop == 0 && gso.Type == stack.GSOGvisor && int(gso.MSS) < pkt.Data().Size() {
   910  		return sendTCPBatch(r, tf, pkt, gso, owner)
   911  	}
   912  
   913  	pkt.GSOOptions = gso
   914  	pkt.Hash = tf.txHash
   915  	pkt.Owner = owner
   916  	buildTCPHdr(r, tf, pkt, gso)
   917  
   918  	if err := r.WritePacket(stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos}, pkt); err != nil {
   919  		r.Stats().TCP.SegmentSendErrors.Increment()
   920  		return err
   921  	}
   922  	r.Stats().TCP.SegmentsSent.Increment()
   923  	if (tf.flags & header.TCPFlagRst) != 0 {
   924  		r.Stats().TCP.ResetsSent.Increment()
   925  	}
   926  	return nil
   927  }
   928  
   929  // makeOptions makes an options slice.
   930  func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
   931  	options := getOptions()
   932  	offset := 0
   933  
   934  	// N.B. the ordering here matches the ordering used by Linux internally
   935  	// and described in the raw makeOptions function. We don't include
   936  	// unnecessary cases here (post connection.)
   937  	if e.SendTSOk {
   938  		// Embed the timestamp if timestamp has been enabled.
   939  		//
   940  		// We only use the lower 32 bits of the unix time in
   941  		// milliseconds. This is similar to what Linux does where it
   942  		// uses the lower 32 bits of the jiffies value in the tsVal
   943  		// field of the timestamp option.
   944  		//
   945  		// Further, RFC7323 section-5.4 recommends millisecond
   946  		// resolution as the lowest recommended resolution for the
   947  		// timestamp clock.
   948  		//
   949  		// Ref: https://tools.ietf.org/html/rfc7323#section-5.4.
   950  		offset += header.EncodeNOP(options[offset:])
   951  		offset += header.EncodeNOP(options[offset:])
   952  		offset += header.EncodeTSOption(e.tsValNow(), e.recentTimestamp(), options[offset:])
   953  	}
   954  	if e.SACKPermitted && len(sackBlocks) > 0 {
   955  		offset += header.EncodeNOP(options[offset:])
   956  		offset += header.EncodeNOP(options[offset:])
   957  		offset += header.EncodeSACKBlocks(sackBlocks, options[offset:])
   958  	}
   959  
   960  	// We expect the above to produce an aligned offset.
   961  	if delta := header.AddTCPOptionPadding(options, offset); delta != 0 {
   962  		panic("unexpected option encoding")
   963  	}
   964  
   965  	return options[:offset]
   966  }
   967  
   968  // sendEmptyRaw sends a TCP segment with no payload to the endpoint's peer.
   969  func (e *endpoint) sendEmptyRaw(flags header.TCPFlags, seq, ack seqnum.Value, rcvWnd seqnum.Size) tcpip.Error {
   970  	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{})
   971  	defer pkt.DecRef()
   972  	return e.sendRaw(pkt, flags, seq, ack, rcvWnd)
   973  }
   974  
   975  // sendRaw sends a TCP segment to the endpoint's peer. This method takes
   976  // ownership of pkt. pkt must not have any headers set.
   977  func (e *endpoint) sendRaw(pkt stack.PacketBufferPtr, flags header.TCPFlags, seq, ack seqnum.Value, rcvWnd seqnum.Size) tcpip.Error {
   978  	var sackBlocks []header.SACKBlock
   979  	if e.EndpointState() == StateEstablished && e.rcv.pendingRcvdSegments.Len() > 0 && (flags&header.TCPFlagAck != 0) {
   980  		sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
   981  	}
   982  	options := e.makeOptions(sackBlocks)
   983  	defer putOptions(options)
   984  	pkt.ReserveHeaderBytes(header.TCPMinimumSize + int(e.route.MaxHeaderLength()) + len(options))
   985  	return e.sendTCP(e.route, tcpFields{
   986  		id:     e.TransportEndpointInfo.ID,
   987  		ttl:    calculateTTL(e.route, e.ipv4TTL, e.ipv6HopLimit),
   988  		tos:    e.sendTOS,
   989  		flags:  flags,
   990  		seq:    seq,
   991  		ack:    ack,
   992  		rcvWnd: rcvWnd,
   993  		opts:   options,
   994  	}, pkt, e.gso)
   995  }
   996  
   997  // +checklocks:e.mu
   998  // +checklocksalias:e.snd.ep.mu=e.mu
   999  func (e *endpoint) sendData(next *segment) {
  1000  	// Initialize the next segment to write if it's currently nil.
  1001  	if e.snd.writeNext == nil {
  1002  		if next == nil {
  1003  			return
  1004  		}
  1005  		e.snd.updateWriteNext(next)
  1006  	}
  1007  
  1008  	// Push out any new packets.
  1009  	e.snd.sendData()
  1010  }
  1011  
  1012  // resetConnectionLocked puts the endpoint in an error state with the given
  1013  // error code and sends a RST if and only if the error is not ErrConnectionReset
  1014  // indicating that the connection is being reset due to receiving a RST. This
  1015  // method must only be called from the protocol goroutine.
  1016  // +checklocks:e.mu
  1017  func (e *endpoint) resetConnectionLocked(err tcpip.Error) {
  1018  	// Only send a reset if the connection is being aborted for a reason
  1019  	// other than receiving a reset.
  1020  	e.hardError = err
  1021  	switch err.(type) {
  1022  	case *tcpip.ErrConnectionReset, *tcpip.ErrTimeout:
  1023  	default:
  1024  		// The exact sequence number to be used for the RST is the same as the
  1025  		// one used by Linux. We need to handle the case of window being shrunk
  1026  		// which can cause sndNxt to be outside the acceptable window on the
  1027  		// receiver.
  1028  		//
  1029  		// See: https://www.snellman.net/blog/archive/2016-02-01-tcp-rst/ for more
  1030  		// information.
  1031  		sndWndEnd := e.snd.SndUna.Add(e.snd.SndWnd)
  1032  		resetSeqNum := sndWndEnd
  1033  		if !sndWndEnd.LessThan(e.snd.SndNxt) || e.snd.SndNxt.Size(sndWndEnd) < (1<<e.snd.SndWndScale) {
  1034  			resetSeqNum = e.snd.SndNxt
  1035  		}
  1036  		e.sendEmptyRaw(header.TCPFlagAck|header.TCPFlagRst, resetSeqNum, e.rcv.RcvNxt, 0)
  1037  	}
  1038  	// Don't purge read queues here. If there's buffered data, it's still allowed
  1039  	// to be read.
  1040  	e.purgeWriteQueue()
  1041  	e.purgePendingRcvQueue()
  1042  	e.cleanupLocked()
  1043  	e.setEndpointState(StateError)
  1044  }
  1045  
  1046  // transitionToStateCloseLocked ensures that the endpoint is
  1047  // cleaned up from the transport demuxer, "before" moving to
  1048  // StateClose. This will ensure that no packet will be
  1049  // delivered to this endpoint from the demuxer when the endpoint
  1050  // is transitioned to StateClose.
  1051  // +checklocks:e.mu
  1052  func (e *endpoint) transitionToStateCloseLocked() {
  1053  	s := e.EndpointState()
  1054  	if s == StateClose {
  1055  		return
  1056  	}
  1057  
  1058  	if s.connected() {
  1059  		e.stack.Stats().TCP.EstablishedClosed.Increment()
  1060  	}
  1061  
  1062  	e.cleanupLocked()
  1063  	// Mark the endpoint as fully closed for reads/writes.
  1064  	e.setEndpointState(StateClose)
  1065  }
  1066  
  1067  // tryDeliverSegmentFromClosedEndpoint attempts to deliver the parsed
  1068  // segment to any other endpoint other than the current one. This is called
  1069  // only when the endpoint is in StateClose and we want to deliver the segment
  1070  // to any other listening endpoint. We reply with RST if we cannot find one.
  1071  func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
  1072  	ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.TransportEndpointInfo.ID, s.pkt.NICID)
  1073  	if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.TransportEndpointInfo.ID.LocalAddress.To4() != (tcpip.Address{}) {
  1074  		// Dual-stack socket, try IPv4.
  1075  		ep = e.stack.FindTransportEndpoint(
  1076  			header.IPv4ProtocolNumber,
  1077  			e.TransProto,
  1078  			e.TransportEndpointInfo.ID,
  1079  			s.pkt.NICID,
  1080  		)
  1081  	}
  1082  	if ep == nil {
  1083  		if !s.flags.Contains(header.TCPFlagRst) {
  1084  			replyWithReset(e.stack, s, stack.DefaultTOS, tcpip.UseDefaultIPv4TTL, tcpip.UseDefaultIPv6HopLimit)
  1085  		}
  1086  		return
  1087  	}
  1088  
  1089  	if e == ep {
  1090  		panic(fmt.Sprintf("current endpoint not removed from demuxer, enqueing segments to itself, endpoint in state %v", e.EndpointState()))
  1091  	}
  1092  
  1093  	if ep := ep.(*endpoint); ep.enqueueSegment(s) {
  1094  		ep.notifyProcessor()
  1095  	}
  1096  }
  1097  
  1098  // Drain segment queue from the endpoint and try to re-match the segment to a
  1099  // different endpoint. This is used when the current endpoint is transitioned to
  1100  // StateClose and has been unregistered from the transport demuxer.
  1101  func (e *endpoint) drainClosingSegmentQueue() {
  1102  	for {
  1103  		s := e.segmentQueue.dequeue()
  1104  		if s == nil {
  1105  			break
  1106  		}
  1107  
  1108  		e.tryDeliverSegmentFromClosedEndpoint(s)
  1109  		s.DecRef()
  1110  	}
  1111  }
  1112  
  1113  // +checklocks:e.mu
  1114  func (e *endpoint) handleReset(s *segment) (ok bool, err tcpip.Error) {
  1115  	if e.rcv.acceptable(s.sequenceNumber, 0) {
  1116  		// RFC 793, page 37 states that "in all states
  1117  		// except SYN-SENT, all reset (RST) segments are
  1118  		// validated by checking their SEQ-fields." So
  1119  		// we only process it if it's acceptable.
  1120  		switch e.EndpointState() {
  1121  		// In case of a RST in CLOSE-WAIT linux moves
  1122  		// the socket to closed state with an error set
  1123  		// to indicate EPIPE.
  1124  		//
  1125  		// Technically this seems to be at odds w/ RFC.
  1126  		// As per https://tools.ietf.org/html/rfc793#section-2.7
  1127  		// page 69 the behavior for a segment arriving
  1128  		// w/ RST bit set in CLOSE-WAIT is inlined below.
  1129  		//
  1130  		//  ESTABLISHED
  1131  		//  FIN-WAIT-1
  1132  		//  FIN-WAIT-2
  1133  		//  CLOSE-WAIT
  1134  
  1135  		//  If the RST bit is set then, any outstanding RECEIVEs and
  1136  		//  SEND should receive "reset" responses. All segment queues
  1137  		//  should be flushed.  Users should also receive an unsolicited
  1138  		//  general "connection reset" signal. Enter the CLOSED state,
  1139  		//  delete the TCB, and return.
  1140  		case StateCloseWait:
  1141  			e.transitionToStateCloseLocked()
  1142  			e.hardError = &tcpip.ErrAborted{}
  1143  			return false, nil
  1144  		default:
  1145  			// RFC 793, page 37 states that "in all states
  1146  			// except SYN-SENT, all reset (RST) segments are
  1147  			// validated by checking their SEQ-fields." So
  1148  			// we only process it if it's acceptable.
  1149  
  1150  			// Notify protocol goroutine. This is required when
  1151  			// handleSegment is invoked from the processor goroutine
  1152  			// rather than the worker goroutine.
  1153  			return false, &tcpip.ErrConnectionReset{}
  1154  		}
  1155  	}
  1156  	return true, nil
  1157  }
  1158  
  1159  // handleSegments processes all inbound segments.
  1160  //
  1161  // +checklocks:e.mu
  1162  // +checklocksalias:e.snd.ep.mu=e.mu
  1163  func (e *endpoint) handleSegmentsLocked() tcpip.Error {
  1164  	sndUna := e.snd.SndUna
  1165  	for i := 0; i < maxSegmentsPerWake; i++ {
  1166  		if state := e.EndpointState(); state.closed() || state == StateTimeWait || state == StateError {
  1167  			return nil
  1168  		}
  1169  		s := e.segmentQueue.dequeue()
  1170  		if s == nil {
  1171  			break
  1172  		}
  1173  		cont, err := e.handleSegmentLocked(s)
  1174  		s.DecRef()
  1175  		if err != nil {
  1176  			return err
  1177  		}
  1178  		if !cont {
  1179  			return nil
  1180  		}
  1181  	}
  1182  
  1183  	// The remote ACK-ing at least 1 byte is an indication that we have a
  1184  	// full-duplex connection to the remote as the only way we will receive an
  1185  	// ACK is if the remote received data that we previously sent.
  1186  	//
  1187  	// As of writing, Linux seems to only confirm a route as reachable when
  1188  	// forward progress is made which is indicated by an ACK that removes data
  1189  	// from the retransmit queue, i.e. sender makes forward progress.
  1190  	if sndUna.LessThan(e.snd.SndUna) {
  1191  		e.route.ConfirmReachable()
  1192  	}
  1193  
  1194  	// Send an ACK for all processed packets if needed.
  1195  	if e.rcv.RcvNxt != e.snd.MaxSentAck {
  1196  		e.snd.sendAck()
  1197  	}
  1198  
  1199  	e.resetKeepaliveTimer(true /* receivedData */)
  1200  
  1201  	return nil
  1202  }
  1203  
  1204  // +checklocks:e.mu
  1205  func (e *endpoint) probeSegmentLocked() {
  1206  	if fn := e.probe; fn != nil {
  1207  		var state stack.TCPEndpointState
  1208  		e.completeStateLocked(&state)
  1209  		fn(&state)
  1210  	}
  1211  }
  1212  
  1213  // handleSegment handles a given segment and notifies the worker goroutine if
  1214  // if the connection should be terminated.
  1215  //
  1216  // +checklocks:e.mu
  1217  // +checklocksalias:e.rcv.ep.mu=e.mu
  1218  // +checklocksalias:e.snd.ep.mu=e.mu
  1219  func (e *endpoint) handleSegmentLocked(s *segment) (cont bool, err tcpip.Error) {
  1220  	// Invoke the tcp probe if installed. The tcp probe function will update
  1221  	// the TCPEndpointState after the segment is processed.
  1222  	defer e.probeSegmentLocked()
  1223  
  1224  	if s.flags.Contains(header.TCPFlagRst) {
  1225  		if ok, err := e.handleReset(s); !ok {
  1226  			return false, err
  1227  		}
  1228  	} else if s.flags.Contains(header.TCPFlagSyn) {
  1229  		// See: https://tools.ietf.org/html/rfc5961#section-4.1
  1230  		//   1) If the SYN bit is set, irrespective of the sequence number, TCP
  1231  		//    MUST send an ACK (also referred to as challenge ACK) to the remote
  1232  		//    peer:
  1233  		//
  1234  		//    <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
  1235  		//
  1236  		//    After sending the acknowledgment, TCP MUST drop the unacceptable
  1237  		//    segment and stop processing further.
  1238  		//
  1239  		// By sending an ACK, the remote peer is challenged to confirm the loss
  1240  		// of the previous connection and the request to start a new connection.
  1241  		// A legitimate peer, after restart, would not have a TCB in the
  1242  		// synchronized state.  Thus, when the ACK arrives, the peer should send
  1243  		// a RST segment back with the sequence number derived from the ACK
  1244  		// field that caused the RST.
  1245  
  1246  		// This RST will confirm that the remote peer has indeed closed the
  1247  		// previous connection.  Upon receipt of a valid RST, the local TCP
  1248  		// endpoint MUST terminate its connection.  The local TCP endpoint
  1249  		// should then rely on SYN retransmission from the remote end to
  1250  		// re-establish the connection.
  1251  		e.snd.maybeSendOutOfWindowAck(s)
  1252  	} else if s.flags.Contains(header.TCPFlagAck) {
  1253  		// Patch the window size in the segment according to the
  1254  		// send window scale.
  1255  		s.window <<= e.snd.SndWndScale
  1256  
  1257  		// RFC 793, page 41 states that "once in the ESTABLISHED
  1258  		// state all segments must carry current acknowledgment
  1259  		// information."
  1260  		drop, err := e.rcv.handleRcvdSegment(s)
  1261  		if err != nil {
  1262  			return false, err
  1263  		}
  1264  		if drop {
  1265  			return true, nil
  1266  		}
  1267  
  1268  		// Now check if the received segment has caused us to transition
  1269  		// to a CLOSED state, if yes then terminate processing and do
  1270  		// not invoke the sender.
  1271  		state := e.EndpointState()
  1272  		if state == StateClose {
  1273  			// When we get into StateClose while processing from the queue,
  1274  			// return immediately and let the protocolMainloop handle it.
  1275  			//
  1276  			// We can reach StateClose only while processing a previous segment
  1277  			// or a notification from the protocolMainLoop (caller goroutine).
  1278  			// This means that with this return, the segment dequeue below can
  1279  			// never occur on a closed endpoint.
  1280  			return false, nil
  1281  		}
  1282  
  1283  		e.snd.handleRcvdSegment(s)
  1284  	}
  1285  
  1286  	return true, nil
  1287  }
  1288  
  1289  // keepaliveTimerExpired is called when the keepaliveTimer fires. We send TCP
  1290  // keepalive packets periodically when the connection is idle. If we don't hear
  1291  // from the other side after a number of tries, we terminate the connection.
  1292  // +checklocks:e.mu
  1293  // +checklocksalias:e.snd.ep.mu=e.mu
  1294  func (e *endpoint) keepaliveTimerExpired() tcpip.Error {
  1295  	userTimeout := e.userTimeout
  1296  
  1297  	e.keepalive.Lock()
  1298  	if !e.SocketOptions().GetKeepAlive() || e.keepalive.timer.isZero() || !e.keepalive.timer.checkExpiration() {
  1299  		e.keepalive.Unlock()
  1300  		return nil
  1301  	}
  1302  
  1303  	// If a userTimeout is set then abort the connection if it is
  1304  	// exceeded.
  1305  	if userTimeout != 0 && e.stack.Clock().NowMonotonic().Sub(e.rcv.lastRcvdAckTime) >= userTimeout && e.keepalive.unacked > 0 {
  1306  		e.keepalive.Unlock()
  1307  		e.stack.Stats().TCP.EstablishedTimedout.Increment()
  1308  		return &tcpip.ErrTimeout{}
  1309  	}
  1310  
  1311  	if e.keepalive.unacked >= e.keepalive.count {
  1312  		e.keepalive.Unlock()
  1313  		e.stack.Stats().TCP.EstablishedTimedout.Increment()
  1314  		return &tcpip.ErrTimeout{}
  1315  	}
  1316  
  1317  	// RFC1122 4.2.3.6: TCP keepalive is a dataless ACK with
  1318  	// seg.seq = snd.nxt-1.
  1319  	e.keepalive.unacked++
  1320  	e.keepalive.Unlock()
  1321  	e.snd.sendEmptySegment(header.TCPFlagAck, e.snd.SndNxt-1)
  1322  	e.resetKeepaliveTimer(false)
  1323  	return nil
  1324  }
  1325  
  1326  // resetKeepaliveTimer restarts or stops the keepalive timer, depending on
  1327  // whether it is enabled for this endpoint.
  1328  func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
  1329  	e.keepalive.Lock()
  1330  	defer e.keepalive.Unlock()
  1331  	if e.keepalive.timer.isZero() {
  1332  		if state := e.EndpointState(); !state.closed() {
  1333  			panic(fmt.Sprintf("Unexpected state when the keepalive time is cleaned up, got %s, want %s or %s", state, StateClose, StateError))
  1334  		}
  1335  		return
  1336  	}
  1337  	if receivedData {
  1338  		e.keepalive.unacked = 0
  1339  	}
  1340  	// Start the keepalive timer IFF it's enabled and there is no pending
  1341  	// data to send.
  1342  	if !e.SocketOptions().GetKeepAlive() || e.snd == nil || e.snd.SndUna != e.snd.SndNxt {
  1343  		e.keepalive.timer.disable()
  1344  		return
  1345  	}
  1346  	if e.keepalive.unacked > 0 {
  1347  		e.keepalive.timer.enable(e.keepalive.interval)
  1348  	} else {
  1349  		e.keepalive.timer.enable(e.keepalive.idle)
  1350  	}
  1351  }
  1352  
  1353  // disableKeepaliveTimer stops the keepalive timer.
  1354  func (e *endpoint) disableKeepaliveTimer() {
  1355  	e.keepalive.Lock()
  1356  	e.keepalive.timer.disable()
  1357  	e.keepalive.Unlock()
  1358  }
  1359  
  1360  // finWait2TimerExpired is called when the FIN-WAIT-2 timeout is hit
  1361  // and the peer hasn't sent us a FIN.
  1362  func (e *endpoint) finWait2TimerExpired() {
  1363  	e.mu.Lock()
  1364  	e.transitionToStateCloseLocked()
  1365  	e.mu.Unlock()
  1366  	e.drainClosingSegmentQueue()
  1367  	e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  1368  }
  1369  
  1370  // +checklocks:e.mu
  1371  func (e *endpoint) handshakeFailed(err tcpip.Error) {
  1372  	e.lastErrorMu.Lock()
  1373  	e.lastError = err
  1374  	e.lastErrorMu.Unlock()
  1375  	// handshakeFailed is also called from startHandshake when a listener
  1376  	// transitions out of Listen state by the time the SYN is processed. In
  1377  	// such cases the handshake is never initialized and the newly created
  1378  	// endpoint is closed right away.
  1379  	if e.h != nil && e.h.retransmitTimer != nil {
  1380  		e.h.retransmitTimer.stop()
  1381  	}
  1382  	e.hardError = err
  1383  	e.cleanupLocked()
  1384  	e.setEndpointState(StateError)
  1385  }
  1386  
  1387  // handleTimeWaitSegments processes segments received during TIME_WAIT
  1388  // state.
  1389  // +checklocks:e.mu
  1390  // +checklocksalias:e.rcv.ep.mu=e.mu
  1391  func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()) {
  1392  	for i := 0; i < maxSegmentsPerWake; i++ {
  1393  		s := e.segmentQueue.dequeue()
  1394  		if s == nil {
  1395  			break
  1396  		}
  1397  		extTW, newSyn := e.rcv.handleTimeWaitSegment(s)
  1398  		if newSyn {
  1399  			info := e.TransportEndpointInfo
  1400  			newID := info.ID
  1401  			newID.RemoteAddress = tcpip.Address{}
  1402  			newID.RemotePort = 0
  1403  			netProtos := []tcpip.NetworkProtocolNumber{info.NetProto}
  1404  			// If the local address is an IPv4 address then also
  1405  			// look for IPv6 dual stack endpoints that might be
  1406  			// listening on the local address.
  1407  			if newID.LocalAddress.To4() != (tcpip.Address{}) {
  1408  				netProtos = []tcpip.NetworkProtocolNumber{header.IPv4ProtocolNumber, header.IPv6ProtocolNumber}
  1409  			}
  1410  			for _, netProto := range netProtos {
  1411  				if listenEP := e.stack.FindTransportEndpoint(netProto, info.TransProto, newID, s.pkt.NICID); listenEP != nil {
  1412  					tcpEP := listenEP.(*endpoint)
  1413  					if EndpointState(tcpEP.State()) == StateListen {
  1414  						reuseTW = func() {
  1415  							if !tcpEP.enqueueSegment(s) {
  1416  								return
  1417  							}
  1418  							tcpEP.notifyProcessor()
  1419  							s.DecRef()
  1420  						}
  1421  						// We explicitly do not DecRef the segment as it's still valid and
  1422  						// being reflected to a listening endpoint.
  1423  						return false, reuseTW
  1424  					}
  1425  				}
  1426  			}
  1427  		}
  1428  		if extTW {
  1429  			extendTimeWait = true
  1430  		}
  1431  		s.DecRef()
  1432  	}
  1433  	return extendTimeWait, nil
  1434  }
  1435  
  1436  // +checklocks:e.mu
  1437  func (e *endpoint) getTimeWaitDuration() time.Duration {
  1438  	timeWaitDuration := DefaultTCPTimeWaitTimeout
  1439  
  1440  	// Get the stack wide configuration.
  1441  	var tcpTW tcpip.TCPTimeWaitTimeoutOption
  1442  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &tcpTW); err == nil {
  1443  		timeWaitDuration = time.Duration(tcpTW)
  1444  	}
  1445  	return timeWaitDuration
  1446  }
  1447  
  1448  // timeWaitTimerExpired is called when an endpoint completes the required time
  1449  // (typically 2 * MSL unless configured to something else at a stack level) in
  1450  // TIME-WAIT state.
  1451  func (e *endpoint) timeWaitTimerExpired() {
  1452  	e.mu.Lock()
  1453  	if e.EndpointState() != StateTimeWait {
  1454  		e.mu.Unlock()
  1455  		return
  1456  	}
  1457  	e.transitionToStateCloseLocked()
  1458  	e.mu.Unlock()
  1459  	e.drainClosingSegmentQueue()
  1460  	e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  1461  }
  1462  
  1463  // notifyProcessor queues this endpoint for processing to its TCP processor.
  1464  func (e *endpoint) notifyProcessor() {
  1465  	// We use TryLock here to avoid deadlocks in cases where a listening endpoint that is being
  1466  	// closed tries to abort half completed connections which in turn try to queue any segments
  1467  	// queued to that endpoint back to the same listening endpoint (because it may have got
  1468  	// segments that matched its id but were either a RST or a new SYN which must be handled
  1469  	// by a listening endpoint). In such cases the Close() on the listening endpoint will handle
  1470  	// any queued segments after it releases the lock.
  1471  	if !e.mu.TryLock() {
  1472  		return
  1473  	}
  1474  	processor := e.protocol.dispatcher.selectProcessor(e.ID)
  1475  	e.mu.Unlock()
  1476  	processor.queueEndpoint(e)
  1477  }