gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/tcpip/transport/tcp/connect.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"crypto/sha256"
    19  	"encoding/binary"
    20  	"fmt"
    21  	"math"
    22  	"time"
    23  
    24  	"gvisor.dev/gvisor/pkg/sync"
    25  	"gvisor.dev/gvisor/pkg/tcpip"
    26  	"gvisor.dev/gvisor/pkg/tcpip/checksum"
    27  	"gvisor.dev/gvisor/pkg/tcpip/header"
    28  	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
    29  	"gvisor.dev/gvisor/pkg/tcpip/stack"
    30  	"gvisor.dev/gvisor/pkg/waiter"
    31  )
    32  
    33  // InitialRTO is the initial retransmission timeout.
    34  // https://github.com/torvalds/linux/blob/7c636d4d20f/include/net/tcp.h#L142
    35  const InitialRTO = time.Second
    36  
    37  // maxSegmentsPerWake is the maximum number of segments to process in the main
    38  // protocol goroutine per wake-up. Yielding [after this number of segments are
    39  // processed] allows other events to be processed as well (e.g., timeouts,
    40  // resets, etc.).
    41  const maxSegmentsPerWake = 100
    42  
    43  type handshakeState int
    44  
    45  // The following are the possible states of the TCP connection during a 3-way
    46  // handshake. A depiction of the states and transitions can be found in RFC 793,
    47  // page 23.
    48  const (
    49  	handshakeSynSent handshakeState = iota
    50  	handshakeSynRcvd
    51  	handshakeCompleted
    52  )
    53  
    54  const (
    55  	// Maximum space available for options.
    56  	maxOptionSize = 40
    57  )
    58  
    59  // handshake holds the state used during a TCP 3-way handshake.
    60  //
    61  // NOTE: handshake.ep.mu is held during handshake processing. It is released if
    62  // we are going to block and reacquired when we start processing an event.
    63  //
    64  // +stateify savable
    65  type handshake struct {
    66  	ep       *Endpoint
    67  	listenEP *Endpoint
    68  	state    handshakeState
    69  	active   bool
    70  	flags    header.TCPFlags
    71  	ackNum   seqnum.Value
    72  
    73  	// iss is the initial send sequence number, as defined in RFC 793.
    74  	iss seqnum.Value
    75  
    76  	// rcvWnd is the receive window, as defined in RFC 793.
    77  	rcvWnd seqnum.Size
    78  
    79  	// sndWnd is the send window, as defined in RFC 793.
    80  	sndWnd seqnum.Size
    81  
    82  	// mss is the maximum segment size received from the peer.
    83  	mss uint16
    84  
    85  	// sndWndScale is the send window scale, as defined in RFC 1323. A
    86  	// negative value means no scaling is supported by the peer.
    87  	sndWndScale int
    88  
    89  	// rcvWndScale is the receive window scale, as defined in RFC 1323.
    90  	rcvWndScale int
    91  
    92  	// startTime is the time at which the first SYN/SYN-ACK was sent.
    93  	startTime tcpip.MonotonicTime
    94  
    95  	// deferAccept if non-zero will drop the final ACK for a passive
    96  	// handshake till an ACK segment with data is received or the timeout is
    97  	// hit.
    98  	deferAccept time.Duration
    99  
   100  	// acked is true if the final ACK for a 3-way handshake has
   101  	// been received. This is required to stop retransmitting the
   102  	// original SYN-ACK when deferAccept is enabled.
   103  	acked bool
   104  
   105  	// sendSYNOpts is the cached values for the SYN options to be sent.
   106  	sendSYNOpts header.TCPSynOptions
   107  
   108  	// sampleRTTWithTSOnly is true when the segment was retransmitted or we can't
   109  	// tell; then RTT can only be sampled when the incoming segment has timestamp
   110  	// options enabled.
   111  	sampleRTTWithTSOnly bool
   112  
   113  	// retransmitTimer is used to retransmit SYN/SYN-ACK with exponential backoff
   114  	// till handshake is either completed or timesout.
   115  	retransmitTimer *backoffTimer `state:"nosave"`
   116  }
   117  
   118  // timerHandler takes a handler function for a timer and returns a function that
   119  // will invoke the provided handler with the endpoint mutex held. In addition
   120  // the returned function will perform any cleanup that may be required if the
   121  // timer handler returns an error. In the case of no errors it will notify the
   122  // processor if there are pending segments that need to be processed.
   123  //
   124  // NOTE: e.mu is held for the duration of the call to f().
   125  func timerHandler(e *Endpoint, f func() tcpip.Error) func() {
   126  	return func() {
   127  		e.mu.Lock()
   128  		if err := f(); err != nil {
   129  			e.lastErrorMu.Lock()
   130  			// If the handler timed out and we have a lastError recorded (maybe due
   131  			// to an ICMP message received), promote it to be the hard error.
   132  			if _, isTimeout := err.(*tcpip.ErrTimeout); e.lastError != nil && isTimeout {
   133  				e.hardError = e.lastError
   134  			} else {
   135  				e.hardError = err
   136  			}
   137  			e.lastError = err
   138  			e.lastErrorMu.Unlock()
   139  			e.cleanupLocked()
   140  			e.setEndpointState(StateError)
   141  			e.mu.Unlock()
   142  			e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
   143  			return
   144  		}
   145  		processor := e.protocol.dispatcher.selectProcessor(e.ID)
   146  		e.mu.Unlock()
   147  
   148  		// notify processor if there are pending segments to be
   149  		// processed.
   150  		if !e.segmentQueue.empty() {
   151  			processor.queueEndpoint(e)
   152  		}
   153  	}
   154  }
   155  
   156  // +checklocks:e.mu
   157  // +checklocksacquire:h.ep.mu
   158  func (e *Endpoint) newHandshake() (h *handshake) {
   159  	h = &handshake{
   160  		ep:          e,
   161  		active:      true,
   162  		rcvWnd:      seqnum.Size(e.initialReceiveWindow()),
   163  		rcvWndScale: e.rcvWndScaleForHandshake(),
   164  	}
   165  	h.ep.AssertLockHeld(e)
   166  	h.resetState()
   167  	// Store reference to handshake state in endpoint.
   168  	e.h = h
   169  	// By the time handshake is created, e.ID is already initialized.
   170  	e.TSOffset = e.protocol.tsOffset(e.ID.LocalAddress, e.ID.RemoteAddress)
   171  	timer, err := newBackoffTimer(h.ep.stack.Clock(), InitialRTO, MaxRTO, timerHandler(e, h.retransmitHandlerLocked))
   172  	if err != nil {
   173  		panic(fmt.Sprintf("newBackOffTimer(_, %s, %s, _) failed: %s", InitialRTO, MaxRTO, err))
   174  	}
   175  	h.retransmitTimer = timer
   176  	return h
   177  }
   178  
   179  // +checklocks:e.mu
   180  // +checklocksacquire:h.ep.mu
   181  func (e *Endpoint) newPassiveHandshake(isn, irs seqnum.Value, opts header.TCPSynOptions, deferAccept time.Duration) (h *handshake) {
   182  	h = e.newHandshake()
   183  	h.resetToSynRcvd(isn, irs, opts, deferAccept)
   184  	return h
   185  }
   186  
   187  // FindWndScale determines the window scale to use for the given maximum window
   188  // size.
   189  func FindWndScale(wnd seqnum.Size) int {
   190  	if wnd < 0x10000 {
   191  		return 0
   192  	}
   193  
   194  	max := seqnum.Size(math.MaxUint16)
   195  	s := 0
   196  	for wnd > max && s < header.MaxWndScale {
   197  		s++
   198  		max <<= 1
   199  	}
   200  
   201  	return s
   202  }
   203  
   204  // resetState resets the state of the handshake object such that it becomes
   205  // ready for a new 3-way handshake.
   206  func (h *handshake) resetState() {
   207  	h.state = handshakeSynSent
   208  	h.flags = header.TCPFlagSyn
   209  	h.ackNum = 0
   210  	h.mss = 0
   211  	h.iss = generateSecureISN(h.ep.TransportEndpointInfo.ID, h.ep.stack.Clock(), h.ep.protocol.seqnumSecret)
   212  }
   213  
   214  // generateSecureISN generates a secure Initial Sequence number based on the
   215  // recommendation here https://tools.ietf.org/html/rfc6528#page-3.
   216  func generateSecureISN(id stack.TransportEndpointID, clock tcpip.Clock, seed [16]byte) seqnum.Value {
   217  	isnHasher := sha256.New()
   218  
   219  	// Per hash.Hash.Writer:
   220  	//
   221  	// It never returns an error.
   222  	_, _ = isnHasher.Write(seed[:])
   223  	_, _ = isnHasher.Write(id.LocalAddress.AsSlice())
   224  	_, _ = isnHasher.Write(id.RemoteAddress.AsSlice())
   225  	portBuf := make([]byte, 2)
   226  	binary.LittleEndian.PutUint16(portBuf, id.LocalPort)
   227  	_, _ = isnHasher.Write(portBuf)
   228  	binary.LittleEndian.PutUint16(portBuf, id.RemotePort)
   229  	_, _ = isnHasher.Write(portBuf)
   230  	// The time period here is 64ns. This is similar to what linux uses
   231  	// generate a sequence number that overlaps less than one
   232  	// time per MSL (2 minutes).
   233  	//
   234  	// A 64ns clock ticks 10^9/64 = 15625000) times in a second.
   235  	// To wrap the whole 32 bit space would require
   236  	// 2^32/1562500 ~ 274 seconds.
   237  	//
   238  	// Which sort of guarantees that we won't reuse the ISN for a new
   239  	// connection for the same tuple for at least 274s.
   240  	hash := binary.LittleEndian.Uint32(isnHasher.Sum(nil)[:4])
   241  	isn := hash + uint32(clock.NowMonotonic().Sub(tcpip.MonotonicTime{}).Nanoseconds()>>6)
   242  	return seqnum.Value(isn)
   243  }
   244  
   245  // effectiveRcvWndScale returns the effective receive window scale to be used.
   246  // If the peer doesn't support window scaling, the effective rcv wnd scale is
   247  // zero; otherwise it's the value calculated based on the initial rcv wnd.
   248  func (h *handshake) effectiveRcvWndScale() uint8 {
   249  	if h.sndWndScale < 0 {
   250  		return 0
   251  	}
   252  	return uint8(h.rcvWndScale)
   253  }
   254  
   255  // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD
   256  // state.
   257  // +checklocks:h.ep.mu
   258  func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts header.TCPSynOptions, deferAccept time.Duration) {
   259  	h.active = false
   260  	h.state = handshakeSynRcvd
   261  	h.flags = header.TCPFlagSyn | header.TCPFlagAck
   262  	h.iss = iss
   263  	h.ackNum = irs + 1
   264  	h.mss = opts.MSS
   265  	h.sndWndScale = opts.WS
   266  	h.deferAccept = deferAccept
   267  	h.ep.setEndpointState(StateSynRecv)
   268  }
   269  
   270  // checkAck checks if the ACK number, if present, of a segment received during
   271  // a TCP 3-way handshake is valid.
   272  func (h *handshake) checkAck(s *segment) bool {
   273  	return !(s.flags.Contains(header.TCPFlagAck) && s.ackNumber != h.iss+1)
   274  }
   275  
   276  // synSentState handles a segment received when the TCP 3-way handshake is in
   277  // the SYN-SENT state.
   278  // +checklocks:h.ep.mu
   279  func (h *handshake) synSentState(s *segment) tcpip.Error {
   280  	// RFC 793, page 37, states that in the SYN-SENT state, a reset is
   281  	// acceptable if the ack field acknowledges the SYN.
   282  	if s.flags.Contains(header.TCPFlagRst) {
   283  		if s.flags.Contains(header.TCPFlagAck) && s.ackNumber == h.iss+1 {
   284  			// RFC 793, page 67, states that "If the RST bit is set [and] If the ACK
   285  			// was acceptable then signal the user "error: connection reset", drop
   286  			// the segment, enter CLOSED state, delete TCB, and return."
   287  			// Although the RFC above calls out ECONNRESET, Linux actually returns
   288  			// ECONNREFUSED here so we do as well.
   289  			return &tcpip.ErrConnectionRefused{}
   290  		}
   291  		return nil
   292  	}
   293  
   294  	if !h.checkAck(s) {
   295  		// RFC 793, page 72 (https://datatracker.ietf.org/doc/html/rfc793#page-72):
   296  		//   If the segment acknowledgment is not acceptable, form a reset segment,
   297  		//        <SEQ=SEG.ACK><CTL=RST>
   298  		//   and send it.
   299  		h.ep.sendEmptyRaw(header.TCPFlagRst, s.ackNumber, 0, 0)
   300  		return nil
   301  	}
   302  
   303  	// We are in the SYN-SENT state. We only care about segments that have
   304  	// the SYN flag.
   305  	if !s.flags.Contains(header.TCPFlagSyn) {
   306  		return nil
   307  	}
   308  
   309  	// Parse the SYN options.
   310  	rcvSynOpts := parseSynSegmentOptions(s)
   311  
   312  	// Remember if the Timestamp option was negotiated.
   313  	h.ep.maybeEnableTimestamp(rcvSynOpts)
   314  
   315  	// Remember if the SACKPermitted option was negotiated.
   316  	h.ep.maybeEnableSACKPermitted(rcvSynOpts)
   317  
   318  	// Remember the sequence we'll ack from now on.
   319  	h.ackNum = s.sequenceNumber + 1
   320  	h.flags |= header.TCPFlagAck
   321  	h.mss = rcvSynOpts.MSS
   322  	h.sndWndScale = rcvSynOpts.WS
   323  
   324  	// If this is a SYN ACK response, we only need to acknowledge the SYN
   325  	// and the handshake is completed.
   326  	if s.flags.Contains(header.TCPFlagAck) {
   327  		h.state = handshakeCompleted
   328  		h.transitionToStateEstablishedLocked(s)
   329  
   330  		h.ep.sendEmptyRaw(header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale())
   331  		return nil
   332  	}
   333  
   334  	// A SYN segment was received, but no ACK in it. We acknowledge the SYN
   335  	// but resend our own SYN and wait for it to be acknowledged in the
   336  	// SYN-RCVD state.
   337  	h.state = handshakeSynRcvd
   338  	ttl := calculateTTL(h.ep.route, h.ep.ipv4TTL, h.ep.ipv6HopLimit)
   339  	amss := h.ep.amss
   340  	h.ep.setEndpointState(StateSynRecv)
   341  	synOpts := header.TCPSynOptions{
   342  		WS:    int(h.effectiveRcvWndScale()),
   343  		TS:    rcvSynOpts.TS,
   344  		TSVal: h.ep.tsValNow(),
   345  		TSEcr: h.ep.recentTimestamp(),
   346  
   347  		// We only send SACKPermitted if the other side indicated it
   348  		// permits SACK. This is not explicitly defined in the RFC but
   349  		// this is the behaviour implemented by Linux.
   350  		SACKPermitted: rcvSynOpts.SACKPermitted,
   351  		MSS:           amss,
   352  	}
   353  	if ttl == 0 {
   354  		ttl = h.ep.route.DefaultTTL()
   355  	}
   356  	h.ep.sendSynTCP(h.ep.route, tcpFields{
   357  		id:     h.ep.TransportEndpointInfo.ID,
   358  		ttl:    ttl,
   359  		tos:    h.ep.sendTOS,
   360  		flags:  h.flags,
   361  		seq:    h.iss,
   362  		ack:    h.ackNum,
   363  		rcvWnd: h.rcvWnd,
   364  	}, synOpts)
   365  	return nil
   366  }
   367  
   368  // synRcvdState handles a segment received when the TCP 3-way handshake is in
   369  // the SYN-RCVD state.
   370  // +checklocks:h.ep.mu
   371  func (h *handshake) synRcvdState(s *segment) tcpip.Error {
   372  	if s.flags.Contains(header.TCPFlagRst) {
   373  		// RFC 793, page 37, states that in the SYN-RCVD state, a reset
   374  		// is acceptable if the sequence number is in the window.
   375  		if s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) {
   376  			return &tcpip.ErrConnectionRefused{}
   377  		}
   378  		return nil
   379  	}
   380  
   381  	// It's possible that s is an ACK of a SYN cookie. This can happen if:
   382  	//
   383  	//   - We receive a SYN while under load and issue a SYN/ACK with
   384  	//     cookie S.
   385  	//   - We receive a retransmitted SYN while space exists in the SYN
   386  	//     queue, and issue a SYN/ACK with seqnum S'.
   387  	//   - We receive the ACK based on S.
   388  	//
   389  	// If we receive a SYN cookie ACK, just use the cookie seqnum.
   390  	if !h.checkAck(s) && h.listenEP != nil {
   391  		iss := s.ackNumber - 1
   392  		data, ok := h.listenEP.listenCtx.isCookieValid(s.id, iss, s.sequenceNumber-1)
   393  		if !ok || int(data) >= len(mssTable) {
   394  			// This isn't a valid cookie.
   395  			// RFC 793, page 72 (https://datatracker.ietf.org/doc/html/rfc793#page-72):
   396  			//   If the segment acknowledgment is not acceptable, form a reset segment,
   397  			//        <SEQ=SEG.ACK><CTL=RST>
   398  			//   and send it.
   399  			h.ep.sendEmptyRaw(header.TCPFlagRst, s.ackNumber, 0, 0)
   400  			return nil
   401  		}
   402  		// This is a cookie that snuck its way in after we stopped using them.
   403  		h.mss = mssTable[data]
   404  		h.iss = iss
   405  	}
   406  
   407  	// RFC 793, Section 3.9, page 69, states that in the SYN-RCVD state, a
   408  	// sequence number outside of the window causes an ACK with the proper seq
   409  	// number and "After sending the acknowledgment, drop the unacceptable
   410  	// segment and return."
   411  	if !s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) {
   412  		if h.ep.allowOutOfWindowAck() {
   413  			h.ep.sendEmptyRaw(header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd)
   414  		}
   415  		return nil
   416  	}
   417  
   418  	if s.flags.Contains(header.TCPFlagSyn) && s.sequenceNumber != h.ackNum-1 {
   419  		// We received two SYN segments with different sequence
   420  		// numbers, so we reset this and restart the whole
   421  		// process, except that we don't reset the timer.
   422  		ack := s.sequenceNumber.Add(s.logicalLen())
   423  		seq := seqnum.Value(0)
   424  		if s.flags.Contains(header.TCPFlagAck) {
   425  			seq = s.ackNumber
   426  		}
   427  		h.ep.sendEmptyRaw(header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0)
   428  
   429  		if !h.active {
   430  			return &tcpip.ErrInvalidEndpointState{}
   431  		}
   432  
   433  		h.resetState()
   434  		synOpts := header.TCPSynOptions{
   435  			WS:            h.rcvWndScale,
   436  			TS:            h.ep.SendTSOk,
   437  			TSVal:         h.ep.tsValNow(),
   438  			TSEcr:         h.ep.recentTimestamp(),
   439  			SACKPermitted: h.ep.SACKPermitted,
   440  			MSS:           h.ep.amss,
   441  		}
   442  		h.ep.sendSynTCP(h.ep.route, tcpFields{
   443  			id:     h.ep.TransportEndpointInfo.ID,
   444  			ttl:    calculateTTL(h.ep.route, h.ep.ipv4TTL, h.ep.ipv6HopLimit),
   445  			tos:    h.ep.sendTOS,
   446  			flags:  h.flags,
   447  			seq:    h.iss,
   448  			ack:    h.ackNum,
   449  			rcvWnd: h.rcvWnd,
   450  		}, synOpts)
   451  		return nil
   452  	}
   453  
   454  	// We have previously received (and acknowledged) the peer's SYN. If the
   455  	// peer acknowledges our SYN, the handshake is completed.
   456  	if s.flags.Contains(header.TCPFlagAck) {
   457  		// If deferAccept is not zero and this is a bare ACK and the
   458  		// timeout is not hit then drop the ACK.
   459  		if h.deferAccept != 0 && s.payloadSize() == 0 && h.ep.stack.Clock().NowMonotonic().Sub(h.startTime) < h.deferAccept {
   460  			h.acked = true
   461  			h.ep.stack.Stats().DroppedPackets.Increment()
   462  			return nil
   463  		}
   464  
   465  		// If the timestamp option is negotiated and the segment does
   466  		// not carry a timestamp option then the segment must be dropped
   467  		// as per https://tools.ietf.org/html/rfc7323#section-3.2.
   468  		if h.ep.SendTSOk && !s.parsedOptions.TS {
   469  			h.ep.stack.Stats().DroppedPackets.Increment()
   470  			return nil
   471  		}
   472  
   473  		// Drop the ACK if the accept queue is full.
   474  		// https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_ipv4.c#L1523
   475  		// We could abort the connection as well with a tunable as in
   476  		// https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_minisocks.c#L788
   477  		if listenEP := h.listenEP; listenEP != nil && listenEP.acceptQueueIsFull() {
   478  			listenEP.stack.Stats().DroppedPackets.Increment()
   479  			return nil
   480  		}
   481  
   482  		// Update timestamp if required. See RFC7323, section-4.3.
   483  		if h.ep.SendTSOk && s.parsedOptions.TS {
   484  			h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber)
   485  		}
   486  
   487  		h.state = handshakeCompleted
   488  		h.transitionToStateEstablishedLocked(s)
   489  
   490  		// Requeue the segment if the ACK completing the handshake has more info
   491  		// to be processed by the newly established endpoint.
   492  		if (s.flags.Contains(header.TCPFlagFin) || s.payloadSize() > 0) && h.ep.enqueueSegment(s) {
   493  			h.ep.protocol.dispatcher.selectProcessor(h.ep.ID).queueEndpoint(h.ep)
   494  
   495  		}
   496  		return nil
   497  	}
   498  
   499  	return nil
   500  }
   501  
   502  // +checklocks:h.ep.mu
   503  func (h *handshake) handleSegment(s *segment) tcpip.Error {
   504  	h.sndWnd = s.window
   505  	if !s.flags.Contains(header.TCPFlagSyn) && h.sndWndScale > 0 {
   506  		h.sndWnd <<= uint8(h.sndWndScale)
   507  	}
   508  
   509  	switch h.state {
   510  	case handshakeSynRcvd:
   511  		return h.synRcvdState(s)
   512  	case handshakeSynSent:
   513  		return h.synSentState(s)
   514  	}
   515  	return nil
   516  }
   517  
   518  // processSegments goes through the segment queue and processes up to
   519  // maxSegmentsPerWake (if they're available).
   520  // +checklocks:h.ep.mu
   521  func (h *handshake) processSegments() tcpip.Error {
   522  	for i := 0; i < maxSegmentsPerWake; i++ {
   523  		s := h.ep.segmentQueue.dequeue()
   524  		if s == nil {
   525  			return nil
   526  		}
   527  
   528  		err := h.handleSegment(s)
   529  		s.DecRef()
   530  		if err != nil {
   531  			return err
   532  		}
   533  
   534  		// We stop processing packets once the handshake is completed,
   535  		// otherwise we may process packets meant to be processed by
   536  		// the main protocol goroutine.
   537  		if h.state == handshakeCompleted {
   538  			break
   539  		}
   540  	}
   541  
   542  	return nil
   543  }
   544  
   545  // start sends the first SYN/SYN-ACK. It does not block, even if link address
   546  // resolution is required.
   547  func (h *handshake) start() {
   548  	h.startTime = h.ep.stack.Clock().NowMonotonic()
   549  	h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route)
   550  	var sackEnabled tcpip.TCPSACKEnabled
   551  	if err := h.ep.stack.TransportProtocolOption(ProtocolNumber, &sackEnabled); err != nil {
   552  		// If stack returned an error when checking for SACKEnabled
   553  		// status then just default to switching off SACK negotiation.
   554  		sackEnabled = false
   555  	}
   556  
   557  	synOpts := header.TCPSynOptions{
   558  		WS:            h.rcvWndScale,
   559  		TS:            true,
   560  		TSVal:         h.ep.tsValNow(),
   561  		TSEcr:         h.ep.recentTimestamp(),
   562  		SACKPermitted: bool(sackEnabled),
   563  		MSS:           h.ep.amss,
   564  	}
   565  
   566  	// start() is also called in a listen context so we want to make sure we only
   567  	// send the TS/SACK option when we received the TS/SACK in the initial SYN.
   568  	if h.state == handshakeSynRcvd {
   569  		synOpts.TS = h.ep.SendTSOk
   570  		synOpts.SACKPermitted = h.ep.SACKPermitted && bool(sackEnabled)
   571  		if h.sndWndScale < 0 {
   572  			// Disable window scaling if the peer did not send us
   573  			// the window scaling option.
   574  			synOpts.WS = -1
   575  		}
   576  	}
   577  
   578  	h.sendSYNOpts = synOpts
   579  	h.ep.sendSynTCP(h.ep.route, tcpFields{
   580  		id:     h.ep.TransportEndpointInfo.ID,
   581  		ttl:    calculateTTL(h.ep.route, h.ep.ipv4TTL, h.ep.ipv6HopLimit),
   582  		tos:    h.ep.sendTOS,
   583  		flags:  h.flags,
   584  		seq:    h.iss,
   585  		ack:    h.ackNum,
   586  		rcvWnd: h.rcvWnd,
   587  	}, synOpts)
   588  }
   589  
   590  // retransmitHandler handles retransmissions of un-acked SYNs.
   591  // +checklocks:h.ep.mu
   592  func (h *handshake) retransmitHandlerLocked() tcpip.Error {
   593  	e := h.ep
   594  	// If the endpoint has already transition out of a connecting state due
   595  	// to say an error (e.g) peer send RST or an ICMP error. Then just
   596  	// return. Any required cleanup should have been done when the RST/error
   597  	// was handled.
   598  	if !e.EndpointState().connecting() {
   599  		return nil
   600  	}
   601  
   602  	if err := h.retransmitTimer.reset(); err != nil {
   603  		return err
   604  	}
   605  
   606  	// Resend the SYN/SYN-ACK only if the following conditions hold.
   607  	//  - It's an active handshake (deferAccept does not apply)
   608  	//  - It's a passive handshake and we have not yet got the final-ACK.
   609  	//  - It's a passive handshake and we got an ACK but deferAccept is
   610  	//    enabled and we are now past the deferAccept duration.
   611  	// The last is required to provide a way for the peer to complete
   612  	// the connection with another ACK or data (as ACKs are never
   613  	// retransmitted on their own).
   614  	if h.active || !h.acked || h.deferAccept != 0 && e.stack.Clock().NowMonotonic().Sub(h.startTime) > h.deferAccept {
   615  		e.sendSynTCP(e.route, tcpFields{
   616  			id:     e.TransportEndpointInfo.ID,
   617  			ttl:    calculateTTL(e.route, e.ipv4TTL, e.ipv6HopLimit),
   618  			tos:    e.sendTOS,
   619  			flags:  h.flags,
   620  			seq:    h.iss,
   621  			ack:    h.ackNum,
   622  			rcvWnd: h.rcvWnd,
   623  		}, h.sendSYNOpts)
   624  		// If we have ever retransmitted the SYN-ACK or
   625  		// SYN segment, we should only measure RTT if
   626  		// TS option is present.
   627  		h.sampleRTTWithTSOnly = true
   628  	}
   629  	return nil
   630  }
   631  
   632  // transitionToStateEstablisedLocked transitions the endpoint of the handshake
   633  // to an established state given the last segment received from peer. It also
   634  // initializes sender/receiver.
   635  // +checklocks:h.ep.mu
   636  func (h *handshake) transitionToStateEstablishedLocked(s *segment) {
   637  	// Stop the SYN retransmissions now that handshake is complete.
   638  	if h.retransmitTimer != nil {
   639  		h.retransmitTimer.stop()
   640  	}
   641  
   642  	// Transfer handshake state to TCP connection. We disable
   643  	// receive window scaling if the peer doesn't support it
   644  	// (indicated by a negative send window scale).
   645  	h.ep.snd = newSender(h.ep, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale)
   646  
   647  	now := h.ep.stack.Clock().NowMonotonic()
   648  
   649  	var rtt time.Duration
   650  	if h.ep.SendTSOk && s.parsedOptions.TSEcr != 0 {
   651  		rtt = h.ep.elapsed(now, s.parsedOptions.TSEcr)
   652  	}
   653  	if !h.sampleRTTWithTSOnly && rtt == 0 {
   654  		rtt = now.Sub(h.startTime)
   655  	}
   656  
   657  	if rtt > 0 {
   658  		h.ep.snd.updateRTO(rtt)
   659  	}
   660  
   661  	h.ep.rcvQueueMu.Lock()
   662  	h.ep.rcv = newReceiver(h.ep, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale())
   663  	// Bootstrap the auto tuning algorithm. Starting at zero will
   664  	// result in a really large receive window after the first auto
   665  	// tuning adjustment.
   666  	h.ep.RcvAutoParams.PrevCopiedBytes = int(h.rcvWnd)
   667  	h.ep.rcvQueueMu.Unlock()
   668  
   669  	h.ep.setEndpointState(StateEstablished)
   670  
   671  	// Completing the 3-way handshake is an indication that the route is valid
   672  	// and the remote is reachable as the only way we can complete a handshake
   673  	// is if our SYN reached the remote and their ACK reached us.
   674  	h.ep.route.ConfirmReachable()
   675  
   676  	// Tell waiters that the endpoint is connected and writable.
   677  	h.ep.waiterQueue.Notify(waiter.WritableEvents)
   678  }
   679  
   680  type backoffTimer struct {
   681  	timeout    time.Duration
   682  	maxTimeout time.Duration
   683  	t          tcpip.Timer
   684  }
   685  
   686  func newBackoffTimer(clock tcpip.Clock, timeout, maxTimeout time.Duration, f func()) (*backoffTimer, tcpip.Error) {
   687  	if timeout > maxTimeout {
   688  		return nil, &tcpip.ErrTimeout{}
   689  	}
   690  	bt := &backoffTimer{timeout: timeout, maxTimeout: maxTimeout}
   691  	bt.t = clock.AfterFunc(timeout, f)
   692  	return bt, nil
   693  }
   694  
   695  func (bt *backoffTimer) reset() tcpip.Error {
   696  	bt.timeout *= 2
   697  	if bt.timeout > bt.maxTimeout {
   698  		return &tcpip.ErrTimeout{}
   699  	}
   700  	bt.t.Reset(bt.timeout)
   701  	return nil
   702  }
   703  
   704  func (bt *backoffTimer) stop() {
   705  	bt.t.Stop()
   706  }
   707  
   708  func parseSynSegmentOptions(s *segment) header.TCPSynOptions {
   709  	synOpts := header.ParseSynOptions(s.options, s.flags.Contains(header.TCPFlagAck))
   710  	if synOpts.TS {
   711  		s.parsedOptions.TSVal = synOpts.TSVal
   712  		s.parsedOptions.TSEcr = synOpts.TSEcr
   713  	}
   714  	return synOpts
   715  }
   716  
   717  var optionPool = sync.Pool{
   718  	New: func() any {
   719  		return &[maxOptionSize]byte{}
   720  	},
   721  }
   722  
   723  func getOptions() []byte {
   724  	return (*optionPool.Get().(*[maxOptionSize]byte))[:]
   725  }
   726  
   727  func putOptions(options []byte) {
   728  	// Reslice to full capacity.
   729  	optionPool.Put(optionsToArray(options))
   730  }
   731  
   732  func makeSynOptions(opts header.TCPSynOptions) []byte {
   733  	// Emulate linux option order. This is as follows:
   734  	//
   735  	// if md5: NOP NOP MD5SIG 18 md5sig(16)
   736  	// if mss: MSS 4 mss(2)
   737  	// if ts and sack_advertise:
   738  	//	SACK 2 TIMESTAMP 2 timestamp(8)
   739  	// elif ts: NOP NOP TIMESTAMP 10 timestamp(8)
   740  	// elif sack: NOP NOP SACK 2
   741  	// if wscale: NOP WINDOW 3 ws(1)
   742  	// if sack_blocks: NOP NOP SACK ((2 + (#blocks * 8))
   743  	//	[for each block] start_seq(4) end_seq(4)
   744  	// if fastopen_cookie:
   745  	//	if exp: EXP (4 + len(cookie)) FASTOPEN_MAGIC(2)
   746  	// 	else: FASTOPEN (2 + len(cookie))
   747  	//	cookie(variable) [padding to four bytes]
   748  	//
   749  	options := getOptions()
   750  
   751  	// Always encode the mss.
   752  	offset := header.EncodeMSSOption(uint32(opts.MSS), options)
   753  
   754  	// Special ordering is required here. If both TS and SACK are enabled,
   755  	// then the SACK option precedes TS, with no padding. If they are
   756  	// enabled individually, then we see padding before the option.
   757  	if opts.TS && opts.SACKPermitted {
   758  		offset += header.EncodeSACKPermittedOption(options[offset:])
   759  		offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:])
   760  	} else if opts.TS {
   761  		offset += header.EncodeNOP(options[offset:])
   762  		offset += header.EncodeNOP(options[offset:])
   763  		offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:])
   764  	} else if opts.SACKPermitted {
   765  		offset += header.EncodeNOP(options[offset:])
   766  		offset += header.EncodeNOP(options[offset:])
   767  		offset += header.EncodeSACKPermittedOption(options[offset:])
   768  	}
   769  
   770  	// Initialize the WS option.
   771  	if opts.WS >= 0 {
   772  		offset += header.EncodeNOP(options[offset:])
   773  		offset += header.EncodeWSOption(opts.WS, options[offset:])
   774  	}
   775  
   776  	// Padding to the end; note that this never apply unless we add a
   777  	// fastopen option, we always expect the offset to remain the same.
   778  	if delta := header.AddTCPOptionPadding(options, offset); delta != 0 {
   779  		panic("unexpected option encoding")
   780  	}
   781  
   782  	return options[:offset]
   783  }
   784  
   785  // tcpFields is a struct to carry different parameters required by the
   786  // send*TCP variant functions below.
   787  type tcpFields struct {
   788  	id     stack.TransportEndpointID
   789  	ttl    uint8
   790  	tos    uint8
   791  	flags  header.TCPFlags
   792  	seq    seqnum.Value
   793  	ack    seqnum.Value
   794  	rcvWnd seqnum.Size
   795  	opts   []byte
   796  	txHash uint32
   797  	df     bool
   798  }
   799  
   800  func (e *Endpoint) sendSynTCP(r *stack.Route, tf tcpFields, opts header.TCPSynOptions) tcpip.Error {
   801  	tf.opts = makeSynOptions(opts)
   802  	// We ignore SYN send errors and let the callers re-attempt send.
   803  	p := stack.NewPacketBuffer(stack.PacketBufferOptions{ReserveHeaderBytes: header.TCPMinimumSize + int(r.MaxHeaderLength()) + len(tf.opts)})
   804  	defer p.DecRef()
   805  	if err := e.sendTCP(r, tf, p, stack.GSO{}); err != nil {
   806  		e.stats.SendErrors.SynSendToNetworkFailed.Increment()
   807  	}
   808  	putOptions(tf.opts)
   809  	return nil
   810  }
   811  
   812  // This method takes ownership of pkt.
   813  func (e *Endpoint) sendTCP(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso stack.GSO) tcpip.Error {
   814  	tf.txHash = e.txHash
   815  	if err := sendTCP(r, tf, pkt, gso, e.owner); err != nil {
   816  		e.stats.SendErrors.SegmentSendToNetworkFailed.Increment()
   817  		return err
   818  	}
   819  	e.stats.SegmentsSent.Increment()
   820  	return nil
   821  }
   822  
   823  func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso stack.GSO) {
   824  	optLen := len(tf.opts)
   825  	tcp := header.TCP(pkt.TransportHeader().Push(header.TCPMinimumSize + optLen))
   826  	pkt.TransportProtocolNumber = header.TCPProtocolNumber
   827  	tcp.Encode(&header.TCPFields{
   828  		SrcPort:    tf.id.LocalPort,
   829  		DstPort:    tf.id.RemotePort,
   830  		SeqNum:     uint32(tf.seq),
   831  		AckNum:     uint32(tf.ack),
   832  		DataOffset: uint8(header.TCPMinimumSize + optLen),
   833  		Flags:      tf.flags,
   834  		WindowSize: uint16(tf.rcvWnd),
   835  	})
   836  	copy(tcp[header.TCPMinimumSize:], tf.opts)
   837  
   838  	xsum := r.PseudoHeaderChecksum(ProtocolNumber, uint16(pkt.Size()))
   839  	// Only calculate the checksum if offloading isn't supported.
   840  	if gso.Type != stack.GSONone && gso.NeedsCsum {
   841  		// This is called CHECKSUM_PARTIAL in the Linux kernel. We
   842  		// calculate a checksum of the pseudo-header and save it in the
   843  		// TCP header, then the kernel calculate a checksum of the
   844  		// header and data and get the right sum of the TCP packet.
   845  		tcp.SetChecksum(xsum)
   846  	} else if r.RequiresTXTransportChecksum() {
   847  		xsum = checksum.Combine(xsum, pkt.Data().Checksum())
   848  		tcp.SetChecksum(^tcp.CalculateChecksum(xsum))
   849  	}
   850  }
   851  
   852  func sendTCPBatch(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso stack.GSO, owner tcpip.PacketOwner) tcpip.Error {
   853  	optLen := len(tf.opts)
   854  	if tf.rcvWnd > math.MaxUint16 {
   855  		tf.rcvWnd = math.MaxUint16
   856  	}
   857  
   858  	mss := int(gso.MSS)
   859  	n := (pkt.Data().Size() + mss - 1) / mss
   860  
   861  	size := pkt.Data().Size()
   862  	hdrSize := header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen
   863  	for i := 0; i < n; i++ {
   864  		packetSize := mss
   865  		if packetSize > size {
   866  			packetSize = size
   867  		}
   868  		size -= packetSize
   869  
   870  		pkt := pkt
   871  		// No need to split the packet in the final iteration. The original
   872  		// packet already has the truncated data.
   873  		shouldSplitPacket := i != n-1
   874  		if shouldSplitPacket {
   875  			splitPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ReserveHeaderBytes: hdrSize})
   876  			splitPkt.Data().ReadFromPacketData(pkt.Data(), packetSize)
   877  			pkt = splitPkt
   878  		}
   879  		pkt.Hash = tf.txHash
   880  		pkt.Owner = owner
   881  
   882  		buildTCPHdr(r, tf, pkt, gso)
   883  		tf.seq = tf.seq.Add(seqnum.Size(packetSize))
   884  		pkt.GSOOptions = gso
   885  		if err := r.WritePacket(stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos, DF: tf.df}, pkt); err != nil {
   886  			r.Stats().TCP.SegmentSendErrors.Increment()
   887  			if shouldSplitPacket {
   888  				pkt.DecRef()
   889  			}
   890  			return err
   891  		}
   892  		r.Stats().TCP.SegmentsSent.Increment()
   893  		if shouldSplitPacket {
   894  			pkt.DecRef()
   895  		}
   896  	}
   897  	return nil
   898  }
   899  
   900  // sendTCP sends a TCP segment with the provided options via the provided
   901  // network endpoint and under the provided identity. This method takes
   902  // ownership of pkt.
   903  func sendTCP(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso stack.GSO, owner tcpip.PacketOwner) tcpip.Error {
   904  	if tf.rcvWnd > math.MaxUint16 {
   905  		tf.rcvWnd = math.MaxUint16
   906  	}
   907  
   908  	if r.Loop()&stack.PacketLoop == 0 && gso.Type == stack.GSOGvisor && int(gso.MSS) < pkt.Data().Size() {
   909  		return sendTCPBatch(r, tf, pkt, gso, owner)
   910  	}
   911  
   912  	pkt.GSOOptions = gso
   913  	pkt.Hash = tf.txHash
   914  	pkt.Owner = owner
   915  	buildTCPHdr(r, tf, pkt, gso)
   916  
   917  	if err := r.WritePacket(stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos, DF: tf.df}, pkt); err != nil {
   918  		r.Stats().TCP.SegmentSendErrors.Increment()
   919  		return err
   920  	}
   921  	r.Stats().TCP.SegmentsSent.Increment()
   922  	if (tf.flags & header.TCPFlagRst) != 0 {
   923  		r.Stats().TCP.ResetsSent.Increment()
   924  	}
   925  	return nil
   926  }
   927  
   928  // makeOptions makes an options slice.
   929  func (e *Endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
   930  	options := getOptions()
   931  	offset := 0
   932  
   933  	// N.B. the ordering here matches the ordering used by Linux internally
   934  	// and described in the raw makeOptions function. We don't include
   935  	// unnecessary cases here (post connection.)
   936  	if e.SendTSOk {
   937  		// Embed the timestamp if timestamp has been enabled.
   938  		//
   939  		// We only use the lower 32 bits of the unix time in
   940  		// milliseconds. This is similar to what Linux does where it
   941  		// uses the lower 32 bits of the jiffies value in the tsVal
   942  		// field of the timestamp option.
   943  		//
   944  		// Further, RFC7323 section-5.4 recommends millisecond
   945  		// resolution as the lowest recommended resolution for the
   946  		// timestamp clock.
   947  		//
   948  		// Ref: https://tools.ietf.org/html/rfc7323#section-5.4.
   949  		offset += header.EncodeNOP(options[offset:])
   950  		offset += header.EncodeNOP(options[offset:])
   951  		offset += header.EncodeTSOption(e.tsValNow(), e.recentTimestamp(), options[offset:])
   952  	}
   953  	if e.SACKPermitted && len(sackBlocks) > 0 {
   954  		offset += header.EncodeNOP(options[offset:])
   955  		offset += header.EncodeNOP(options[offset:])
   956  		offset += header.EncodeSACKBlocks(sackBlocks, options[offset:])
   957  	}
   958  
   959  	// We expect the above to produce an aligned offset.
   960  	if delta := header.AddTCPOptionPadding(options, offset); delta != 0 {
   961  		panic("unexpected option encoding")
   962  	}
   963  
   964  	return options[:offset]
   965  }
   966  
   967  // sendEmptyRaw sends a TCP segment with no payload to the endpoint's peer.
   968  //
   969  // +checklocks:e.mu
   970  // +checklocksalias:e.snd.ep.mu=e.mu
   971  func (e *Endpoint) sendEmptyRaw(flags header.TCPFlags, seq, ack seqnum.Value, rcvWnd seqnum.Size) tcpip.Error {
   972  	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{})
   973  	defer pkt.DecRef()
   974  	return e.sendRaw(pkt, flags, seq, ack, rcvWnd)
   975  }
   976  
   977  // sendRaw sends a TCP segment to the endpoint's peer. This method takes
   978  // ownership of pkt. pkt must not have any headers set.
   979  //
   980  // +checklocks:e.mu
   981  // +checklocksalias:e.snd.ep.mu=e.mu
   982  func (e *Endpoint) sendRaw(pkt *stack.PacketBuffer, flags header.TCPFlags, seq, ack seqnum.Value, rcvWnd seqnum.Size) tcpip.Error {
   983  	var sackBlocks []header.SACKBlock
   984  	if e.EndpointState() == StateEstablished && e.rcv.pendingRcvdSegments.Len() > 0 && (flags&header.TCPFlagAck != 0) {
   985  		sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
   986  	}
   987  	options := e.makeOptions(sackBlocks)
   988  	defer putOptions(options)
   989  	pkt.ReserveHeaderBytes(header.TCPMinimumSize + int(e.route.MaxHeaderLength()) + len(options))
   990  	return e.sendTCP(e.route, tcpFields{
   991  		id:     e.TransportEndpointInfo.ID,
   992  		ttl:    calculateTTL(e.route, e.ipv4TTL, e.ipv6HopLimit),
   993  		tos:    e.sendTOS,
   994  		flags:  flags,
   995  		seq:    seq,
   996  		ack:    ack,
   997  		rcvWnd: rcvWnd,
   998  		opts:   options,
   999  		df:     e.pmtud == tcpip.PMTUDiscoveryWant || e.pmtud == tcpip.PMTUDiscoveryDo,
  1000  	}, pkt, e.gso)
  1001  }
  1002  
  1003  // +checklocks:e.mu
  1004  // +checklocksalias:e.snd.ep.mu=e.mu
  1005  func (e *Endpoint) sendData(next *segment) {
  1006  	// Initialize the next segment to write if it's currently nil.
  1007  	if e.snd.writeNext == nil {
  1008  		if next == nil {
  1009  			return
  1010  		}
  1011  		e.snd.updateWriteNext(next)
  1012  	}
  1013  
  1014  	// Push out any new packets.
  1015  	e.snd.sendData()
  1016  }
  1017  
  1018  // resetConnectionLocked puts the endpoint in an error state with the given
  1019  // error code and sends a RST if and only if the error is not ErrConnectionReset
  1020  // indicating that the connection is being reset due to receiving a RST. This
  1021  // method must only be called from the protocol goroutine.
  1022  // +checklocks:e.mu
  1023  func (e *Endpoint) resetConnectionLocked(err tcpip.Error) {
  1024  	// Only send a reset if the connection is being aborted for a reason
  1025  	// other than receiving a reset.
  1026  	e.hardError = err
  1027  	switch err.(type) {
  1028  	case *tcpip.ErrConnectionReset, *tcpip.ErrTimeout:
  1029  	default:
  1030  		// The exact sequence number to be used for the RST is the same as the
  1031  		// one used by Linux. We need to handle the case of window being shrunk
  1032  		// which can cause sndNxt to be outside the acceptable window on the
  1033  		// receiver.
  1034  		//
  1035  		// See: https://www.snellman.net/blog/archive/2016-02-01-tcp-rst/ for more
  1036  		// information.
  1037  		sndWndEnd := e.snd.SndUna.Add(e.snd.SndWnd)
  1038  		resetSeqNum := sndWndEnd
  1039  		if !sndWndEnd.LessThan(e.snd.SndNxt) || e.snd.SndNxt.Size(sndWndEnd) < (1<<e.snd.SndWndScale) {
  1040  			resetSeqNum = e.snd.SndNxt
  1041  		}
  1042  		e.sendEmptyRaw(header.TCPFlagAck|header.TCPFlagRst, resetSeqNum, e.rcv.RcvNxt, 0)
  1043  	}
  1044  	// Don't purge read queues here. If there's buffered data, it's still allowed
  1045  	// to be read.
  1046  	e.purgeWriteQueue()
  1047  	e.purgePendingRcvQueue()
  1048  	e.cleanupLocked()
  1049  	e.setEndpointState(StateError)
  1050  }
  1051  
  1052  // transitionToStateCloseLocked ensures that the endpoint is
  1053  // cleaned up from the transport demuxer, "before" moving to
  1054  // StateClose. This will ensure that no packet will be
  1055  // delivered to this endpoint from the demuxer when the endpoint
  1056  // is transitioned to StateClose.
  1057  // +checklocks:e.mu
  1058  func (e *Endpoint) transitionToStateCloseLocked() {
  1059  	s := e.EndpointState()
  1060  	if s == StateClose {
  1061  		return
  1062  	}
  1063  
  1064  	if s.connected() {
  1065  		e.stack.Stats().TCP.EstablishedClosed.Increment()
  1066  	}
  1067  
  1068  	e.cleanupLocked()
  1069  	// Mark the endpoint as fully closed for reads/writes.
  1070  	e.setEndpointState(StateClose)
  1071  }
  1072  
  1073  // tryDeliverSegmentFromClosedEndpoint attempts to deliver the parsed
  1074  // segment to any other endpoint other than the current one. This is called
  1075  // only when the endpoint is in StateClose and we want to deliver the segment
  1076  // to any other listening endpoint. We reply with RST if we cannot find one.
  1077  func (e *Endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
  1078  	ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.TransportEndpointInfo.ID, s.pkt.NICID)
  1079  	if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.TransportEndpointInfo.ID.LocalAddress.To4() != (tcpip.Address{}) {
  1080  		// Dual-stack socket, try IPv4.
  1081  		ep = e.stack.FindTransportEndpoint(
  1082  			header.IPv4ProtocolNumber,
  1083  			e.TransProto,
  1084  			e.TransportEndpointInfo.ID,
  1085  			s.pkt.NICID,
  1086  		)
  1087  	}
  1088  	if ep == nil {
  1089  		if !s.flags.Contains(header.TCPFlagRst) {
  1090  			replyWithReset(e.stack, s, stack.DefaultTOS, tcpip.UseDefaultIPv4TTL, tcpip.UseDefaultIPv6HopLimit)
  1091  		}
  1092  		return
  1093  	}
  1094  
  1095  	if e == ep {
  1096  		panic(fmt.Sprintf("current endpoint not removed from demuxer, enqueuing segments to itself, endpoint in state %v", e.EndpointState()))
  1097  	}
  1098  
  1099  	if ep := ep.(*Endpoint); ep.enqueueSegment(s) {
  1100  		ep.notifyProcessor()
  1101  	}
  1102  }
  1103  
  1104  // Drain segment queue from the endpoint and try to re-match the segment to a
  1105  // different endpoint. This is used when the current endpoint is transitioned to
  1106  // StateClose and has been unregistered from the transport demuxer.
  1107  func (e *Endpoint) drainClosingSegmentQueue() {
  1108  	for {
  1109  		s := e.segmentQueue.dequeue()
  1110  		if s == nil {
  1111  			break
  1112  		}
  1113  
  1114  		e.tryDeliverSegmentFromClosedEndpoint(s)
  1115  		s.DecRef()
  1116  	}
  1117  }
  1118  
  1119  // +checklocks:e.mu
  1120  func (e *Endpoint) handleReset(s *segment) (ok bool, err tcpip.Error) {
  1121  	if e.rcv.acceptable(s.sequenceNumber, 0) {
  1122  		// RFC 793, page 37 states that "in all states
  1123  		// except SYN-SENT, all reset (RST) segments are
  1124  		// validated by checking their SEQ-fields." So
  1125  		// we only process it if it's acceptable.
  1126  		switch e.EndpointState() {
  1127  		// In case of a RST in CLOSE-WAIT linux moves
  1128  		// the socket to closed state with an error set
  1129  		// to indicate EPIPE.
  1130  		//
  1131  		// Technically this seems to be at odds w/ RFC.
  1132  		// As per https://tools.ietf.org/html/rfc793#section-2.7
  1133  		// page 69 the behavior for a segment arriving
  1134  		// w/ RST bit set in CLOSE-WAIT is inlined below.
  1135  		//
  1136  		//  ESTABLISHED
  1137  		//  FIN-WAIT-1
  1138  		//  FIN-WAIT-2
  1139  		//  CLOSE-WAIT
  1140  
  1141  		//  If the RST bit is set then, any outstanding RECEIVEs and
  1142  		//  SEND should receive "reset" responses. All segment queues
  1143  		//  should be flushed.  Users should also receive an unsolicited
  1144  		//  general "connection reset" signal. Enter the CLOSED state,
  1145  		//  delete the TCB, and return.
  1146  		case StateCloseWait:
  1147  			e.transitionToStateCloseLocked()
  1148  			e.hardError = &tcpip.ErrAborted{}
  1149  			return false, nil
  1150  		default:
  1151  			// RFC 793, page 37 states that "in all states
  1152  			// except SYN-SENT, all reset (RST) segments are
  1153  			// validated by checking their SEQ-fields." So
  1154  			// we only process it if it's acceptable.
  1155  
  1156  			// Notify protocol goroutine. This is required when
  1157  			// handleSegment is invoked from the processor goroutine
  1158  			// rather than the worker goroutine.
  1159  			return false, &tcpip.ErrConnectionReset{}
  1160  		}
  1161  	}
  1162  	return true, nil
  1163  }
  1164  
  1165  // handleSegments processes all inbound segments.
  1166  //
  1167  // +checklocks:e.mu
  1168  // +checklocksalias:e.snd.ep.mu=e.mu
  1169  func (e *Endpoint) handleSegmentsLocked() tcpip.Error {
  1170  	sndUna := e.snd.SndUna
  1171  	for i := 0; i < maxSegmentsPerWake; i++ {
  1172  		if state := e.EndpointState(); state.closed() || state == StateTimeWait || state == StateError {
  1173  			return nil
  1174  		}
  1175  		s := e.segmentQueue.dequeue()
  1176  		if s == nil {
  1177  			break
  1178  		}
  1179  		cont, err := e.handleSegmentLocked(s)
  1180  		s.DecRef()
  1181  		if err != nil {
  1182  			return err
  1183  		}
  1184  		if !cont {
  1185  			return nil
  1186  		}
  1187  	}
  1188  
  1189  	// The remote ACK-ing at least 1 byte is an indication that we have a
  1190  	// full-duplex connection to the remote as the only way we will receive an
  1191  	// ACK is if the remote received data that we previously sent.
  1192  	//
  1193  	// As of writing, Linux seems to only confirm a route as reachable when
  1194  	// forward progress is made which is indicated by an ACK that removes data
  1195  	// from the retransmit queue, i.e. sender makes forward progress.
  1196  	if sndUna.LessThan(e.snd.SndUna) {
  1197  		e.route.ConfirmReachable()
  1198  	}
  1199  
  1200  	// Send an ACK for all processed packets if needed.
  1201  	if e.rcv.RcvNxt != e.snd.MaxSentAck {
  1202  		e.snd.sendAck()
  1203  	}
  1204  
  1205  	e.resetKeepaliveTimer(true /* receivedData */)
  1206  
  1207  	return nil
  1208  }
  1209  
  1210  // +checklocks:e.mu
  1211  func (e *Endpoint) probeSegmentLocked() {
  1212  	if fn := e.probe; fn != nil {
  1213  		var state stack.TCPEndpointState
  1214  		e.completeStateLocked(&state)
  1215  		fn(&state)
  1216  	}
  1217  }
  1218  
  1219  // handleSegment handles a given segment and notifies the worker goroutine if
  1220  // if the connection should be terminated.
  1221  //
  1222  // +checklocks:e.mu
  1223  // +checklocksalias:e.rcv.ep.mu=e.mu
  1224  // +checklocksalias:e.snd.ep.mu=e.mu
  1225  func (e *Endpoint) handleSegmentLocked(s *segment) (cont bool, err tcpip.Error) {
  1226  	// Invoke the tcp probe if installed. The tcp probe function will update
  1227  	// the TCPEndpointState after the segment is processed.
  1228  	defer e.probeSegmentLocked()
  1229  
  1230  	if s.flags.Contains(header.TCPFlagRst) {
  1231  		if ok, err := e.handleReset(s); !ok {
  1232  			return false, err
  1233  		}
  1234  	} else if s.flags.Contains(header.TCPFlagSyn) {
  1235  		// See: https://tools.ietf.org/html/rfc5961#section-4.1
  1236  		//   1) If the SYN bit is set, irrespective of the sequence number, TCP
  1237  		//    MUST send an ACK (also referred to as challenge ACK) to the remote
  1238  		//    peer:
  1239  		//
  1240  		//    <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
  1241  		//
  1242  		//    After sending the acknowledgment, TCP MUST drop the unacceptable
  1243  		//    segment and stop processing further.
  1244  		//
  1245  		// By sending an ACK, the remote peer is challenged to confirm the loss
  1246  		// of the previous connection and the request to start a new connection.
  1247  		// A legitimate peer, after restart, would not have a TCB in the
  1248  		// synchronized state.  Thus, when the ACK arrives, the peer should send
  1249  		// a RST segment back with the sequence number derived from the ACK
  1250  		// field that caused the RST.
  1251  
  1252  		// This RST will confirm that the remote peer has indeed closed the
  1253  		// previous connection.  Upon receipt of a valid RST, the local TCP
  1254  		// endpoint MUST terminate its connection.  The local TCP endpoint
  1255  		// should then rely on SYN retransmission from the remote end to
  1256  		// re-establish the connection.
  1257  		e.snd.maybeSendOutOfWindowAck(s)
  1258  	} else if s.flags.Contains(header.TCPFlagAck) {
  1259  		// Patch the window size in the segment according to the
  1260  		// send window scale.
  1261  		s.window <<= e.snd.SndWndScale
  1262  
  1263  		// RFC 793, page 41 states that "once in the ESTABLISHED
  1264  		// state all segments must carry current acknowledgment
  1265  		// information."
  1266  		drop, err := e.rcv.handleRcvdSegment(s)
  1267  		if err != nil {
  1268  			return false, err
  1269  		}
  1270  		if drop {
  1271  			return true, nil
  1272  		}
  1273  
  1274  		// Now check if the received segment has caused us to transition
  1275  		// to a CLOSED state, if yes then terminate processing and do
  1276  		// not invoke the sender.
  1277  		state := e.EndpointState()
  1278  		if state == StateClose {
  1279  			// When we get into StateClose while processing from the queue,
  1280  			// return immediately and let the protocolMainloop handle it.
  1281  			//
  1282  			// We can reach StateClose only while processing a previous segment
  1283  			// or a notification from the protocolMainLoop (caller goroutine).
  1284  			// This means that with this return, the segment dequeue below can
  1285  			// never occur on a closed endpoint.
  1286  			return false, nil
  1287  		}
  1288  
  1289  		e.snd.handleRcvdSegment(s)
  1290  	}
  1291  
  1292  	return true, nil
  1293  }
  1294  
  1295  // keepaliveTimerExpired is called when the keepaliveTimer fires. We send TCP
  1296  // keepalive packets periodically when the connection is idle. If we don't hear
  1297  // from the other side after a number of tries, we terminate the connection.
  1298  // +checklocks:e.mu
  1299  // +checklocksalias:e.snd.ep.mu=e.mu
  1300  func (e *Endpoint) keepaliveTimerExpired() tcpip.Error {
  1301  	userTimeout := e.userTimeout
  1302  
  1303  	// If the route is not ready or already cleaned up, then we don't need to
  1304  	// send keepalives.
  1305  	if e.route == nil {
  1306  		return nil
  1307  	}
  1308  	e.keepalive.Lock()
  1309  	if !e.SocketOptions().GetKeepAlive() || e.keepalive.timer.isUninitialized() || !e.keepalive.timer.checkExpiration() {
  1310  		e.keepalive.Unlock()
  1311  		return nil
  1312  	}
  1313  
  1314  	// If a userTimeout is set then abort the connection if it is
  1315  	// exceeded.
  1316  	if userTimeout != 0 && e.stack.Clock().NowMonotonic().Sub(e.rcv.lastRcvdAckTime) >= userTimeout && e.keepalive.unacked > 0 {
  1317  		e.keepalive.Unlock()
  1318  		e.stack.Stats().TCP.EstablishedTimedout.Increment()
  1319  		return &tcpip.ErrTimeout{}
  1320  	}
  1321  
  1322  	if e.keepalive.unacked >= e.keepalive.count {
  1323  		e.keepalive.Unlock()
  1324  		e.stack.Stats().TCP.EstablishedTimedout.Increment()
  1325  		return &tcpip.ErrTimeout{}
  1326  	}
  1327  
  1328  	// RFC1122 4.2.3.6: TCP keepalive is a dataless ACK with
  1329  	// seg.seq = snd.nxt-1.
  1330  	e.keepalive.unacked++
  1331  	e.keepalive.Unlock()
  1332  	e.snd.sendEmptySegment(header.TCPFlagAck, e.snd.SndNxt-1)
  1333  	e.resetKeepaliveTimer(false)
  1334  	return nil
  1335  }
  1336  
  1337  // resetKeepaliveTimer restarts or stops the keepalive timer, depending on
  1338  // whether it is enabled for this endpoint.
  1339  func (e *Endpoint) resetKeepaliveTimer(receivedData bool) {
  1340  	e.keepalive.Lock()
  1341  	defer e.keepalive.Unlock()
  1342  	if e.keepalive.timer.isUninitialized() {
  1343  		if state := e.EndpointState(); !state.closed() {
  1344  			panic(fmt.Sprintf("Unexpected state when the keepalive time is cleaned up, got %s, want %s or %s", state, StateClose, StateError))
  1345  		}
  1346  		return
  1347  	}
  1348  	if receivedData {
  1349  		e.keepalive.unacked = 0
  1350  	}
  1351  	// Start the keepalive timer IFF it's enabled and there is no pending
  1352  	// data to send.
  1353  	if !e.SocketOptions().GetKeepAlive() || e.snd == nil || e.snd.SndUna != e.snd.SndNxt {
  1354  		e.keepalive.timer.disable()
  1355  		return
  1356  	}
  1357  	if e.keepalive.unacked > 0 {
  1358  		e.keepalive.timer.enable(e.keepalive.interval)
  1359  	} else {
  1360  		e.keepalive.timer.enable(e.keepalive.idle)
  1361  	}
  1362  }
  1363  
  1364  // disableKeepaliveTimer stops the keepalive timer.
  1365  func (e *Endpoint) disableKeepaliveTimer() {
  1366  	e.keepalive.Lock()
  1367  	e.keepalive.timer.disable()
  1368  	e.keepalive.Unlock()
  1369  }
  1370  
  1371  // finWait2TimerExpired is called when the FIN-WAIT-2 timeout is hit
  1372  // and the peer hasn't sent us a FIN.
  1373  func (e *Endpoint) finWait2TimerExpired() {
  1374  	e.mu.Lock()
  1375  	e.transitionToStateCloseLocked()
  1376  	e.mu.Unlock()
  1377  	e.drainClosingSegmentQueue()
  1378  	e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  1379  }
  1380  
  1381  // +checklocks:e.mu
  1382  func (e *Endpoint) handshakeFailed(err tcpip.Error) {
  1383  	e.lastErrorMu.Lock()
  1384  	e.lastError = err
  1385  	e.lastErrorMu.Unlock()
  1386  	// handshakeFailed is also called from startHandshake when a listener
  1387  	// transitions out of Listen state by the time the SYN is processed. In
  1388  	// such cases the handshake is never initialized and the newly created
  1389  	// endpoint is closed right away.
  1390  	if e.h != nil && e.h.retransmitTimer != nil {
  1391  		e.h.retransmitTimer.stop()
  1392  	}
  1393  	e.hardError = err
  1394  	e.cleanupLocked()
  1395  	e.setEndpointState(StateError)
  1396  }
  1397  
  1398  // handleTimeWaitSegments processes segments received during TIME_WAIT
  1399  // state.
  1400  // +checklocks:e.mu
  1401  // +checklocksalias:e.rcv.ep.mu=e.mu
  1402  func (e *Endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()) {
  1403  	for i := 0; i < maxSegmentsPerWake; i++ {
  1404  		s := e.segmentQueue.dequeue()
  1405  		if s == nil {
  1406  			break
  1407  		}
  1408  		extTW, newSyn := e.rcv.handleTimeWaitSegment(s)
  1409  		if newSyn {
  1410  			info := e.TransportEndpointInfo
  1411  			newID := info.ID
  1412  			newID.RemoteAddress = tcpip.Address{}
  1413  			newID.RemotePort = 0
  1414  			netProtos := []tcpip.NetworkProtocolNumber{info.NetProto}
  1415  			// If the local address is an IPv4 address then also
  1416  			// look for IPv6 dual stack endpoints that might be
  1417  			// listening on the local address.
  1418  			if newID.LocalAddress.To4() != (tcpip.Address{}) {
  1419  				netProtos = []tcpip.NetworkProtocolNumber{header.IPv4ProtocolNumber, header.IPv6ProtocolNumber}
  1420  			}
  1421  			for _, netProto := range netProtos {
  1422  				if listenEP := e.stack.FindTransportEndpoint(netProto, info.TransProto, newID, s.pkt.NICID); listenEP != nil {
  1423  					tcpEP := listenEP.(*Endpoint)
  1424  					if EndpointState(tcpEP.State()) == StateListen {
  1425  						reuseTW = func() {
  1426  							if !tcpEP.enqueueSegment(s) {
  1427  								return
  1428  							}
  1429  							tcpEP.notifyProcessor()
  1430  							s.DecRef()
  1431  						}
  1432  						// We explicitly do not DecRef the segment as it's still valid and
  1433  						// being reflected to a listening endpoint.
  1434  						return false, reuseTW
  1435  					}
  1436  				}
  1437  			}
  1438  		}
  1439  		if extTW {
  1440  			extendTimeWait = true
  1441  		}
  1442  		s.DecRef()
  1443  	}
  1444  	return extendTimeWait, nil
  1445  }
  1446  
  1447  // +checklocks:e.mu
  1448  func (e *Endpoint) getTimeWaitDuration() time.Duration {
  1449  	timeWaitDuration := DefaultTCPTimeWaitTimeout
  1450  
  1451  	// Get the stack wide configuration.
  1452  	var tcpTW tcpip.TCPTimeWaitTimeoutOption
  1453  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &tcpTW); err == nil {
  1454  		timeWaitDuration = time.Duration(tcpTW)
  1455  	}
  1456  	return timeWaitDuration
  1457  }
  1458  
  1459  // timeWaitTimerExpired is called when an endpoint completes the required time
  1460  // (typically 2 * MSL unless configured to something else at a stack level) in
  1461  // TIME-WAIT state.
  1462  func (e *Endpoint) timeWaitTimerExpired() {
  1463  	e.mu.Lock()
  1464  	if e.EndpointState() != StateTimeWait {
  1465  		e.mu.Unlock()
  1466  		return
  1467  	}
  1468  	e.transitionToStateCloseLocked()
  1469  	e.mu.Unlock()
  1470  	e.drainClosingSegmentQueue()
  1471  	e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  1472  }
  1473  
  1474  // notifyProcessor queues this endpoint for processing to its TCP processor.
  1475  func (e *Endpoint) notifyProcessor() {
  1476  	// We use TryLock here to avoid deadlocks in cases where a listening endpoint that is being
  1477  	// closed tries to abort half completed connections which in turn try to queue any segments
  1478  	// queued to that endpoint back to the same listening endpoint (because it may have got
  1479  	// segments that matched its id but were either a RST or a new SYN which must be handled
  1480  	// by a listening endpoint). In such cases the Close() on the listening endpoint will handle
  1481  	// any queued segments after it releases the lock.
  1482  	if !e.mu.TryLock() {
  1483  		return
  1484  	}
  1485  	processor := e.protocol.dispatcher.selectProcessor(e.ID)
  1486  	e.mu.Unlock()
  1487  	processor.queueEndpoint(e)
  1488  }