inet.af/netstack@v0.0.0-20220214151720-7585b01ddccf/tcpip/transport/tcp/connect.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"encoding/binary"
    19  	"math"
    20  	"time"
    21  
    22  	"inet.af/netstack/sleep"
    23  	"inet.af/netstack/sync"
    24  	"inet.af/netstack/tcpip"
    25  	"inet.af/netstack/tcpip/buffer"
    26  	"inet.af/netstack/tcpip/hash/jenkins"
    27  	"inet.af/netstack/tcpip/header"
    28  	"inet.af/netstack/tcpip/seqnum"
    29  	"inet.af/netstack/tcpip/stack"
    30  	"inet.af/netstack/waiter"
    31  )
    32  
    33  // InitialRTO is the initial retransmission timeout.
    34  // https://github.com/torvalds/linux/blob/7c636d4d20f/include/net/tcp.h#L142
    35  const InitialRTO = time.Second
    36  
    37  // maxSegmentsPerWake is the maximum number of segments to process in the main
    38  // protocol goroutine per wake-up. Yielding [after this number of segments are
    39  // processed] allows other events to be processed as well (e.g., timeouts,
    40  // resets, etc.).
    41  const maxSegmentsPerWake = 100
    42  
    43  type handshakeState int
    44  
    45  // The following are the possible states of the TCP connection during a 3-way
    46  // handshake. A depiction of the states and transitions can be found in RFC 793,
    47  // page 23.
    48  const (
    49  	handshakeSynSent handshakeState = iota
    50  	handshakeSynRcvd
    51  	handshakeCompleted
    52  )
    53  
    54  const (
    55  	// Maximum space available for options.
    56  	maxOptionSize = 40
    57  )
    58  
    59  // handshake holds the state used during a TCP 3-way handshake.
    60  //
    61  // NOTE: handshake.ep.mu is held during handshake processing. It is released if
    62  // we are going to block and reacquired when we start processing an event.
    63  type handshake struct {
    64  	ep       *endpoint
    65  	listenEP *endpoint
    66  	state    handshakeState
    67  	active   bool
    68  	flags    header.TCPFlags
    69  	ackNum   seqnum.Value
    70  
    71  	// iss is the initial send sequence number, as defined in RFC 793.
    72  	iss seqnum.Value
    73  
    74  	// rcvWnd is the receive window, as defined in RFC 793.
    75  	rcvWnd seqnum.Size
    76  
    77  	// sndWnd is the send window, as defined in RFC 793.
    78  	sndWnd seqnum.Size
    79  
    80  	// mss is the maximum segment size received from the peer.
    81  	mss uint16
    82  
    83  	// sndWndScale is the send window scale, as defined in RFC 1323. A
    84  	// negative value means no scaling is supported by the peer.
    85  	sndWndScale int
    86  
    87  	// rcvWndScale is the receive window scale, as defined in RFC 1323.
    88  	rcvWndScale int
    89  
    90  	// startTime is the time at which the first SYN/SYN-ACK was sent.
    91  	startTime tcpip.MonotonicTime
    92  
    93  	// deferAccept if non-zero will drop the final ACK for a passive
    94  	// handshake till an ACK segment with data is received or the timeout is
    95  	// hit.
    96  	deferAccept time.Duration
    97  
    98  	// acked is true if the the final ACK for a 3-way handshake has
    99  	// been received. This is required to stop retransmitting the
   100  	// original SYN-ACK when deferAccept is enabled.
   101  	acked bool
   102  
   103  	// sendSYNOpts is the cached values for the SYN options to be sent.
   104  	sendSYNOpts header.TCPSynOptions
   105  
   106  	// sampleRTTWithTSOnly is true when the segment was retransmitted or we can't
   107  	// tell; then RTT can only be sampled when the incoming segment has timestamp
   108  	// options enabled.
   109  	sampleRTTWithTSOnly bool
   110  }
   111  
   112  func (e *endpoint) newHandshake() *handshake {
   113  	h := &handshake{
   114  		ep:          e,
   115  		active:      true,
   116  		rcvWnd:      seqnum.Size(e.initialReceiveWindow()),
   117  		rcvWndScale: e.rcvWndScaleForHandshake(),
   118  	}
   119  	h.resetState()
   120  	// Store reference to handshake state in endpoint.
   121  	e.h = h
   122  	// By the time handshake is created, e.ID is already initialized.
   123  	e.TSOffset = e.protocol.tsOffset(e.ID.LocalAddress, e.ID.RemoteAddress)
   124  	return h
   125  }
   126  
   127  func (e *endpoint) newPassiveHandshake(isn, irs seqnum.Value, opts header.TCPSynOptions, deferAccept time.Duration) *handshake {
   128  	h := e.newHandshake()
   129  	h.resetToSynRcvd(isn, irs, opts, deferAccept)
   130  	return h
   131  }
   132  
   133  // FindWndScale determines the window scale to use for the given maximum window
   134  // size.
   135  func FindWndScale(wnd seqnum.Size) int {
   136  	if wnd < 0x10000 {
   137  		return 0
   138  	}
   139  
   140  	max := seqnum.Size(math.MaxUint16)
   141  	s := 0
   142  	for wnd > max && s < header.MaxWndScale {
   143  		s++
   144  		max <<= 1
   145  	}
   146  
   147  	return s
   148  }
   149  
   150  // resetState resets the state of the handshake object such that it becomes
   151  // ready for a new 3-way handshake.
   152  func (h *handshake) resetState() {
   153  	h.state = handshakeSynSent
   154  	h.flags = header.TCPFlagSyn
   155  	h.ackNum = 0
   156  	h.mss = 0
   157  	h.iss = generateSecureISN(h.ep.TransportEndpointInfo.ID, h.ep.stack.Clock(), h.ep.protocol.seqnumSecret)
   158  }
   159  
   160  // generateSecureISN generates a secure Initial Sequence number based on the
   161  // recommendation here https://tools.ietf.org/html/rfc6528#page-3.
   162  func generateSecureISN(id stack.TransportEndpointID, clock tcpip.Clock, seed uint32) seqnum.Value {
   163  	isnHasher := jenkins.Sum32(seed)
   164  	// Per hash.Hash.Writer:
   165  	//
   166  	// It never returns an error.
   167  	_, _ = isnHasher.Write([]byte(id.LocalAddress))
   168  	_, _ = isnHasher.Write([]byte(id.RemoteAddress))
   169  	portBuf := make([]byte, 2)
   170  	binary.LittleEndian.PutUint16(portBuf, id.LocalPort)
   171  	_, _ = isnHasher.Write(portBuf)
   172  	binary.LittleEndian.PutUint16(portBuf, id.RemotePort)
   173  	_, _ = isnHasher.Write(portBuf)
   174  	// The time period here is 64ns. This is similar to what linux uses
   175  	// generate a sequence number that overlaps less than one
   176  	// time per MSL (2 minutes).
   177  	//
   178  	// A 64ns clock ticks 10^9/64 = 15625000) times in a second.
   179  	// To wrap the whole 32 bit space would require
   180  	// 2^32/1562500 ~ 274 seconds.
   181  	//
   182  	// Which sort of guarantees that we won't reuse the ISN for a new
   183  	// connection for the same tuple for at least 274s.
   184  	isn := isnHasher.Sum32() + uint32(clock.NowMonotonic().Sub(tcpip.MonotonicTime{}).Nanoseconds()>>6)
   185  	return seqnum.Value(isn)
   186  }
   187  
   188  // effectiveRcvWndScale returns the effective receive window scale to be used.
   189  // If the peer doesn't support window scaling, the effective rcv wnd scale is
   190  // zero; otherwise it's the value calculated based on the initial rcv wnd.
   191  func (h *handshake) effectiveRcvWndScale() uint8 {
   192  	if h.sndWndScale < 0 {
   193  		return 0
   194  	}
   195  	return uint8(h.rcvWndScale)
   196  }
   197  
   198  // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD
   199  // state.
   200  func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts header.TCPSynOptions, deferAccept time.Duration) {
   201  	h.active = false
   202  	h.state = handshakeSynRcvd
   203  	h.flags = header.TCPFlagSyn | header.TCPFlagAck
   204  	h.iss = iss
   205  	h.ackNum = irs + 1
   206  	h.mss = opts.MSS
   207  	h.sndWndScale = opts.WS
   208  	h.deferAccept = deferAccept
   209  	h.ep.setEndpointState(StateSynRecv)
   210  }
   211  
   212  // checkAck checks if the ACK number, if present, of a segment received during
   213  // a TCP 3-way handshake is valid. If it's not, a RST segment is sent back in
   214  // response.
   215  func (h *handshake) checkAck(s *segment) bool {
   216  	if s.flags.Contains(header.TCPFlagAck) && s.ackNumber != h.iss+1 {
   217  		// RFC 793, page 36, states that a reset must be generated when
   218  		// the connection is in any non-synchronized state and an
   219  		// incoming segment acknowledges something not yet sent. The
   220  		// connection remains in the same state.
   221  		ack := s.sequenceNumber.Add(s.logicalLen())
   222  		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagRst|header.TCPFlagAck, s.ackNumber, ack, 0)
   223  		return false
   224  	}
   225  
   226  	return true
   227  }
   228  
   229  // synSentState handles a segment received when the TCP 3-way handshake is in
   230  // the SYN-SENT state.
   231  func (h *handshake) synSentState(s *segment) tcpip.Error {
   232  	// RFC 793, page 37, states that in the SYN-SENT state, a reset is
   233  	// acceptable if the ack field acknowledges the SYN.
   234  	if s.flags.Contains(header.TCPFlagRst) {
   235  		if s.flags.Contains(header.TCPFlagAck) && s.ackNumber == h.iss+1 {
   236  			// RFC 793, page 67, states that "If the RST bit is set [and] If the ACK
   237  			// was acceptable then signal the user "error: connection reset", drop
   238  			// the segment, enter CLOSED state, delete TCB, and return."
   239  			h.ep.workerCleanup = true
   240  			// Although the RFC above calls out ECONNRESET, Linux actually returns
   241  			// ECONNREFUSED here so we do as well.
   242  			return &tcpip.ErrConnectionRefused{}
   243  		}
   244  		return nil
   245  	}
   246  
   247  	if !h.checkAck(s) {
   248  		return nil
   249  	}
   250  
   251  	// We are in the SYN-SENT state. We only care about segments that have
   252  	// the SYN flag.
   253  	if !s.flags.Contains(header.TCPFlagSyn) {
   254  		return nil
   255  	}
   256  
   257  	// Parse the SYN options.
   258  	rcvSynOpts := parseSynSegmentOptions(s)
   259  
   260  	// Remember if the Timestamp option was negotiated.
   261  	h.ep.maybeEnableTimestamp(rcvSynOpts)
   262  
   263  	// Remember if the SACKPermitted option was negotiated.
   264  	h.ep.maybeEnableSACKPermitted(rcvSynOpts)
   265  
   266  	// Remember the sequence we'll ack from now on.
   267  	h.ackNum = s.sequenceNumber + 1
   268  	h.flags |= header.TCPFlagAck
   269  	h.mss = rcvSynOpts.MSS
   270  	h.sndWndScale = rcvSynOpts.WS
   271  
   272  	// If this is a SYN ACK response, we only need to acknowledge the SYN
   273  	// and the handshake is completed.
   274  	if s.flags.Contains(header.TCPFlagAck) {
   275  		h.state = handshakeCompleted
   276  		h.transitionToStateEstablishedLocked(s)
   277  
   278  		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale())
   279  		return nil
   280  	}
   281  
   282  	// A SYN segment was received, but no ACK in it. We acknowledge the SYN
   283  	// but resend our own SYN and wait for it to be acknowledged in the
   284  	// SYN-RCVD state.
   285  	h.state = handshakeSynRcvd
   286  	ttl := h.ep.ttl
   287  	amss := h.ep.amss
   288  	h.ep.setEndpointState(StateSynRecv)
   289  	synOpts := header.TCPSynOptions{
   290  		WS:    int(h.effectiveRcvWndScale()),
   291  		TS:    rcvSynOpts.TS,
   292  		TSVal: h.ep.tsValNow(),
   293  		TSEcr: h.ep.recentTimestamp(),
   294  
   295  		// We only send SACKPermitted if the other side indicated it
   296  		// permits SACK. This is not explicitly defined in the RFC but
   297  		// this is the behaviour implemented by Linux.
   298  		SACKPermitted: rcvSynOpts.SACKPermitted,
   299  		MSS:           amss,
   300  	}
   301  	if ttl == 0 {
   302  		ttl = h.ep.route.DefaultTTL()
   303  	}
   304  	h.ep.sendSynTCP(h.ep.route, tcpFields{
   305  		id:     h.ep.TransportEndpointInfo.ID,
   306  		ttl:    ttl,
   307  		tos:    h.ep.sendTOS,
   308  		flags:  h.flags,
   309  		seq:    h.iss,
   310  		ack:    h.ackNum,
   311  		rcvWnd: h.rcvWnd,
   312  	}, synOpts)
   313  	return nil
   314  }
   315  
   316  // synRcvdState handles a segment received when the TCP 3-way handshake is in
   317  // the SYN-RCVD state.
   318  func (h *handshake) synRcvdState(s *segment) tcpip.Error {
   319  	if s.flags.Contains(header.TCPFlagRst) {
   320  		// RFC 793, page 37, states that in the SYN-RCVD state, a reset
   321  		// is acceptable if the sequence number is in the window.
   322  		if s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) {
   323  			return &tcpip.ErrConnectionRefused{}
   324  		}
   325  		return nil
   326  	}
   327  
   328  	if !h.checkAck(s) {
   329  		return nil
   330  	}
   331  
   332  	// RFC 793, Section 3.9, page 69, states that in the SYN-RCVD state, a
   333  	// sequence number outside of the window causes an ACK with the proper seq
   334  	// number and "After sending the acknowledgment, drop the unacceptable
   335  	// segment and return."
   336  	if !s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) {
   337  		if h.ep.allowOutOfWindowAck() {
   338  			h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd)
   339  		}
   340  		return nil
   341  	}
   342  
   343  	if s.flags.Contains(header.TCPFlagSyn) && s.sequenceNumber != h.ackNum-1 {
   344  		// We received two SYN segments with different sequence
   345  		// numbers, so we reset this and restart the whole
   346  		// process, except that we don't reset the timer.
   347  		ack := s.sequenceNumber.Add(s.logicalLen())
   348  		seq := seqnum.Value(0)
   349  		if s.flags.Contains(header.TCPFlagAck) {
   350  			seq = s.ackNumber
   351  		}
   352  		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0)
   353  
   354  		if !h.active {
   355  			return &tcpip.ErrInvalidEndpointState{}
   356  		}
   357  
   358  		h.resetState()
   359  		synOpts := header.TCPSynOptions{
   360  			WS:            h.rcvWndScale,
   361  			TS:            h.ep.SendTSOk,
   362  			TSVal:         h.ep.tsValNow(),
   363  			TSEcr:         h.ep.recentTimestamp(),
   364  			SACKPermitted: h.ep.SACKPermitted,
   365  			MSS:           h.ep.amss,
   366  		}
   367  		h.ep.sendSynTCP(h.ep.route, tcpFields{
   368  			id:     h.ep.TransportEndpointInfo.ID,
   369  			ttl:    h.ep.ttl,
   370  			tos:    h.ep.sendTOS,
   371  			flags:  h.flags,
   372  			seq:    h.iss,
   373  			ack:    h.ackNum,
   374  			rcvWnd: h.rcvWnd,
   375  		}, synOpts)
   376  		return nil
   377  	}
   378  
   379  	// We have previously received (and acknowledged) the peer's SYN. If the
   380  	// peer acknowledges our SYN, the handshake is completed.
   381  	if s.flags.Contains(header.TCPFlagAck) {
   382  		// If deferAccept is not zero and this is a bare ACK and the
   383  		// timeout is not hit then drop the ACK.
   384  		if h.deferAccept != 0 && s.data.Size() == 0 && h.ep.stack.Clock().NowMonotonic().Sub(h.startTime) < h.deferAccept {
   385  			h.acked = true
   386  			h.ep.stack.Stats().DroppedPackets.Increment()
   387  			return nil
   388  		}
   389  
   390  		// If the timestamp option is negotiated and the segment does
   391  		// not carry a timestamp option then the segment must be dropped
   392  		// as per https://tools.ietf.org/html/rfc7323#section-3.2.
   393  		if h.ep.SendTSOk && !s.parsedOptions.TS {
   394  			h.ep.stack.Stats().DroppedPackets.Increment()
   395  			return nil
   396  		}
   397  
   398  		// Drop the ACK if the accept queue is full.
   399  		// https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_ipv4.c#L1523
   400  		// We could abort the connection as well with a tunable as in
   401  		// https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_minisocks.c#L788
   402  		if listenEP := h.listenEP; listenEP != nil && listenEP.acceptQueueIsFull() {
   403  			listenEP.stack.Stats().DroppedPackets.Increment()
   404  			return nil
   405  		}
   406  
   407  		// Update timestamp if required. See RFC7323, section-4.3.
   408  		if h.ep.SendTSOk && s.parsedOptions.TS {
   409  			h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber)
   410  		}
   411  
   412  		h.state = handshakeCompleted
   413  
   414  		h.transitionToStateEstablishedLocked(s)
   415  
   416  		// Requeue the segment if the ACK completing the handshake has more info
   417  		// to be procesed by the newly established endpoint.
   418  		if (s.flags.Contains(header.TCPFlagFin) || s.data.Size() > 0) && h.ep.enqueueSegment(s) {
   419  			s.incRef()
   420  			h.ep.newSegmentWaker.Assert()
   421  		}
   422  		return nil
   423  	}
   424  
   425  	return nil
   426  }
   427  
   428  func (h *handshake) handleSegment(s *segment) tcpip.Error {
   429  	h.sndWnd = s.window
   430  	if !s.flags.Contains(header.TCPFlagSyn) && h.sndWndScale > 0 {
   431  		h.sndWnd <<= uint8(h.sndWndScale)
   432  	}
   433  
   434  	switch h.state {
   435  	case handshakeSynRcvd:
   436  		return h.synRcvdState(s)
   437  	case handshakeSynSent:
   438  		return h.synSentState(s)
   439  	}
   440  	return nil
   441  }
   442  
   443  // processSegments goes through the segment queue and processes up to
   444  // maxSegmentsPerWake (if they're available).
   445  func (h *handshake) processSegments() tcpip.Error {
   446  	for i := 0; i < maxSegmentsPerWake; i++ {
   447  		s := h.ep.segmentQueue.dequeue()
   448  		if s == nil {
   449  			return nil
   450  		}
   451  
   452  		err := h.handleSegment(s)
   453  		s.decRef()
   454  		if err != nil {
   455  			return err
   456  		}
   457  
   458  		// We stop processing packets once the handshake is completed,
   459  		// otherwise we may process packets meant to be processed by
   460  		// the main protocol goroutine.
   461  		if h.state == handshakeCompleted {
   462  			break
   463  		}
   464  	}
   465  
   466  	// If the queue is not empty, make sure we'll wake up in the next
   467  	// iteration.
   468  	if !h.ep.segmentQueue.empty() {
   469  		h.ep.newSegmentWaker.Assert()
   470  	}
   471  
   472  	return nil
   473  }
   474  
   475  // start sends the first SYN/SYN-ACK. It does not block, even if link address
   476  // resolution is required.
   477  func (h *handshake) start() {
   478  	h.startTime = h.ep.stack.Clock().NowMonotonic()
   479  	h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route)
   480  	var sackEnabled tcpip.TCPSACKEnabled
   481  	if err := h.ep.stack.TransportProtocolOption(ProtocolNumber, &sackEnabled); err != nil {
   482  		// If stack returned an error when checking for SACKEnabled
   483  		// status then just default to switching off SACK negotiation.
   484  		sackEnabled = false
   485  	}
   486  
   487  	synOpts := header.TCPSynOptions{
   488  		WS:            h.rcvWndScale,
   489  		TS:            true,
   490  		TSVal:         h.ep.tsValNow(),
   491  		TSEcr:         h.ep.recentTimestamp(),
   492  		SACKPermitted: bool(sackEnabled),
   493  		MSS:           h.ep.amss,
   494  	}
   495  
   496  	// start() is also called in a listen context so we want to make sure we only
   497  	// send the TS/SACK option when we received the TS/SACK in the initial SYN.
   498  	if h.state == handshakeSynRcvd {
   499  		synOpts.TS = h.ep.SendTSOk
   500  		synOpts.SACKPermitted = h.ep.SACKPermitted && bool(sackEnabled)
   501  		if h.sndWndScale < 0 {
   502  			// Disable window scaling if the peer did not send us
   503  			// the window scaling option.
   504  			synOpts.WS = -1
   505  		}
   506  	}
   507  
   508  	h.sendSYNOpts = synOpts
   509  	h.ep.sendSynTCP(h.ep.route, tcpFields{
   510  		id:     h.ep.TransportEndpointInfo.ID,
   511  		ttl:    h.ep.ttl,
   512  		tos:    h.ep.sendTOS,
   513  		flags:  h.flags,
   514  		seq:    h.iss,
   515  		ack:    h.ackNum,
   516  		rcvWnd: h.rcvWnd,
   517  	}, synOpts)
   518  }
   519  
   520  // complete completes the TCP 3-way handshake initiated by h.start().
   521  // +checklocks:h.ep.mu
   522  func (h *handshake) complete() tcpip.Error {
   523  	// Set up the wakers.
   524  	var s sleep.Sleeper
   525  	resendWaker := sleep.Waker{}
   526  	s.AddWaker(&resendWaker)
   527  	s.AddWaker(&h.ep.notificationWaker)
   528  	s.AddWaker(&h.ep.newSegmentWaker)
   529  	defer s.Done()
   530  
   531  	// Initialize the resend timer.
   532  	timer, err := newBackoffTimer(h.ep.stack.Clock(), InitialRTO, MaxRTO, resendWaker.Assert)
   533  	if err != nil {
   534  		return err
   535  	}
   536  	defer timer.stop()
   537  	for h.state != handshakeCompleted {
   538  		// Unlock before blocking, and reacquire again afterwards (h.ep.mu is held
   539  		// throughout handshake processing).
   540  		h.ep.mu.Unlock()
   541  		w := s.Fetch(true /* block */)
   542  		h.ep.mu.Lock()
   543  		switch w {
   544  		case &resendWaker:
   545  			if err := timer.reset(); err != nil {
   546  				return err
   547  			}
   548  			// Resend the SYN/SYN-ACK only if the following conditions hold.
   549  			//  - It's an active handshake (deferAccept does not apply)
   550  			//  - It's a passive handshake and we have not yet got the final-ACK.
   551  			//  - It's a passive handshake and we got an ACK but deferAccept is
   552  			//    enabled and we are now past the deferAccept duration.
   553  			// The last is required to provide a way for the peer to complete
   554  			// the connection with another ACK or data (as ACKs are never
   555  			// retransmitted on their own).
   556  			if h.active || !h.acked || h.deferAccept != 0 && h.ep.stack.Clock().NowMonotonic().Sub(h.startTime) > h.deferAccept {
   557  				h.ep.sendSynTCP(h.ep.route, tcpFields{
   558  					id:     h.ep.TransportEndpointInfo.ID,
   559  					ttl:    h.ep.ttl,
   560  					tos:    h.ep.sendTOS,
   561  					flags:  h.flags,
   562  					seq:    h.iss,
   563  					ack:    h.ackNum,
   564  					rcvWnd: h.rcvWnd,
   565  				}, h.sendSYNOpts)
   566  				// If we have ever retransmitted the SYN-ACK or
   567  				// SYN segment, we should only measure RTT if
   568  				// TS option is present.
   569  				h.sampleRTTWithTSOnly = true
   570  			}
   571  
   572  		case &h.ep.notificationWaker:
   573  			n := h.ep.fetchNotifications()
   574  			if (n&notifyClose)|(n&notifyAbort) != 0 {
   575  				return &tcpip.ErrAborted{}
   576  			}
   577  			if n&notifyShutdown != 0 {
   578  				return &tcpip.ErrConnectionReset{}
   579  			}
   580  			if n&notifyDrain != 0 {
   581  				for !h.ep.segmentQueue.empty() {
   582  					s := h.ep.segmentQueue.dequeue()
   583  					err := h.handleSegment(s)
   584  					s.decRef()
   585  					if err != nil {
   586  						return err
   587  					}
   588  					if h.state == handshakeCompleted {
   589  						return nil
   590  					}
   591  				}
   592  				close(h.ep.drainDone)
   593  				h.ep.mu.Unlock()
   594  				<-h.ep.undrain
   595  				h.ep.mu.Lock()
   596  			}
   597  			// Check for any ICMP errors notified to us.
   598  			if n&notifyError != 0 {
   599  				if err := h.ep.lastErrorLocked(); err != nil {
   600  					return err
   601  				}
   602  				// Flag the handshake failure as aborted if the lastError is
   603  				// cleared because of a socket layer call.
   604  				return &tcpip.ErrConnectionAborted{}
   605  			}
   606  		case &h.ep.newSegmentWaker:
   607  			if err := h.processSegments(); err != nil {
   608  				return err
   609  			}
   610  		}
   611  	}
   612  
   613  	return nil
   614  }
   615  
   616  // transitionToStateEstablisedLocked transitions the endpoint of the handshake
   617  // to an established state given the last segment received from peer. It also
   618  // initializes sender/receiver.
   619  func (h *handshake) transitionToStateEstablishedLocked(s *segment) {
   620  	// Transfer handshake state to TCP connection. We disable
   621  	// receive window scaling if the peer doesn't support it
   622  	// (indicated by a negative send window scale).
   623  	h.ep.snd = newSender(h.ep, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale)
   624  
   625  	now := h.ep.stack.Clock().NowMonotonic()
   626  
   627  	var rtt time.Duration
   628  	if h.ep.SendTSOk && s.parsedOptions.TSEcr != 0 {
   629  		rtt = h.ep.elapsed(now, s.parsedOptions.TSEcr)
   630  	}
   631  	if !h.sampleRTTWithTSOnly && rtt == 0 {
   632  		rtt = now.Sub(h.startTime)
   633  	}
   634  
   635  	if rtt > 0 {
   636  		h.ep.snd.updateRTO(rtt)
   637  	}
   638  
   639  	h.ep.rcvQueueInfo.rcvQueueMu.Lock()
   640  	h.ep.rcv = newReceiver(h.ep, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale())
   641  	// Bootstrap the auto tuning algorithm. Starting at zero will
   642  	// result in a really large receive window after the first auto
   643  	// tuning adjustment.
   644  	h.ep.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes = int(h.rcvWnd)
   645  	h.ep.rcvQueueInfo.rcvQueueMu.Unlock()
   646  
   647  	h.ep.setEndpointState(StateEstablished)
   648  }
   649  
   650  type backoffTimer struct {
   651  	timeout    time.Duration
   652  	maxTimeout time.Duration
   653  	t          tcpip.Timer
   654  }
   655  
   656  func newBackoffTimer(clock tcpip.Clock, timeout, maxTimeout time.Duration, f func()) (*backoffTimer, tcpip.Error) {
   657  	if timeout > maxTimeout {
   658  		return nil, &tcpip.ErrTimeout{}
   659  	}
   660  	bt := &backoffTimer{timeout: timeout, maxTimeout: maxTimeout}
   661  	bt.t = clock.AfterFunc(timeout, f)
   662  	return bt, nil
   663  }
   664  
   665  func (bt *backoffTimer) reset() tcpip.Error {
   666  	bt.timeout *= 2
   667  	if bt.timeout > bt.maxTimeout {
   668  		return &tcpip.ErrTimeout{}
   669  	}
   670  	bt.t.Reset(bt.timeout)
   671  	return nil
   672  }
   673  
   674  func (bt *backoffTimer) stop() {
   675  	bt.t.Stop()
   676  }
   677  
   678  func parseSynSegmentOptions(s *segment) header.TCPSynOptions {
   679  	synOpts := header.ParseSynOptions(s.options, s.flags.Contains(header.TCPFlagAck))
   680  	if synOpts.TS {
   681  		s.parsedOptions.TSVal = synOpts.TSVal
   682  		s.parsedOptions.TSEcr = synOpts.TSEcr
   683  	}
   684  	return synOpts
   685  }
   686  
   687  var optionPool = sync.Pool{
   688  	New: func() interface{} {
   689  		return &[maxOptionSize]byte{}
   690  	},
   691  }
   692  
   693  func getOptions() []byte {
   694  	return (*optionPool.Get().(*[maxOptionSize]byte))[:]
   695  }
   696  
   697  func putOptions(options []byte) {
   698  	// Reslice to full capacity.
   699  	optionPool.Put(optionsToArray(options))
   700  }
   701  
   702  func makeSynOptions(opts header.TCPSynOptions) []byte {
   703  	// Emulate linux option order. This is as follows:
   704  	//
   705  	// if md5: NOP NOP MD5SIG 18 md5sig(16)
   706  	// if mss: MSS 4 mss(2)
   707  	// if ts and sack_advertise:
   708  	//	SACK 2 TIMESTAMP 2 timestamp(8)
   709  	// elif ts: NOP NOP TIMESTAMP 10 timestamp(8)
   710  	// elif sack: NOP NOP SACK 2
   711  	// if wscale: NOP WINDOW 3 ws(1)
   712  	// if sack_blocks: NOP NOP SACK ((2 + (#blocks * 8))
   713  	//	[for each block] start_seq(4) end_seq(4)
   714  	// if fastopen_cookie:
   715  	//	if exp: EXP (4 + len(cookie)) FASTOPEN_MAGIC(2)
   716  	// 	else: FASTOPEN (2 + len(cookie))
   717  	//	cookie(variable) [padding to four bytes]
   718  	//
   719  	options := getOptions()
   720  
   721  	// Always encode the mss.
   722  	offset := header.EncodeMSSOption(uint32(opts.MSS), options)
   723  
   724  	// Special ordering is required here. If both TS and SACK are enabled,
   725  	// then the SACK option precedes TS, with no padding. If they are
   726  	// enabled individually, then we see padding before the option.
   727  	if opts.TS && opts.SACKPermitted {
   728  		offset += header.EncodeSACKPermittedOption(options[offset:])
   729  		offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:])
   730  	} else if opts.TS {
   731  		offset += header.EncodeNOP(options[offset:])
   732  		offset += header.EncodeNOP(options[offset:])
   733  		offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:])
   734  	} else if opts.SACKPermitted {
   735  		offset += header.EncodeNOP(options[offset:])
   736  		offset += header.EncodeNOP(options[offset:])
   737  		offset += header.EncodeSACKPermittedOption(options[offset:])
   738  	}
   739  
   740  	// Initialize the WS option.
   741  	if opts.WS >= 0 {
   742  		offset += header.EncodeNOP(options[offset:])
   743  		offset += header.EncodeWSOption(opts.WS, options[offset:])
   744  	}
   745  
   746  	// Padding to the end; note that this never apply unless we add a
   747  	// fastopen option, we always expect the offset to remain the same.
   748  	if delta := header.AddTCPOptionPadding(options, offset); delta != 0 {
   749  		panic("unexpected option encoding")
   750  	}
   751  
   752  	return options[:offset]
   753  }
   754  
   755  // tcpFields is a struct to carry different parameters required by the
   756  // send*TCP variant functions below.
   757  type tcpFields struct {
   758  	id     stack.TransportEndpointID
   759  	ttl    uint8
   760  	tos    uint8
   761  	flags  header.TCPFlags
   762  	seq    seqnum.Value
   763  	ack    seqnum.Value
   764  	rcvWnd seqnum.Size
   765  	opts   []byte
   766  	txHash uint32
   767  }
   768  
   769  func (e *endpoint) sendSynTCP(r *stack.Route, tf tcpFields, opts header.TCPSynOptions) tcpip.Error {
   770  	tf.opts = makeSynOptions(opts)
   771  	// We ignore SYN send errors and let the callers re-attempt send.
   772  	if err := e.sendTCP(r, tf, buffer.VectorisedView{}, stack.GSO{}); err != nil {
   773  		e.stats.SendErrors.SynSendToNetworkFailed.Increment()
   774  	}
   775  	putOptions(tf.opts)
   776  	return nil
   777  }
   778  
   779  func (e *endpoint) sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso stack.GSO) tcpip.Error {
   780  	tf.txHash = e.txHash
   781  	if err := sendTCP(r, tf, data, gso, e.owner); err != nil {
   782  		e.stats.SendErrors.SegmentSendToNetworkFailed.Increment()
   783  		return err
   784  	}
   785  	e.stats.SegmentsSent.Increment()
   786  	return nil
   787  }
   788  
   789  func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso stack.GSO) {
   790  	optLen := len(tf.opts)
   791  	tcp := header.TCP(pkt.TransportHeader().Push(header.TCPMinimumSize + optLen))
   792  	pkt.TransportProtocolNumber = header.TCPProtocolNumber
   793  	tcp.Encode(&header.TCPFields{
   794  		SrcPort:    tf.id.LocalPort,
   795  		DstPort:    tf.id.RemotePort,
   796  		SeqNum:     uint32(tf.seq),
   797  		AckNum:     uint32(tf.ack),
   798  		DataOffset: uint8(header.TCPMinimumSize + optLen),
   799  		Flags:      tf.flags,
   800  		WindowSize: uint16(tf.rcvWnd),
   801  	})
   802  	copy(tcp[header.TCPMinimumSize:], tf.opts)
   803  
   804  	xsum := r.PseudoHeaderChecksum(ProtocolNumber, uint16(pkt.Size()))
   805  	// Only calculate the checksum if offloading isn't supported.
   806  	if gso.Type != stack.GSONone && gso.NeedsCsum {
   807  		// This is called CHECKSUM_PARTIAL in the Linux kernel. We
   808  		// calculate a checksum of the pseudo-header and save it in the
   809  		// TCP header, then the kernel calculate a checksum of the
   810  		// header and data and get the right sum of the TCP packet.
   811  		tcp.SetChecksum(xsum)
   812  	} else if r.RequiresTXTransportChecksum() {
   813  		xsum = header.ChecksumCombine(xsum, pkt.Data().AsRange().Checksum())
   814  		tcp.SetChecksum(^tcp.CalculateChecksum(xsum))
   815  	}
   816  }
   817  
   818  func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso stack.GSO, owner tcpip.PacketOwner) tcpip.Error {
   819  	// We need to shallow clone the VectorisedView here as ReadToView will
   820  	// split the VectorisedView and Trim underlying views as it splits. Not
   821  	// doing the clone here will cause the underlying views of data itself
   822  	// to be altered.
   823  	data = data.Clone(nil)
   824  
   825  	optLen := len(tf.opts)
   826  	if tf.rcvWnd > math.MaxUint16 {
   827  		tf.rcvWnd = math.MaxUint16
   828  	}
   829  
   830  	mss := int(gso.MSS)
   831  	n := (data.Size() + mss - 1) / mss
   832  
   833  	size := data.Size()
   834  	hdrSize := header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen
   835  	var pkts stack.PacketBufferList
   836  	for i := 0; i < n; i++ {
   837  		packetSize := mss
   838  		if packetSize > size {
   839  			packetSize = size
   840  		}
   841  		size -= packetSize
   842  		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
   843  			ReserveHeaderBytes: hdrSize,
   844  		})
   845  		pkt.Hash = tf.txHash
   846  		pkt.Owner = owner
   847  		pkt.Data().ReadFromVV(&data, packetSize)
   848  		buildTCPHdr(r, tf, pkt, gso)
   849  		tf.seq = tf.seq.Add(seqnum.Size(packetSize))
   850  		pkt.GSOOptions = gso
   851  		pkts.PushBack(pkt)
   852  	}
   853  	defer pkts.DecRef()
   854  
   855  	if tf.ttl == 0 {
   856  		tf.ttl = r.DefaultTTL()
   857  	}
   858  	sent, err := r.WritePackets(pkts, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos})
   859  	if err != nil {
   860  		r.Stats().TCP.SegmentSendErrors.IncrementBy(uint64(n - sent))
   861  	}
   862  	r.Stats().TCP.SegmentsSent.IncrementBy(uint64(sent))
   863  	return err
   864  }
   865  
   866  // sendTCP sends a TCP segment with the provided options via the provided
   867  // network endpoint and under the provided identity.
   868  func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso stack.GSO, owner tcpip.PacketOwner) tcpip.Error {
   869  	optLen := len(tf.opts)
   870  	if tf.rcvWnd > math.MaxUint16 {
   871  		tf.rcvWnd = math.MaxUint16
   872  	}
   873  
   874  	if r.Loop()&stack.PacketLoop == 0 && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() {
   875  		return sendTCPBatch(r, tf, data, gso, owner)
   876  	}
   877  
   878  	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
   879  		ReserveHeaderBytes: header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen,
   880  		Data:               data,
   881  	})
   882  	defer pkt.DecRef()
   883  	pkt.GSOOptions = gso
   884  	pkt.Hash = tf.txHash
   885  	pkt.Owner = owner
   886  	buildTCPHdr(r, tf, pkt, gso)
   887  
   888  	if tf.ttl == 0 {
   889  		tf.ttl = r.DefaultTTL()
   890  	}
   891  	if err := r.WritePacket(stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos}, pkt); err != nil {
   892  		r.Stats().TCP.SegmentSendErrors.Increment()
   893  		return err
   894  	}
   895  	r.Stats().TCP.SegmentsSent.Increment()
   896  	if (tf.flags & header.TCPFlagRst) != 0 {
   897  		r.Stats().TCP.ResetsSent.Increment()
   898  	}
   899  	return nil
   900  }
   901  
   902  // makeOptions makes an options slice.
   903  func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
   904  	options := getOptions()
   905  	offset := 0
   906  
   907  	// N.B. the ordering here matches the ordering used by Linux internally
   908  	// and described in the raw makeOptions function. We don't include
   909  	// unnecessary cases here (post connection.)
   910  	if e.SendTSOk {
   911  		// Embed the timestamp if timestamp has been enabled.
   912  		//
   913  		// We only use the lower 32 bits of the unix time in
   914  		// milliseconds. This is similar to what Linux does where it
   915  		// uses the lower 32 bits of the jiffies value in the tsVal
   916  		// field of the timestamp option.
   917  		//
   918  		// Further, RFC7323 section-5.4 recommends millisecond
   919  		// resolution as the lowest recommended resolution for the
   920  		// timestamp clock.
   921  		//
   922  		// Ref: https://tools.ietf.org/html/rfc7323#section-5.4.
   923  		offset += header.EncodeNOP(options[offset:])
   924  		offset += header.EncodeNOP(options[offset:])
   925  		offset += header.EncodeTSOption(e.tsValNow(), e.recentTimestamp(), options[offset:])
   926  	}
   927  	if e.SACKPermitted && len(sackBlocks) > 0 {
   928  		offset += header.EncodeNOP(options[offset:])
   929  		offset += header.EncodeNOP(options[offset:])
   930  		offset += header.EncodeSACKBlocks(sackBlocks, options[offset:])
   931  	}
   932  
   933  	// We expect the above to produce an aligned offset.
   934  	if delta := header.AddTCPOptionPadding(options, offset); delta != 0 {
   935  		panic("unexpected option encoding")
   936  	}
   937  
   938  	return options[:offset]
   939  }
   940  
   941  // sendRaw sends a TCP segment to the endpoint's peer.
   942  func (e *endpoint) sendRaw(data buffer.VectorisedView, flags header.TCPFlags, seq, ack seqnum.Value, rcvWnd seqnum.Size) tcpip.Error {
   943  	var sackBlocks []header.SACKBlock
   944  	if e.EndpointState() == StateEstablished && e.rcv.pendingRcvdSegments.Len() > 0 && (flags&header.TCPFlagAck != 0) {
   945  		sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
   946  	}
   947  	options := e.makeOptions(sackBlocks)
   948  	err := e.sendTCP(e.route, tcpFields{
   949  		id:     e.TransportEndpointInfo.ID,
   950  		ttl:    e.ttl,
   951  		tos:    e.sendTOS,
   952  		flags:  flags,
   953  		seq:    seq,
   954  		ack:    ack,
   955  		rcvWnd: rcvWnd,
   956  		opts:   options,
   957  	}, data, e.gso)
   958  	putOptions(options)
   959  	return err
   960  }
   961  
   962  // Precondition: e.mu must be locked.
   963  func (e *endpoint) sendData(next *segment) {
   964  	// Initialize the next segment to write if it's currently nil.
   965  	if e.snd.writeNext == nil {
   966  		if next == nil {
   967  			return
   968  		}
   969  		e.snd.writeNext = next
   970  	}
   971  
   972  	// Push out any new packets.
   973  	e.snd.sendData()
   974  }
   975  
   976  // resetConnectionLocked puts the endpoint in an error state with the given
   977  // error code and sends a RST if and only if the error is not ErrConnectionReset
   978  // indicating that the connection is being reset due to receiving a RST. This
   979  // method must only be called from the protocol goroutine.
   980  func (e *endpoint) resetConnectionLocked(err tcpip.Error) {
   981  	// Only send a reset if the connection is being aborted for a reason
   982  	// other than receiving a reset.
   983  	e.setEndpointState(StateError)
   984  	e.hardError = err
   985  	switch err.(type) {
   986  	case *tcpip.ErrConnectionReset, *tcpip.ErrTimeout:
   987  	default:
   988  		// The exact sequence number to be used for the RST is the same as the
   989  		// one used by Linux. We need to handle the case of window being shrunk
   990  		// which can cause sndNxt to be outside the acceptable window on the
   991  		// receiver.
   992  		//
   993  		// See: https://www.snellman.net/blog/archive/2016-02-01-tcp-rst/ for more
   994  		// information.
   995  		sndWndEnd := e.snd.SndUna.Add(e.snd.SndWnd)
   996  		resetSeqNum := sndWndEnd
   997  		if !sndWndEnd.LessThan(e.snd.SndNxt) || e.snd.SndNxt.Size(sndWndEnd) < (1<<e.snd.SndWndScale) {
   998  			resetSeqNum = e.snd.SndNxt
   999  		}
  1000  		e.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck|header.TCPFlagRst, resetSeqNum, e.rcv.RcvNxt, 0)
  1001  	}
  1002  }
  1003  
  1004  // completeWorkerLocked is called by the worker goroutine when it's about to
  1005  // exit.
  1006  func (e *endpoint) completeWorkerLocked() {
  1007  	// Worker is terminating(either due to moving to
  1008  	// CLOSED or ERROR state, ensure we release all
  1009  	// registrations port reservations even if the socket
  1010  	// itself is not yet closed by the application.
  1011  	e.workerRunning = false
  1012  	if e.workerCleanup {
  1013  		e.cleanupLocked()
  1014  	}
  1015  }
  1016  
  1017  // transitionToStateCloseLocked ensures that the endpoint is
  1018  // cleaned up from the transport demuxer, "before" moving to
  1019  // StateClose. This will ensure that no packet will be
  1020  // delivered to this endpoint from the demuxer when the endpoint
  1021  // is transitioned to StateClose.
  1022  func (e *endpoint) transitionToStateCloseLocked() {
  1023  	s := e.EndpointState()
  1024  	if s == StateClose {
  1025  		return
  1026  	}
  1027  
  1028  	if s.connected() {
  1029  		e.stack.Stats().TCP.CurrentConnected.Decrement()
  1030  		e.stack.Stats().TCP.EstablishedClosed.Increment()
  1031  	}
  1032  
  1033  	// Mark the endpoint as fully closed for reads/writes.
  1034  	e.cleanupLocked()
  1035  	e.setEndpointState(StateClose)
  1036  }
  1037  
  1038  // tryDeliverSegmentFromClosedEndpoint attempts to deliver the parsed
  1039  // segment to any other endpoint other than the current one. This is called
  1040  // only when the endpoint is in StateClose and we want to deliver the segment
  1041  // to any other listening endpoint. We reply with RST if we cannot find one.
  1042  func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
  1043  	ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.TransportEndpointInfo.ID, s.nicID)
  1044  	if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.TransportEndpointInfo.ID.LocalAddress.To4() != "" {
  1045  		// Dual-stack socket, try IPv4.
  1046  		ep = e.stack.FindTransportEndpoint(
  1047  			header.IPv4ProtocolNumber,
  1048  			e.TransProto,
  1049  			e.TransportEndpointInfo.ID,
  1050  			s.nicID,
  1051  		)
  1052  	}
  1053  	if ep == nil {
  1054  		replyWithReset(e.stack, s, stack.DefaultTOS, 0 /* ttl */)
  1055  		s.decRef()
  1056  		return
  1057  	}
  1058  
  1059  	if e == ep {
  1060  		panic("current endpoint not removed from demuxer, enqueing segments to itself")
  1061  	}
  1062  
  1063  	if ep := ep.(*endpoint); ep.enqueueSegment(s) {
  1064  		ep.newSegmentWaker.Assert()
  1065  	}
  1066  }
  1067  
  1068  // Drain segment queue from the endpoint and try to re-match the segment to a
  1069  // different endpoint. This is used when the current endpoint is transitioned to
  1070  // StateClose and has been unregistered from the transport demuxer.
  1071  func (e *endpoint) drainClosingSegmentQueue() {
  1072  	for {
  1073  		s := e.segmentQueue.dequeue()
  1074  		if s == nil {
  1075  			break
  1076  		}
  1077  
  1078  		e.tryDeliverSegmentFromClosedEndpoint(s)
  1079  	}
  1080  }
  1081  
  1082  func (e *endpoint) handleReset(s *segment) (ok bool, err tcpip.Error) {
  1083  	if e.rcv.acceptable(s.sequenceNumber, 0) {
  1084  		// RFC 793, page 37 states that "in all states
  1085  		// except SYN-SENT, all reset (RST) segments are
  1086  		// validated by checking their SEQ-fields." So
  1087  		// we only process it if it's acceptable.
  1088  		switch e.EndpointState() {
  1089  		// In case of a RST in CLOSE-WAIT linux moves
  1090  		// the socket to closed state with an error set
  1091  		// to indicate EPIPE.
  1092  		//
  1093  		// Technically this seems to be at odds w/ RFC.
  1094  		// As per https://tools.ietf.org/html/rfc793#section-2.7
  1095  		// page 69 the behavior for a segment arriving
  1096  		// w/ RST bit set in CLOSE-WAIT is inlined below.
  1097  		//
  1098  		//  ESTABLISHED
  1099  		//  FIN-WAIT-1
  1100  		//  FIN-WAIT-2
  1101  		//  CLOSE-WAIT
  1102  
  1103  		//  If the RST bit is set then, any outstanding RECEIVEs and
  1104  		//  SEND should receive "reset" responses. All segment queues
  1105  		//  should be flushed.  Users should also receive an unsolicited
  1106  		//  general "connection reset" signal. Enter the CLOSED state,
  1107  		//  delete the TCB, and return.
  1108  		case StateCloseWait:
  1109  			e.transitionToStateCloseLocked()
  1110  			e.hardError = &tcpip.ErrAborted{}
  1111  			e.notifyProtocolGoroutine(notifyTickleWorker)
  1112  			return false, nil
  1113  		default:
  1114  			// RFC 793, page 37 states that "in all states
  1115  			// except SYN-SENT, all reset (RST) segments are
  1116  			// validated by checking their SEQ-fields." So
  1117  			// we only process it if it's acceptable.
  1118  
  1119  			// Notify protocol goroutine. This is required when
  1120  			// handleSegment is invoked from the processor goroutine
  1121  			// rather than the worker goroutine.
  1122  			e.notifyProtocolGoroutine(notifyResetByPeer)
  1123  			return false, &tcpip.ErrConnectionReset{}
  1124  		}
  1125  	}
  1126  	return true, nil
  1127  }
  1128  
  1129  // handleSegments processes all inbound segments.
  1130  //
  1131  // Precondition: e.mu must be held.
  1132  func (e *endpoint) handleSegmentsLocked(fastPath bool) tcpip.Error {
  1133  	checkRequeue := true
  1134  	for i := 0; i < maxSegmentsPerWake; i++ {
  1135  		if state := e.EndpointState(); state.closed() || state == StateTimeWait {
  1136  			return nil
  1137  		}
  1138  		s := e.segmentQueue.dequeue()
  1139  		if s == nil {
  1140  			checkRequeue = false
  1141  			break
  1142  		}
  1143  
  1144  		cont, err := e.handleSegmentLocked(s)
  1145  		s.decRef()
  1146  		if err != nil {
  1147  			return err
  1148  		}
  1149  		if !cont {
  1150  			return nil
  1151  		}
  1152  	}
  1153  
  1154  	// When fastPath is true we don't want to wake up the worker
  1155  	// goroutine. If the endpoint has more segments to process the
  1156  	// dispatcher will call handleSegments again anyway.
  1157  	if !fastPath && checkRequeue && !e.segmentQueue.empty() {
  1158  		e.newSegmentWaker.Assert()
  1159  	}
  1160  
  1161  	// Send an ACK for all processed packets if needed.
  1162  	if e.rcv.RcvNxt != e.snd.MaxSentAck {
  1163  		e.snd.sendAck()
  1164  	}
  1165  
  1166  	e.resetKeepaliveTimer(true /* receivedData */)
  1167  
  1168  	return nil
  1169  }
  1170  
  1171  // Precondition: e.mu must be held.
  1172  func (e *endpoint) probeSegmentLocked() {
  1173  	if fn := e.probe; fn != nil {
  1174  		fn(e.completeStateLocked())
  1175  	}
  1176  }
  1177  
  1178  // handleSegment handles a given segment and notifies the worker goroutine if
  1179  // if the connection should be terminated.
  1180  //
  1181  // Precondition: e.mu must be held.
  1182  func (e *endpoint) handleSegmentLocked(s *segment) (cont bool, err tcpip.Error) {
  1183  	// Invoke the tcp probe if installed. The tcp probe function will update
  1184  	// the TCPEndpointState after the segment is processed.
  1185  	defer e.probeSegmentLocked()
  1186  
  1187  	if s.flags.Contains(header.TCPFlagRst) {
  1188  		if ok, err := e.handleReset(s); !ok {
  1189  			return false, err
  1190  		}
  1191  	} else if s.flags.Contains(header.TCPFlagSyn) {
  1192  		// See: https://tools.ietf.org/html/rfc5961#section-4.1
  1193  		//   1) If the SYN bit is set, irrespective of the sequence number, TCP
  1194  		//    MUST send an ACK (also referred to as challenge ACK) to the remote
  1195  		//    peer:
  1196  		//
  1197  		//    <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
  1198  		//
  1199  		//    After sending the acknowledgment, TCP MUST drop the unacceptable
  1200  		//    segment and stop processing further.
  1201  		//
  1202  		// By sending an ACK, the remote peer is challenged to confirm the loss
  1203  		// of the previous connection and the request to start a new connection.
  1204  		// A legitimate peer, after restart, would not have a TCB in the
  1205  		// synchronized state.  Thus, when the ACK arrives, the peer should send
  1206  		// a RST segment back with the sequence number derived from the ACK
  1207  		// field that caused the RST.
  1208  
  1209  		// This RST will confirm that the remote peer has indeed closed the
  1210  		// previous connection.  Upon receipt of a valid RST, the local TCP
  1211  		// endpoint MUST terminate its connection.  The local TCP endpoint
  1212  		// should then rely on SYN retransmission from the remote end to
  1213  		// re-establish the connection.
  1214  		e.snd.maybeSendOutOfWindowAck(s)
  1215  	} else if s.flags.Contains(header.TCPFlagAck) {
  1216  		// Patch the window size in the segment according to the
  1217  		// send window scale.
  1218  		s.window <<= e.snd.SndWndScale
  1219  
  1220  		// RFC 793, page 41 states that "once in the ESTABLISHED
  1221  		// state all segments must carry current acknowledgment
  1222  		// information."
  1223  		drop, err := e.rcv.handleRcvdSegment(s)
  1224  		if err != nil {
  1225  			return false, err
  1226  		}
  1227  		if drop {
  1228  			return true, nil
  1229  		}
  1230  
  1231  		// Now check if the received segment has caused us to transition
  1232  		// to a CLOSED state, if yes then terminate processing and do
  1233  		// not invoke the sender.
  1234  		state := e.EndpointState()
  1235  		if state == StateClose {
  1236  			// When we get into StateClose while processing from the queue,
  1237  			// return immediately and let the protocolMainloop handle it.
  1238  			//
  1239  			// We can reach StateClose only while processing a previous segment
  1240  			// or a notification from the protocolMainLoop (caller goroutine).
  1241  			// This means that with this return, the segment dequeue below can
  1242  			// never occur on a closed endpoint.
  1243  			return false, nil
  1244  		}
  1245  
  1246  		e.snd.handleRcvdSegment(s)
  1247  	}
  1248  
  1249  	return true, nil
  1250  }
  1251  
  1252  // keepaliveTimerExpired is called when the keepaliveTimer fires. We send TCP
  1253  // keepalive packets periodically when the connection is idle. If we don't hear
  1254  // from the other side after a number of tries, we terminate the connection.
  1255  func (e *endpoint) keepaliveTimerExpired() tcpip.Error {
  1256  	userTimeout := e.userTimeout
  1257  
  1258  	e.keepalive.Lock()
  1259  	if !e.SocketOptions().GetKeepAlive() || !e.keepalive.timer.checkExpiration() {
  1260  		e.keepalive.Unlock()
  1261  		return nil
  1262  	}
  1263  
  1264  	// If a userTimeout is set then abort the connection if it is
  1265  	// exceeded.
  1266  	if userTimeout != 0 && e.stack.Clock().NowMonotonic().Sub(e.rcv.lastRcvdAckTime) >= userTimeout && e.keepalive.unacked > 0 {
  1267  		e.keepalive.Unlock()
  1268  		e.stack.Stats().TCP.EstablishedTimedout.Increment()
  1269  		return &tcpip.ErrTimeout{}
  1270  	}
  1271  
  1272  	if e.keepalive.unacked >= e.keepalive.count {
  1273  		e.keepalive.Unlock()
  1274  		e.stack.Stats().TCP.EstablishedTimedout.Increment()
  1275  		return &tcpip.ErrTimeout{}
  1276  	}
  1277  
  1278  	// RFC1122 4.2.3.6: TCP keepalive is a dataless ACK with
  1279  	// seg.seq = snd.nxt-1.
  1280  	e.keepalive.unacked++
  1281  	e.keepalive.Unlock()
  1282  	e.snd.sendSegmentFromView(buffer.VectorisedView{}, header.TCPFlagAck, e.snd.SndNxt-1)
  1283  	e.resetKeepaliveTimer(false)
  1284  	return nil
  1285  }
  1286  
  1287  // resetKeepaliveTimer restarts or stops the keepalive timer, depending on
  1288  // whether it is enabled for this endpoint.
  1289  func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
  1290  	e.keepalive.Lock()
  1291  	if receivedData {
  1292  		e.keepalive.unacked = 0
  1293  	}
  1294  	// Start the keepalive timer IFF it's enabled and there is no pending
  1295  	// data to send.
  1296  	if !e.SocketOptions().GetKeepAlive() || e.snd == nil || e.snd.SndUna != e.snd.SndNxt {
  1297  		e.keepalive.timer.disable()
  1298  		e.keepalive.Unlock()
  1299  		return
  1300  	}
  1301  	if e.keepalive.unacked > 0 {
  1302  		e.keepalive.timer.enable(e.keepalive.interval)
  1303  	} else {
  1304  		e.keepalive.timer.enable(e.keepalive.idle)
  1305  	}
  1306  	e.keepalive.Unlock()
  1307  }
  1308  
  1309  // disableKeepaliveTimer stops the keepalive timer.
  1310  func (e *endpoint) disableKeepaliveTimer() {
  1311  	e.keepalive.Lock()
  1312  	e.keepalive.timer.disable()
  1313  	e.keepalive.Unlock()
  1314  }
  1315  
  1316  // protocolMainLoopDone is called at the end of protocolMainLoop.
  1317  // +checklocksrelease:e.mu
  1318  func (e *endpoint) protocolMainLoopDone(closeTimer tcpip.Timer) {
  1319  	if e.snd != nil {
  1320  		e.snd.resendTimer.cleanup()
  1321  		e.snd.probeTimer.cleanup()
  1322  		e.snd.reorderTimer.cleanup()
  1323  	}
  1324  
  1325  	if closeTimer != nil {
  1326  		closeTimer.Stop()
  1327  	}
  1328  
  1329  	e.completeWorkerLocked()
  1330  
  1331  	if e.drainDone != nil {
  1332  		close(e.drainDone)
  1333  	}
  1334  
  1335  	e.mu.Unlock()
  1336  
  1337  	e.drainClosingSegmentQueue()
  1338  
  1339  	// When the protocol loop exits we should wake up our waiters.
  1340  	e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  1341  }
  1342  
  1343  // handleWakeup handles a wakeup event while connected.
  1344  //
  1345  // +checklocks:e.mu
  1346  func (e *endpoint) handleWakeup(w, closeWaker *sleep.Waker, closeTimer *tcpip.Timer) tcpip.Error {
  1347  	switch w {
  1348  	case &e.sndQueueInfo.sndWaker:
  1349  		e.sendData(nil /* next */)
  1350  	case &e.newSegmentWaker:
  1351  		return e.handleSegmentsLocked(false /* fastPath */)
  1352  	case &e.snd.resendWaker:
  1353  		if !e.snd.retransmitTimerExpired() {
  1354  			e.stack.Stats().TCP.EstablishedTimedout.Increment()
  1355  			return &tcpip.ErrTimeout{}
  1356  		}
  1357  	case closeWaker:
  1358  		// This means the socket is being closed due to the
  1359  		// TCP-FIN-WAIT2 timeout was hit. Just mark the socket as
  1360  		// closed.
  1361  		e.transitionToStateCloseLocked()
  1362  		e.workerCleanup = true
  1363  	case &e.snd.probeWaker:
  1364  		return e.snd.probeTimerExpired()
  1365  	case &e.keepalive.waker:
  1366  		return e.keepaliveTimerExpired()
  1367  	case &e.notificationWaker:
  1368  		n := e.fetchNotifications()
  1369  		if n&notifyNonZeroReceiveWindow != 0 {
  1370  			e.rcv.nonZeroWindow()
  1371  		}
  1372  
  1373  		if n&notifyMTUChanged != 0 {
  1374  			e.sndQueueInfo.sndQueueMu.Lock()
  1375  			count := e.sndQueueInfo.PacketTooBigCount
  1376  			e.sndQueueInfo.PacketTooBigCount = 0
  1377  			mtu := e.sndQueueInfo.SndMTU
  1378  			e.sndQueueInfo.sndQueueMu.Unlock()
  1379  
  1380  			e.snd.updateMaxPayloadSize(mtu, count)
  1381  		}
  1382  
  1383  		if n&notifyReset != 0 || n&notifyAbort != 0 {
  1384  			return &tcpip.ErrConnectionAborted{}
  1385  		}
  1386  
  1387  		if n&notifyResetByPeer != 0 {
  1388  			return &tcpip.ErrConnectionReset{}
  1389  		}
  1390  
  1391  		if n&notifyClose != 0 && e.closed {
  1392  			switch e.EndpointState() {
  1393  			case StateEstablished:
  1394  				// Perform full shutdown if the endpoint is
  1395  				// still established. This can occur when
  1396  				// notifyClose was asserted just before
  1397  				// becoming established.
  1398  				e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead)
  1399  			case StateFinWait2:
  1400  				// The socket has been closed and we are in
  1401  				// FIN_WAIT2 so start the FIN_WAIT2 timer.
  1402  				if *closeTimer == nil {
  1403  					*closeTimer = e.stack.Clock().AfterFunc(e.tcpLingerTimeout, closeWaker.Assert)
  1404  				}
  1405  			}
  1406  		}
  1407  
  1408  		if n&notifyKeepaliveChanged != 0 {
  1409  			// The timer could fire in background when the endpoint
  1410  			// is drained. That's OK. See above.
  1411  			e.resetKeepaliveTimer(true)
  1412  		}
  1413  
  1414  		if n&notifyDrain != 0 {
  1415  			for !e.segmentQueue.empty() {
  1416  				if err := e.handleSegmentsLocked(false /* fastPath */); err != nil {
  1417  					return err
  1418  				}
  1419  			}
  1420  			if !e.EndpointState().closed() {
  1421  				// Only block the worker if the endpoint
  1422  				// is not in closed state or error state.
  1423  				close(e.drainDone)
  1424  				e.mu.Unlock()
  1425  				<-e.undrain
  1426  				e.mu.Lock()
  1427  			}
  1428  		}
  1429  
  1430  		// N.B. notifyTickleWorker may be set, but there is no action
  1431  		// to take in this case.
  1432  	case &e.snd.reorderWaker:
  1433  		return e.snd.rc.reorderTimerExpired()
  1434  	default:
  1435  		panic("unknown waker") // Shouldn't happen.
  1436  	}
  1437  	return nil
  1438  }
  1439  
  1440  // protocolMainLoop is the main loop of the TCP protocol. It runs in its own
  1441  // goroutine and is responsible for sending segments and handling received
  1442  // segments.
  1443  func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) {
  1444  	var (
  1445  		closeTimer tcpip.Timer
  1446  		closeWaker sleep.Waker
  1447  	)
  1448  
  1449  	e.mu.Lock()
  1450  	if handshake {
  1451  		if err := e.h.complete(); err != nil { // +checklocksforce
  1452  			e.lastErrorMu.Lock()
  1453  			e.lastError = err
  1454  			e.lastErrorMu.Unlock()
  1455  
  1456  			e.setEndpointState(StateError)
  1457  			e.hardError = err
  1458  
  1459  			e.workerCleanup = true
  1460  			e.protocolMainLoopDone(closeTimer)
  1461  			return
  1462  		}
  1463  	}
  1464  
  1465  	// Reaching this point means that we successfully completed the 3-way
  1466  	// handshake with our peer. The current endpoint state could be any state
  1467  	// post ESTABLISHED, including CLOSED or ERROR if the endpoint processes a
  1468  	// RST from the peer via the dispatcher fast path, before the loop is
  1469  	// started.
  1470  	if s := e.EndpointState(); !s.connected() {
  1471  		switch s {
  1472  		case StateClose, StateError:
  1473  			// If the endpoint is in CLOSED/ERROR state, sender state has to be
  1474  			// initialized if the endpoint was previously established.
  1475  			if e.snd != nil {
  1476  				break
  1477  			}
  1478  			fallthrough
  1479  		default:
  1480  			panic("endpoint was not established, current state " + s.String())
  1481  		}
  1482  	}
  1483  
  1484  	// Completing the 3-way handshake is an indication that the route is valid
  1485  	// and the remote is reachable as the only way we can complete a handshake
  1486  	// is if our SYN reached the remote and their ACK reached us.
  1487  	e.route.ConfirmReachable()
  1488  
  1489  	drained := e.drainDone != nil
  1490  	if drained {
  1491  		close(e.drainDone)
  1492  		e.mu.Unlock()
  1493  		<-e.undrain
  1494  		e.mu.Lock()
  1495  	}
  1496  
  1497  	// Add all wakers.
  1498  	var s sleep.Sleeper
  1499  	s.AddWaker(&e.sndQueueInfo.sndWaker)
  1500  	s.AddWaker(&e.newSegmentWaker)
  1501  	s.AddWaker(&e.snd.resendWaker)
  1502  	s.AddWaker(&e.snd.probeWaker)
  1503  	s.AddWaker(&closeWaker)
  1504  	s.AddWaker(&e.keepalive.waker)
  1505  	s.AddWaker(&e.notificationWaker)
  1506  	s.AddWaker(&e.snd.reorderWaker)
  1507  
  1508  	// Notify the caller that the waker initialization is complete and the
  1509  	// endpoint is ready.
  1510  	if wakerInitDone != nil {
  1511  		close(wakerInitDone)
  1512  	}
  1513  
  1514  	// Tell waiters that the endpoint is connected and writable.
  1515  	e.waiterQueue.Notify(waiter.WritableEvents)
  1516  
  1517  	// The following assertions and notifications are needed for restored
  1518  	// endpoints. Fresh newly created endpoints have empty states and should
  1519  	// not invoke any.
  1520  	if !e.segmentQueue.empty() {
  1521  		e.newSegmentWaker.Assert()
  1522  	}
  1523  
  1524  	e.rcvQueueInfo.rcvQueueMu.Lock()
  1525  	if !e.rcvQueueInfo.rcvQueue.Empty() {
  1526  		e.waiterQueue.Notify(waiter.ReadableEvents)
  1527  	}
  1528  	e.rcvQueueInfo.rcvQueueMu.Unlock()
  1529  
  1530  	if e.workerCleanup {
  1531  		e.notifyProtocolGoroutine(notifyClose)
  1532  	}
  1533  
  1534  	// Main loop. Handle segments until both send and receive ends of the
  1535  	// connection have completed.
  1536  	cleanupOnError := func(err tcpip.Error) {
  1537  		e.stack.Stats().TCP.CurrentConnected.Decrement()
  1538  		e.workerCleanup = true
  1539  		if err != nil {
  1540  			e.resetConnectionLocked(err)
  1541  		}
  1542  	}
  1543  
  1544  loop:
  1545  	for {
  1546  		switch e.EndpointState() {
  1547  		case StateTimeWait, StateClose, StateError:
  1548  			break loop
  1549  		}
  1550  
  1551  		e.mu.Unlock()
  1552  		w := s.Fetch(true /* block */)
  1553  		e.mu.Lock()
  1554  
  1555  		// We need to double check here because the notification may be
  1556  		// stale by the time we got around to processing it.
  1557  		switch e.EndpointState() {
  1558  		case StateError:
  1559  			// If the endpoint has already transitioned to an ERROR
  1560  			// state just pass nil here as any reset that may need
  1561  			// to be sent etc should already have been done and we
  1562  			// just want to terminate the loop and cleanup the
  1563  			// endpoint.
  1564  			cleanupOnError(nil)
  1565  			e.protocolMainLoopDone(closeTimer)
  1566  			return
  1567  		case StateTimeWait:
  1568  			fallthrough
  1569  		case StateClose:
  1570  			break loop
  1571  		default:
  1572  			if err := e.handleWakeup(w, &closeWaker, &closeTimer); err != nil {
  1573  				cleanupOnError(err)
  1574  				e.protocolMainLoopDone(closeTimer)
  1575  				return
  1576  			}
  1577  		}
  1578  	}
  1579  
  1580  	var reuseTW func()
  1581  	if e.EndpointState() == StateTimeWait {
  1582  		// Disable close timer as we now entering real TIME_WAIT.
  1583  		if closeTimer != nil {
  1584  			closeTimer.Stop()
  1585  		}
  1586  		// Mark the current sleeper done so as to free all associated
  1587  		// wakers.
  1588  		s.Done()
  1589  		// Wake up any waiters before we enter TIME_WAIT.
  1590  		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  1591  		e.workerCleanup = true
  1592  		reuseTW = e.doTimeWait()
  1593  	}
  1594  
  1595  	// Handle any StateError transition from StateTimeWait.
  1596  	if e.EndpointState() == StateError {
  1597  		cleanupOnError(nil)
  1598  		e.protocolMainLoopDone(closeTimer)
  1599  		return
  1600  	}
  1601  
  1602  	e.transitionToStateCloseLocked()
  1603  
  1604  	e.protocolMainLoopDone(closeTimer)
  1605  
  1606  	// A new SYN was received during TIME_WAIT and we need to abort
  1607  	// the timewait and redirect the segment to the listener queue
  1608  	if reuseTW != nil {
  1609  		reuseTW()
  1610  	}
  1611  }
  1612  
  1613  // handleTimeWaitSegments processes segments received during TIME_WAIT
  1614  // state.
  1615  func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()) {
  1616  	checkRequeue := true
  1617  	for i := 0; i < maxSegmentsPerWake; i++ {
  1618  		s := e.segmentQueue.dequeue()
  1619  		if s == nil {
  1620  			checkRequeue = false
  1621  			break
  1622  		}
  1623  		extTW, newSyn := e.rcv.handleTimeWaitSegment(s)
  1624  		if newSyn {
  1625  			info := e.TransportEndpointInfo
  1626  			newID := info.ID
  1627  			newID.RemoteAddress = ""
  1628  			newID.RemotePort = 0
  1629  			netProtos := []tcpip.NetworkProtocolNumber{info.NetProto}
  1630  			// If the local address is an IPv4 address then also
  1631  			// look for IPv6 dual stack endpoints that might be
  1632  			// listening on the local address.
  1633  			if newID.LocalAddress.To4() != "" {
  1634  				netProtos = []tcpip.NetworkProtocolNumber{header.IPv4ProtocolNumber, header.IPv6ProtocolNumber}
  1635  			}
  1636  			for _, netProto := range netProtos {
  1637  				if listenEP := e.stack.FindTransportEndpoint(netProto, info.TransProto, newID, s.nicID); listenEP != nil {
  1638  					tcpEP := listenEP.(*endpoint)
  1639  					if EndpointState(tcpEP.State()) == StateListen {
  1640  						reuseTW = func() {
  1641  							if !tcpEP.enqueueSegment(s) {
  1642  								s.decRef()
  1643  								return
  1644  							}
  1645  							tcpEP.newSegmentWaker.Assert()
  1646  						}
  1647  						// We explicitly do not decRef
  1648  						// the segment as it's still
  1649  						// valid and being reflected to
  1650  						// a listening endpoint.
  1651  						return false, reuseTW
  1652  					}
  1653  				}
  1654  			}
  1655  		}
  1656  		if extTW {
  1657  			extendTimeWait = true
  1658  		}
  1659  		s.decRef()
  1660  	}
  1661  	if checkRequeue && !e.segmentQueue.empty() {
  1662  		e.newSegmentWaker.Assert()
  1663  	}
  1664  	return extendTimeWait, nil
  1665  }
  1666  
  1667  // doTimeWait is responsible for handling the TCP behaviour once a socket
  1668  // enters the TIME_WAIT state. Optionally it can return a closure that
  1669  // should be executed after releasing the endpoint registrations. This is
  1670  // done in cases where a new SYN is received during TIME_WAIT that carries
  1671  // a sequence number larger than one see on the connection.
  1672  // +checklocks:e.mu
  1673  func (e *endpoint) doTimeWait() (twReuse func()) {
  1674  	// Trigger a 2 * MSL time wait state. During this period
  1675  	// we will drop all incoming segments.
  1676  	// NOTE: On Linux this is not configurable and is fixed at 60 seconds.
  1677  	timeWaitDuration := DefaultTCPTimeWaitTimeout
  1678  
  1679  	// Get the stack wide configuration.
  1680  	var tcpTW tcpip.TCPTimeWaitTimeoutOption
  1681  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &tcpTW); err == nil {
  1682  		timeWaitDuration = time.Duration(tcpTW)
  1683  	}
  1684  
  1685  	var s sleep.Sleeper
  1686  	defer s.Done()
  1687  	s.AddWaker(&e.newSegmentWaker)
  1688  	s.AddWaker(&e.notificationWaker)
  1689  
  1690  	var timeWaitWaker sleep.Waker
  1691  	s.AddWaker(&timeWaitWaker)
  1692  	timeWaitTimer := e.stack.Clock().AfterFunc(timeWaitDuration, timeWaitWaker.Assert)
  1693  	defer timeWaitTimer.Stop()
  1694  
  1695  	for {
  1696  		e.mu.Unlock()
  1697  		w := s.Fetch(true /* block */)
  1698  		e.mu.Lock()
  1699  		switch w {
  1700  		case &e.newSegmentWaker:
  1701  			extendTimeWait, reuseTW := e.handleTimeWaitSegments()
  1702  			if reuseTW != nil {
  1703  				return reuseTW
  1704  			}
  1705  			if extendTimeWait {
  1706  				timeWaitTimer.Reset(timeWaitDuration)
  1707  			}
  1708  		case &e.notificationWaker:
  1709  			n := e.fetchNotifications()
  1710  			if n&notifyAbort != 0 {
  1711  				return nil
  1712  			}
  1713  			if n&notifyDrain != 0 {
  1714  				for !e.segmentQueue.empty() {
  1715  					// Ignore extending TIME_WAIT during a
  1716  					// save. For sockets in TIME_WAIT we just
  1717  					// terminate the TIME_WAIT early.
  1718  					e.handleTimeWaitSegments()
  1719  				}
  1720  				close(e.drainDone)
  1721  				e.mu.Unlock()
  1722  				<-e.undrain
  1723  				e.mu.Lock()
  1724  				return nil
  1725  			}
  1726  		case &timeWaitWaker:
  1727  			return nil
  1728  		}
  1729  	}
  1730  }