github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/tcpip/transport/tcp/connect.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"encoding/binary"
    19  	"math"
    20  	"time"
    21  
    22  	"github.com/SagerNet/gvisor/pkg/sleep"
    23  	"github.com/SagerNet/gvisor/pkg/sync"
    24  	"github.com/SagerNet/gvisor/pkg/tcpip"
    25  	"github.com/SagerNet/gvisor/pkg/tcpip/buffer"
    26  	"github.com/SagerNet/gvisor/pkg/tcpip/hash/jenkins"
    27  	"github.com/SagerNet/gvisor/pkg/tcpip/header"
    28  	"github.com/SagerNet/gvisor/pkg/tcpip/seqnum"
    29  	"github.com/SagerNet/gvisor/pkg/tcpip/stack"
    30  	"github.com/SagerNet/gvisor/pkg/waiter"
    31  )
    32  
    33  // maxSegmentsPerWake is the maximum number of segments to process in the main
    34  // protocol goroutine per wake-up. Yielding [after this number of segments are
    35  // processed] allows other events to be processed as well (e.g., timeouts,
    36  // resets, etc.).
    37  const maxSegmentsPerWake = 100
    38  
    39  type handshakeState int
    40  
    41  // The following are the possible states of the TCP connection during a 3-way
    42  // handshake. A depiction of the states and transitions can be found in RFC 793,
    43  // page 23.
    44  const (
    45  	handshakeSynSent handshakeState = iota
    46  	handshakeSynRcvd
    47  	handshakeCompleted
    48  )
    49  
    50  // The following are used to set up sleepers.
    51  const (
    52  	wakerForNotification = iota
    53  	wakerForNewSegment
    54  	wakerForResend
    55  )
    56  
    57  const (
    58  	// Maximum space available for options.
    59  	maxOptionSize = 40
    60  )
    61  
    62  // handshake holds the state used during a TCP 3-way handshake.
    63  //
    64  // NOTE: handshake.ep.mu is held during handshake processing. It is released if
    65  // we are going to block and reacquired when we start processing an event.
    66  type handshake struct {
    67  	ep       *endpoint
    68  	listenEP *endpoint
    69  	state    handshakeState
    70  	active   bool
    71  	flags    header.TCPFlags
    72  	ackNum   seqnum.Value
    73  
    74  	// iss is the initial send sequence number, as defined in RFC 793.
    75  	iss seqnum.Value
    76  
    77  	// rcvWnd is the receive window, as defined in RFC 793.
    78  	rcvWnd seqnum.Size
    79  
    80  	// sndWnd is the send window, as defined in RFC 793.
    81  	sndWnd seqnum.Size
    82  
    83  	// mss is the maximum segment size received from the peer.
    84  	mss uint16
    85  
    86  	// sndWndScale is the send window scale, as defined in RFC 1323. A
    87  	// negative value means no scaling is supported by the peer.
    88  	sndWndScale int
    89  
    90  	// rcvWndScale is the receive window scale, as defined in RFC 1323.
    91  	rcvWndScale int
    92  
    93  	// startTime is the time at which the first SYN/SYN-ACK was sent.
    94  	startTime tcpip.MonotonicTime
    95  
    96  	// deferAccept if non-zero will drop the final ACK for a passive
    97  	// handshake till an ACK segment with data is received or the timeout is
    98  	// hit.
    99  	deferAccept time.Duration
   100  
   101  	// acked is true if the the final ACK for a 3-way handshake has
   102  	// been received. This is required to stop retransmitting the
   103  	// original SYN-ACK when deferAccept is enabled.
   104  	acked bool
   105  
   106  	// sendSYNOpts is the cached values for the SYN options to be sent.
   107  	sendSYNOpts header.TCPSynOptions
   108  }
   109  
   110  func (e *endpoint) newHandshake() *handshake {
   111  	h := &handshake{
   112  		ep:          e,
   113  		active:      true,
   114  		rcvWnd:      seqnum.Size(e.initialReceiveWindow()),
   115  		rcvWndScale: e.rcvWndScaleForHandshake(),
   116  	}
   117  	h.resetState()
   118  	// Store reference to handshake state in endpoint.
   119  	e.h = h
   120  	return h
   121  }
   122  
   123  func (e *endpoint) newPassiveHandshake(isn, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) *handshake {
   124  	h := e.newHandshake()
   125  	h.resetToSynRcvd(isn, irs, opts, deferAccept)
   126  	return h
   127  }
   128  
   129  // FindWndScale determines the window scale to use for the given maximum window
   130  // size.
   131  func FindWndScale(wnd seqnum.Size) int {
   132  	if wnd < 0x10000 {
   133  		return 0
   134  	}
   135  
   136  	max := seqnum.Size(math.MaxUint16)
   137  	s := 0
   138  	for wnd > max && s < header.MaxWndScale {
   139  		s++
   140  		max <<= 1
   141  	}
   142  
   143  	return s
   144  }
   145  
   146  // resetState resets the state of the handshake object such that it becomes
   147  // ready for a new 3-way handshake.
   148  func (h *handshake) resetState() {
   149  	h.state = handshakeSynSent
   150  	h.flags = header.TCPFlagSyn
   151  	h.ackNum = 0
   152  	h.mss = 0
   153  	h.iss = generateSecureISN(h.ep.TransportEndpointInfo.ID, h.ep.stack.Clock(), h.ep.stack.Seed())
   154  }
   155  
   156  // generateSecureISN generates a secure Initial Sequence number based on the
   157  // recommendation here https://tools.ietf.org/html/rfc6528#page-3.
   158  func generateSecureISN(id stack.TransportEndpointID, clock tcpip.Clock, seed uint32) seqnum.Value {
   159  	isnHasher := jenkins.Sum32(seed)
   160  	isnHasher.Write([]byte(id.LocalAddress))
   161  	isnHasher.Write([]byte(id.RemoteAddress))
   162  	portBuf := make([]byte, 2)
   163  	binary.LittleEndian.PutUint16(portBuf, id.LocalPort)
   164  	isnHasher.Write(portBuf)
   165  	binary.LittleEndian.PutUint16(portBuf, id.RemotePort)
   166  	isnHasher.Write(portBuf)
   167  	// The time period here is 64ns. This is similar to what linux uses
   168  	// generate a sequence number that overlaps less than one
   169  	// time per MSL (2 minutes).
   170  	//
   171  	// A 64ns clock ticks 10^9/64 = 15625000) times in a second.
   172  	// To wrap the whole 32 bit space would require
   173  	// 2^32/1562500 ~ 274 seconds.
   174  	//
   175  	// Which sort of guarantees that we won't reuse the ISN for a new
   176  	// connection for the same tuple for at least 274s.
   177  	isn := isnHasher.Sum32() + uint32(clock.NowMonotonic().Sub(tcpip.MonotonicTime{}).Nanoseconds()>>6)
   178  	return seqnum.Value(isn)
   179  }
   180  
   181  // effectiveRcvWndScale returns the effective receive window scale to be used.
   182  // If the peer doesn't support window scaling, the effective rcv wnd scale is
   183  // zero; otherwise it's the value calculated based on the initial rcv wnd.
   184  func (h *handshake) effectiveRcvWndScale() uint8 {
   185  	if h.sndWndScale < 0 {
   186  		return 0
   187  	}
   188  	return uint8(h.rcvWndScale)
   189  }
   190  
   191  // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD
   192  // state.
   193  func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) {
   194  	h.active = false
   195  	h.state = handshakeSynRcvd
   196  	h.flags = header.TCPFlagSyn | header.TCPFlagAck
   197  	h.iss = iss
   198  	h.ackNum = irs + 1
   199  	h.mss = opts.MSS
   200  	h.sndWndScale = opts.WS
   201  	h.deferAccept = deferAccept
   202  	h.ep.setEndpointState(StateSynRecv)
   203  }
   204  
   205  // checkAck checks if the ACK number, if present, of a segment received during
   206  // a TCP 3-way handshake is valid. If it's not, a RST segment is sent back in
   207  // response.
   208  func (h *handshake) checkAck(s *segment) bool {
   209  	if s.flags.Contains(header.TCPFlagAck) && s.ackNumber != h.iss+1 {
   210  		// RFC 793, page 36, states that a reset must be generated when
   211  		// the connection is in any non-synchronized state and an
   212  		// incoming segment acknowledges something not yet sent. The
   213  		// connection remains in the same state.
   214  		ack := s.sequenceNumber.Add(s.logicalLen())
   215  		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagRst|header.TCPFlagAck, s.ackNumber, ack, 0)
   216  		return false
   217  	}
   218  
   219  	return true
   220  }
   221  
   222  // synSentState handles a segment received when the TCP 3-way handshake is in
   223  // the SYN-SENT state.
   224  func (h *handshake) synSentState(s *segment) tcpip.Error {
   225  	// RFC 793, page 37, states that in the SYN-SENT state, a reset is
   226  	// acceptable if the ack field acknowledges the SYN.
   227  	if s.flags.Contains(header.TCPFlagRst) {
   228  		if s.flags.Contains(header.TCPFlagAck) && s.ackNumber == h.iss+1 {
   229  			// RFC 793, page 67, states that "If the RST bit is set [and] If the ACK
   230  			// was acceptable then signal the user "error: connection reset", drop
   231  			// the segment, enter CLOSED state, delete TCB, and return."
   232  			h.ep.workerCleanup = true
   233  			// Although the RFC above calls out ECONNRESET, Linux actually returns
   234  			// ECONNREFUSED here so we do as well.
   235  			return &tcpip.ErrConnectionRefused{}
   236  		}
   237  		return nil
   238  	}
   239  
   240  	if !h.checkAck(s) {
   241  		return nil
   242  	}
   243  
   244  	// We are in the SYN-SENT state. We only care about segments that have
   245  	// the SYN flag.
   246  	if !s.flags.Contains(header.TCPFlagSyn) {
   247  		return nil
   248  	}
   249  
   250  	// Parse the SYN options.
   251  	rcvSynOpts := parseSynSegmentOptions(s)
   252  
   253  	// Remember if the Timestamp option was negotiated.
   254  	h.ep.maybeEnableTimestamp(&rcvSynOpts)
   255  
   256  	// Remember if the SACKPermitted option was negotiated.
   257  	h.ep.maybeEnableSACKPermitted(&rcvSynOpts)
   258  
   259  	// Remember the sequence we'll ack from now on.
   260  	h.ackNum = s.sequenceNumber + 1
   261  	h.flags |= header.TCPFlagAck
   262  	h.mss = rcvSynOpts.MSS
   263  	h.sndWndScale = rcvSynOpts.WS
   264  
   265  	// If this is a SYN ACK response, we only need to acknowledge the SYN
   266  	// and the handshake is completed.
   267  	if s.flags.Contains(header.TCPFlagAck) {
   268  		h.state = handshakeCompleted
   269  
   270  		h.ep.transitionToStateEstablishedLocked(h)
   271  
   272  		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale())
   273  		return nil
   274  	}
   275  
   276  	// A SYN segment was received, but no ACK in it. We acknowledge the SYN
   277  	// but resend our own SYN and wait for it to be acknowledged in the
   278  	// SYN-RCVD state.
   279  	h.state = handshakeSynRcvd
   280  	ttl := h.ep.ttl
   281  	amss := h.ep.amss
   282  	h.ep.setEndpointState(StateSynRecv)
   283  	synOpts := header.TCPSynOptions{
   284  		WS:    int(h.effectiveRcvWndScale()),
   285  		TS:    rcvSynOpts.TS,
   286  		TSVal: h.ep.timestamp(),
   287  		TSEcr: h.ep.recentTimestamp(),
   288  
   289  		// We only send SACKPermitted if the other side indicated it
   290  		// permits SACK. This is not explicitly defined in the RFC but
   291  		// this is the behaviour implemented by Linux.
   292  		SACKPermitted: rcvSynOpts.SACKPermitted,
   293  		MSS:           amss,
   294  	}
   295  	if ttl == 0 {
   296  		ttl = h.ep.route.DefaultTTL()
   297  	}
   298  	h.ep.sendSynTCP(h.ep.route, tcpFields{
   299  		id:     h.ep.TransportEndpointInfo.ID,
   300  		ttl:    ttl,
   301  		tos:    h.ep.sendTOS,
   302  		flags:  h.flags,
   303  		seq:    h.iss,
   304  		ack:    h.ackNum,
   305  		rcvWnd: h.rcvWnd,
   306  	}, synOpts)
   307  	return nil
   308  }
   309  
   310  // synRcvdState handles a segment received when the TCP 3-way handshake is in
   311  // the SYN-RCVD state.
   312  func (h *handshake) synRcvdState(s *segment) tcpip.Error {
   313  	if s.flags.Contains(header.TCPFlagRst) {
   314  		// RFC 793, page 37, states that in the SYN-RCVD state, a reset
   315  		// is acceptable if the sequence number is in the window.
   316  		if s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) {
   317  			return &tcpip.ErrConnectionRefused{}
   318  		}
   319  		return nil
   320  	}
   321  
   322  	if !h.checkAck(s) {
   323  		return nil
   324  	}
   325  
   326  	// RFC 793, Section 3.9, page 69, states that in the SYN-RCVD state, a
   327  	// sequence number outside of the window causes an ACK with the proper seq
   328  	// number and "After sending the acknowledgment, drop the unacceptable
   329  	// segment and return."
   330  	if !s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) {
   331  		if h.ep.allowOutOfWindowAck() {
   332  			h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd)
   333  		}
   334  		return nil
   335  	}
   336  
   337  	if s.flags.Contains(header.TCPFlagSyn) && s.sequenceNumber != h.ackNum-1 {
   338  		// We received two SYN segments with different sequence
   339  		// numbers, so we reset this and restart the whole
   340  		// process, except that we don't reset the timer.
   341  		ack := s.sequenceNumber.Add(s.logicalLen())
   342  		seq := seqnum.Value(0)
   343  		if s.flags.Contains(header.TCPFlagAck) {
   344  			seq = s.ackNumber
   345  		}
   346  		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0)
   347  
   348  		if !h.active {
   349  			return &tcpip.ErrInvalidEndpointState{}
   350  		}
   351  
   352  		h.resetState()
   353  		synOpts := header.TCPSynOptions{
   354  			WS:            h.rcvWndScale,
   355  			TS:            h.ep.SendTSOk,
   356  			TSVal:         h.ep.timestamp(),
   357  			TSEcr:         h.ep.recentTimestamp(),
   358  			SACKPermitted: h.ep.SACKPermitted,
   359  			MSS:           h.ep.amss,
   360  		}
   361  		h.ep.sendSynTCP(h.ep.route, tcpFields{
   362  			id:     h.ep.TransportEndpointInfo.ID,
   363  			ttl:    h.ep.ttl,
   364  			tos:    h.ep.sendTOS,
   365  			flags:  h.flags,
   366  			seq:    h.iss,
   367  			ack:    h.ackNum,
   368  			rcvWnd: h.rcvWnd,
   369  		}, synOpts)
   370  		return nil
   371  	}
   372  
   373  	// We have previously received (and acknowledged) the peer's SYN. If the
   374  	// peer acknowledges our SYN, the handshake is completed.
   375  	if s.flags.Contains(header.TCPFlagAck) {
   376  		// If deferAccept is not zero and this is a bare ACK and the
   377  		// timeout is not hit then drop the ACK.
   378  		if h.deferAccept != 0 && s.data.Size() == 0 && h.ep.stack.Clock().NowMonotonic().Sub(h.startTime) < h.deferAccept {
   379  			h.acked = true
   380  			h.ep.stack.Stats().DroppedPackets.Increment()
   381  			return nil
   382  		}
   383  
   384  		// If the timestamp option is negotiated and the segment does
   385  		// not carry a timestamp option then the segment must be dropped
   386  		// as per https://tools.ietf.org/html/rfc7323#section-3.2.
   387  		if h.ep.SendTSOk && !s.parsedOptions.TS {
   388  			h.ep.stack.Stats().DroppedPackets.Increment()
   389  			return nil
   390  		}
   391  
   392  		// Drop the ACK if the accept queue is full.
   393  		// https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_ipv4.c#L1523
   394  		// We could abort the connection as well with a tunable as in
   395  		// https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_minisocks.c#L788
   396  		if listenEP := h.listenEP; listenEP != nil && listenEP.acceptQueueIsFull() {
   397  			listenEP.stack.Stats().DroppedPackets.Increment()
   398  			return nil
   399  		}
   400  
   401  		// Update timestamp if required. See RFC7323, section-4.3.
   402  		if h.ep.SendTSOk && s.parsedOptions.TS {
   403  			h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber)
   404  		}
   405  		h.state = handshakeCompleted
   406  
   407  		h.ep.transitionToStateEstablishedLocked(h)
   408  
   409  		// Requeue the segment if the ACK completing the handshake has more info
   410  		// to be procesed by the newly established endpoint.
   411  		if (s.flags.Contains(header.TCPFlagFin) || s.data.Size() > 0) && h.ep.enqueueSegment(s) {
   412  			s.incRef()
   413  			h.ep.newSegmentWaker.Assert()
   414  		}
   415  		return nil
   416  	}
   417  
   418  	return nil
   419  }
   420  
   421  func (h *handshake) handleSegment(s *segment) tcpip.Error {
   422  	h.sndWnd = s.window
   423  	if !s.flags.Contains(header.TCPFlagSyn) && h.sndWndScale > 0 {
   424  		h.sndWnd <<= uint8(h.sndWndScale)
   425  	}
   426  
   427  	switch h.state {
   428  	case handshakeSynRcvd:
   429  		return h.synRcvdState(s)
   430  	case handshakeSynSent:
   431  		return h.synSentState(s)
   432  	}
   433  	return nil
   434  }
   435  
   436  // processSegments goes through the segment queue and processes up to
   437  // maxSegmentsPerWake (if they're available).
   438  func (h *handshake) processSegments() tcpip.Error {
   439  	for i := 0; i < maxSegmentsPerWake; i++ {
   440  		s := h.ep.segmentQueue.dequeue()
   441  		if s == nil {
   442  			return nil
   443  		}
   444  
   445  		err := h.handleSegment(s)
   446  		s.decRef()
   447  		if err != nil {
   448  			return err
   449  		}
   450  
   451  		// We stop processing packets once the handshake is completed,
   452  		// otherwise we may process packets meant to be processed by
   453  		// the main protocol goroutine.
   454  		if h.state == handshakeCompleted {
   455  			break
   456  		}
   457  	}
   458  
   459  	// If the queue is not empty, make sure we'll wake up in the next
   460  	// iteration.
   461  	if !h.ep.segmentQueue.empty() {
   462  		h.ep.newSegmentWaker.Assert()
   463  	}
   464  
   465  	return nil
   466  }
   467  
   468  // start sends the first SYN/SYN-ACK. It does not block, even if link address
   469  // resolution is required.
   470  func (h *handshake) start() {
   471  	h.startTime = h.ep.stack.Clock().NowMonotonic()
   472  	h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route)
   473  	var sackEnabled tcpip.TCPSACKEnabled
   474  	if err := h.ep.stack.TransportProtocolOption(ProtocolNumber, &sackEnabled); err != nil {
   475  		// If stack returned an error when checking for SACKEnabled
   476  		// status then just default to switching off SACK negotiation.
   477  		sackEnabled = false
   478  	}
   479  
   480  	synOpts := header.TCPSynOptions{
   481  		WS:            h.rcvWndScale,
   482  		TS:            true,
   483  		TSVal:         h.ep.timestamp(),
   484  		TSEcr:         h.ep.recentTimestamp(),
   485  		SACKPermitted: bool(sackEnabled),
   486  		MSS:           h.ep.amss,
   487  	}
   488  
   489  	// start() is also called in a listen context so we want to make sure we only
   490  	// send the TS/SACK option when we received the TS/SACK in the initial SYN.
   491  	if h.state == handshakeSynRcvd {
   492  		synOpts.TS = h.ep.SendTSOk
   493  		synOpts.SACKPermitted = h.ep.SACKPermitted && bool(sackEnabled)
   494  		if h.sndWndScale < 0 {
   495  			// Disable window scaling if the peer did not send us
   496  			// the window scaling option.
   497  			synOpts.WS = -1
   498  		}
   499  	}
   500  
   501  	h.sendSYNOpts = synOpts
   502  	h.ep.sendSynTCP(h.ep.route, tcpFields{
   503  		id:     h.ep.TransportEndpointInfo.ID,
   504  		ttl:    h.ep.ttl,
   505  		tos:    h.ep.sendTOS,
   506  		flags:  h.flags,
   507  		seq:    h.iss,
   508  		ack:    h.ackNum,
   509  		rcvWnd: h.rcvWnd,
   510  	}, synOpts)
   511  }
   512  
   513  // complete completes the TCP 3-way handshake initiated by h.start().
   514  // +checklocks:h.ep.mu
   515  func (h *handshake) complete() tcpip.Error {
   516  	// Set up the wakers.
   517  	var s sleep.Sleeper
   518  	resendWaker := sleep.Waker{}
   519  	s.AddWaker(&resendWaker, wakerForResend)
   520  	s.AddWaker(&h.ep.notificationWaker, wakerForNotification)
   521  	s.AddWaker(&h.ep.newSegmentWaker, wakerForNewSegment)
   522  	defer s.Done()
   523  
   524  	// Initialize the resend timer.
   525  	timer, err := newBackoffTimer(h.ep.stack.Clock(), time.Second, MaxRTO, resendWaker.Assert)
   526  	if err != nil {
   527  		return err
   528  	}
   529  	defer timer.stop()
   530  	for h.state != handshakeCompleted {
   531  		// Unlock before blocking, and reacquire again afterwards (h.ep.mu is held
   532  		// throughout handshake processing).
   533  		h.ep.mu.Unlock()
   534  		index, _ := s.Fetch(true /* block */)
   535  		h.ep.mu.Lock()
   536  		switch index {
   537  
   538  		case wakerForResend:
   539  			if err := timer.reset(); err != nil {
   540  				return err
   541  			}
   542  			// Resend the SYN/SYN-ACK only if the following conditions hold.
   543  			//  - It's an active handshake (deferAccept does not apply)
   544  			//  - It's a passive handshake and we have not yet got the final-ACK.
   545  			//  - It's a passive handshake and we got an ACK but deferAccept is
   546  			//    enabled and we are now past the deferAccept duration.
   547  			// The last is required to provide a way for the peer to complete
   548  			// the connection with another ACK or data (as ACKs are never
   549  			// retransmitted on their own).
   550  			if h.active || !h.acked || h.deferAccept != 0 && h.ep.stack.Clock().NowMonotonic().Sub(h.startTime) > h.deferAccept {
   551  				h.ep.sendSynTCP(h.ep.route, tcpFields{
   552  					id:     h.ep.TransportEndpointInfo.ID,
   553  					ttl:    h.ep.ttl,
   554  					tos:    h.ep.sendTOS,
   555  					flags:  h.flags,
   556  					seq:    h.iss,
   557  					ack:    h.ackNum,
   558  					rcvWnd: h.rcvWnd,
   559  				}, h.sendSYNOpts)
   560  			}
   561  
   562  		case wakerForNotification:
   563  			n := h.ep.fetchNotifications()
   564  			if (n&notifyClose)|(n&notifyAbort) != 0 {
   565  				return &tcpip.ErrAborted{}
   566  			}
   567  			if n&notifyDrain != 0 {
   568  				for !h.ep.segmentQueue.empty() {
   569  					s := h.ep.segmentQueue.dequeue()
   570  					err := h.handleSegment(s)
   571  					s.decRef()
   572  					if err != nil {
   573  						return err
   574  					}
   575  					if h.state == handshakeCompleted {
   576  						return nil
   577  					}
   578  				}
   579  				close(h.ep.drainDone)
   580  				h.ep.mu.Unlock()
   581  				<-h.ep.undrain
   582  				h.ep.mu.Lock()
   583  			}
   584  			// Check for any ICMP errors notified to us.
   585  			if n&notifyError != 0 {
   586  				if err := h.ep.lastErrorLocked(); err != nil {
   587  					return err
   588  				}
   589  				// Flag the handshake failure as aborted if the lastError is
   590  				// cleared because of a socket layer call.
   591  				return &tcpip.ErrConnectionAborted{}
   592  			}
   593  		case wakerForNewSegment:
   594  			if err := h.processSegments(); err != nil {
   595  				return err
   596  			}
   597  		}
   598  	}
   599  
   600  	return nil
   601  }
   602  
   603  type backoffTimer struct {
   604  	timeout    time.Duration
   605  	maxTimeout time.Duration
   606  	t          tcpip.Timer
   607  }
   608  
   609  func newBackoffTimer(clock tcpip.Clock, timeout, maxTimeout time.Duration, f func()) (*backoffTimer, tcpip.Error) {
   610  	if timeout > maxTimeout {
   611  		return nil, &tcpip.ErrTimeout{}
   612  	}
   613  	bt := &backoffTimer{timeout: timeout, maxTimeout: maxTimeout}
   614  	bt.t = clock.AfterFunc(timeout, f)
   615  	return bt, nil
   616  }
   617  
   618  func (bt *backoffTimer) reset() tcpip.Error {
   619  	bt.timeout *= 2
   620  	if bt.timeout > bt.maxTimeout {
   621  		return &tcpip.ErrTimeout{}
   622  	}
   623  	bt.t.Reset(bt.timeout)
   624  	return nil
   625  }
   626  
   627  func (bt *backoffTimer) stop() {
   628  	bt.t.Stop()
   629  }
   630  
   631  func parseSynSegmentOptions(s *segment) header.TCPSynOptions {
   632  	synOpts := header.ParseSynOptions(s.options, s.flags.Contains(header.TCPFlagAck))
   633  	if synOpts.TS {
   634  		s.parsedOptions.TSVal = synOpts.TSVal
   635  		s.parsedOptions.TSEcr = synOpts.TSEcr
   636  	}
   637  	return synOpts
   638  }
   639  
   640  var optionPool = sync.Pool{
   641  	New: func() interface{} {
   642  		return &[maxOptionSize]byte{}
   643  	},
   644  }
   645  
   646  func getOptions() []byte {
   647  	return (*optionPool.Get().(*[maxOptionSize]byte))[:]
   648  }
   649  
   650  func putOptions(options []byte) {
   651  	// Reslice to full capacity.
   652  	optionPool.Put(optionsToArray(options))
   653  }
   654  
   655  func makeSynOptions(opts header.TCPSynOptions) []byte {
   656  	// Emulate linux option order. This is as follows:
   657  	//
   658  	// if md5: NOP NOP MD5SIG 18 md5sig(16)
   659  	// if mss: MSS 4 mss(2)
   660  	// if ts and sack_advertise:
   661  	//	SACK 2 TIMESTAMP 2 timestamp(8)
   662  	// elif ts: NOP NOP TIMESTAMP 10 timestamp(8)
   663  	// elif sack: NOP NOP SACK 2
   664  	// if wscale: NOP WINDOW 3 ws(1)
   665  	// if sack_blocks: NOP NOP SACK ((2 + (#blocks * 8))
   666  	//	[for each block] start_seq(4) end_seq(4)
   667  	// if fastopen_cookie:
   668  	//	if exp: EXP (4 + len(cookie)) FASTOPEN_MAGIC(2)
   669  	// 	else: FASTOPEN (2 + len(cookie))
   670  	//	cookie(variable) [padding to four bytes]
   671  	//
   672  	options := getOptions()
   673  
   674  	// Always encode the mss.
   675  	offset := header.EncodeMSSOption(uint32(opts.MSS), options)
   676  
   677  	// Special ordering is required here. If both TS and SACK are enabled,
   678  	// then the SACK option precedes TS, with no padding. If they are
   679  	// enabled individually, then we see padding before the option.
   680  	if opts.TS && opts.SACKPermitted {
   681  		offset += header.EncodeSACKPermittedOption(options[offset:])
   682  		offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:])
   683  	} else if opts.TS {
   684  		offset += header.EncodeNOP(options[offset:])
   685  		offset += header.EncodeNOP(options[offset:])
   686  		offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:])
   687  	} else if opts.SACKPermitted {
   688  		offset += header.EncodeNOP(options[offset:])
   689  		offset += header.EncodeNOP(options[offset:])
   690  		offset += header.EncodeSACKPermittedOption(options[offset:])
   691  	}
   692  
   693  	// Initialize the WS option.
   694  	if opts.WS >= 0 {
   695  		offset += header.EncodeNOP(options[offset:])
   696  		offset += header.EncodeWSOption(opts.WS, options[offset:])
   697  	}
   698  
   699  	// Padding to the end; note that this never apply unless we add a
   700  	// fastopen option, we always expect the offset to remain the same.
   701  	if delta := header.AddTCPOptionPadding(options, offset); delta != 0 {
   702  		panic("unexpected option encoding")
   703  	}
   704  
   705  	return options[:offset]
   706  }
   707  
   708  // tcpFields is a struct to carry different parameters required by the
   709  // send*TCP variant functions below.
   710  type tcpFields struct {
   711  	id     stack.TransportEndpointID
   712  	ttl    uint8
   713  	tos    uint8
   714  	flags  header.TCPFlags
   715  	seq    seqnum.Value
   716  	ack    seqnum.Value
   717  	rcvWnd seqnum.Size
   718  	opts   []byte
   719  	txHash uint32
   720  }
   721  
   722  func (e *endpoint) sendSynTCP(r *stack.Route, tf tcpFields, opts header.TCPSynOptions) tcpip.Error {
   723  	tf.opts = makeSynOptions(opts)
   724  	// We ignore SYN send errors and let the callers re-attempt send.
   725  	if err := e.sendTCP(r, tf, buffer.VectorisedView{}, stack.GSO{}); err != nil {
   726  		e.stats.SendErrors.SynSendToNetworkFailed.Increment()
   727  	}
   728  	putOptions(tf.opts)
   729  	return nil
   730  }
   731  
   732  func (e *endpoint) sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso stack.GSO) tcpip.Error {
   733  	tf.txHash = e.txHash
   734  	if err := sendTCP(r, tf, data, gso, e.owner); err != nil {
   735  		e.stats.SendErrors.SegmentSendToNetworkFailed.Increment()
   736  		return err
   737  	}
   738  	e.stats.SegmentsSent.Increment()
   739  	return nil
   740  }
   741  
   742  func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso stack.GSO) {
   743  	optLen := len(tf.opts)
   744  	tcp := header.TCP(pkt.TransportHeader().Push(header.TCPMinimumSize + optLen))
   745  	pkt.TransportProtocolNumber = header.TCPProtocolNumber
   746  	tcp.Encode(&header.TCPFields{
   747  		SrcPort:    tf.id.LocalPort,
   748  		DstPort:    tf.id.RemotePort,
   749  		SeqNum:     uint32(tf.seq),
   750  		AckNum:     uint32(tf.ack),
   751  		DataOffset: uint8(header.TCPMinimumSize + optLen),
   752  		Flags:      tf.flags,
   753  		WindowSize: uint16(tf.rcvWnd),
   754  	})
   755  	copy(tcp[header.TCPMinimumSize:], tf.opts)
   756  
   757  	xsum := r.PseudoHeaderChecksum(ProtocolNumber, uint16(pkt.Size()))
   758  	// Only calculate the checksum if offloading isn't supported.
   759  	if gso.Type != stack.GSONone && gso.NeedsCsum {
   760  		// This is called CHECKSUM_PARTIAL in the Linux kernel. We
   761  		// calculate a checksum of the pseudo-header and save it in the
   762  		// TCP header, then the kernel calculate a checksum of the
   763  		// header and data and get the right sum of the TCP packet.
   764  		tcp.SetChecksum(xsum)
   765  	} else if r.RequiresTXTransportChecksum() {
   766  		xsum = header.ChecksumCombine(xsum, pkt.Data().AsRange().Checksum())
   767  		tcp.SetChecksum(^tcp.CalculateChecksum(xsum))
   768  	}
   769  }
   770  
   771  func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso stack.GSO, owner tcpip.PacketOwner) tcpip.Error {
   772  	// We need to shallow clone the VectorisedView here as ReadToView will
   773  	// split the VectorisedView and Trim underlying views as it splits. Not
   774  	// doing the clone here will cause the underlying views of data itself
   775  	// to be altered.
   776  	data = data.Clone(nil)
   777  
   778  	optLen := len(tf.opts)
   779  	if tf.rcvWnd > math.MaxUint16 {
   780  		tf.rcvWnd = math.MaxUint16
   781  	}
   782  
   783  	mss := int(gso.MSS)
   784  	n := (data.Size() + mss - 1) / mss
   785  
   786  	size := data.Size()
   787  	hdrSize := header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen
   788  	var pkts stack.PacketBufferList
   789  	for i := 0; i < n; i++ {
   790  		packetSize := mss
   791  		if packetSize > size {
   792  			packetSize = size
   793  		}
   794  		size -= packetSize
   795  		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
   796  			ReserveHeaderBytes: hdrSize,
   797  		})
   798  		pkt.Hash = tf.txHash
   799  		pkt.Owner = owner
   800  		pkt.Data().ReadFromVV(&data, packetSize)
   801  		buildTCPHdr(r, tf, pkt, gso)
   802  		tf.seq = tf.seq.Add(seqnum.Size(packetSize))
   803  		pkt.GSOOptions = gso
   804  		pkts.PushBack(pkt)
   805  	}
   806  
   807  	if tf.ttl == 0 {
   808  		tf.ttl = r.DefaultTTL()
   809  	}
   810  	sent, err := r.WritePackets(pkts, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos})
   811  	if err != nil {
   812  		r.Stats().TCP.SegmentSendErrors.IncrementBy(uint64(n - sent))
   813  	}
   814  	r.Stats().TCP.SegmentsSent.IncrementBy(uint64(sent))
   815  	return err
   816  }
   817  
   818  // sendTCP sends a TCP segment with the provided options via the provided
   819  // network endpoint and under the provided identity.
   820  func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso stack.GSO, owner tcpip.PacketOwner) tcpip.Error {
   821  	optLen := len(tf.opts)
   822  	if tf.rcvWnd > math.MaxUint16 {
   823  		tf.rcvWnd = math.MaxUint16
   824  	}
   825  
   826  	if r.Loop()&stack.PacketLoop == 0 && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() {
   827  		return sendTCPBatch(r, tf, data, gso, owner)
   828  	}
   829  
   830  	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
   831  		ReserveHeaderBytes: header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen,
   832  		Data:               data,
   833  	})
   834  	pkt.GSOOptions = gso
   835  	pkt.Hash = tf.txHash
   836  	pkt.Owner = owner
   837  	buildTCPHdr(r, tf, pkt, gso)
   838  
   839  	if tf.ttl == 0 {
   840  		tf.ttl = r.DefaultTTL()
   841  	}
   842  	if err := r.WritePacket(stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos}, pkt); err != nil {
   843  		r.Stats().TCP.SegmentSendErrors.Increment()
   844  		return err
   845  	}
   846  	r.Stats().TCP.SegmentsSent.Increment()
   847  	if (tf.flags & header.TCPFlagRst) != 0 {
   848  		r.Stats().TCP.ResetsSent.Increment()
   849  	}
   850  	return nil
   851  }
   852  
   853  // makeOptions makes an options slice.
   854  func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
   855  	options := getOptions()
   856  	offset := 0
   857  
   858  	// N.B. the ordering here matches the ordering used by Linux internally
   859  	// and described in the raw makeOptions function. We don't include
   860  	// unnecessary cases here (post connection.)
   861  	if e.SendTSOk {
   862  		// Embed the timestamp if timestamp has been enabled.
   863  		//
   864  		// We only use the lower 32 bits of the unix time in
   865  		// milliseconds. This is similar to what Linux does where it
   866  		// uses the lower 32 bits of the jiffies value in the tsVal
   867  		// field of the timestamp option.
   868  		//
   869  		// Further, RFC7323 section-5.4 recommends millisecond
   870  		// resolution as the lowest recommended resolution for the
   871  		// timestamp clock.
   872  		//
   873  		// Ref: https://tools.ietf.org/html/rfc7323#section-5.4.
   874  		offset += header.EncodeNOP(options[offset:])
   875  		offset += header.EncodeNOP(options[offset:])
   876  		offset += header.EncodeTSOption(e.timestamp(), e.recentTimestamp(), options[offset:])
   877  	}
   878  	if e.SACKPermitted && len(sackBlocks) > 0 {
   879  		offset += header.EncodeNOP(options[offset:])
   880  		offset += header.EncodeNOP(options[offset:])
   881  		offset += header.EncodeSACKBlocks(sackBlocks, options[offset:])
   882  	}
   883  
   884  	// We expect the above to produce an aligned offset.
   885  	if delta := header.AddTCPOptionPadding(options, offset); delta != 0 {
   886  		panic("unexpected option encoding")
   887  	}
   888  
   889  	return options[:offset]
   890  }
   891  
   892  // sendRaw sends a TCP segment to the endpoint's peer.
   893  func (e *endpoint) sendRaw(data buffer.VectorisedView, flags header.TCPFlags, seq, ack seqnum.Value, rcvWnd seqnum.Size) tcpip.Error {
   894  	var sackBlocks []header.SACKBlock
   895  	if e.EndpointState() == StateEstablished && e.rcv.pendingRcvdSegments.Len() > 0 && (flags&header.TCPFlagAck != 0) {
   896  		sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
   897  	}
   898  	options := e.makeOptions(sackBlocks)
   899  	err := e.sendTCP(e.route, tcpFields{
   900  		id:     e.TransportEndpointInfo.ID,
   901  		ttl:    e.ttl,
   902  		tos:    e.sendTOS,
   903  		flags:  flags,
   904  		seq:    seq,
   905  		ack:    ack,
   906  		rcvWnd: rcvWnd,
   907  		opts:   options,
   908  	}, data, e.gso)
   909  	putOptions(options)
   910  	return err
   911  }
   912  
   913  // Precondition: e.mu must be locked.
   914  func (e *endpoint) sendData(next *segment) {
   915  	// Initialize the next segment to write if it's currently nil.
   916  	if e.snd.writeNext == nil {
   917  		if next == nil {
   918  			return
   919  		}
   920  		e.snd.writeNext = next
   921  	}
   922  
   923  	// Push out any new packets.
   924  	e.snd.sendData()
   925  }
   926  
   927  // resetConnectionLocked puts the endpoint in an error state with the given
   928  // error code and sends a RST if and only if the error is not ErrConnectionReset
   929  // indicating that the connection is being reset due to receiving a RST. This
   930  // method must only be called from the protocol goroutine.
   931  func (e *endpoint) resetConnectionLocked(err tcpip.Error) {
   932  	// Only send a reset if the connection is being aborted for a reason
   933  	// other than receiving a reset.
   934  	e.setEndpointState(StateError)
   935  	e.hardError = err
   936  	switch err.(type) {
   937  	case *tcpip.ErrConnectionReset, *tcpip.ErrTimeout:
   938  	default:
   939  		// The exact sequence number to be used for the RST is the same as the
   940  		// one used by Linux. We need to handle the case of window being shrunk
   941  		// which can cause sndNxt to be outside the acceptable window on the
   942  		// receiver.
   943  		//
   944  		// See: https://www.snellman.net/blog/archive/2016-02-01-tcp-rst/ for more
   945  		// information.
   946  		sndWndEnd := e.snd.SndUna.Add(e.snd.SndWnd)
   947  		resetSeqNum := sndWndEnd
   948  		if !sndWndEnd.LessThan(e.snd.SndNxt) || e.snd.SndNxt.Size(sndWndEnd) < (1<<e.snd.SndWndScale) {
   949  			resetSeqNum = e.snd.SndNxt
   950  		}
   951  		e.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck|header.TCPFlagRst, resetSeqNum, e.rcv.RcvNxt, 0)
   952  	}
   953  }
   954  
   955  // completeWorkerLocked is called by the worker goroutine when it's about to
   956  // exit.
   957  func (e *endpoint) completeWorkerLocked() {
   958  	// Worker is terminating(either due to moving to
   959  	// CLOSED or ERROR state, ensure we release all
   960  	// registrations port reservations even if the socket
   961  	// itself is not yet closed by the application.
   962  	e.workerRunning = false
   963  	if e.workerCleanup {
   964  		e.cleanupLocked()
   965  	}
   966  }
   967  
   968  // transitionToStateEstablisedLocked transitions a given endpoint
   969  // to an established state using the handshake parameters provided.
   970  // It also initializes sender/receiver.
   971  func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) {
   972  	// Transfer handshake state to TCP connection. We disable
   973  	// receive window scaling if the peer doesn't support it
   974  	// (indicated by a negative send window scale).
   975  	e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale)
   976  
   977  	e.rcvQueueInfo.rcvQueueMu.Lock()
   978  	e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale())
   979  	// Bootstrap the auto tuning algorithm. Starting at zero will
   980  	// result in a really large receive window after the first auto
   981  	// tuning adjustment.
   982  	e.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes = int(h.rcvWnd)
   983  	e.rcvQueueInfo.rcvQueueMu.Unlock()
   984  
   985  	e.setEndpointState(StateEstablished)
   986  }
   987  
   988  // transitionToStateCloseLocked ensures that the endpoint is
   989  // cleaned up from the transport demuxer, "before" moving to
   990  // StateClose. This will ensure that no packet will be
   991  // delivered to this endpoint from the demuxer when the endpoint
   992  // is transitioned to StateClose.
   993  func (e *endpoint) transitionToStateCloseLocked() {
   994  	s := e.EndpointState()
   995  	if s == StateClose {
   996  		return
   997  	}
   998  
   999  	if s.connected() {
  1000  		e.stack.Stats().TCP.CurrentConnected.Decrement()
  1001  		e.stack.Stats().TCP.EstablishedClosed.Increment()
  1002  	}
  1003  
  1004  	// Mark the endpoint as fully closed for reads/writes.
  1005  	e.cleanupLocked()
  1006  	e.setEndpointState(StateClose)
  1007  }
  1008  
  1009  // tryDeliverSegmentFromClosedEndpoint attempts to deliver the parsed
  1010  // segment to any other endpoint other than the current one. This is called
  1011  // only when the endpoint is in StateClose and we want to deliver the segment
  1012  // to any other listening endpoint. We reply with RST if we cannot find one.
  1013  func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
  1014  	ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.TransportEndpointInfo.ID, s.nicID)
  1015  	if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.TransportEndpointInfo.ID.LocalAddress.To4() != "" {
  1016  		// Dual-stack socket, try IPv4.
  1017  		ep = e.stack.FindTransportEndpoint(
  1018  			header.IPv4ProtocolNumber,
  1019  			e.TransProto,
  1020  			e.TransportEndpointInfo.ID,
  1021  			s.nicID,
  1022  		)
  1023  	}
  1024  	if ep == nil {
  1025  		replyWithReset(e.stack, s, stack.DefaultTOS, 0 /* ttl */)
  1026  		s.decRef()
  1027  		return
  1028  	}
  1029  
  1030  	if e == ep {
  1031  		panic("current endpoint not removed from demuxer, enqueing segments to itself")
  1032  	}
  1033  
  1034  	if ep := ep.(*endpoint); ep.enqueueSegment(s) {
  1035  		ep.newSegmentWaker.Assert()
  1036  	}
  1037  }
  1038  
  1039  // Drain segment queue from the endpoint and try to re-match the segment to a
  1040  // different endpoint. This is used when the current endpoint is transitioned to
  1041  // StateClose and has been unregistered from the transport demuxer.
  1042  func (e *endpoint) drainClosingSegmentQueue() {
  1043  	for {
  1044  		s := e.segmentQueue.dequeue()
  1045  		if s == nil {
  1046  			break
  1047  		}
  1048  
  1049  		e.tryDeliverSegmentFromClosedEndpoint(s)
  1050  	}
  1051  }
  1052  
  1053  func (e *endpoint) handleReset(s *segment) (ok bool, err tcpip.Error) {
  1054  	if e.rcv.acceptable(s.sequenceNumber, 0) {
  1055  		// RFC 793, page 37 states that "in all states
  1056  		// except SYN-SENT, all reset (RST) segments are
  1057  		// validated by checking their SEQ-fields." So
  1058  		// we only process it if it's acceptable.
  1059  		switch e.EndpointState() {
  1060  		// In case of a RST in CLOSE-WAIT linux moves
  1061  		// the socket to closed state with an error set
  1062  		// to indicate EPIPE.
  1063  		//
  1064  		// Technically this seems to be at odds w/ RFC.
  1065  		// As per https://tools.ietf.org/html/rfc793#section-2.7
  1066  		// page 69 the behavior for a segment arriving
  1067  		// w/ RST bit set in CLOSE-WAIT is inlined below.
  1068  		//
  1069  		//  ESTABLISHED
  1070  		//  FIN-WAIT-1
  1071  		//  FIN-WAIT-2
  1072  		//  CLOSE-WAIT
  1073  
  1074  		//  If the RST bit is set then, any outstanding RECEIVEs and
  1075  		//  SEND should receive "reset" responses. All segment queues
  1076  		//  should be flushed.  Users should also receive an unsolicited
  1077  		//  general "connection reset" signal. Enter the CLOSED state,
  1078  		//  delete the TCB, and return.
  1079  		case StateCloseWait:
  1080  			e.transitionToStateCloseLocked()
  1081  			e.hardError = &tcpip.ErrAborted{}
  1082  			e.notifyProtocolGoroutine(notifyTickleWorker)
  1083  			return false, nil
  1084  		default:
  1085  			// RFC 793, page 37 states that "in all states
  1086  			// except SYN-SENT, all reset (RST) segments are
  1087  			// validated by checking their SEQ-fields." So
  1088  			// we only process it if it's acceptable.
  1089  
  1090  			// Notify protocol goroutine. This is required when
  1091  			// handleSegment is invoked from the processor goroutine
  1092  			// rather than the worker goroutine.
  1093  			e.notifyProtocolGoroutine(notifyResetByPeer)
  1094  			return false, &tcpip.ErrConnectionReset{}
  1095  		}
  1096  	}
  1097  	return true, nil
  1098  }
  1099  
  1100  // handleSegments processes all inbound segments.
  1101  //
  1102  // Precondition: e.mu must be held.
  1103  func (e *endpoint) handleSegmentsLocked(fastPath bool) tcpip.Error {
  1104  	checkRequeue := true
  1105  	for i := 0; i < maxSegmentsPerWake; i++ {
  1106  		if state := e.EndpointState(); state.closed() || state == StateTimeWait {
  1107  			return nil
  1108  		}
  1109  		s := e.segmentQueue.dequeue()
  1110  		if s == nil {
  1111  			checkRequeue = false
  1112  			break
  1113  		}
  1114  
  1115  		cont, err := e.handleSegmentLocked(s)
  1116  		s.decRef()
  1117  		if err != nil {
  1118  			return err
  1119  		}
  1120  		if !cont {
  1121  			return nil
  1122  		}
  1123  	}
  1124  
  1125  	// When fastPath is true we don't want to wake up the worker
  1126  	// goroutine. If the endpoint has more segments to process the
  1127  	// dispatcher will call handleSegments again anyway.
  1128  	if !fastPath && checkRequeue && !e.segmentQueue.empty() {
  1129  		e.newSegmentWaker.Assert()
  1130  	}
  1131  
  1132  	// Send an ACK for all processed packets if needed.
  1133  	if e.rcv.RcvNxt != e.snd.MaxSentAck {
  1134  		e.snd.sendAck()
  1135  	}
  1136  
  1137  	e.resetKeepaliveTimer(true /* receivedData */)
  1138  
  1139  	return nil
  1140  }
  1141  
  1142  // Precondition: e.mu must be held.
  1143  func (e *endpoint) probeSegmentLocked() {
  1144  	if fn := e.probe; fn != nil {
  1145  		fn(e.completeStateLocked())
  1146  	}
  1147  }
  1148  
  1149  // handleSegment handles a given segment and notifies the worker goroutine if
  1150  // if the connection should be terminated.
  1151  //
  1152  // Precondition: e.mu must be held.
  1153  func (e *endpoint) handleSegmentLocked(s *segment) (cont bool, err tcpip.Error) {
  1154  	// Invoke the tcp probe if installed. The tcp probe function will update
  1155  	// the TCPEndpointState after the segment is processed.
  1156  	defer e.probeSegmentLocked()
  1157  
  1158  	if s.flags.Contains(header.TCPFlagRst) {
  1159  		if ok, err := e.handleReset(s); !ok {
  1160  			return false, err
  1161  		}
  1162  	} else if s.flags.Contains(header.TCPFlagSyn) {
  1163  		// See: https://tools.ietf.org/html/rfc5961#section-4.1
  1164  		//   1) If the SYN bit is set, irrespective of the sequence number, TCP
  1165  		//    MUST send an ACK (also referred to as challenge ACK) to the remote
  1166  		//    peer:
  1167  		//
  1168  		//    <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
  1169  		//
  1170  		//    After sending the acknowledgment, TCP MUST drop the unacceptable
  1171  		//    segment and stop processing further.
  1172  		//
  1173  		// By sending an ACK, the remote peer is challenged to confirm the loss
  1174  		// of the previous connection and the request to start a new connection.
  1175  		// A legitimate peer, after restart, would not have a TCB in the
  1176  		// synchronized state.  Thus, when the ACK arrives, the peer should send
  1177  		// a RST segment back with the sequence number derived from the ACK
  1178  		// field that caused the RST.
  1179  
  1180  		// This RST will confirm that the remote peer has indeed closed the
  1181  		// previous connection.  Upon receipt of a valid RST, the local TCP
  1182  		// endpoint MUST terminate its connection.  The local TCP endpoint
  1183  		// should then rely on SYN retransmission from the remote end to
  1184  		// re-establish the connection.
  1185  		e.snd.maybeSendOutOfWindowAck(s)
  1186  	} else if s.flags.Contains(header.TCPFlagAck) {
  1187  		// Patch the window size in the segment according to the
  1188  		// send window scale.
  1189  		s.window <<= e.snd.SndWndScale
  1190  
  1191  		// RFC 793, page 41 states that "once in the ESTABLISHED
  1192  		// state all segments must carry current acknowledgment
  1193  		// information."
  1194  		drop, err := e.rcv.handleRcvdSegment(s)
  1195  		if err != nil {
  1196  			return false, err
  1197  		}
  1198  		if drop {
  1199  			return true, nil
  1200  		}
  1201  
  1202  		// Now check if the received segment has caused us to transition
  1203  		// to a CLOSED state, if yes then terminate processing and do
  1204  		// not invoke the sender.
  1205  		state := e.EndpointState()
  1206  		if state == StateClose {
  1207  			// When we get into StateClose while processing from the queue,
  1208  			// return immediately and let the protocolMainloop handle it.
  1209  			//
  1210  			// We can reach StateClose only while processing a previous segment
  1211  			// or a notification from the protocolMainLoop (caller goroutine).
  1212  			// This means that with this return, the segment dequeue below can
  1213  			// never occur on a closed endpoint.
  1214  			return false, nil
  1215  		}
  1216  
  1217  		e.snd.handleRcvdSegment(s)
  1218  	}
  1219  
  1220  	return true, nil
  1221  }
  1222  
  1223  // keepaliveTimerExpired is called when the keepaliveTimer fires. We send TCP
  1224  // keepalive packets periodically when the connection is idle. If we don't hear
  1225  // from the other side after a number of tries, we terminate the connection.
  1226  func (e *endpoint) keepaliveTimerExpired() tcpip.Error {
  1227  	userTimeout := e.userTimeout
  1228  
  1229  	e.keepalive.Lock()
  1230  	if !e.SocketOptions().GetKeepAlive() || !e.keepalive.timer.checkExpiration() {
  1231  		e.keepalive.Unlock()
  1232  		return nil
  1233  	}
  1234  
  1235  	// If a userTimeout is set then abort the connection if it is
  1236  	// exceeded.
  1237  	if userTimeout != 0 && e.stack.Clock().NowMonotonic().Sub(e.rcv.lastRcvdAckTime) >= userTimeout && e.keepalive.unacked > 0 {
  1238  		e.keepalive.Unlock()
  1239  		e.stack.Stats().TCP.EstablishedTimedout.Increment()
  1240  		return &tcpip.ErrTimeout{}
  1241  	}
  1242  
  1243  	if e.keepalive.unacked >= e.keepalive.count {
  1244  		e.keepalive.Unlock()
  1245  		e.stack.Stats().TCP.EstablishedTimedout.Increment()
  1246  		return &tcpip.ErrTimeout{}
  1247  	}
  1248  
  1249  	// RFC1122 4.2.3.6: TCP keepalive is a dataless ACK with
  1250  	// seg.seq = snd.nxt-1.
  1251  	e.keepalive.unacked++
  1252  	e.keepalive.Unlock()
  1253  	e.snd.sendSegmentFromView(buffer.VectorisedView{}, header.TCPFlagAck, e.snd.SndNxt-1)
  1254  	e.resetKeepaliveTimer(false)
  1255  	return nil
  1256  }
  1257  
  1258  // resetKeepaliveTimer restarts or stops the keepalive timer, depending on
  1259  // whether it is enabled for this endpoint.
  1260  func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
  1261  	e.keepalive.Lock()
  1262  	if receivedData {
  1263  		e.keepalive.unacked = 0
  1264  	}
  1265  	// Start the keepalive timer IFF it's enabled and there is no pending
  1266  	// data to send.
  1267  	if !e.SocketOptions().GetKeepAlive() || e.snd == nil || e.snd.SndUna != e.snd.SndNxt {
  1268  		e.keepalive.timer.disable()
  1269  		e.keepalive.Unlock()
  1270  		return
  1271  	}
  1272  	if e.keepalive.unacked > 0 {
  1273  		e.keepalive.timer.enable(e.keepalive.interval)
  1274  	} else {
  1275  		e.keepalive.timer.enable(e.keepalive.idle)
  1276  	}
  1277  	e.keepalive.Unlock()
  1278  }
  1279  
  1280  // disableKeepaliveTimer stops the keepalive timer.
  1281  func (e *endpoint) disableKeepaliveTimer() {
  1282  	e.keepalive.Lock()
  1283  	e.keepalive.timer.disable()
  1284  	e.keepalive.Unlock()
  1285  }
  1286  
  1287  // protocolMainLoopDone is called at the end of protocolMainLoop.
  1288  // +checklocksrelease:e.mu
  1289  func (e *endpoint) protocolMainLoopDone(closeTimer tcpip.Timer, closeWaker *sleep.Waker) {
  1290  	if e.snd != nil {
  1291  		e.snd.resendTimer.cleanup()
  1292  		e.snd.probeTimer.cleanup()
  1293  		e.snd.reorderTimer.cleanup()
  1294  	}
  1295  
  1296  	if closeTimer != nil {
  1297  		closeTimer.Stop()
  1298  	}
  1299  
  1300  	e.completeWorkerLocked()
  1301  
  1302  	if e.drainDone != nil {
  1303  		close(e.drainDone)
  1304  	}
  1305  
  1306  	e.mu.Unlock()
  1307  
  1308  	e.drainClosingSegmentQueue()
  1309  
  1310  	// When the protocol loop exits we should wake up our waiters.
  1311  	e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  1312  }
  1313  
  1314  // protocolMainLoop is the main loop of the TCP protocol. It runs in its own
  1315  // goroutine and is responsible for sending segments and handling received
  1316  // segments.
  1317  func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) tcpip.Error {
  1318  	var (
  1319  		closeTimer tcpip.Timer
  1320  		closeWaker sleep.Waker
  1321  	)
  1322  
  1323  	e.mu.Lock()
  1324  	if handshake {
  1325  		if err := e.h.complete(); err != nil { // +checklocksforce
  1326  			e.lastErrorMu.Lock()
  1327  			e.lastError = err
  1328  			e.lastErrorMu.Unlock()
  1329  
  1330  			e.setEndpointState(StateError)
  1331  			e.hardError = err
  1332  
  1333  			e.workerCleanup = true
  1334  			e.protocolMainLoopDone(closeTimer, &closeWaker)
  1335  			return err
  1336  		}
  1337  	}
  1338  
  1339  	// Reaching this point means that we successfully completed the 3-way
  1340  	// handshake with our peer. The current endpoint state could be any state
  1341  	// post ESTABLISHED, including CLOSED or ERROR if the endpoint processes a
  1342  	// RST from the peer via the dispatcher fast path, before the loop is
  1343  	// started.
  1344  	if s := e.EndpointState(); !s.connected() {
  1345  		switch s {
  1346  		case StateClose, StateError:
  1347  			// If the endpoint is in CLOSED/ERROR state, sender state has to be
  1348  			// initialized if the endpoint was previously established.
  1349  			if e.snd != nil {
  1350  				break
  1351  			}
  1352  			fallthrough
  1353  		default:
  1354  			panic("endpoint was not established, current state " + s.String())
  1355  		}
  1356  	}
  1357  
  1358  	// Completing the 3-way handshake is an indication that the route is valid
  1359  	// and the remote is reachable as the only way we can complete a handshake
  1360  	// is if our SYN reached the remote and their ACK reached us.
  1361  	e.route.ConfirmReachable()
  1362  
  1363  	drained := e.drainDone != nil
  1364  	if drained {
  1365  		close(e.drainDone)
  1366  		e.mu.Unlock()
  1367  		<-e.undrain
  1368  		e.mu.Lock()
  1369  	}
  1370  
  1371  	// Set up the functions that will be called when the main protocol loop
  1372  	// wakes up.
  1373  	funcs := []struct {
  1374  		w *sleep.Waker
  1375  		f func() tcpip.Error
  1376  	}{
  1377  		{
  1378  			w: &e.sndQueueInfo.sndWaker,
  1379  			f: func() tcpip.Error {
  1380  				e.sendData(nil /* next */)
  1381  				return nil
  1382  			},
  1383  		},
  1384  		{
  1385  			w: &closeWaker,
  1386  			f: func() tcpip.Error {
  1387  				// This means the socket is being closed due
  1388  				// to the TCP-FIN-WAIT2 timeout was hit. Just
  1389  				// mark the socket as closed.
  1390  				e.transitionToStateCloseLocked()
  1391  				e.workerCleanup = true
  1392  				return nil
  1393  			},
  1394  		},
  1395  		{
  1396  			w: &e.snd.resendWaker,
  1397  			f: func() tcpip.Error {
  1398  				if !e.snd.retransmitTimerExpired() {
  1399  					e.stack.Stats().TCP.EstablishedTimedout.Increment()
  1400  					return &tcpip.ErrTimeout{}
  1401  				}
  1402  				return nil
  1403  			},
  1404  		},
  1405  		{
  1406  			w: &e.snd.probeWaker,
  1407  			f: e.snd.probeTimerExpired,
  1408  		},
  1409  		{
  1410  			w: &e.newSegmentWaker,
  1411  			f: func() tcpip.Error {
  1412  				return e.handleSegmentsLocked(false /* fastPath */)
  1413  			},
  1414  		},
  1415  		{
  1416  			w: &e.keepalive.waker,
  1417  			f: e.keepaliveTimerExpired,
  1418  		},
  1419  		{
  1420  			w: &e.notificationWaker,
  1421  			f: func() tcpip.Error {
  1422  				n := e.fetchNotifications()
  1423  				if n&notifyNonZeroReceiveWindow != 0 {
  1424  					e.rcv.nonZeroWindow()
  1425  				}
  1426  
  1427  				if n&notifyMTUChanged != 0 {
  1428  					e.sndQueueInfo.sndQueueMu.Lock()
  1429  					count := e.sndQueueInfo.PacketTooBigCount
  1430  					e.sndQueueInfo.PacketTooBigCount = 0
  1431  					mtu := e.sndQueueInfo.SndMTU
  1432  					e.sndQueueInfo.sndQueueMu.Unlock()
  1433  
  1434  					e.snd.updateMaxPayloadSize(mtu, count)
  1435  				}
  1436  
  1437  				if n&notifyReset != 0 || n&notifyAbort != 0 {
  1438  					return &tcpip.ErrConnectionAborted{}
  1439  				}
  1440  
  1441  				if n&notifyResetByPeer != 0 {
  1442  					return &tcpip.ErrConnectionReset{}
  1443  				}
  1444  
  1445  				if n&notifyClose != 0 && e.closed {
  1446  					switch e.EndpointState() {
  1447  					case StateEstablished:
  1448  						// Perform full shutdown if the endpoint is still
  1449  						// established. This can occur when notifyClose
  1450  						// was asserted just before becoming established.
  1451  						e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead)
  1452  					case StateFinWait2:
  1453  						// The socket has been closed and we are in FIN_WAIT2
  1454  						// so start the FIN_WAIT2 timer.
  1455  						if closeTimer == nil {
  1456  							closeTimer = e.stack.Clock().AfterFunc(e.tcpLingerTimeout, closeWaker.Assert)
  1457  						}
  1458  					}
  1459  				}
  1460  
  1461  				if n&notifyKeepaliveChanged != 0 {
  1462  					// The timer could fire in background
  1463  					// when the endpoint is drained. That's
  1464  					// OK. See above.
  1465  					e.resetKeepaliveTimer(true)
  1466  				}
  1467  
  1468  				if n&notifyDrain != 0 {
  1469  					for !e.segmentQueue.empty() {
  1470  						if err := e.handleSegmentsLocked(false /* fastPath */); err != nil {
  1471  							return err
  1472  						}
  1473  					}
  1474  					if !e.EndpointState().closed() {
  1475  						// Only block the worker if the endpoint
  1476  						// is not in closed state or error state.
  1477  						close(e.drainDone)
  1478  						e.mu.Unlock() // +checklocksforce
  1479  						<-e.undrain
  1480  						e.mu.Lock()
  1481  					}
  1482  				}
  1483  
  1484  				if n&notifyTickleWorker != 0 {
  1485  					// Just a tickle notification. No need to do
  1486  					// anything.
  1487  					return nil
  1488  				}
  1489  
  1490  				return nil
  1491  			},
  1492  		},
  1493  		{
  1494  			w: &e.snd.reorderWaker,
  1495  			f: e.snd.rc.reorderTimerExpired,
  1496  		},
  1497  	}
  1498  
  1499  	// Initialize the sleeper based on the wakers in funcs.
  1500  	var s sleep.Sleeper
  1501  	for i := range funcs {
  1502  		s.AddWaker(funcs[i].w, i)
  1503  	}
  1504  
  1505  	// Notify the caller that the waker initialization is complete and the
  1506  	// endpoint is ready.
  1507  	if wakerInitDone != nil {
  1508  		close(wakerInitDone)
  1509  	}
  1510  
  1511  	// Tell waiters that the endpoint is connected and writable.
  1512  	e.waiterQueue.Notify(waiter.WritableEvents)
  1513  
  1514  	// The following assertions and notifications are needed for restored
  1515  	// endpoints. Fresh newly created endpoints have empty states and should
  1516  	// not invoke any.
  1517  	if !e.segmentQueue.empty() {
  1518  		e.newSegmentWaker.Assert()
  1519  	}
  1520  
  1521  	e.rcvQueueInfo.rcvQueueMu.Lock()
  1522  	if !e.rcvQueueInfo.rcvQueue.Empty() {
  1523  		e.waiterQueue.Notify(waiter.ReadableEvents)
  1524  	}
  1525  	e.rcvQueueInfo.rcvQueueMu.Unlock()
  1526  
  1527  	if e.workerCleanup {
  1528  		e.notifyProtocolGoroutine(notifyClose)
  1529  	}
  1530  
  1531  	// Main loop. Handle segments until both send and receive ends of the
  1532  	// connection have completed.
  1533  	cleanupOnError := func(err tcpip.Error) {
  1534  		e.stack.Stats().TCP.CurrentConnected.Decrement()
  1535  		e.workerCleanup = true
  1536  		if err != nil {
  1537  			e.resetConnectionLocked(err)
  1538  		}
  1539  	}
  1540  
  1541  loop:
  1542  	for {
  1543  		switch e.EndpointState() {
  1544  		case StateTimeWait, StateClose, StateError:
  1545  			break loop
  1546  		}
  1547  
  1548  		e.mu.Unlock()
  1549  		v, _ := s.Fetch(true /* block */)
  1550  		e.mu.Lock()
  1551  
  1552  		// We need to double check here because the notification may be
  1553  		// stale by the time we got around to processing it.
  1554  		switch e.EndpointState() {
  1555  		case StateError:
  1556  			// If the endpoint has already transitioned to an ERROR
  1557  			// state just pass nil here as any reset that may need
  1558  			// to be sent etc should already have been done and we
  1559  			// just want to terminate the loop and cleanup the
  1560  			// endpoint.
  1561  			cleanupOnError(nil)
  1562  			e.protocolMainLoopDone(closeTimer, &closeWaker)
  1563  			return nil
  1564  		case StateTimeWait:
  1565  			fallthrough
  1566  		case StateClose:
  1567  			break loop
  1568  		default:
  1569  			if err := funcs[v].f(); err != nil {
  1570  				cleanupOnError(err)
  1571  				e.protocolMainLoopDone(closeTimer, &closeWaker)
  1572  				return nil
  1573  			}
  1574  		}
  1575  	}
  1576  
  1577  	var reuseTW func()
  1578  	if e.EndpointState() == StateTimeWait {
  1579  		// Disable close timer as we now entering real TIME_WAIT.
  1580  		if closeTimer != nil {
  1581  			closeTimer.Stop()
  1582  		}
  1583  		// Mark the current sleeper done so as to free all associated
  1584  		// wakers.
  1585  		s.Done()
  1586  		// Wake up any waiters before we enter TIME_WAIT.
  1587  		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
  1588  		e.workerCleanup = true
  1589  		reuseTW = e.doTimeWait()
  1590  	}
  1591  
  1592  	// Handle any StateError transition from StateTimeWait.
  1593  	if e.EndpointState() == StateError {
  1594  		cleanupOnError(nil)
  1595  		e.protocolMainLoopDone(closeTimer, &closeWaker)
  1596  		return nil
  1597  	}
  1598  
  1599  	e.transitionToStateCloseLocked()
  1600  
  1601  	e.protocolMainLoopDone(closeTimer, &closeWaker)
  1602  
  1603  	// A new SYN was received during TIME_WAIT and we need to abort
  1604  	// the timewait and redirect the segment to the listener queue
  1605  	if reuseTW != nil {
  1606  		reuseTW()
  1607  	}
  1608  
  1609  	return nil
  1610  }
  1611  
  1612  // handleTimeWaitSegments processes segments received during TIME_WAIT
  1613  // state.
  1614  func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()) {
  1615  	checkRequeue := true
  1616  	for i := 0; i < maxSegmentsPerWake; i++ {
  1617  		s := e.segmentQueue.dequeue()
  1618  		if s == nil {
  1619  			checkRequeue = false
  1620  			break
  1621  		}
  1622  		extTW, newSyn := e.rcv.handleTimeWaitSegment(s)
  1623  		if newSyn {
  1624  			info := e.TransportEndpointInfo
  1625  			newID := info.ID
  1626  			newID.RemoteAddress = ""
  1627  			newID.RemotePort = 0
  1628  			netProtos := []tcpip.NetworkProtocolNumber{info.NetProto}
  1629  			// If the local address is an IPv4 address then also
  1630  			// look for IPv6 dual stack endpoints that might be
  1631  			// listening on the local address.
  1632  			if newID.LocalAddress.To4() != "" {
  1633  				netProtos = []tcpip.NetworkProtocolNumber{header.IPv4ProtocolNumber, header.IPv6ProtocolNumber}
  1634  			}
  1635  			for _, netProto := range netProtos {
  1636  				if listenEP := e.stack.FindTransportEndpoint(netProto, info.TransProto, newID, s.nicID); listenEP != nil {
  1637  					tcpEP := listenEP.(*endpoint)
  1638  					if EndpointState(tcpEP.State()) == StateListen {
  1639  						reuseTW = func() {
  1640  							if !tcpEP.enqueueSegment(s) {
  1641  								s.decRef()
  1642  								return
  1643  							}
  1644  							tcpEP.newSegmentWaker.Assert()
  1645  						}
  1646  						// We explicitly do not decRef
  1647  						// the segment as it's still
  1648  						// valid and being reflected to
  1649  						// a listening endpoint.
  1650  						return false, reuseTW
  1651  					}
  1652  				}
  1653  			}
  1654  		}
  1655  		if extTW {
  1656  			extendTimeWait = true
  1657  		}
  1658  		s.decRef()
  1659  	}
  1660  	if checkRequeue && !e.segmentQueue.empty() {
  1661  		e.newSegmentWaker.Assert()
  1662  	}
  1663  	return extendTimeWait, nil
  1664  }
  1665  
  1666  // doTimeWait is responsible for handling the TCP behaviour once a socket
  1667  // enters the TIME_WAIT state. Optionally it can return a closure that
  1668  // should be executed after releasing the endpoint registrations. This is
  1669  // done in cases where a new SYN is received during TIME_WAIT that carries
  1670  // a sequence number larger than one see on the connection.
  1671  // +checklocks:e.mu
  1672  func (e *endpoint) doTimeWait() (twReuse func()) {
  1673  	// Trigger a 2 * MSL time wait state. During this period
  1674  	// we will drop all incoming segments.
  1675  	// NOTE: On Linux this is not configurable and is fixed at 60 seconds.
  1676  	timeWaitDuration := DefaultTCPTimeWaitTimeout
  1677  
  1678  	// Get the stack wide configuration.
  1679  	var tcpTW tcpip.TCPTimeWaitTimeoutOption
  1680  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &tcpTW); err == nil {
  1681  		timeWaitDuration = time.Duration(tcpTW)
  1682  	}
  1683  
  1684  	const newSegment = 1
  1685  	const notification = 2
  1686  	const timeWaitDone = 3
  1687  
  1688  	var s sleep.Sleeper
  1689  	defer s.Done()
  1690  	s.AddWaker(&e.newSegmentWaker, newSegment)
  1691  	s.AddWaker(&e.notificationWaker, notification)
  1692  
  1693  	var timeWaitWaker sleep.Waker
  1694  	s.AddWaker(&timeWaitWaker, timeWaitDone)
  1695  	timeWaitTimer := e.stack.Clock().AfterFunc(timeWaitDuration, timeWaitWaker.Assert)
  1696  	defer timeWaitTimer.Stop()
  1697  
  1698  	for {
  1699  		e.mu.Unlock()
  1700  		v, _ := s.Fetch(true /* block */)
  1701  		e.mu.Lock()
  1702  		switch v {
  1703  		case newSegment:
  1704  			extendTimeWait, reuseTW := e.handleTimeWaitSegments()
  1705  			if reuseTW != nil {
  1706  				return reuseTW
  1707  			}
  1708  			if extendTimeWait {
  1709  				timeWaitTimer.Reset(timeWaitDuration)
  1710  			}
  1711  		case notification:
  1712  			n := e.fetchNotifications()
  1713  			if n&notifyAbort != 0 {
  1714  				return nil
  1715  			}
  1716  			if n&notifyDrain != 0 {
  1717  				for !e.segmentQueue.empty() {
  1718  					// Ignore extending TIME_WAIT during a
  1719  					// save. For sockets in TIME_WAIT we just
  1720  					// terminate the TIME_WAIT early.
  1721  					e.handleTimeWaitSegments()
  1722  				}
  1723  				close(e.drainDone)
  1724  				e.mu.Unlock()
  1725  				<-e.undrain
  1726  				e.mu.Lock()
  1727  				return nil
  1728  			}
  1729  		case timeWaitDone:
  1730  			return nil
  1731  		}
  1732  	}
  1733  }