github.com/vpnishe/netstack@v1.10.6/tcpip/transport/tcp/connect.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"encoding/binary"
    19  	"sync"
    20  	"time"
    21  
    22  	"github.com/vpnishe/netstack/rand"
    23  	"github.com/vpnishe/netstack/sleep"
    24  	"github.com/vpnishe/netstack/tcpip"
    25  	"github.com/vpnishe/netstack/tcpip/buffer"
    26  	"github.com/vpnishe/netstack/tcpip/hash/jenkins"
    27  	"github.com/vpnishe/netstack/tcpip/header"
    28  	"github.com/vpnishe/netstack/tcpip/seqnum"
    29  	"github.com/vpnishe/netstack/tcpip/stack"
    30  	"github.com/vpnishe/netstack/waiter"
    31  )
    32  
    33  // maxSegmentsPerWake is the maximum number of segments to process in the main
    34  // protocol goroutine per wake-up. Yielding [after this number of segments are
    35  // processed] allows other events to be processed as well (e.g., timeouts,
    36  // resets, etc.).
    37  const maxSegmentsPerWake = 100
    38  
    39  type handshakeState int
    40  
    41  // The following are the possible states of the TCP connection during a 3-way
    42  // handshake. A depiction of the states and transitions can be found in RFC 793,
    43  // page 23.
    44  const (
    45  	handshakeSynSent handshakeState = iota
    46  	handshakeSynRcvd
    47  	handshakeCompleted
    48  )
    49  
    50  // The following are used to set up sleepers.
    51  const (
    52  	wakerForNotification = iota
    53  	wakerForNewSegment
    54  	wakerForResend
    55  	wakerForResolution
    56  )
    57  
    58  const (
    59  	// Maximum space available for options.
    60  	maxOptionSize = 40
    61  )
    62  
    63  // handshake holds the state used during a TCP 3-way handshake.
    64  type handshake struct {
    65  	ep     *endpoint
    66  	state  handshakeState
    67  	active bool
    68  	flags  uint8
    69  	ackNum seqnum.Value
    70  
    71  	// iss is the initial send sequence number, as defined in RFC 793.
    72  	iss seqnum.Value
    73  
    74  	// rcvWnd is the receive window, as defined in RFC 793.
    75  	rcvWnd seqnum.Size
    76  
    77  	// sndWnd is the send window, as defined in RFC 793.
    78  	sndWnd seqnum.Size
    79  
    80  	// mss is the maximum segment size received from the peer.
    81  	mss uint16
    82  
    83  	// sndWndScale is the send window scale, as defined in RFC 1323. A
    84  	// negative value means no scaling is supported by the peer.
    85  	sndWndScale int
    86  
    87  	// rcvWndScale is the receive window scale, as defined in RFC 1323.
    88  	rcvWndScale int
    89  }
    90  
    91  func newHandshake(ep *endpoint, rcvWnd seqnum.Size) handshake {
    92  	rcvWndScale := ep.rcvWndScaleForHandshake()
    93  
    94  	// Round-down the rcvWnd to a multiple of wndScale. This ensures that the
    95  	// window offered in SYN won't be reduced due to the loss of precision if
    96  	// window scaling is enabled after the handshake.
    97  	rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale)
    98  
    99  	// Ensure we can always accept at least 1 byte if the scale specified
   100  	// was too high for the provided rcvWnd.
   101  	if rcvWnd == 0 {
   102  		rcvWnd = 1
   103  	}
   104  
   105  	h := handshake{
   106  		ep:          ep,
   107  		active:      true,
   108  		rcvWnd:      rcvWnd,
   109  		rcvWndScale: int(rcvWndScale),
   110  	}
   111  	h.resetState()
   112  	return h
   113  }
   114  
   115  // FindWndScale determines the window scale to use for the given maximum window
   116  // size.
   117  func FindWndScale(wnd seqnum.Size) int {
   118  	if wnd < 0x10000 {
   119  		return 0
   120  	}
   121  
   122  	max := seqnum.Size(0xffff)
   123  	s := 0
   124  	for wnd > max && s < header.MaxWndScale {
   125  		s++
   126  		max <<= 1
   127  	}
   128  
   129  	return s
   130  }
   131  
   132  // resetState resets the state of the handshake object such that it becomes
   133  // ready for a new 3-way handshake.
   134  func (h *handshake) resetState() {
   135  	b := make([]byte, 4)
   136  	if _, err := rand.Read(b); err != nil {
   137  		panic(err)
   138  	}
   139  
   140  	h.state = handshakeSynSent
   141  	h.flags = header.TCPFlagSyn
   142  	h.ackNum = 0
   143  	h.mss = 0
   144  	h.iss = generateSecureISN(h.ep.ID, h.ep.stack.Seed())
   145  }
   146  
   147  // generateSecureISN generates a secure Initial Sequence number based on the
   148  // recommendation here https://tools.ietf.org/html/rfc6528#page-3.
   149  func generateSecureISN(id stack.TransportEndpointID, seed uint32) seqnum.Value {
   150  	isnHasher := jenkins.Sum32(seed)
   151  	isnHasher.Write([]byte(id.LocalAddress))
   152  	isnHasher.Write([]byte(id.RemoteAddress))
   153  	portBuf := make([]byte, 2)
   154  	binary.LittleEndian.PutUint16(portBuf, id.LocalPort)
   155  	isnHasher.Write(portBuf)
   156  	binary.LittleEndian.PutUint16(portBuf, id.RemotePort)
   157  	isnHasher.Write(portBuf)
   158  	// The time period here is 64ns. This is similar to what linux uses
   159  	// generate a sequence number that overlaps less than one
   160  	// time per MSL (2 minutes).
   161  	//
   162  	// A 64ns clock ticks 10^9/64 = 15625000) times in a second.
   163  	// To wrap the whole 32 bit space would require
   164  	// 2^32/1562500 ~ 274 seconds.
   165  	//
   166  	// Which sort of guarantees that we won't reuse the ISN for a new
   167  	// connection for the same tuple for at least 274s.
   168  	isn := isnHasher.Sum32() + uint32(time.Now().UnixNano()>>6)
   169  	return seqnum.Value(isn)
   170  }
   171  
   172  // effectiveRcvWndScale returns the effective receive window scale to be used.
   173  // If the peer doesn't support window scaling, the effective rcv wnd scale is
   174  // zero; otherwise it's the value calculated based on the initial rcv wnd.
   175  func (h *handshake) effectiveRcvWndScale() uint8 {
   176  	if h.sndWndScale < 0 {
   177  		return 0
   178  	}
   179  	return uint8(h.rcvWndScale)
   180  }
   181  
   182  // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD
   183  // state.
   184  func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions) {
   185  	h.active = false
   186  	h.state = handshakeSynRcvd
   187  	h.flags = header.TCPFlagSyn | header.TCPFlagAck
   188  	h.iss = iss
   189  	h.ackNum = irs + 1
   190  	h.mss = opts.MSS
   191  	h.sndWndScale = opts.WS
   192  	h.ep.mu.Lock()
   193  	h.ep.state = StateSynRecv
   194  	h.ep.mu.Unlock()
   195  }
   196  
   197  // checkAck checks if the ACK number, if present, of a segment received during
   198  // a TCP 3-way handshake is valid. If it's not, a RST segment is sent back in
   199  // response.
   200  func (h *handshake) checkAck(s *segment) bool {
   201  	if s.flagIsSet(header.TCPFlagAck) && s.ackNumber != h.iss+1 {
   202  		// RFC 793, page 36, states that a reset must be generated when
   203  		// the connection is in any non-synchronized state and an
   204  		// incoming segment acknowledges something not yet sent. The
   205  		// connection remains in the same state.
   206  		ack := s.sequenceNumber.Add(s.logicalLen())
   207  		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagRst|header.TCPFlagAck, s.ackNumber, ack, 0)
   208  		return false
   209  	}
   210  
   211  	return true
   212  }
   213  
   214  // synSentState handles a segment received when the TCP 3-way handshake is in
   215  // the SYN-SENT state.
   216  func (h *handshake) synSentState(s *segment) *tcpip.Error {
   217  	// RFC 793, page 37, states that in the SYN-SENT state, a reset is
   218  	// acceptable if the ack field acknowledges the SYN.
   219  	if s.flagIsSet(header.TCPFlagRst) {
   220  		if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == h.iss+1 {
   221  			return tcpip.ErrConnectionRefused
   222  		}
   223  		return nil
   224  	}
   225  
   226  	if !h.checkAck(s) {
   227  		return nil
   228  	}
   229  
   230  	// We are in the SYN-SENT state. We only care about segments that have
   231  	// the SYN flag.
   232  	if !s.flagIsSet(header.TCPFlagSyn) {
   233  		return nil
   234  	}
   235  
   236  	// Parse the SYN options.
   237  	rcvSynOpts := parseSynSegmentOptions(s)
   238  
   239  	// Remember if the Timestamp option was negotiated.
   240  	h.ep.maybeEnableTimestamp(&rcvSynOpts)
   241  
   242  	// Remember if the SACKPermitted option was negotiated.
   243  	h.ep.maybeEnableSACKPermitted(&rcvSynOpts)
   244  
   245  	// Remember the sequence we'll ack from now on.
   246  	h.ackNum = s.sequenceNumber + 1
   247  	h.flags |= header.TCPFlagAck
   248  	h.mss = rcvSynOpts.MSS
   249  	h.sndWndScale = rcvSynOpts.WS
   250  
   251  	// If this is a SYN ACK response, we only need to acknowledge the SYN
   252  	// and the handshake is completed.
   253  	if s.flagIsSet(header.TCPFlagAck) {
   254  		h.state = handshakeCompleted
   255  		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale())
   256  		return nil
   257  	}
   258  
   259  	// A SYN segment was received, but no ACK in it. We acknowledge the SYN
   260  	// but resend our own SYN and wait for it to be acknowledged in the
   261  	// SYN-RCVD state.
   262  	h.state = handshakeSynRcvd
   263  	h.ep.mu.Lock()
   264  	h.ep.state = StateSynRecv
   265  	ttl := h.ep.ttl
   266  	h.ep.mu.Unlock()
   267  	synOpts := header.TCPSynOptions{
   268  		WS:    int(h.effectiveRcvWndScale()),
   269  		TS:    rcvSynOpts.TS,
   270  		TSVal: h.ep.timestamp(),
   271  		TSEcr: h.ep.recentTS,
   272  
   273  		// We only send SACKPermitted if the other side indicated it
   274  		// permits SACK. This is not explicitly defined in the RFC but
   275  		// this is the behaviour implemented by Linux.
   276  		SACKPermitted: rcvSynOpts.SACKPermitted,
   277  		MSS:           h.ep.amss,
   278  	}
   279  	if ttl == 0 {
   280  		ttl = s.route.DefaultTTL()
   281  	}
   282  	h.ep.sendSynTCP(&s.route, h.ep.ID, ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
   283  	return nil
   284  }
   285  
   286  // synRcvdState handles a segment received when the TCP 3-way handshake is in
   287  // the SYN-RCVD state.
   288  func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
   289  	if s.flagIsSet(header.TCPFlagRst) {
   290  		// RFC 793, page 37, states that in the SYN-RCVD state, a reset
   291  		// is acceptable if the sequence number is in the window.
   292  		if s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) {
   293  			return tcpip.ErrConnectionRefused
   294  		}
   295  		return nil
   296  	}
   297  
   298  	if !h.checkAck(s) {
   299  		return nil
   300  	}
   301  
   302  	if s.flagIsSet(header.TCPFlagSyn) && s.sequenceNumber != h.ackNum-1 {
   303  		// We received two SYN segments with different sequence
   304  		// numbers, so we reset this and restart the whole
   305  		// process, except that we don't reset the timer.
   306  		ack := s.sequenceNumber.Add(s.logicalLen())
   307  		seq := seqnum.Value(0)
   308  		if s.flagIsSet(header.TCPFlagAck) {
   309  			seq = s.ackNumber
   310  		}
   311  		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0)
   312  
   313  		if !h.active {
   314  			return tcpip.ErrInvalidEndpointState
   315  		}
   316  
   317  		h.resetState()
   318  		synOpts := header.TCPSynOptions{
   319  			WS:            h.rcvWndScale,
   320  			TS:            h.ep.sendTSOk,
   321  			TSVal:         h.ep.timestamp(),
   322  			TSEcr:         h.ep.recentTS,
   323  			SACKPermitted: h.ep.sackPermitted,
   324  			MSS:           h.ep.amss,
   325  		}
   326  		h.ep.sendSynTCP(&s.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
   327  		return nil
   328  	}
   329  
   330  	// We have previously received (and acknowledged) the peer's SYN. If the
   331  	// peer acknowledges our SYN, the handshake is completed.
   332  	if s.flagIsSet(header.TCPFlagAck) {
   333  		// If the timestamp option is negotiated and the segment does
   334  		// not carry a timestamp option then the segment must be dropped
   335  		// as per https://tools.ietf.org/html/rfc7323#section-3.2.
   336  		if h.ep.sendTSOk && !s.parsedOptions.TS {
   337  			h.ep.stack.Stats().DroppedPackets.Increment()
   338  			return nil
   339  		}
   340  
   341  		// Update timestamp if required. See RFC7323, section-4.3.
   342  		if h.ep.sendTSOk && s.parsedOptions.TS {
   343  			h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber)
   344  		}
   345  		h.state = handshakeCompleted
   346  		return nil
   347  	}
   348  
   349  	return nil
   350  }
   351  
   352  func (h *handshake) handleSegment(s *segment) *tcpip.Error {
   353  	h.sndWnd = s.window
   354  	if !s.flagIsSet(header.TCPFlagSyn) && h.sndWndScale > 0 {
   355  		h.sndWnd <<= uint8(h.sndWndScale)
   356  	}
   357  
   358  	switch h.state {
   359  	case handshakeSynRcvd:
   360  		return h.synRcvdState(s)
   361  	case handshakeSynSent:
   362  		return h.synSentState(s)
   363  	}
   364  	return nil
   365  }
   366  
   367  // processSegments goes through the segment queue and processes up to
   368  // maxSegmentsPerWake (if they're available).
   369  func (h *handshake) processSegments() *tcpip.Error {
   370  	for i := 0; i < maxSegmentsPerWake; i++ {
   371  		s := h.ep.segmentQueue.dequeue()
   372  		if s == nil {
   373  			return nil
   374  		}
   375  
   376  		err := h.handleSegment(s)
   377  		s.decRef()
   378  		if err != nil {
   379  			return err
   380  		}
   381  
   382  		// We stop processing packets once the handshake is completed,
   383  		// otherwise we may process packets meant to be processed by
   384  		// the main protocol goroutine.
   385  		if h.state == handshakeCompleted {
   386  			break
   387  		}
   388  	}
   389  
   390  	// If the queue is not empty, make sure we'll wake up in the next
   391  	// iteration.
   392  	if !h.ep.segmentQueue.empty() {
   393  		h.ep.newSegmentWaker.Assert()
   394  	}
   395  
   396  	return nil
   397  }
   398  
   399  func (h *handshake) resolveRoute() *tcpip.Error {
   400  	// Set up the wakers.
   401  	s := sleep.Sleeper{}
   402  	resolutionWaker := &sleep.Waker{}
   403  	s.AddWaker(resolutionWaker, wakerForResolution)
   404  	s.AddWaker(&h.ep.notificationWaker, wakerForNotification)
   405  	defer s.Done()
   406  
   407  	// Initial action is to resolve route.
   408  	index := wakerForResolution
   409  	for {
   410  		switch index {
   411  		case wakerForResolution:
   412  			if _, err := h.ep.route.Resolve(resolutionWaker); err != tcpip.ErrWouldBlock {
   413  				if err == tcpip.ErrNoLinkAddress {
   414  					h.ep.stats.SendErrors.NoLinkAddr.Increment()
   415  				} else if err != nil {
   416  					h.ep.stats.SendErrors.NoRoute.Increment()
   417  				}
   418  				// Either success (err == nil) or failure.
   419  				return err
   420  			}
   421  			// Resolution not completed. Keep trying...
   422  
   423  		case wakerForNotification:
   424  			n := h.ep.fetchNotifications()
   425  			if n&notifyClose != 0 {
   426  				h.ep.route.RemoveWaker(resolutionWaker)
   427  				return tcpip.ErrAborted
   428  			}
   429  			if n&notifyDrain != 0 {
   430  				close(h.ep.drainDone)
   431  				<-h.ep.undrain
   432  			}
   433  		}
   434  
   435  		// Wait for notification.
   436  		index, _ = s.Fetch(true)
   437  	}
   438  }
   439  
   440  // execute executes the TCP 3-way handshake.
   441  func (h *handshake) execute() *tcpip.Error {
   442  	if h.ep.route.IsResolutionRequired() {
   443  		if err := h.resolveRoute(); err != nil {
   444  			return err
   445  		}
   446  	}
   447  
   448  	// Initialize the resend timer.
   449  	resendWaker := sleep.Waker{}
   450  	timeOut := time.Duration(time.Second)
   451  	rt := time.AfterFunc(timeOut, func() {
   452  		resendWaker.Assert()
   453  	})
   454  	defer rt.Stop()
   455  
   456  	// Set up the wakers.
   457  	s := sleep.Sleeper{}
   458  	s.AddWaker(&resendWaker, wakerForResend)
   459  	s.AddWaker(&h.ep.notificationWaker, wakerForNotification)
   460  	s.AddWaker(&h.ep.newSegmentWaker, wakerForNewSegment)
   461  	defer s.Done()
   462  
   463  	var sackEnabled SACKEnabled
   464  	if err := h.ep.stack.TransportProtocolOption(ProtocolNumber, &sackEnabled); err != nil {
   465  		// If stack returned an error when checking for SACKEnabled
   466  		// status then just default to switching off SACK negotiation.
   467  		sackEnabled = false
   468  	}
   469  
   470  	// Send the initial SYN segment and loop until the handshake is
   471  	// completed.
   472  	h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route)
   473  
   474  	synOpts := header.TCPSynOptions{
   475  		WS:            h.rcvWndScale,
   476  		TS:            true,
   477  		TSVal:         h.ep.timestamp(),
   478  		TSEcr:         h.ep.recentTS,
   479  		SACKPermitted: bool(sackEnabled),
   480  		MSS:           h.ep.amss,
   481  	}
   482  
   483  	// Execute is also called in a listen context so we want to make sure we
   484  	// only send the TS/SACK option when we received the TS/SACK in the
   485  	// initial SYN.
   486  	if h.state == handshakeSynRcvd {
   487  		synOpts.TS = h.ep.sendTSOk
   488  		synOpts.SACKPermitted = h.ep.sackPermitted && bool(sackEnabled)
   489  		if h.sndWndScale < 0 {
   490  			// Disable window scaling if the peer did not send us
   491  			// the window scaling option.
   492  			synOpts.WS = -1
   493  		}
   494  	}
   495  	h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
   496  
   497  	for h.state != handshakeCompleted {
   498  		switch index, _ := s.Fetch(true); index {
   499  		case wakerForResend:
   500  			timeOut *= 2
   501  			if timeOut > 60*time.Second {
   502  				return tcpip.ErrTimeout
   503  			}
   504  			rt.Reset(timeOut)
   505  			h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
   506  
   507  		case wakerForNotification:
   508  			n := h.ep.fetchNotifications()
   509  			if n&notifyClose != 0 {
   510  				return tcpip.ErrAborted
   511  			}
   512  			if n&notifyDrain != 0 {
   513  				for !h.ep.segmentQueue.empty() {
   514  					s := h.ep.segmentQueue.dequeue()
   515  					err := h.handleSegment(s)
   516  					s.decRef()
   517  					if err != nil {
   518  						return err
   519  					}
   520  					if h.state == handshakeCompleted {
   521  						return nil
   522  					}
   523  				}
   524  				close(h.ep.drainDone)
   525  				<-h.ep.undrain
   526  			}
   527  
   528  		case wakerForNewSegment:
   529  			if err := h.processSegments(); err != nil {
   530  				return err
   531  			}
   532  		}
   533  	}
   534  
   535  	return nil
   536  }
   537  
   538  func parseSynSegmentOptions(s *segment) header.TCPSynOptions {
   539  	synOpts := header.ParseSynOptions(s.options, s.flagIsSet(header.TCPFlagAck))
   540  	if synOpts.TS {
   541  		s.parsedOptions.TSVal = synOpts.TSVal
   542  		s.parsedOptions.TSEcr = synOpts.TSEcr
   543  	}
   544  	return synOpts
   545  }
   546  
   547  var optionPool = sync.Pool{
   548  	New: func() interface{} {
   549  		return make([]byte, maxOptionSize)
   550  	},
   551  }
   552  
   553  func getOptions() []byte {
   554  	return optionPool.Get().([]byte)
   555  }
   556  
   557  func putOptions(options []byte) {
   558  	// Reslice to full capacity.
   559  	optionPool.Put(options[0:cap(options)])
   560  }
   561  
   562  func makeSynOptions(opts header.TCPSynOptions) []byte {
   563  	// Emulate linux option order. This is as follows:
   564  	//
   565  	// if md5: NOP NOP MD5SIG 18 md5sig(16)
   566  	// if mss: MSS 4 mss(2)
   567  	// if ts and sack_advertise:
   568  	//	SACK 2 TIMESTAMP 2 timestamp(8)
   569  	// elif ts: NOP NOP TIMESTAMP 10 timestamp(8)
   570  	// elif sack: NOP NOP SACK 2
   571  	// if wscale: NOP WINDOW 3 ws(1)
   572  	// if sack_blocks: NOP NOP SACK ((2 + (#blocks * 8))
   573  	//	[for each block] start_seq(4) end_seq(4)
   574  	// if fastopen_cookie:
   575  	//	if exp: EXP (4 + len(cookie)) FASTOPEN_MAGIC(2)
   576  	// 	else: FASTOPEN (2 + len(cookie))
   577  	//	cookie(variable) [padding to four bytes]
   578  	//
   579  	options := getOptions()
   580  
   581  	// Always encode the mss.
   582  	offset := header.EncodeMSSOption(uint32(opts.MSS), options)
   583  
   584  	// Special ordering is required here. If both TS and SACK are enabled,
   585  	// then the SACK option precedes TS, with no padding. If they are
   586  	// enabled individually, then we see padding before the option.
   587  	if opts.TS && opts.SACKPermitted {
   588  		offset += header.EncodeSACKPermittedOption(options[offset:])
   589  		offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:])
   590  	} else if opts.TS {
   591  		offset += header.EncodeNOP(options[offset:])
   592  		offset += header.EncodeNOP(options[offset:])
   593  		offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:])
   594  	} else if opts.SACKPermitted {
   595  		offset += header.EncodeNOP(options[offset:])
   596  		offset += header.EncodeNOP(options[offset:])
   597  		offset += header.EncodeSACKPermittedOption(options[offset:])
   598  	}
   599  
   600  	// Initialize the WS option.
   601  	if opts.WS >= 0 {
   602  		offset += header.EncodeNOP(options[offset:])
   603  		offset += header.EncodeWSOption(opts.WS, options[offset:])
   604  	}
   605  
   606  	// Padding to the end; note that this never apply unless we add a
   607  	// fastopen option, we always expect the offset to remain the same.
   608  	if delta := header.AddTCPOptionPadding(options, offset); delta != 0 {
   609  		panic("unexpected option encoding")
   610  	}
   611  
   612  	return options[:offset]
   613  }
   614  
   615  func (e *endpoint) sendSynTCP(r *stack.Route, id stack.TransportEndpointID, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts header.TCPSynOptions) *tcpip.Error {
   616  	options := makeSynOptions(opts)
   617  	// We ignore SYN send errors and let the callers re-attempt send.
   618  	if err := e.sendTCP(r, id, buffer.VectorisedView{}, ttl, tos, flags, seq, ack, rcvWnd, options, nil); err != nil {
   619  		e.stats.SendErrors.SynSendToNetworkFailed.Increment()
   620  	}
   621  	putOptions(options)
   622  	return nil
   623  }
   624  
   625  func (e *endpoint) sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error {
   626  	if err := sendTCP(r, id, data, ttl, tos, flags, seq, ack, rcvWnd, opts, gso); err != nil {
   627  		e.stats.SendErrors.SegmentSendToNetworkFailed.Increment()
   628  		return err
   629  	}
   630  	e.stats.SegmentsSent.Increment()
   631  	return nil
   632  }
   633  
   634  func buildTCPHdr(r *stack.Route, id stack.TransportEndpointID, d *stack.PacketDescriptor, data buffer.VectorisedView, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) {
   635  	optLen := len(opts)
   636  	hdr := &d.Hdr
   637  	packetSize := d.Size
   638  	off := d.Off
   639  	// Initialize the header.
   640  	tcp := header.TCP(hdr.Prepend(header.TCPMinimumSize + optLen))
   641  	tcp.Encode(&header.TCPFields{
   642  		SrcPort:    id.LocalPort,
   643  		DstPort:    id.RemotePort,
   644  		SeqNum:     uint32(seq),
   645  		AckNum:     uint32(ack),
   646  		DataOffset: uint8(header.TCPMinimumSize + optLen),
   647  		Flags:      flags,
   648  		WindowSize: uint16(rcvWnd),
   649  	})
   650  	copy(tcp[header.TCPMinimumSize:], opts)
   651  
   652  	length := uint16(hdr.UsedLength() + packetSize)
   653  	xsum := r.PseudoHeaderChecksum(ProtocolNumber, length)
   654  	// Only calculate the checksum if offloading isn't supported.
   655  	if gso != nil && gso.NeedsCsum {
   656  		// This is called CHECKSUM_PARTIAL in the Linux kernel. We
   657  		// calculate a checksum of the pseudo-header and save it in the
   658  		// TCP header, then the kernel calculate a checksum of the
   659  		// header and data and get the right sum of the TCP packet.
   660  		tcp.SetChecksum(xsum)
   661  	} else if r.Capabilities()&stack.CapabilityTXChecksumOffload == 0 {
   662  		xsum = header.ChecksumVVWithOffset(data, xsum, off, packetSize)
   663  		tcp.SetChecksum(^tcp.CalculateChecksum(xsum))
   664  	}
   665  
   666  }
   667  
   668  func sendTCPBatch(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error {
   669  	optLen := len(opts)
   670  	if rcvWnd > 0xffff {
   671  		rcvWnd = 0xffff
   672  	}
   673  
   674  	mss := int(gso.MSS)
   675  	n := (data.Size() + mss - 1) / mss
   676  
   677  	hdrs := stack.NewPacketDescriptors(n, header.TCPMinimumSize+int(r.MaxHeaderLength())+optLen)
   678  
   679  	size := data.Size()
   680  	off := 0
   681  	for i := 0; i < n; i++ {
   682  		packetSize := mss
   683  		if packetSize > size {
   684  			packetSize = size
   685  		}
   686  		size -= packetSize
   687  		hdrs[i].Off = off
   688  		hdrs[i].Size = packetSize
   689  		buildTCPHdr(r, id, &hdrs[i], data, flags, seq, ack, rcvWnd, opts, gso)
   690  		off += packetSize
   691  		seq = seq.Add(seqnum.Size(packetSize))
   692  	}
   693  	if ttl == 0 {
   694  		ttl = r.DefaultTTL()
   695  	}
   696  	sent, err := r.WritePackets(gso, hdrs, data, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos})
   697  	if err != nil {
   698  		r.Stats().TCP.SegmentSendErrors.IncrementBy(uint64(n - sent))
   699  	}
   700  	r.Stats().TCP.SegmentsSent.IncrementBy(uint64(sent))
   701  	return err
   702  }
   703  
   704  // sendTCP sends a TCP segment with the provided options via the provided
   705  // network endpoint and under the provided identity.
   706  func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error {
   707  	optLen := len(opts)
   708  	if rcvWnd > 0xffff {
   709  		rcvWnd = 0xffff
   710  	}
   711  
   712  	if r.Loop&stack.PacketLoop == 0 && gso != nil && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() {
   713  		return sendTCPBatch(r, id, data, ttl, tos, flags, seq, ack, rcvWnd, opts, gso)
   714  	}
   715  
   716  	d := &stack.PacketDescriptor{
   717  		Hdr:  buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen),
   718  		Off:  0,
   719  		Size: data.Size(),
   720  	}
   721  	buildTCPHdr(r, id, d, data, flags, seq, ack, rcvWnd, opts, gso)
   722  
   723  	if ttl == 0 {
   724  		ttl = r.DefaultTTL()
   725  	}
   726  	if err := r.WritePacket(gso, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}, tcpip.PacketBuffer{
   727  		Header: d.Hdr,
   728  		Data:   data,
   729  	}); err != nil {
   730  		r.Stats().TCP.SegmentSendErrors.Increment()
   731  		return err
   732  	}
   733  	r.Stats().TCP.SegmentsSent.Increment()
   734  	if (flags & header.TCPFlagRst) != 0 {
   735  		r.Stats().TCP.ResetsSent.Increment()
   736  	}
   737  	return nil
   738  }
   739  
   740  // makeOptions makes an options slice.
   741  func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
   742  	options := getOptions()
   743  	offset := 0
   744  
   745  	// N.B. the ordering here matches the ordering used by Linux internally
   746  	// and described in the raw makeOptions function. We don't include
   747  	// unnecessary cases here (post connection.)
   748  	if e.sendTSOk {
   749  		// Embed the timestamp if timestamp has been enabled.
   750  		//
   751  		// We only use the lower 32 bits of the unix time in
   752  		// milliseconds. This is similar to what Linux does where it
   753  		// uses the lower 32 bits of the jiffies value in the tsVal
   754  		// field of the timestamp option.
   755  		//
   756  		// Further, RFC7323 section-5.4 recommends millisecond
   757  		// resolution as the lowest recommended resolution for the
   758  		// timestamp clock.
   759  		//
   760  		// Ref: https://tools.ietf.org/html/rfc7323#section-5.4.
   761  		offset += header.EncodeNOP(options[offset:])
   762  		offset += header.EncodeNOP(options[offset:])
   763  		offset += header.EncodeTSOption(e.timestamp(), uint32(e.recentTS), options[offset:])
   764  	}
   765  	if e.sackPermitted && len(sackBlocks) > 0 {
   766  		offset += header.EncodeNOP(options[offset:])
   767  		offset += header.EncodeNOP(options[offset:])
   768  		offset += header.EncodeSACKBlocks(sackBlocks, options[offset:])
   769  	}
   770  
   771  	// We expect the above to produce an aligned offset.
   772  	if delta := header.AddTCPOptionPadding(options, offset); delta != 0 {
   773  		panic("unexpected option encoding")
   774  	}
   775  
   776  	return options[:offset]
   777  }
   778  
   779  // sendRaw sends a TCP segment to the endpoint's peer.
   780  func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size) *tcpip.Error {
   781  	var sackBlocks []header.SACKBlock
   782  	if e.state == StateEstablished && e.rcv.pendingBufSize > 0 && (flags&header.TCPFlagAck != 0) {
   783  		sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
   784  	}
   785  	options := e.makeOptions(sackBlocks)
   786  	err := e.sendTCP(&e.route, e.ID, data, e.ttl, e.sendTOS, flags, seq, ack, rcvWnd, options, e.gso)
   787  	putOptions(options)
   788  	return err
   789  }
   790  
   791  func (e *endpoint) handleWrite() *tcpip.Error {
   792  	// Move packets from send queue to send list. The queue is accessible
   793  	// from other goroutines and protected by the send mutex, while the send
   794  	// list is only accessible from the handler goroutine, so it needs no
   795  	// mutexes.
   796  	e.sndBufMu.Lock()
   797  
   798  	first := e.sndQueue.Front()
   799  	if first != nil {
   800  		e.snd.writeList.PushBackList(&e.sndQueue)
   801  		e.snd.sndNxtList.UpdateForward(e.sndBufInQueue)
   802  		e.sndBufInQueue = 0
   803  	}
   804  
   805  	e.sndBufMu.Unlock()
   806  
   807  	// Initialize the next segment to write if it's currently nil.
   808  	if e.snd.writeNext == nil {
   809  		e.snd.writeNext = first
   810  	}
   811  
   812  	// Push out any new packets.
   813  	e.snd.sendData()
   814  
   815  	return nil
   816  }
   817  
   818  func (e *endpoint) handleClose() *tcpip.Error {
   819  	// Drain the send queue.
   820  	e.handleWrite()
   821  
   822  	// Mark send side as closed.
   823  	e.snd.closed = true
   824  
   825  	return nil
   826  }
   827  
   828  // resetConnectionLocked puts the endpoint in an error state with the given
   829  // error code and sends a RST if and only if the error is not ErrConnectionReset
   830  // indicating that the connection is being reset due to receiving a RST. This
   831  // method must only be called from the protocol goroutine.
   832  func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
   833  	// Only send a reset if the connection is being aborted for a reason
   834  	// other than receiving a reset.
   835  	if e.state == StateEstablished || e.state == StateCloseWait {
   836  		e.stack.Stats().TCP.EstablishedResets.Increment()
   837  		e.stack.Stats().TCP.CurrentEstablished.Decrement()
   838  	}
   839  	e.state = StateError
   840  	e.HardError = err
   841  	if err != tcpip.ErrConnectionReset {
   842  		// The exact sequence number to be used for the RST is the same as the
   843  		// one used by Linux. We need to handle the case of window being shrunk
   844  		// which can cause sndNxt to be outside the acceptable window on the
   845  		// receiver.
   846  		//
   847  		// See: https://www.snellman.net/blog/archive/2016-02-01-tcp-rst/ for more
   848  		// information.
   849  		sndWndEnd := e.snd.sndUna.Add(e.snd.sndWnd)
   850  		resetSeqNum := sndWndEnd
   851  		if !sndWndEnd.LessThan(e.snd.sndNxt) || e.snd.sndNxt.Size(sndWndEnd) < (1<<e.snd.sndWndScale) {
   852  			resetSeqNum = e.snd.sndNxt
   853  		}
   854  		e.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck|header.TCPFlagRst, resetSeqNum, e.rcv.rcvNxt, 0)
   855  	}
   856  }
   857  
   858  // completeWorkerLocked is called by the worker goroutine when it's about to
   859  // exit. It marks the worker as completed and performs cleanup work if requested
   860  // by Close().
   861  func (e *endpoint) completeWorkerLocked() {
   862  	e.workerRunning = false
   863  	if e.workerCleanup {
   864  		e.cleanupLocked()
   865  	}
   866  }
   867  
   868  // transitionToStateCloseLocked ensures that the endpoint is
   869  // cleaned up from the transport demuxer, "before" moving to
   870  // StateClose. This will ensure that no packet will be
   871  // delivered to this endpoint from the demuxer when the endpoint
   872  // is transitioned to StateClose.
   873  func (e *endpoint) transitionToStateCloseLocked() {
   874  	if e.state == StateClose {
   875  		return
   876  	}
   877  	e.cleanupLocked()
   878  	e.state = StateClose
   879  }
   880  
   881  // tryDeliverSegmentFromClosedEndpoint attempts to deliver the parsed
   882  // segment to any other endpoint other than the current one. This is called
   883  // only when the endpoint is in StateClose and we want to deliver the segment
   884  // to any other listening endpoint. We reply with RST if we cannot find one.
   885  func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
   886  	ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.ID, &s.route)
   887  	if ep == nil {
   888  		replyWithReset(s)
   889  		s.decRef()
   890  		return
   891  	}
   892  	ep.(*endpoint).enqueueSegment(s)
   893  }
   894  
   895  func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
   896  	if e.rcv.acceptable(s.sequenceNumber, 0) {
   897  		// RFC 793, page 37 states that "in all states
   898  		// except SYN-SENT, all reset (RST) segments are
   899  		// validated by checking their SEQ-fields." So
   900  		// we only process it if it's acceptable.
   901  		s.decRef()
   902  		e.mu.Lock()
   903  		switch e.state {
   904  		// In case of a RST in CLOSE-WAIT linux moves
   905  		// the socket to closed state with an error set
   906  		// to indicate EPIPE.
   907  		//
   908  		// Technically this seems to be at odds w/ RFC.
   909  		// As per https://tools.ietf.org/html/rfc793#section-2.7
   910  		// page 69 the behavior for a segment arriving
   911  		// w/ RST bit set in CLOSE-WAIT is inlined below.
   912  		//
   913  		//  ESTABLISHED
   914  		//  FIN-WAIT-1
   915  		//  FIN-WAIT-2
   916  		//  CLOSE-WAIT
   917  
   918  		//  If the RST bit is set then, any outstanding RECEIVEs and
   919  		//  SEND should receive "reset" responses. All segment queues
   920  		//  should be flushed.  Users should also receive an unsolicited
   921  		//  general "connection reset" signal. Enter the CLOSED state,
   922  		//  delete the TCB, and return.
   923  		case StateCloseWait:
   924  			e.transitionToStateCloseLocked()
   925  			e.HardError = tcpip.ErrAborted
   926  			e.mu.Unlock()
   927  			return false, nil
   928  		default:
   929  			e.mu.Unlock()
   930  			return false, tcpip.ErrConnectionReset
   931  		}
   932  	}
   933  	return true, nil
   934  }
   935  
   936  // handleSegments pulls segments from the queue and processes them. It returns
   937  // no error if the protocol loop should continue, an error otherwise.
   938  func (e *endpoint) handleSegments() *tcpip.Error {
   939  	checkRequeue := true
   940  	for i := 0; i < maxSegmentsPerWake; i++ {
   941  		e.mu.RLock()
   942  		state := e.state
   943  		e.mu.RUnlock()
   944  		if state == StateClose {
   945  			// When we get into StateClose while processing from the queue,
   946  			// return immediately and let the protocolMainloop handle it.
   947  			//
   948  			// We can reach StateClose only while processing a previous segment
   949  			// or a notification from the protocolMainLoop (caller goroutine).
   950  			// This means that with this return, the segment dequeue below can
   951  			// never occur on a closed endpoint.
   952  			return nil
   953  		}
   954  
   955  		s := e.segmentQueue.dequeue()
   956  		if s == nil {
   957  			checkRequeue = false
   958  			break
   959  		}
   960  
   961  		// Invoke the tcp probe if installed.
   962  		if e.probe != nil {
   963  			e.probe(e.completeState())
   964  		}
   965  
   966  		if s.flagIsSet(header.TCPFlagRst) {
   967  			if ok, err := e.handleReset(s); !ok {
   968  				return err
   969  			}
   970  		} else if s.flagIsSet(header.TCPFlagSyn) {
   971  			// See: https://tools.ietf.org/html/rfc5961#section-4.1
   972  			//   1) If the SYN bit is set, irrespective of the sequence number, TCP
   973  			//    MUST send an ACK (also referred to as challenge ACK) to the remote
   974  			//    peer:
   975  			//
   976  			//    <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
   977  			//
   978  			//    After sending the acknowledgment, TCP MUST drop the unacceptable
   979  			//    segment and stop processing further.
   980  			//
   981  			// By sending an ACK, the remote peer is challenged to confirm the loss
   982  			// of the previous connection and the request to start a new connection.
   983  			// A legitimate peer, after restart, would not have a TCB in the
   984  			// synchronized state.  Thus, when the ACK arrives, the peer should send
   985  			// a RST segment back with the sequence number derived from the ACK
   986  			// field that caused the RST.
   987  
   988  			// This RST will confirm that the remote peer has indeed closed the
   989  			// previous connection.  Upon receipt of a valid RST, the local TCP
   990  			// endpoint MUST terminate its connection.  The local TCP endpoint
   991  			// should then rely on SYN retransmission from the remote end to
   992  			// re-establish the connection.
   993  
   994  			e.snd.sendAck()
   995  		} else if s.flagIsSet(header.TCPFlagAck) {
   996  			// Patch the window size in the segment according to the
   997  			// send window scale.
   998  			s.window <<= e.snd.sndWndScale
   999  
  1000  			// RFC 793, page 41 states that "once in the ESTABLISHED
  1001  			// state all segments must carry current acknowledgment
  1002  			// information."
  1003  			drop, err := e.rcv.handleRcvdSegment(s)
  1004  			if err != nil {
  1005  				s.decRef()
  1006  				return err
  1007  			}
  1008  			if drop {
  1009  				s.decRef()
  1010  				continue
  1011  			}
  1012  			e.snd.handleRcvdSegment(s)
  1013  		}
  1014  		s.decRef()
  1015  	}
  1016  
  1017  	// If the queue is not empty, make sure we'll wake up in the next
  1018  	// iteration.
  1019  	if checkRequeue && !e.segmentQueue.empty() {
  1020  		e.newSegmentWaker.Assert()
  1021  	}
  1022  
  1023  	// Send an ACK for all processed packets if needed.
  1024  	if e.rcv.rcvNxt != e.snd.maxSentAck {
  1025  		e.snd.sendAck()
  1026  	}
  1027  
  1028  	e.resetKeepaliveTimer(true)
  1029  
  1030  	return nil
  1031  }
  1032  
  1033  // keepaliveTimerExpired is called when the keepaliveTimer fires. We send TCP
  1034  // keepalive packets periodically when the connection is idle. If we don't hear
  1035  // from the other side after a number of tries, we terminate the connection.
  1036  func (e *endpoint) keepaliveTimerExpired() *tcpip.Error {
  1037  	e.keepalive.Lock()
  1038  	if !e.keepalive.enabled || !e.keepalive.timer.checkExpiration() {
  1039  		e.keepalive.Unlock()
  1040  		return nil
  1041  	}
  1042  
  1043  	if e.keepalive.unacked >= e.keepalive.count {
  1044  		e.keepalive.Unlock()
  1045  		return tcpip.ErrTimeout
  1046  	}
  1047  
  1048  	// RFC1122 4.2.3.6: TCP keepalive is a dataless ACK with
  1049  	// seg.seq = snd.nxt-1.
  1050  	e.keepalive.unacked++
  1051  	e.keepalive.Unlock()
  1052  	e.snd.sendSegmentFromView(buffer.VectorisedView{}, header.TCPFlagAck, e.snd.sndNxt-1)
  1053  	e.resetKeepaliveTimer(false)
  1054  	return nil
  1055  }
  1056  
  1057  // resetKeepaliveTimer restarts or stops the keepalive timer, depending on
  1058  // whether it is enabled for this endpoint.
  1059  func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
  1060  	e.keepalive.Lock()
  1061  	defer e.keepalive.Unlock()
  1062  	if receivedData {
  1063  		e.keepalive.unacked = 0
  1064  	}
  1065  	// Start the keepalive timer IFF it's enabled and there is no pending
  1066  	// data to send.
  1067  	if !e.keepalive.enabled || e.snd == nil || e.snd.sndUna != e.snd.sndNxt {
  1068  		e.keepalive.timer.disable()
  1069  		return
  1070  	}
  1071  	if e.keepalive.unacked > 0 {
  1072  		e.keepalive.timer.enable(e.keepalive.interval)
  1073  	} else {
  1074  		e.keepalive.timer.enable(e.keepalive.idle)
  1075  	}
  1076  }
  1077  
  1078  // disableKeepaliveTimer stops the keepalive timer.
  1079  func (e *endpoint) disableKeepaliveTimer() {
  1080  	e.keepalive.Lock()
  1081  	e.keepalive.timer.disable()
  1082  	e.keepalive.Unlock()
  1083  }
  1084  
  1085  // protocolMainLoop is the main loop of the TCP protocol. It runs in its own
  1086  // goroutine and is responsible for sending segments and handling received
  1087  // segments.
  1088  func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
  1089  	var closeTimer *time.Timer
  1090  	var closeWaker sleep.Waker
  1091  
  1092  	epilogue := func() {
  1093  		// e.mu is expected to be hold upon entering this section.
  1094  
  1095  		if e.snd != nil {
  1096  			e.snd.resendTimer.cleanup()
  1097  		}
  1098  
  1099  		if closeTimer != nil {
  1100  			closeTimer.Stop()
  1101  		}
  1102  
  1103  		e.completeWorkerLocked()
  1104  
  1105  		if e.drainDone != nil {
  1106  			close(e.drainDone)
  1107  		}
  1108  
  1109  		e.mu.Unlock()
  1110  		// When the protocol loop exits we should wake up our waiters.
  1111  		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
  1112  	}
  1113  
  1114  	if handshake {
  1115  		// This is an active connection, so we must initiate the 3-way
  1116  		// handshake, and then inform potential waiters about its
  1117  		// completion.
  1118  		initialRcvWnd := e.initialReceiveWindow()
  1119  		h := newHandshake(e, seqnum.Size(initialRcvWnd))
  1120  		e.mu.Lock()
  1121  		h.ep.state = StateSynSent
  1122  		e.mu.Unlock()
  1123  
  1124  		if err := h.execute(); err != nil {
  1125  			e.lastErrorMu.Lock()
  1126  			e.lastError = err
  1127  			e.lastErrorMu.Unlock()
  1128  
  1129  			e.mu.Lock()
  1130  			e.stack.Stats().TCP.EstablishedResets.Increment()
  1131  			e.stack.Stats().TCP.CurrentEstablished.Decrement()
  1132  			e.state = StateError
  1133  			e.HardError = err
  1134  
  1135  			// Lock released below.
  1136  			epilogue()
  1137  
  1138  			return err
  1139  		}
  1140  
  1141  		// Transfer handshake state to TCP connection. We disable
  1142  		// receive window scaling if the peer doesn't support it
  1143  		// (indicated by a negative send window scale).
  1144  		e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale)
  1145  
  1146  		rcvBufSize := seqnum.Size(e.receiveBufferSize())
  1147  		e.rcvListMu.Lock()
  1148  		e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale(), rcvBufSize)
  1149  		// boot strap the auto tuning algorithm. Starting at zero will
  1150  		// result in a large step function on the first proper causing
  1151  		// the window to just go to a really large value after the first
  1152  		// RTT itself.
  1153  		e.rcvAutoParams.prevCopied = initialRcvWnd
  1154  		e.rcvListMu.Unlock()
  1155  		e.stack.Stats().TCP.CurrentEstablished.Increment()
  1156  		e.mu.Lock()
  1157  		e.state = StateEstablished
  1158  		e.mu.Unlock()
  1159  	}
  1160  
  1161  	e.keepalive.timer.init(&e.keepalive.waker)
  1162  	defer e.keepalive.timer.cleanup()
  1163  
  1164  	// Tell waiters that the endpoint is connected and writable.
  1165  	e.mu.Lock()
  1166  	drained := e.drainDone != nil
  1167  	e.mu.Unlock()
  1168  	if drained {
  1169  		close(e.drainDone)
  1170  		<-e.undrain
  1171  	}
  1172  
  1173  	e.waiterQueue.Notify(waiter.EventOut)
  1174  
  1175  	// Set up the functions that will be called when the main protocol loop
  1176  	// wakes up.
  1177  	funcs := []struct {
  1178  		w *sleep.Waker
  1179  		f func() *tcpip.Error
  1180  	}{
  1181  		{
  1182  			w: &e.sndWaker,
  1183  			f: e.handleWrite,
  1184  		},
  1185  		{
  1186  			w: &e.sndCloseWaker,
  1187  			f: e.handleClose,
  1188  		},
  1189  		{
  1190  			w: &e.newSegmentWaker,
  1191  			f: e.handleSegments,
  1192  		},
  1193  		{
  1194  			w: &closeWaker,
  1195  			f: func() *tcpip.Error {
  1196  				// This means the socket is being closed due
  1197  				// to the TCP_FIN_WAIT2 timeout was hit. Just
  1198  				// mark the socket as closed.
  1199  				e.mu.Lock()
  1200  				e.transitionToStateCloseLocked()
  1201  				e.mu.Unlock()
  1202  				return nil
  1203  			},
  1204  		},
  1205  		{
  1206  			w: &e.snd.resendWaker,
  1207  			f: func() *tcpip.Error {
  1208  				if !e.snd.retransmitTimerExpired() {
  1209  					return tcpip.ErrTimeout
  1210  				}
  1211  				return nil
  1212  			},
  1213  		},
  1214  		{
  1215  			w: &e.keepalive.waker,
  1216  			f: e.keepaliveTimerExpired,
  1217  		},
  1218  		{
  1219  			w: &e.notificationWaker,
  1220  			f: func() *tcpip.Error {
  1221  				n := e.fetchNotifications()
  1222  				if n&notifyNonZeroReceiveWindow != 0 {
  1223  					e.rcv.nonZeroWindow()
  1224  				}
  1225  
  1226  				if n&notifyReceiveWindowChanged != 0 {
  1227  					e.rcv.pendingBufSize = seqnum.Size(e.receiveBufferSize())
  1228  				}
  1229  
  1230  				if n&notifyMTUChanged != 0 {
  1231  					e.sndBufMu.Lock()
  1232  					count := e.packetTooBigCount
  1233  					e.packetTooBigCount = 0
  1234  					mtu := e.sndMTU
  1235  					e.sndBufMu.Unlock()
  1236  
  1237  					e.snd.updateMaxPayloadSize(mtu, count)
  1238  				}
  1239  
  1240  				if n&notifyReset != 0 {
  1241  					e.mu.Lock()
  1242  					e.resetConnectionLocked(tcpip.ErrConnectionAborted)
  1243  					e.mu.Unlock()
  1244  				}
  1245  
  1246  				if n&notifyClose != 0 && closeTimer == nil {
  1247  					e.mu.Lock()
  1248  					if e.state == StateFinWait2 && e.closed {
  1249  						// The socket has been closed and we are in FIN_WAIT2
  1250  						// so start the FIN_WAIT2 timer.
  1251  						closeTimer = time.AfterFunc(e.tcpLingerTimeout, func() {
  1252  							closeWaker.Assert()
  1253  						})
  1254  						e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
  1255  					}
  1256  					e.mu.Unlock()
  1257  				}
  1258  
  1259  				if n&notifyKeepaliveChanged != 0 {
  1260  					// The timer could fire in background
  1261  					// when the endpoint is drained. That's
  1262  					// OK. See above.
  1263  					e.resetKeepaliveTimer(true)
  1264  				}
  1265  
  1266  				if n&notifyDrain != 0 {
  1267  					for !e.segmentQueue.empty() {
  1268  						if err := e.handleSegments(); err != nil {
  1269  							return err
  1270  						}
  1271  					}
  1272  					if e.state != StateClose && e.state != StateError {
  1273  						// Only block the worker if the endpoint
  1274  						// is not in closed state or error state.
  1275  						close(e.drainDone)
  1276  						<-e.undrain
  1277  					}
  1278  				}
  1279  
  1280  				if n&notifyTickleWorker != 0 {
  1281  					// Just a tickle notification. No need to do
  1282  					// anything.
  1283  					return nil
  1284  				}
  1285  
  1286  				return nil
  1287  			},
  1288  		},
  1289  	}
  1290  
  1291  	// Initialize the sleeper based on the wakers in funcs.
  1292  	s := sleep.Sleeper{}
  1293  	for i := range funcs {
  1294  		s.AddWaker(funcs[i].w, i)
  1295  	}
  1296  
  1297  	// The following assertions and notifications are needed for restored
  1298  	// endpoints. Fresh newly created endpoints have empty states and should
  1299  	// not invoke any.
  1300  	e.segmentQueue.mu.Lock()
  1301  	if !e.segmentQueue.list.Empty() {
  1302  		e.newSegmentWaker.Assert()
  1303  	}
  1304  	e.segmentQueue.mu.Unlock()
  1305  
  1306  	e.rcvListMu.Lock()
  1307  	if !e.rcvList.Empty() {
  1308  		e.waiterQueue.Notify(waiter.EventIn)
  1309  	}
  1310  	e.rcvListMu.Unlock()
  1311  
  1312  	e.mu.Lock()
  1313  	if e.workerCleanup {
  1314  		e.notifyProtocolGoroutine(notifyClose)
  1315  	}
  1316  
  1317  	// Main loop. Handle segments until both send and receive ends of the
  1318  	// connection have completed.
  1319  
  1320  	for e.state != StateTimeWait && e.state != StateClose && e.state != StateError {
  1321  		e.mu.Unlock()
  1322  		e.workMu.Unlock()
  1323  		v, _ := s.Fetch(true)
  1324  		e.workMu.Lock()
  1325  		if err := funcs[v].f(); err != nil {
  1326  			e.mu.Lock()
  1327  			// Ensure we release all endpoint registration and route
  1328  			// references as the connection is now in an error
  1329  			// state.
  1330  			e.workerCleanup = true
  1331  			e.resetConnectionLocked(err)
  1332  			// Lock released below.
  1333  			epilogue()
  1334  
  1335  			return nil
  1336  		}
  1337  		e.mu.Lock()
  1338  	}
  1339  
  1340  	state := e.state
  1341  	e.mu.Unlock()
  1342  	var reuseTW func()
  1343  	if state == StateTimeWait {
  1344  		// Disable close timer as we now entering real TIME_WAIT.
  1345  		if closeTimer != nil {
  1346  			closeTimer.Stop()
  1347  		}
  1348  		// Mark the current sleeper done so as to free all associated
  1349  		// wakers.
  1350  		s.Done()
  1351  		// Wake up any waiters before we enter TIME_WAIT.
  1352  		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
  1353  		reuseTW = e.doTimeWait()
  1354  	}
  1355  
  1356  	// Mark endpoint as closed.
  1357  	e.mu.Lock()
  1358  	if e.state != StateError {
  1359  		e.stack.Stats().TCP.EstablishedResets.Increment()
  1360  		e.stack.Stats().TCP.CurrentEstablished.Decrement()
  1361  		e.transitionToStateCloseLocked()
  1362  	}
  1363  
  1364  	// Lock released below.
  1365  	epilogue()
  1366  
  1367  	// epilogue removes the endpoint from the transport-demuxer and
  1368  	// unlocks e.mu. Now that no new segments can get enqueued to this
  1369  	// endpoint, try to re-match the segment to a different endpoint
  1370  	// as the current endpoint is closed.
  1371  	for !e.segmentQueue.empty() {
  1372  		s := e.segmentQueue.dequeue()
  1373  		e.tryDeliverSegmentFromClosedEndpoint(s)
  1374  	}
  1375  
  1376  	// A new SYN was received during TIME_WAIT and we need to abort
  1377  	// the timewait and redirect the segment to the listener queue
  1378  	if reuseTW != nil {
  1379  		reuseTW()
  1380  	}
  1381  
  1382  	return nil
  1383  }
  1384  
  1385  // handleTimeWaitSegments processes segments received during TIME_WAIT
  1386  // state.
  1387  func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()) {
  1388  	checkRequeue := true
  1389  	for i := 0; i < maxSegmentsPerWake; i++ {
  1390  		s := e.segmentQueue.dequeue()
  1391  		if s == nil {
  1392  			checkRequeue = false
  1393  			break
  1394  		}
  1395  		extTW, newSyn := e.rcv.handleTimeWaitSegment(s)
  1396  		if newSyn {
  1397  			info := e.EndpointInfo.TransportEndpointInfo
  1398  			newID := info.ID
  1399  			newID.RemoteAddress = ""
  1400  			newID.RemotePort = 0
  1401  			netProtos := []tcpip.NetworkProtocolNumber{info.NetProto}
  1402  			// If the local address is an IPv4 address then also
  1403  			// look for IPv6 dual stack endpoints that might be
  1404  			// listening on the local address.
  1405  			if newID.LocalAddress.To4() != "" {
  1406  				netProtos = []tcpip.NetworkProtocolNumber{header.IPv4ProtocolNumber, header.IPv6ProtocolNumber}
  1407  			}
  1408  			for _, netProto := range netProtos {
  1409  				if listenEP := e.stack.FindTransportEndpoint(netProto, info.TransProto, newID, &s.route); listenEP != nil {
  1410  					tcpEP := listenEP.(*endpoint)
  1411  					if EndpointState(tcpEP.State()) == StateListen {
  1412  						reuseTW = func() {
  1413  							tcpEP.enqueueSegment(s)
  1414  						}
  1415  						// We explicitly do not decRef
  1416  						// the segment as it's still
  1417  						// valid and being reflected to
  1418  						// a listening endpoint.
  1419  						return false, reuseTW
  1420  					}
  1421  				}
  1422  			}
  1423  		}
  1424  		if extTW {
  1425  			extendTimeWait = true
  1426  		}
  1427  		s.decRef()
  1428  	}
  1429  	if checkRequeue && !e.segmentQueue.empty() {
  1430  		e.newSegmentWaker.Assert()
  1431  	}
  1432  	return extendTimeWait, nil
  1433  }
  1434  
  1435  // doTimeWait is responsible for handling the TCP behaviour once a socket
  1436  // enters the TIME_WAIT state. Optionally it can return a closure that
  1437  // should be executed after releasing the endpoint registrations. This is
  1438  // done in cases where a new SYN is received during TIME_WAIT that carries
  1439  // a sequence number larger than one see on the connection.
  1440  func (e *endpoint) doTimeWait() (twReuse func()) {
  1441  	// Trigger a 2 * MSL time wait state. During this period
  1442  	// we will drop all incoming segments.
  1443  	// NOTE: On Linux this is not configurable and is fixed at 60 seconds.
  1444  	timeWaitDuration := DefaultTCPTimeWaitTimeout
  1445  
  1446  	// Get the stack wide configuration.
  1447  	var tcpTW tcpip.TCPTimeWaitTimeoutOption
  1448  	if err := e.stack.TransportProtocolOption(ProtocolNumber, &tcpTW); err == nil {
  1449  		timeWaitDuration = time.Duration(tcpTW)
  1450  	}
  1451  
  1452  	const newSegment = 1
  1453  	const notification = 2
  1454  	const timeWaitDone = 3
  1455  
  1456  	s := sleep.Sleeper{}
  1457  	s.AddWaker(&e.newSegmentWaker, newSegment)
  1458  	s.AddWaker(&e.notificationWaker, notification)
  1459  
  1460  	var timeWaitWaker sleep.Waker
  1461  	s.AddWaker(&timeWaitWaker, timeWaitDone)
  1462  	timeWaitTimer := time.AfterFunc(timeWaitDuration, timeWaitWaker.Assert)
  1463  	defer timeWaitTimer.Stop()
  1464  
  1465  	for {
  1466  		e.workMu.Unlock()
  1467  		v, _ := s.Fetch(true)
  1468  		e.workMu.Lock()
  1469  		switch v {
  1470  		case newSegment:
  1471  			extendTimeWait, reuseTW := e.handleTimeWaitSegments()
  1472  			if reuseTW != nil {
  1473  				return reuseTW
  1474  			}
  1475  			if extendTimeWait {
  1476  				timeWaitTimer.Reset(timeWaitDuration)
  1477  			}
  1478  		case notification:
  1479  			n := e.fetchNotifications()
  1480  			if n&notifyClose != 0 {
  1481  				return nil
  1482  			}
  1483  			if n&notifyDrain != 0 {
  1484  				for !e.segmentQueue.empty() {
  1485  					// Ignore extending TIME_WAIT during a
  1486  					// save. For sockets in TIME_WAIT we just
  1487  					// terminate the TIME_WAIT early.
  1488  					e.handleTimeWaitSegments()
  1489  				}
  1490  				close(e.drainDone)
  1491  				<-e.undrain
  1492  				return nil
  1493  			}
  1494  		case timeWaitDone:
  1495  			return nil
  1496  		}
  1497  	}
  1498  }