github.com/FlowerWrong/netstack@v0.0.0-20191009141956-e5848263af28/tcpip/transport/tcp/connect.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"sync"
    19  	"time"
    20  
    21  	"github.com/FlowerWrong/netstack/rand"
    22  	"github.com/FlowerWrong/netstack/sleep"
    23  	"github.com/FlowerWrong/netstack/tcpip"
    24  	"github.com/FlowerWrong/netstack/tcpip/buffer"
    25  	"github.com/FlowerWrong/netstack/tcpip/header"
    26  	"github.com/FlowerWrong/netstack/tcpip/seqnum"
    27  	"github.com/FlowerWrong/netstack/tcpip/stack"
    28  	"github.com/FlowerWrong/netstack/waiter"
    29  )
    30  
    31  // maxSegmentsPerWake is the maximum number of segments to process in the main
    32  // protocol goroutine per wake-up. Yielding [after this number of segments are
    33  // processed] allows other events to be processed as well (e.g., timeouts,
    34  // resets, etc.).
    35  const maxSegmentsPerWake = 100
    36  
    37  type handshakeState int
    38  
    39  // The following are the possible states of the TCP connection during a 3-way
    40  // handshake. A depiction of the states and transitions can be found in RFC 793,
    41  // page 23.
    42  const (
    43  	handshakeSynSent handshakeState = iota
    44  	handshakeSynRcvd
    45  	handshakeCompleted
    46  )
    47  
    48  // The following are used to set up sleepers.
    49  const (
    50  	wakerForNotification = iota
    51  	wakerForNewSegment
    52  	wakerForResend
    53  	wakerForResolution
    54  )
    55  
    56  const (
    57  	// Maximum space available for options.
    58  	maxOptionSize = 40
    59  )
    60  
    61  // handshake holds the state used during a TCP 3-way handshake.
    62  type handshake struct {
    63  	ep     *endpoint
    64  	state  handshakeState
    65  	active bool
    66  	flags  uint8
    67  	ackNum seqnum.Value
    68  
    69  	// iss is the initial send sequence number, as defined in RFC 793.
    70  	iss seqnum.Value
    71  
    72  	// rcvWnd is the receive window, as defined in RFC 793.
    73  	rcvWnd seqnum.Size
    74  
    75  	// sndWnd is the send window, as defined in RFC 793.
    76  	sndWnd seqnum.Size
    77  
    78  	// mss is the maximum segment size received from the peer.
    79  	mss uint16
    80  
    81  	// amss is the maximum segment size advertised by us to the peer.
    82  	amss uint16
    83  
    84  	// sndWndScale is the send window scale, as defined in RFC 1323. A
    85  	// negative value means no scaling is supported by the peer.
    86  	sndWndScale int
    87  
    88  	// rcvWndScale is the receive window scale, as defined in RFC 1323.
    89  	rcvWndScale int
    90  }
    91  
    92  func newHandshake(ep *endpoint, rcvWnd seqnum.Size) handshake {
    93  	rcvWndScale := ep.rcvWndScaleForHandshake()
    94  
    95  	// Round-down the rcvWnd to a multiple of wndScale. This ensures that the
    96  	// window offered in SYN won't be reduced due to the loss of precision if
    97  	// window scaling is enabled after the handshake.
    98  	rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale)
    99  
   100  	// Ensure we can always accept at least 1 byte if the scale specified
   101  	// was too high for the provided rcvWnd.
   102  	if rcvWnd == 0 {
   103  		rcvWnd = 1
   104  	}
   105  
   106  	h := handshake{
   107  		ep:          ep,
   108  		active:      true,
   109  		rcvWnd:      rcvWnd,
   110  		rcvWndScale: int(rcvWndScale),
   111  	}
   112  	h.resetState()
   113  	return h
   114  }
   115  
   116  // FindWndScale determines the window scale to use for the given maximum window
   117  // size.
   118  func FindWndScale(wnd seqnum.Size) int {
   119  	if wnd < 0x10000 {
   120  		return 0
   121  	}
   122  
   123  	max := seqnum.Size(0xffff)
   124  	s := 0
   125  	for wnd > max && s < header.MaxWndScale {
   126  		s++
   127  		max <<= 1
   128  	}
   129  
   130  	return s
   131  }
   132  
   133  // resetState resets the state of the handshake object such that it becomes
   134  // ready for a new 3-way handshake.
   135  func (h *handshake) resetState() {
   136  	b := make([]byte, 4)
   137  	if _, err := rand.Read(b); err != nil {
   138  		panic(err)
   139  	}
   140  
   141  	h.state = handshakeSynSent
   142  	h.flags = header.TCPFlagSyn
   143  	h.ackNum = 0
   144  	h.mss = 0
   145  	h.iss = seqnum.Value(uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24)
   146  }
   147  
   148  // effectiveRcvWndScale returns the effective receive window scale to be used.
   149  // If the peer doesn't support window scaling, the effective rcv wnd scale is
   150  // zero; otherwise it's the value calculated based on the initial rcv wnd.
   151  func (h *handshake) effectiveRcvWndScale() uint8 {
   152  	if h.sndWndScale < 0 {
   153  		return 0
   154  	}
   155  	return uint8(h.rcvWndScale)
   156  }
   157  
   158  // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD
   159  // state.
   160  func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions) {
   161  	h.active = false
   162  	h.state = handshakeSynRcvd
   163  	h.flags = header.TCPFlagSyn | header.TCPFlagAck
   164  	h.iss = iss
   165  	h.ackNum = irs + 1
   166  	h.mss = opts.MSS
   167  	h.sndWndScale = opts.WS
   168  	h.ep.mu.Lock()
   169  	h.ep.state = StateSynRecv
   170  	h.ep.mu.Unlock()
   171  }
   172  
   173  // checkAck checks if the ACK number, if present, of a segment received during
   174  // a TCP 3-way handshake is valid. If it's not, a RST segment is sent back in
   175  // response.
   176  func (h *handshake) checkAck(s *segment) bool {
   177  	if s.flagIsSet(header.TCPFlagAck) && s.ackNumber != h.iss+1 {
   178  		// RFC 793, page 36, states that a reset must be generated when
   179  		// the connection is in any non-synchronized state and an
   180  		// incoming segment acknowledges something not yet sent. The
   181  		// connection remains in the same state.
   182  		ack := s.sequenceNumber.Add(s.logicalLen())
   183  		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagRst|header.TCPFlagAck, s.ackNumber, ack, 0)
   184  		return false
   185  	}
   186  
   187  	return true
   188  }
   189  
   190  // synSentState handles a segment received when the TCP 3-way handshake is in
   191  // the SYN-SENT state.
   192  func (h *handshake) synSentState(s *segment) *tcpip.Error {
   193  	// RFC 793, page 37, states that in the SYN-SENT state, a reset is
   194  	// acceptable if the ack field acknowledges the SYN.
   195  	if s.flagIsSet(header.TCPFlagRst) {
   196  		if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == h.iss+1 {
   197  			return tcpip.ErrConnectionRefused
   198  		}
   199  		return nil
   200  	}
   201  
   202  	if !h.checkAck(s) {
   203  		return nil
   204  	}
   205  
   206  	// We are in the SYN-SENT state. We only care about segments that have
   207  	// the SYN flag.
   208  	if !s.flagIsSet(header.TCPFlagSyn) {
   209  		return nil
   210  	}
   211  
   212  	// Parse the SYN options.
   213  	rcvSynOpts := parseSynSegmentOptions(s)
   214  
   215  	// Remember if the Timestamp option was negotiated.
   216  	h.ep.maybeEnableTimestamp(&rcvSynOpts)
   217  
   218  	// Remember if the SACKPermitted option was negotiated.
   219  	h.ep.maybeEnableSACKPermitted(&rcvSynOpts)
   220  
   221  	// Remember the sequence we'll ack from now on.
   222  	h.ackNum = s.sequenceNumber + 1
   223  	h.flags |= header.TCPFlagAck
   224  	h.mss = rcvSynOpts.MSS
   225  	h.sndWndScale = rcvSynOpts.WS
   226  
   227  	// If this is a SYN ACK response, we only need to acknowledge the SYN
   228  	// and the handshake is completed.
   229  	if s.flagIsSet(header.TCPFlagAck) {
   230  		h.state = handshakeCompleted
   231  		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale())
   232  		return nil
   233  	}
   234  
   235  	// A SYN segment was received, but no ACK in it. We acknowledge the SYN
   236  	// but resend our own SYN and wait for it to be acknowledged in the
   237  	// SYN-RCVD state.
   238  	h.state = handshakeSynRcvd
   239  	h.ep.mu.Lock()
   240  	h.ep.state = StateSynRecv
   241  	ttl := h.ep.ttl
   242  	h.ep.mu.Unlock()
   243  	synOpts := header.TCPSynOptions{
   244  		WS:    int(h.effectiveRcvWndScale()),
   245  		TS:    rcvSynOpts.TS,
   246  		TSVal: h.ep.timestamp(),
   247  		TSEcr: h.ep.recentTS,
   248  
   249  		// We only send SACKPermitted if the other side indicated it
   250  		// permits SACK. This is not explicitly defined in the RFC but
   251  		// this is the behaviour implemented by Linux.
   252  		SACKPermitted: rcvSynOpts.SACKPermitted,
   253  		MSS:           h.ep.amss,
   254  	}
   255  	if ttl == 0 {
   256  		ttl = s.route.DefaultTTL()
   257  	}
   258  	sendSynTCP(&s.route, h.ep.id, ttl, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
   259  
   260  	return nil
   261  }
   262  
   263  // synRcvdState handles a segment received when the TCP 3-way handshake is in
   264  // the SYN-RCVD state.
   265  func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
   266  	if s.flagIsSet(header.TCPFlagRst) {
   267  		// RFC 793, page 37, states that in the SYN-RCVD state, a reset
   268  		// is acceptable if the sequence number is in the window.
   269  		if s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) {
   270  			return tcpip.ErrConnectionRefused
   271  		}
   272  		return nil
   273  	}
   274  
   275  	if !h.checkAck(s) {
   276  		return nil
   277  	}
   278  
   279  	if s.flagIsSet(header.TCPFlagSyn) && s.sequenceNumber != h.ackNum-1 {
   280  		// We received two SYN segments with different sequence
   281  		// numbers, so we reset this and restart the whole
   282  		// process, except that we don't reset the timer.
   283  		ack := s.sequenceNumber.Add(s.logicalLen())
   284  		seq := seqnum.Value(0)
   285  		if s.flagIsSet(header.TCPFlagAck) {
   286  			seq = s.ackNumber
   287  		}
   288  		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0)
   289  
   290  		if !h.active {
   291  			return tcpip.ErrInvalidEndpointState
   292  		}
   293  
   294  		h.resetState()
   295  		synOpts := header.TCPSynOptions{
   296  			WS:            h.rcvWndScale,
   297  			TS:            h.ep.sendTSOk,
   298  			TSVal:         h.ep.timestamp(),
   299  			TSEcr:         h.ep.recentTS,
   300  			SACKPermitted: h.ep.sackPermitted,
   301  			MSS:           h.ep.amss,
   302  		}
   303  		sendSynTCP(&s.route, h.ep.id, h.ep.ttl, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
   304  		return nil
   305  	}
   306  
   307  	// We have previously received (and acknowledged) the peer's SYN. If the
   308  	// peer acknowledges our SYN, the handshake is completed.
   309  	if s.flagIsSet(header.TCPFlagAck) {
   310  		// If the timestamp option is negotiated and the segment does
   311  		// not carry a timestamp option then the segment must be dropped
   312  		// as per https://tools.ietf.org/html/rfc7323#section-3.2.
   313  		if h.ep.sendTSOk && !s.parsedOptions.TS {
   314  			h.ep.stack.Stats().DroppedPackets.Increment()
   315  			return nil
   316  		}
   317  
   318  		// Update timestamp if required. See RFC7323, section-4.3.
   319  		if h.ep.sendTSOk && s.parsedOptions.TS {
   320  			h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber)
   321  		}
   322  		h.state = handshakeCompleted
   323  		return nil
   324  	}
   325  
   326  	return nil
   327  }
   328  
   329  func (h *handshake) handleSegment(s *segment) *tcpip.Error {
   330  	h.sndWnd = s.window
   331  	if !s.flagIsSet(header.TCPFlagSyn) && h.sndWndScale > 0 {
   332  		h.sndWnd <<= uint8(h.sndWndScale)
   333  	}
   334  
   335  	switch h.state {
   336  	case handshakeSynRcvd:
   337  		return h.synRcvdState(s)
   338  	case handshakeSynSent:
   339  		return h.synSentState(s)
   340  	}
   341  	return nil
   342  }
   343  
   344  // processSegments goes through the segment queue and processes up to
   345  // maxSegmentsPerWake (if they're available).
   346  func (h *handshake) processSegments() *tcpip.Error {
   347  	for i := 0; i < maxSegmentsPerWake; i++ {
   348  		s := h.ep.segmentQueue.dequeue()
   349  		if s == nil {
   350  			return nil
   351  		}
   352  
   353  		err := h.handleSegment(s)
   354  		s.decRef()
   355  		if err != nil {
   356  			return err
   357  		}
   358  
   359  		// We stop processing packets once the handshake is completed,
   360  		// otherwise we may process packets meant to be processed by
   361  		// the main protocol goroutine.
   362  		if h.state == handshakeCompleted {
   363  			break
   364  		}
   365  	}
   366  
   367  	// If the queue is not empty, make sure we'll wake up in the next
   368  	// iteration.
   369  	if !h.ep.segmentQueue.empty() {
   370  		h.ep.newSegmentWaker.Assert()
   371  	}
   372  
   373  	return nil
   374  }
   375  
   376  func (h *handshake) resolveRoute() *tcpip.Error {
   377  	// Set up the wakers.
   378  	s := sleep.Sleeper{}
   379  	resolutionWaker := &sleep.Waker{}
   380  	s.AddWaker(resolutionWaker, wakerForResolution)
   381  	s.AddWaker(&h.ep.notificationWaker, wakerForNotification)
   382  	defer s.Done()
   383  
   384  	// Initial action is to resolve route.
   385  	index := wakerForResolution
   386  	for {
   387  		switch index {
   388  		case wakerForResolution:
   389  			if _, err := h.ep.route.Resolve(resolutionWaker); err != tcpip.ErrWouldBlock {
   390  				// Either success (err == nil) or failure.
   391  				return err
   392  			}
   393  			// Resolution not completed. Keep trying...
   394  
   395  		case wakerForNotification:
   396  			n := h.ep.fetchNotifications()
   397  			if n&notifyClose != 0 {
   398  				h.ep.route.RemoveWaker(resolutionWaker)
   399  				return tcpip.ErrAborted
   400  			}
   401  			if n&notifyDrain != 0 {
   402  				close(h.ep.drainDone)
   403  				<-h.ep.undrain
   404  			}
   405  		}
   406  
   407  		// Wait for notification.
   408  		index, _ = s.Fetch(true)
   409  	}
   410  }
   411  
   412  // execute executes the TCP 3-way handshake.
   413  func (h *handshake) execute() *tcpip.Error {
   414  	if h.ep.route.IsResolutionRequired() {
   415  		if err := h.resolveRoute(); err != nil {
   416  			return err
   417  		}
   418  	}
   419  
   420  	// Initialize the resend timer.
   421  	resendWaker := sleep.Waker{}
   422  	timeOut := time.Duration(time.Second)
   423  	rt := time.AfterFunc(timeOut, func() {
   424  		resendWaker.Assert()
   425  	})
   426  	defer rt.Stop()
   427  
   428  	// Set up the wakers.
   429  	s := sleep.Sleeper{}
   430  	s.AddWaker(&resendWaker, wakerForResend)
   431  	s.AddWaker(&h.ep.notificationWaker, wakerForNotification)
   432  	s.AddWaker(&h.ep.newSegmentWaker, wakerForNewSegment)
   433  	defer s.Done()
   434  
   435  	var sackEnabled SACKEnabled
   436  	if err := h.ep.stack.TransportProtocolOption(ProtocolNumber, &sackEnabled); err != nil {
   437  		// If stack returned an error when checking for SACKEnabled
   438  		// status then just default to switching off SACK negotiation.
   439  		sackEnabled = false
   440  	}
   441  
   442  	// Send the initial SYN segment and loop until the handshake is
   443  	// completed.
   444  	h.ep.amss = mssForRoute(&h.ep.route)
   445  
   446  	synOpts := header.TCPSynOptions{
   447  		WS:            h.rcvWndScale,
   448  		TS:            true,
   449  		TSVal:         h.ep.timestamp(),
   450  		TSEcr:         h.ep.recentTS,
   451  		SACKPermitted: bool(sackEnabled),
   452  		MSS:           h.ep.amss,
   453  	}
   454  
   455  	// Execute is also called in a listen context so we want to make sure we
   456  	// only send the TS/SACK option when we received the TS/SACK in the
   457  	// initial SYN.
   458  	if h.state == handshakeSynRcvd {
   459  		synOpts.TS = h.ep.sendTSOk
   460  		synOpts.SACKPermitted = h.ep.sackPermitted && bool(sackEnabled)
   461  		if h.sndWndScale < 0 {
   462  			// Disable window scaling if the peer did not send us
   463  			// the window scaling option.
   464  			synOpts.WS = -1
   465  		}
   466  	}
   467  	sendSynTCP(&h.ep.route, h.ep.id, h.ep.ttl, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
   468  	for h.state != handshakeCompleted {
   469  		switch index, _ := s.Fetch(true); index {
   470  		case wakerForResend:
   471  			timeOut *= 2
   472  			if timeOut > 60*time.Second {
   473  				return tcpip.ErrTimeout
   474  			}
   475  			rt.Reset(timeOut)
   476  			sendSynTCP(&h.ep.route, h.ep.id, h.ep.ttl, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
   477  
   478  		case wakerForNotification:
   479  			n := h.ep.fetchNotifications()
   480  			if n&notifyClose != 0 {
   481  				return tcpip.ErrAborted
   482  			}
   483  			if n&notifyDrain != 0 {
   484  				for !h.ep.segmentQueue.empty() {
   485  					s := h.ep.segmentQueue.dequeue()
   486  					err := h.handleSegment(s)
   487  					s.decRef()
   488  					if err != nil {
   489  						return err
   490  					}
   491  					if h.state == handshakeCompleted {
   492  						return nil
   493  					}
   494  				}
   495  				close(h.ep.drainDone)
   496  				<-h.ep.undrain
   497  			}
   498  
   499  		case wakerForNewSegment:
   500  			if err := h.processSegments(); err != nil {
   501  				return err
   502  			}
   503  		}
   504  	}
   505  
   506  	return nil
   507  }
   508  
   509  func parseSynSegmentOptions(s *segment) header.TCPSynOptions {
   510  	synOpts := header.ParseSynOptions(s.options, s.flagIsSet(header.TCPFlagAck))
   511  	if synOpts.TS {
   512  		s.parsedOptions.TSVal = synOpts.TSVal
   513  		s.parsedOptions.TSEcr = synOpts.TSEcr
   514  	}
   515  	return synOpts
   516  }
   517  
   518  var optionPool = sync.Pool{
   519  	New: func() interface{} {
   520  		return make([]byte, maxOptionSize)
   521  	},
   522  }
   523  
   524  func getOptions() []byte {
   525  	return optionPool.Get().([]byte)
   526  }
   527  
   528  func putOptions(options []byte) {
   529  	// Reslice to full capacity.
   530  	optionPool.Put(options[0:cap(options)])
   531  }
   532  
   533  func makeSynOptions(opts header.TCPSynOptions) []byte {
   534  	// Emulate linux option order. This is as follows:
   535  	//
   536  	// if md5: NOP NOP MD5SIG 18 md5sig(16)
   537  	// if mss: MSS 4 mss(2)
   538  	// if ts and sack_advertise:
   539  	//	SACK 2 TIMESTAMP 2 timestamp(8)
   540  	// elif ts: NOP NOP TIMESTAMP 10 timestamp(8)
   541  	// elif sack: NOP NOP SACK 2
   542  	// if wscale: NOP WINDOW 3 ws(1)
   543  	// if sack_blocks: NOP NOP SACK ((2 + (#blocks * 8))
   544  	//	[for each block] start_seq(4) end_seq(4)
   545  	// if fastopen_cookie:
   546  	//	if exp: EXP (4 + len(cookie)) FASTOPEN_MAGIC(2)
   547  	// 	else: FASTOPEN (2 + len(cookie))
   548  	//	cookie(variable) [padding to four bytes]
   549  	//
   550  	options := getOptions()
   551  
   552  	// Always encode the mss.
   553  	offset := header.EncodeMSSOption(uint32(opts.MSS), options)
   554  
   555  	// Special ordering is required here. If both TS and SACK are enabled,
   556  	// then the SACK option precedes TS, with no padding. If they are
   557  	// enabled individually, then we see padding before the option.
   558  	if opts.TS && opts.SACKPermitted {
   559  		offset += header.EncodeSACKPermittedOption(options[offset:])
   560  		offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:])
   561  	} else if opts.TS {
   562  		offset += header.EncodeNOP(options[offset:])
   563  		offset += header.EncodeNOP(options[offset:])
   564  		offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:])
   565  	} else if opts.SACKPermitted {
   566  		offset += header.EncodeNOP(options[offset:])
   567  		offset += header.EncodeNOP(options[offset:])
   568  		offset += header.EncodeSACKPermittedOption(options[offset:])
   569  	}
   570  
   571  	// Initialize the WS option.
   572  	if opts.WS >= 0 {
   573  		offset += header.EncodeNOP(options[offset:])
   574  		offset += header.EncodeWSOption(opts.WS, options[offset:])
   575  	}
   576  
   577  	// Padding to the end; note that this never apply unless we add a
   578  	// fastopen option, we always expect the offset to remain the same.
   579  	if delta := header.AddTCPOptionPadding(options, offset); delta != 0 {
   580  		panic("unexpected option encoding")
   581  	}
   582  
   583  	return options[:offset]
   584  }
   585  
   586  func sendSynTCP(r *stack.Route, id stack.TransportEndpointID, ttl uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts header.TCPSynOptions) *tcpip.Error {
   587  	options := makeSynOptions(opts)
   588  	err := sendTCP(r, id, buffer.VectorisedView{}, ttl, flags, seq, ack, rcvWnd, options, nil)
   589  	putOptions(options)
   590  	return err
   591  }
   592  
   593  // sendTCP sends a TCP segment with the provided options via the provided
   594  // network endpoint and under the provided identity.
   595  func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error {
   596  	optLen := len(opts)
   597  	// Allocate a buffer for the TCP header.
   598  	hdr := buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen)
   599  
   600  	if rcvWnd > 0xffff {
   601  		rcvWnd = 0xffff
   602  	}
   603  
   604  	// Initialize the header.
   605  	tcp := header.TCP(hdr.Prepend(header.TCPMinimumSize + optLen))
   606  	tcp.Encode(&header.TCPFields{
   607  		SrcPort:    id.LocalPort,
   608  		DstPort:    id.RemotePort,
   609  		SeqNum:     uint32(seq),
   610  		AckNum:     uint32(ack),
   611  		DataOffset: uint8(header.TCPMinimumSize + optLen),
   612  		Flags:      flags,
   613  		WindowSize: uint16(rcvWnd),
   614  	})
   615  	copy(tcp[header.TCPMinimumSize:], opts)
   616  
   617  	length := uint16(hdr.UsedLength() + data.Size())
   618  	xsum := r.PseudoHeaderChecksum(ProtocolNumber, length)
   619  	// Only calculate the checksum if offloading isn't supported.
   620  	if gso != nil && gso.NeedsCsum {
   621  		// This is called CHECKSUM_PARTIAL in the Linux kernel. We
   622  		// calculate a checksum of the pseudo-header and save it in the
   623  		// TCP header, then the kernel calculate a checksum of the
   624  		// header and data and get the right sum of the TCP packet.
   625  		tcp.SetChecksum(xsum)
   626  	} else if r.Capabilities()&stack.CapabilityTXChecksumOffload == 0 {
   627  		xsum = header.ChecksumVV(data, xsum)
   628  		tcp.SetChecksum(^tcp.CalculateChecksum(xsum))
   629  	}
   630  
   631  	r.Stats().TCP.SegmentsSent.Increment()
   632  	if (flags & header.TCPFlagRst) != 0 {
   633  		r.Stats().TCP.ResetsSent.Increment()
   634  	}
   635  
   636  	return r.WritePacket(gso, hdr, data, ProtocolNumber, ttl, ttl == 0 /* useDefaultTTL */)
   637  }
   638  
   639  // makeOptions makes an options slice.
   640  func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
   641  	options := getOptions()
   642  	offset := 0
   643  
   644  	// N.B. the ordering here matches the ordering used by Linux internally
   645  	// and described in the raw makeOptions function. We don't include
   646  	// unnecessary cases here (post connection.)
   647  	if e.sendTSOk {
   648  		// Embed the timestamp if timestamp has been enabled.
   649  		//
   650  		// We only use the lower 32 bits of the unix time in
   651  		// milliseconds. This is similar to what Linux does where it
   652  		// uses the lower 32 bits of the jiffies value in the tsVal
   653  		// field of the timestamp option.
   654  		//
   655  		// Further, RFC7323 section-5.4 recommends millisecond
   656  		// resolution as the lowest recommended resolution for the
   657  		// timestamp clock.
   658  		//
   659  		// Ref: https://tools.ietf.org/html/rfc7323#section-5.4.
   660  		offset += header.EncodeNOP(options[offset:])
   661  		offset += header.EncodeNOP(options[offset:])
   662  		offset += header.EncodeTSOption(e.timestamp(), uint32(e.recentTS), options[offset:])
   663  	}
   664  	if e.sackPermitted && len(sackBlocks) > 0 {
   665  		offset += header.EncodeNOP(options[offset:])
   666  		offset += header.EncodeNOP(options[offset:])
   667  		offset += header.EncodeSACKBlocks(sackBlocks, options[offset:])
   668  	}
   669  
   670  	// We expect the above to produce an aligned offset.
   671  	if delta := header.AddTCPOptionPadding(options, offset); delta != 0 {
   672  		panic("unexpected option encoding")
   673  	}
   674  
   675  	return options[:offset]
   676  }
   677  
   678  // sendRaw sends a TCP segment to the endpoint's peer.
   679  func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size) *tcpip.Error {
   680  	var sackBlocks []header.SACKBlock
   681  	if e.state == StateEstablished && e.rcv.pendingBufSize > 0 && (flags&header.TCPFlagAck != 0) {
   682  		sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
   683  	}
   684  	options := e.makeOptions(sackBlocks)
   685  	err := sendTCP(&e.route, e.id, data, e.ttl, flags, seq, ack, rcvWnd, options, e.gso)
   686  	putOptions(options)
   687  	return err
   688  }
   689  
   690  func (e *endpoint) handleWrite() *tcpip.Error {
   691  	// Move packets from send queue to send list. The queue is accessible
   692  	// from other goroutines and protected by the send mutex, while the send
   693  	// list is only accessible from the handler goroutine, so it needs no
   694  	// mutexes.
   695  	e.sndBufMu.Lock()
   696  
   697  	first := e.sndQueue.Front()
   698  	if first != nil {
   699  		e.snd.writeList.PushBackList(&e.sndQueue)
   700  		e.snd.sndNxtList.UpdateForward(e.sndBufInQueue)
   701  		e.sndBufInQueue = 0
   702  	}
   703  
   704  	e.sndBufMu.Unlock()
   705  
   706  	// Initialize the next segment to write if it's currently nil.
   707  	if e.snd.writeNext == nil {
   708  		e.snd.writeNext = first
   709  	}
   710  
   711  	// Push out any new packets.
   712  	e.snd.sendData()
   713  
   714  	return nil
   715  }
   716  
   717  func (e *endpoint) handleClose() *tcpip.Error {
   718  	// Drain the send queue.
   719  	e.handleWrite()
   720  
   721  	// Mark send side as closed.
   722  	e.snd.closed = true
   723  
   724  	return nil
   725  }
   726  
   727  // resetConnectionLocked puts the endpoint in an error state with the given
   728  // error code and sends a RST if and only if the error is not ErrConnectionReset
   729  // indicating that the connection is being reset due to receiving a RST. This
   730  // method must only be called from the protocol goroutine.
   731  func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
   732  	// Only send a reset if the connection is being aborted for a reason
   733  	// other than receiving a reset.
   734  	e.state = StateError
   735  	e.hardError = err
   736  	if err != tcpip.ErrConnectionReset {
   737  		e.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck|header.TCPFlagRst, e.snd.sndUna, e.rcv.rcvNxt, 0)
   738  	}
   739  }
   740  
   741  // completeWorkerLocked is called by the worker goroutine when it's about to
   742  // exit. It marks the worker as completed and performs cleanup work if requested
   743  // by Close().
   744  func (e *endpoint) completeWorkerLocked() {
   745  	e.workerRunning = false
   746  	if e.workerCleanup {
   747  		e.cleanupLocked()
   748  	}
   749  }
   750  
   751  // handleSegments pulls segments from the queue and processes them. It returns
   752  // no error if the protocol loop should continue, an error otherwise.
   753  func (e *endpoint) handleSegments() *tcpip.Error {
   754  	checkRequeue := true
   755  	for i := 0; i < maxSegmentsPerWake; i++ {
   756  		s := e.segmentQueue.dequeue()
   757  		if s == nil {
   758  			checkRequeue = false
   759  			break
   760  		}
   761  
   762  		// Invoke the tcp probe if installed.
   763  		if e.probe != nil {
   764  			e.probe(e.completeState())
   765  		}
   766  
   767  		if s.flagIsSet(header.TCPFlagRst) {
   768  			if e.rcv.acceptable(s.sequenceNumber, 0) {
   769  				// RFC 793, page 37 states that "in all states
   770  				// except SYN-SENT, all reset (RST) segments are
   771  				// validated by checking their SEQ-fields." So
   772  				// we only process it if it's acceptable.
   773  				s.decRef()
   774  				return tcpip.ErrConnectionReset
   775  			}
   776  		} else if s.flagIsSet(header.TCPFlagAck) {
   777  			// Patch the window size in the segment according to the
   778  			// send window scale.
   779  			s.window <<= e.snd.sndWndScale
   780  
   781  			// RFC 793, page 41 states that "once in the ESTABLISHED
   782  			// state all segments must carry current acknowledgment
   783  			// information."
   784  			e.rcv.handleRcvdSegment(s)
   785  			e.snd.handleRcvdSegment(s)
   786  		}
   787  		s.decRef()
   788  	}
   789  
   790  	// If the queue is not empty, make sure we'll wake up in the next
   791  	// iteration.
   792  	if checkRequeue && !e.segmentQueue.empty() {
   793  		e.newSegmentWaker.Assert()
   794  	}
   795  
   796  	// Send an ACK for all processed packets if needed.
   797  	if e.rcv.rcvNxt != e.snd.maxSentAck {
   798  		e.snd.sendAck()
   799  	}
   800  
   801  	e.resetKeepaliveTimer(true)
   802  
   803  	return nil
   804  }
   805  
   806  // keepaliveTimerExpired is called when the keepaliveTimer fires. We send TCP
   807  // keepalive packets periodically when the connection is idle. If we don't hear
   808  // from the other side after a number of tries, we terminate the connection.
   809  func (e *endpoint) keepaliveTimerExpired() *tcpip.Error {
   810  	e.keepalive.Lock()
   811  	if !e.keepalive.enabled || !e.keepalive.timer.checkExpiration() {
   812  		e.keepalive.Unlock()
   813  		return nil
   814  	}
   815  
   816  	if e.keepalive.unacked >= e.keepalive.count {
   817  		e.keepalive.Unlock()
   818  		return tcpip.ErrTimeout
   819  	}
   820  
   821  	// RFC1122 4.2.3.6: TCP keepalive is a dataless ACK with
   822  	// seg.seq = snd.nxt-1.
   823  	e.keepalive.unacked++
   824  	e.keepalive.Unlock()
   825  	e.snd.sendSegmentFromView(buffer.VectorisedView{}, header.TCPFlagAck, e.snd.sndNxt-1)
   826  	e.resetKeepaliveTimer(false)
   827  	return nil
   828  }
   829  
   830  // resetKeepaliveTimer restarts or stops the keepalive timer, depending on
   831  // whether it is enabled for this endpoint.
   832  func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
   833  	e.keepalive.Lock()
   834  	defer e.keepalive.Unlock()
   835  	if receivedData {
   836  		e.keepalive.unacked = 0
   837  	}
   838  	// Start the keepalive timer IFF it's enabled and there is no pending
   839  	// data to send.
   840  	if !e.keepalive.enabled || e.snd == nil || e.snd.sndUna != e.snd.sndNxt {
   841  		e.keepalive.timer.disable()
   842  		return
   843  	}
   844  	if e.keepalive.unacked > 0 {
   845  		e.keepalive.timer.enable(e.keepalive.interval)
   846  	} else {
   847  		e.keepalive.timer.enable(e.keepalive.idle)
   848  	}
   849  }
   850  
   851  // disableKeepaliveTimer stops the keepalive timer.
   852  func (e *endpoint) disableKeepaliveTimer() {
   853  	e.keepalive.Lock()
   854  	e.keepalive.timer.disable()
   855  	e.keepalive.Unlock()
   856  }
   857  
   858  // protocolMainLoop is the main loop of the TCP protocol. It runs in its own
   859  // goroutine and is responsible for sending segments and handling received
   860  // segments.
   861  func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
   862  	var closeTimer *time.Timer
   863  	var closeWaker sleep.Waker
   864  
   865  	epilogue := func() {
   866  		// e.mu is expected to be hold upon entering this section.
   867  
   868  		if e.snd != nil {
   869  			e.snd.resendTimer.cleanup()
   870  		}
   871  
   872  		if closeTimer != nil {
   873  			closeTimer.Stop()
   874  		}
   875  
   876  		e.completeWorkerLocked()
   877  
   878  		if e.drainDone != nil {
   879  			close(e.drainDone)
   880  		}
   881  
   882  		e.mu.Unlock()
   883  
   884  		// When the protocol loop exits we should wake up our waiters.
   885  		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
   886  	}
   887  
   888  	if handshake {
   889  		// This is an active connection, so we must initiate the 3-way
   890  		// handshake, and then inform potential waiters about its
   891  		// completion.
   892  		initialRcvWnd := e.initialReceiveWindow()
   893  		h := newHandshake(e, seqnum.Size(initialRcvWnd))
   894  		e.mu.Lock()
   895  		h.ep.state = StateSynSent
   896  		e.mu.Unlock()
   897  
   898  		if err := h.execute(); err != nil {
   899  			e.lastErrorMu.Lock()
   900  			e.lastError = err
   901  			e.lastErrorMu.Unlock()
   902  
   903  			e.mu.Lock()
   904  			e.state = StateError
   905  			e.hardError = err
   906  
   907  			// Lock released below.
   908  			epilogue()
   909  
   910  			return err
   911  		}
   912  
   913  		// Transfer handshake state to TCP connection. We disable
   914  		// receive window scaling if the peer doesn't support it
   915  		// (indicated by a negative send window scale).
   916  		e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale)
   917  
   918  		rcvBufSize := seqnum.Size(e.receiveBufferSize())
   919  		e.rcvListMu.Lock()
   920  		e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale(), rcvBufSize)
   921  		// boot strap the auto tuning algorithm. Starting at zero will
   922  		// result in a large step function on the first proper causing
   923  		// the window to just go to a really large value after the first
   924  		// RTT itself.
   925  		e.rcvAutoParams.prevCopied = initialRcvWnd
   926  		e.rcvListMu.Unlock()
   927  	}
   928  
   929  	e.keepalive.timer.init(&e.keepalive.waker)
   930  	defer e.keepalive.timer.cleanup()
   931  
   932  	// Tell waiters that the endpoint is connected and writable.
   933  	e.mu.Lock()
   934  	e.state = StateEstablished
   935  	drained := e.drainDone != nil
   936  	e.mu.Unlock()
   937  	if drained {
   938  		close(e.drainDone)
   939  		<-e.undrain
   940  	}
   941  
   942  	e.waiterQueue.Notify(waiter.EventOut)
   943  
   944  	// Set up the functions that will be called when the main protocol loop
   945  	// wakes up.
   946  	funcs := []struct {
   947  		w *sleep.Waker
   948  		f func() *tcpip.Error
   949  	}{
   950  		{
   951  			w: &e.sndWaker,
   952  			f: e.handleWrite,
   953  		},
   954  		{
   955  			w: &e.sndCloseWaker,
   956  			f: e.handleClose,
   957  		},
   958  		{
   959  			w: &e.newSegmentWaker,
   960  			f: e.handleSegments,
   961  		},
   962  		{
   963  			w: &closeWaker,
   964  			f: func() *tcpip.Error {
   965  				return tcpip.ErrConnectionAborted
   966  			},
   967  		},
   968  		{
   969  			w: &e.snd.resendWaker,
   970  			f: func() *tcpip.Error {
   971  				if !e.snd.retransmitTimerExpired() {
   972  					return tcpip.ErrTimeout
   973  				}
   974  				return nil
   975  			},
   976  		},
   977  		{
   978  			w: &e.keepalive.waker,
   979  			f: e.keepaliveTimerExpired,
   980  		},
   981  		{
   982  			w: &e.notificationWaker,
   983  			f: func() *tcpip.Error {
   984  				n := e.fetchNotifications()
   985  				if n&notifyNonZeroReceiveWindow != 0 {
   986  					e.rcv.nonZeroWindow()
   987  				}
   988  
   989  				if n&notifyReceiveWindowChanged != 0 {
   990  					e.rcv.pendingBufSize = seqnum.Size(e.receiveBufferSize())
   991  				}
   992  
   993  				if n&notifyMTUChanged != 0 {
   994  					e.sndBufMu.Lock()
   995  					count := e.packetTooBigCount
   996  					e.packetTooBigCount = 0
   997  					mtu := e.sndMTU
   998  					e.sndBufMu.Unlock()
   999  
  1000  					e.snd.updateMaxPayloadSize(mtu, count)
  1001  				}
  1002  
  1003  				if n&notifyReset != 0 {
  1004  					e.mu.Lock()
  1005  					e.resetConnectionLocked(tcpip.ErrConnectionAborted)
  1006  					e.mu.Unlock()
  1007  				}
  1008  				if n&notifyClose != 0 && closeTimer == nil {
  1009  					// Reset the connection 3 seconds after
  1010  					// the endpoint has been closed.
  1011  					//
  1012  					// The timer could fire in background
  1013  					// when the endpoint is drained. That's
  1014  					// OK as the loop here will not honor
  1015  					// the firing until the undrain arrives.
  1016  					closeTimer = time.AfterFunc(3*time.Second, func() {
  1017  						closeWaker.Assert()
  1018  					})
  1019  				}
  1020  
  1021  				if n&notifyKeepaliveChanged != 0 {
  1022  					// The timer could fire in background
  1023  					// when the endpoint is drained. That's
  1024  					// OK. See above.
  1025  					e.resetKeepaliveTimer(true)
  1026  				}
  1027  
  1028  				if n&notifyDrain != 0 {
  1029  					for !e.segmentQueue.empty() {
  1030  						if err := e.handleSegments(); err != nil {
  1031  							return err
  1032  						}
  1033  					}
  1034  					if e.state != StateError {
  1035  						close(e.drainDone)
  1036  						<-e.undrain
  1037  					}
  1038  				}
  1039  
  1040  				return nil
  1041  			},
  1042  		},
  1043  	}
  1044  
  1045  	// Initialize the sleeper based on the wakers in funcs.
  1046  	s := sleep.Sleeper{}
  1047  	for i := range funcs {
  1048  		s.AddWaker(funcs[i].w, i)
  1049  	}
  1050  
  1051  	// The following assertions and notifications are needed for restored
  1052  	// endpoints. Fresh newly created endpoints have empty states and should
  1053  	// not invoke any.
  1054  	e.segmentQueue.mu.Lock()
  1055  	if !e.segmentQueue.list.Empty() {
  1056  		e.newSegmentWaker.Assert()
  1057  	}
  1058  	e.segmentQueue.mu.Unlock()
  1059  
  1060  	e.rcvListMu.Lock()
  1061  	if !e.rcvList.Empty() {
  1062  		e.waiterQueue.Notify(waiter.EventIn)
  1063  	}
  1064  	e.rcvListMu.Unlock()
  1065  
  1066  	e.mu.RLock()
  1067  	if e.workerCleanup {
  1068  		e.notifyProtocolGoroutine(notifyClose)
  1069  	}
  1070  	e.mu.RUnlock()
  1071  
  1072  	// Main loop. Handle segments until both send and receive ends of the
  1073  	// connection have completed.
  1074  	for !e.rcv.closed || !e.snd.closed || e.snd.sndUna != e.snd.sndNxtList {
  1075  		e.workMu.Unlock()
  1076  		v, _ := s.Fetch(true)
  1077  		e.workMu.Lock()
  1078  		if err := funcs[v].f(); err != nil {
  1079  			e.mu.Lock()
  1080  			// Ensure we release all endpoint registration and route
  1081  			// references as the connection is now in an error
  1082  			// state.
  1083  			e.workerCleanup = true
  1084  			e.resetConnectionLocked(err)
  1085  			// Lock released below.
  1086  			epilogue()
  1087  
  1088  			return nil
  1089  		}
  1090  	}
  1091  
  1092  	// Mark endpoint as closed.
  1093  	e.mu.Lock()
  1094  	if e.state != StateError {
  1095  		e.state = StateClose
  1096  	}
  1097  	// Lock released below.
  1098  	epilogue()
  1099  
  1100  	return nil
  1101  }