github.com/polevpn/netstack@v1.10.9/tcpip/transport/tcp/rcv.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"container/heap"
    19  	"time"
    20  
    21  	"github.com/polevpn/netstack/tcpip"
    22  	"github.com/polevpn/netstack/tcpip/header"
    23  	"github.com/polevpn/netstack/tcpip/seqnum"
    24  )
    25  
    26  // receiver holds the state necessary to receive TCP segments and turn them
    27  // into a stream of bytes.
    28  //
    29  // +stateify savable
    30  type receiver struct {
    31  	ep *endpoint
    32  
    33  	rcvNxt seqnum.Value
    34  
    35  	// rcvAcc is one beyond the last acceptable sequence number. That is,
    36  	// the "largest" sequence value that the receiver has announced to the
    37  	// its peer that it's willing to accept. This may be different than
    38  	// rcvNxt + rcvWnd if the receive window is reduced; in that case we
    39  	// have to reduce the window as we receive more data instead of
    40  	// shrinking it.
    41  	rcvAcc seqnum.Value
    42  
    43  	// rcvWnd is the non-scaled receive window last advertised to the peer.
    44  	rcvWnd seqnum.Size
    45  
    46  	rcvWndScale uint8
    47  
    48  	closed bool
    49  
    50  	pendingRcvdSegments segmentHeap
    51  	pendingBufUsed      seqnum.Size
    52  	pendingBufSize      seqnum.Size
    53  }
    54  
    55  func newReceiver(ep *endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale uint8, pendingBufSize seqnum.Size) *receiver {
    56  	return &receiver{
    57  		ep:             ep,
    58  		rcvNxt:         irs + 1,
    59  		rcvAcc:         irs.Add(rcvWnd + 1),
    60  		rcvWnd:         rcvWnd,
    61  		rcvWndScale:    rcvWndScale,
    62  		pendingBufSize: pendingBufSize,
    63  	}
    64  }
    65  
    66  // acceptable checks if the segment sequence number range is acceptable
    67  // according to the table on page 26 of RFC 793.
    68  func (r *receiver) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool {
    69  	rcvWnd := r.rcvNxt.Size(r.rcvAcc)
    70  	if rcvWnd == 0 {
    71  		return segLen == 0 && segSeq == r.rcvNxt
    72  	}
    73  
    74  	return segSeq.InWindow(r.rcvNxt, rcvWnd) ||
    75  		seqnum.Overlap(r.rcvNxt, rcvWnd, segSeq, segLen)
    76  }
    77  
    78  // getSendParams returns the parameters needed by the sender when building
    79  // segments to send.
    80  func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) {
    81  	// Calculate the window size based on the available buffer space.
    82  	receiveBufferAvailable := r.ep.receiveBufferAvailable()
    83  	acc := r.rcvNxt.Add(seqnum.Size(receiveBufferAvailable))
    84  	if r.rcvAcc.LessThan(acc) {
    85  		r.rcvAcc = acc
    86  	}
    87  	// Stash away the non-scaled receive window as we use it for measuring
    88  	// receiver's estimated RTT.
    89  	r.rcvWnd = r.rcvNxt.Size(r.rcvAcc)
    90  	return r.rcvNxt, r.rcvWnd >> r.rcvWndScale
    91  }
    92  
    93  // nonZeroWindow is called when the receive window grows from zero to nonzero;
    94  // in such cases we may need to send an ack to indicate to our peer that it can
    95  // resume sending data.
    96  func (r *receiver) nonZeroWindow() {
    97  	if (r.rcvAcc-r.rcvNxt)>>r.rcvWndScale != 0 {
    98  		// We never got around to announcing a zero window size, so we
    99  		// don't need to immediately announce a nonzero one.
   100  		return
   101  	}
   102  
   103  	// Immediately send an ack.
   104  	r.ep.snd.sendAck()
   105  }
   106  
   107  // consumeSegment attempts to consume a segment that was received by r. The
   108  // segment may have just been received or may have been received earlier but
   109  // wasn't ready to be consumed then.
   110  //
   111  // Returns true if the segment was consumed, false if it cannot be consumed
   112  // yet because of a missing segment.
   113  func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum.Size) bool {
   114  	if segLen > 0 {
   115  		// If the segment doesn't include the seqnum we're expecting to
   116  		// consume now, we're missing a segment. We cannot proceed until
   117  		// we receive that segment though.
   118  		if !r.rcvNxt.InWindow(segSeq, segLen) {
   119  			return false
   120  		}
   121  
   122  		// Trim segment to eliminate already acknowledged data.
   123  		if segSeq.LessThan(r.rcvNxt) {
   124  			diff := segSeq.Size(r.rcvNxt)
   125  			segLen -= diff
   126  			segSeq.UpdateForward(diff)
   127  			s.sequenceNumber.UpdateForward(diff)
   128  			s.data.TrimFront(int(diff))
   129  		}
   130  
   131  		// Move segment to ready-to-deliver list. Wakeup any waiters.
   132  		r.ep.readyToRead(s)
   133  
   134  	} else if segSeq != r.rcvNxt {
   135  		return false
   136  	}
   137  
   138  	// Update the segment that we're expecting to consume.
   139  	r.rcvNxt = segSeq.Add(segLen)
   140  
   141  	// In cases of a misbehaving sender which could send more than the
   142  	// advertised window, we could end up in a situation where we get a
   143  	// segment that exceeds the window advertised. Instead of partially
   144  	// accepting the segment and discarding bytes beyond the advertised
   145  	// window, we accept the whole segment and make sure r.rcvAcc is moved
   146  	// forward to match r.rcvNxt to indicate that the window is now closed.
   147  	//
   148  	// In absence of this check the r.acceptable() check fails and accepts
   149  	// segments that should be dropped because rcvWnd is calculated as
   150  	// the size of the interval (rcvNxt, rcvAcc] which becomes extremely
   151  	// large if rcvAcc is ever less than rcvNxt.
   152  	if r.rcvAcc.LessThan(r.rcvNxt) {
   153  		r.rcvAcc = r.rcvNxt
   154  	}
   155  
   156  	// Trim SACK Blocks to remove any SACK information that covers
   157  	// sequence numbers that have been consumed.
   158  	TrimSACKBlockList(&r.ep.sack, r.rcvNxt)
   159  
   160  	// Handle FIN or FIN-ACK.
   161  	if s.flagIsSet(header.TCPFlagFin) {
   162  		r.rcvNxt++
   163  
   164  		// Send ACK immediately.
   165  		r.ep.snd.sendAck()
   166  
   167  		// Tell any readers that no more data will come.
   168  		r.closed = true
   169  		r.ep.readyToRead(nil)
   170  
   171  		// We just received a FIN, our next state depends on whether we sent a
   172  		// FIN already or not.
   173  		r.ep.mu.Lock()
   174  		switch r.ep.state {
   175  		case StateEstablished:
   176  			r.ep.state = StateCloseWait
   177  		case StateFinWait1:
   178  			if s.flagIsSet(header.TCPFlagAck) {
   179  				// FIN-ACK, transition to TIME-WAIT.
   180  				r.ep.state = StateTimeWait
   181  			} else {
   182  				// Simultaneous close, expecting a final ACK.
   183  				r.ep.state = StateClosing
   184  			}
   185  		case StateFinWait2:
   186  			r.ep.state = StateTimeWait
   187  		}
   188  		r.ep.mu.Unlock()
   189  
   190  		// Flush out any pending segments, except the very first one if
   191  		// it happens to be the one we're handling now because the
   192  		// caller is using it.
   193  		first := 0
   194  		if len(r.pendingRcvdSegments) != 0 && r.pendingRcvdSegments[0] == s {
   195  			first = 1
   196  		}
   197  
   198  		for i := first; i < len(r.pendingRcvdSegments); i++ {
   199  			r.pendingRcvdSegments[i].decRef()
   200  		}
   201  		r.pendingRcvdSegments = r.pendingRcvdSegments[:first]
   202  
   203  		return true
   204  	}
   205  
   206  	// Handle ACK (not FIN-ACK, which we handled above) during one of the
   207  	// shutdown states.
   208  	if s.flagIsSet(header.TCPFlagAck) {
   209  		r.ep.mu.Lock()
   210  		switch r.ep.state {
   211  		case StateFinWait1:
   212  			r.ep.state = StateFinWait2
   213  			// Notify protocol goroutine that we have received an
   214  			// ACK to our FIN so that it can start the FIN_WAIT2
   215  			// timer to abort connection if the other side does
   216  			// not close within 2MSL.
   217  			r.ep.notifyProtocolGoroutine(notifyClose)
   218  		case StateClosing:
   219  			r.ep.state = StateTimeWait
   220  		case StateLastAck:
   221  			r.ep.transitionToStateCloseLocked()
   222  		}
   223  		r.ep.mu.Unlock()
   224  	}
   225  
   226  	return true
   227  }
   228  
   229  // updateRTT updates the receiver RTT measurement based on the sequence number
   230  // of the received segment.
   231  func (r *receiver) updateRTT() {
   232  	// From: https://public.lanl.gov/radiant/pubs/drs/sc2001-poster.pdf
   233  	//
   234  	// A system that is only transmitting acknowledgements can still
   235  	// estimate the round-trip time by observing the time between when a byte
   236  	// is first acknowledged and the receipt of data that is at least one
   237  	// window beyond the sequence number that was acknowledged.
   238  	r.ep.rcvListMu.Lock()
   239  	if r.ep.rcvAutoParams.rttMeasureTime.IsZero() {
   240  		// New measurement.
   241  		r.ep.rcvAutoParams.rttMeasureTime = time.Now()
   242  		r.ep.rcvAutoParams.rttMeasureSeqNumber = r.rcvNxt.Add(r.rcvWnd)
   243  		r.ep.rcvListMu.Unlock()
   244  		return
   245  	}
   246  	if r.rcvNxt.LessThan(r.ep.rcvAutoParams.rttMeasureSeqNumber) {
   247  		r.ep.rcvListMu.Unlock()
   248  		return
   249  	}
   250  	rtt := time.Since(r.ep.rcvAutoParams.rttMeasureTime)
   251  	// We only store the minimum observed RTT here as this is only used in
   252  	// absence of a SRTT available from either timestamps or a sender
   253  	// measurement of RTT.
   254  	if r.ep.rcvAutoParams.rtt == 0 || rtt < r.ep.rcvAutoParams.rtt {
   255  		r.ep.rcvAutoParams.rtt = rtt
   256  	}
   257  	r.ep.rcvAutoParams.rttMeasureTime = time.Now()
   258  	r.ep.rcvAutoParams.rttMeasureSeqNumber = r.rcvNxt.Add(r.rcvWnd)
   259  	r.ep.rcvListMu.Unlock()
   260  }
   261  
   262  func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, closed bool) (drop bool, err *tcpip.Error) {
   263  	r.ep.rcvListMu.Lock()
   264  	rcvClosed := r.ep.rcvClosed || r.closed
   265  	r.ep.rcvListMu.Unlock()
   266  
   267  	// If we are in one of the shutdown states then we need to do
   268  	// additional checks before we try and process the segment.
   269  	switch state {
   270  	case StateCloseWait, StateClosing, StateLastAck:
   271  		if !s.sequenceNumber.LessThanEq(r.rcvNxt) {
   272  			s.decRef()
   273  			// Just drop the segment as we have
   274  			// already received a FIN and this
   275  			// segment is after the sequence number
   276  			// for the FIN.
   277  			return true, nil
   278  		}
   279  		fallthrough
   280  	case StateFinWait1:
   281  		fallthrough
   282  	case StateFinWait2:
   283  		// If we are closed for reads (either due to an
   284  		// incoming FIN or the user calling shutdown(..,
   285  		// SHUT_RD) then any data past the rcvNxt should
   286  		// trigger a RST.
   287  		endDataSeq := s.sequenceNumber.Add(seqnum.Size(s.data.Size()))
   288  		if rcvClosed && r.rcvNxt.LessThan(endDataSeq) {
   289  			s.decRef()
   290  			return true, tcpip.ErrConnectionAborted
   291  		}
   292  		if state == StateFinWait1 {
   293  			break
   294  		}
   295  
   296  		// If it's a retransmission of an old data segment
   297  		// or a pure ACK then allow it.
   298  		if s.sequenceNumber.Add(s.logicalLen()).LessThanEq(r.rcvNxt) ||
   299  			s.logicalLen() == 0 {
   300  			break
   301  		}
   302  
   303  		// In FIN-WAIT2 if the socket is fully
   304  		// closed(not owned by application on our end
   305  		// then the only acceptable segment is a
   306  		// FIN. Since FIN can technically also carry
   307  		// data we verify that the segment carrying a
   308  		// FIN ends at exactly e.rcvNxt+1.
   309  		//
   310  		// From RFC793 page 25.
   311  		//
   312  		// For sequence number purposes, the SYN is
   313  		// considered to occur before the first actual
   314  		// data octet of the segment in which it occurs,
   315  		// while the FIN is considered to occur after
   316  		// the last actual data octet in a segment in
   317  		// which it occurs.
   318  		if closed && (!s.flagIsSet(header.TCPFlagFin) || s.sequenceNumber.Add(s.logicalLen()) != r.rcvNxt+1) {
   319  			s.decRef()
   320  			return true, tcpip.ErrConnectionAborted
   321  		}
   322  	}
   323  
   324  	// We don't care about receive processing anymore if the receive side
   325  	// is closed.
   326  	//
   327  	// NOTE: We still want to permit a FIN as it's possible only our
   328  	// end has closed and the peer is yet to send a FIN. Hence we
   329  	// compare only the payload.
   330  	segEnd := s.sequenceNumber.Add(seqnum.Size(s.data.Size()))
   331  	if rcvClosed && !segEnd.LessThanEq(r.rcvNxt) {
   332  		return true, nil
   333  	}
   334  	return false, nil
   335  }
   336  
   337  // handleRcvdSegment handles TCP segments directed at the connection managed by
   338  // r as they arrive. It is called by the protocol main loop.
   339  func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
   340  	r.ep.mu.RLock()
   341  	state := r.ep.state
   342  	closed := r.ep.closed
   343  	r.ep.mu.RUnlock()
   344  
   345  	if state != StateEstablished {
   346  		drop, err := r.handleRcvdSegmentClosing(s, state, closed)
   347  		if drop || err != nil {
   348  			return drop, err
   349  		}
   350  	}
   351  
   352  	segLen := seqnum.Size(s.data.Size())
   353  	segSeq := s.sequenceNumber
   354  
   355  	// If the sequence number range is outside the acceptable range, just
   356  	// send an ACK and stop further processing of the segment.
   357  	// This is according to RFC 793, page 68.
   358  	if !r.acceptable(segSeq, segLen) {
   359  		r.ep.snd.sendAck()
   360  		return true, nil
   361  	}
   362  
   363  	// Defer segment processing if it can't be consumed now.
   364  	if !r.consumeSegment(s, segSeq, segLen) {
   365  		if segLen > 0 || s.flagIsSet(header.TCPFlagFin) {
   366  			// We only store the segment if it's within our buffer
   367  			// size limit.
   368  			if r.pendingBufUsed < r.pendingBufSize {
   369  				r.pendingBufUsed += s.logicalLen()
   370  				s.incRef()
   371  				heap.Push(&r.pendingRcvdSegments, s)
   372  				UpdateSACKBlocks(&r.ep.sack, segSeq, segSeq.Add(segLen), r.rcvNxt)
   373  			}
   374  
   375  			// Immediately send an ack so that the peer knows it may
   376  			// have to retransmit.
   377  			r.ep.snd.sendAck()
   378  		}
   379  		return false, nil
   380  	}
   381  
   382  	// Since we consumed a segment update the receiver's RTT estimate
   383  	// if required.
   384  	if segLen > 0 {
   385  		r.updateRTT()
   386  	}
   387  
   388  	// By consuming the current segment, we may have filled a gap in the
   389  	// sequence number domain that allows pending segments to be consumed
   390  	// now. So try to do it.
   391  	for !r.closed && r.pendingRcvdSegments.Len() > 0 {
   392  		s := r.pendingRcvdSegments[0]
   393  		segLen := seqnum.Size(s.data.Size())
   394  		segSeq := s.sequenceNumber
   395  
   396  		// Skip segment altogether if it has already been acknowledged.
   397  		if !segSeq.Add(segLen-1).LessThan(r.rcvNxt) &&
   398  			!r.consumeSegment(s, segSeq, segLen) {
   399  			break
   400  		}
   401  
   402  		heap.Pop(&r.pendingRcvdSegments)
   403  		r.pendingBufUsed -= s.logicalLen()
   404  		s.decRef()
   405  	}
   406  	return false, nil
   407  }
   408  
   409  // handleTimeWaitSegment handles inbound segments received when the endpoint
   410  // has entered the TIME_WAIT state.
   411  func (r *receiver) handleTimeWaitSegment(s *segment) (resetTimeWait bool, newSyn bool) {
   412  	segSeq := s.sequenceNumber
   413  	segLen := seqnum.Size(s.data.Size())
   414  
   415  	// Just silently drop any RST packets in TIME_WAIT. We do not support
   416  	// TIME_WAIT assasination as a result we confirm w/ fix 1 as described
   417  	// in https://tools.ietf.org/html/rfc1337#section-3.
   418  	if s.flagIsSet(header.TCPFlagRst) {
   419  		return false, false
   420  	}
   421  
   422  	// If it's a SYN and the sequence number is higher than any seen before
   423  	// for this connection then try and redirect it to a listening endpoint
   424  	// if available.
   425  	//
   426  	// RFC 1122:
   427  	//   "When a connection is [...] on TIME-WAIT state [...]
   428  	//   [a TCP] MAY accept a new SYN from the remote TCP to
   429  	//   reopen the connection directly, if it:
   430  
   431  	//    (1) assigns its initial sequence number for the new
   432  	//     connection to be larger than the largest sequence
   433  	//     number it used on the previous connection incarnation,
   434  	//     and
   435  
   436  	//    (2) returns to TIME-WAIT state if the SYN turns out
   437  	//      to be an old duplicate".
   438  	if s.flagIsSet(header.TCPFlagSyn) && r.rcvNxt.LessThan(segSeq) {
   439  
   440  		return false, true
   441  	}
   442  
   443  	// Drop the segment if it does not contain an ACK.
   444  	if !s.flagIsSet(header.TCPFlagAck) {
   445  		return false, false
   446  	}
   447  
   448  	// Update Timestamp if required. See RFC7323, section-4.3.
   449  	if r.ep.sendTSOk && s.parsedOptions.TS {
   450  		r.ep.updateRecentTimestamp(s.parsedOptions.TSVal, r.ep.snd.maxSentAck, segSeq)
   451  	}
   452  
   453  	if segSeq.Add(1) == r.rcvNxt && s.flagIsSet(header.TCPFlagFin) {
   454  		// If it's a FIN-ACK then resetTimeWait and send an ACK, as it
   455  		// indicates our final ACK could have been lost.
   456  		r.ep.snd.sendAck()
   457  		return true, false
   458  	}
   459  
   460  	// If the sequence number range is outside the acceptable range or
   461  	// carries data then just send an ACK. This is according to RFC 793,
   462  	// page 37.
   463  	//
   464  	// NOTE: In TIME_WAIT the only acceptable sequence number is rcvNxt.
   465  	if segSeq != r.rcvNxt || segLen != 0 {
   466  		r.ep.snd.sendAck()
   467  	}
   468  	return false, false
   469  }