github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/tcpip/transport/tcp/rcv.go

github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/tcpip/transport/tcp/rcv.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"container/heap"
    19  	"math"
    20  
    21  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip"
    22  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/header"
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/seqnum"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/stack"
    25  )
    26  
    27  // receiver holds the state necessary to receive TCP segments and turn them
    28  // into a stream of bytes.
    29  //
    30  // +stateify savable
    31  type receiver struct {
    32  	stack.TCPReceiverState
    33  	ep *endpoint
    34  
    35  	// rcvWnd is the non-scaled receive window last advertised to the peer.
    36  	rcvWnd seqnum.Size
    37  
    38  	// rcvWUP is the RcvNxt value at the last window update sent.
    39  	rcvWUP seqnum.Value
    40  
    41  	// prevBufused is the snapshot of endpoint rcvBufUsed taken when we
    42  	// advertise a receive window.
    43  	prevBufUsed int
    44  
    45  	closed bool
    46  
    47  	// pendingRcvdSegments is bounded by the receive buffer size of the
    48  	// endpoint.
    49  	pendingRcvdSegments segmentHeap
    50  
    51  	// Time when the last ack was received.
    52  	lastRcvdAckTime tcpip.MonotonicTime
    53  }
    54  
    55  func newReceiver(ep *endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale uint8) *receiver {
    56  	return &receiver{
    57  		ep: ep,
    58  		TCPReceiverState: stack.TCPReceiverState{
    59  			RcvNxt:      irs + 1,
    60  			RcvAcc:      irs.Add(rcvWnd + 1),
    61  			RcvWndScale: rcvWndScale,
    62  		},
    63  		rcvWnd:          rcvWnd,
    64  		rcvWUP:          irs + 1,
    65  		lastRcvdAckTime: ep.stack.Clock().NowMonotonic(),
    66  	}
    67  }
    68  
    69  // acceptable checks if the segment sequence number range is acceptable
    70  // according to the table on page 26 of RFC 793.
    71  func (r *receiver) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool {
    72  	// r.rcvWnd could be much larger than the window size we advertised in our
    73  	// outgoing packets, we should use what we have advertised for acceptability
    74  	// test.
    75  	scaledWindowSize := r.rcvWnd >> r.RcvWndScale
    76  	if scaledWindowSize > math.MaxUint16 {
    77  		// This is what we actually put in the Window field.
    78  		scaledWindowSize = math.MaxUint16
    79  	}
    80  	advertisedWindowSize := scaledWindowSize << r.RcvWndScale
    81  	return header.Acceptable(segSeq, segLen, r.RcvNxt, r.RcvNxt.Add(advertisedWindowSize))
    82  }
    83  
    84  // currentWindow returns the available space in the window that was advertised
    85  // last to our peer.
    86  func (r *receiver) currentWindow() (curWnd seqnum.Size) {
    87  	endOfWnd := r.rcvWUP.Add(r.rcvWnd)
    88  	if endOfWnd.LessThan(r.RcvNxt) {
    89  		// return 0 if r.RcvNxt is past the end of the previously advertised window.
    90  		// This can happen because we accept a large segment completely even if
    91  		// accepting it causes it to partially exceed the advertised window.
    92  		return 0
    93  	}
    94  	return r.RcvNxt.Size(endOfWnd)
    95  }
    96  
    97  // getSendParams returns the parameters needed by the sender when building
    98  // segments to send.
    99  // +checklocks:r.ep.mu
   100  func (r *receiver) getSendParams() (RcvNxt seqnum.Value, rcvWnd seqnum.Size) {
   101  	newWnd := r.ep.selectWindow()
   102  	curWnd := r.currentWindow()
   103  	unackLen := int(r.ep.snd.MaxSentAck.Size(r.RcvNxt))
   104  	bufUsed := r.ep.receiveBufferUsed()
   105  
   106  	// Grow the right edge of the window only for payloads larger than the
   107  	// the segment overhead OR if the application is actively consuming data.
   108  	//
   109  	// Avoiding growing the right edge otherwise, addresses a situation below:
   110  	// An application has been slow in reading data and we have burst of
   111  	// incoming segments lengths < segment overhead. Here, our available free
   112  	// memory would reduce drastically when compared to the advertised receive
   113  	// window.
   114  	//
   115  	// For example: With incoming 512 bytes segments, segment overhead of
   116  	// 552 bytes (at the time of writing this comment), with receive window
   117  	// starting from 1MB and with rcvAdvWndScale being 1, buffer would reach 0
   118  	// when the curWnd is still 19436 bytes, because for every incoming segment
   119  	// newWnd would reduce by (552+512) >> rcvAdvWndScale (current value 1),
   120  	// while curWnd would reduce by 512 bytes.
   121  	// Such a situation causes us to keep tail dropping the incoming segments
   122  	// and never advertise zero receive window to the peer.
   123  	//
   124  	// Linux does a similar check for minimal sk_buff size (128):
   125  	// https://github.com/torvalds/linux/blob/d5beb3140f91b1c8a3d41b14d729aefa4dcc58bc/net/ipv4/tcp_input.c#L783
   126  	//
   127  	// Also, if the application is reading the data, we keep growing the right
   128  	// edge, as we are still advertising a window that we think can be serviced.
   129  	toGrow := unackLen >= SegOverheadSize || bufUsed <= r.prevBufUsed
   130  
   131  	// Update RcvAcc only if new window is > previously advertised window. We
   132  	// should never shrink the acceptable sequence space once it has been
   133  	// advertised the peer. If we shrink the acceptable sequence space then we
   134  	// would end up dropping bytes that might already be in flight.
   135  	// ====================================================  sequence space.
   136  	// ^             ^               ^                   ^
   137  	// rcvWUP       RcvNxt         RcvAcc          new RcvAcc
   138  	//               <=====curWnd ===>
   139  	//               <========= newWnd > curWnd ========= >
   140  	if r.RcvNxt.Add(curWnd).LessThan(r.RcvNxt.Add(newWnd)) && toGrow {
   141  		// If the new window moves the right edge, then update RcvAcc.
   142  		r.RcvAcc = r.RcvNxt.Add(newWnd)
   143  	} else {
   144  		if newWnd == 0 {
   145  			// newWnd is zero but we can't advertise a zero as it would cause window
   146  			// to shrink so just increment a metric to record this event.
   147  			r.ep.stats.ReceiveErrors.WantZeroRcvWindow.Increment()
   148  		}
   149  		newWnd = curWnd
   150  	}
   151  
   152  	// Apply silly-window avoidance when recovering from zero-window situation.
   153  	// Keep advertising zero receive window up until the new window reaches a
   154  	// threshold.
   155  	if r.rcvWnd == 0 && newWnd != 0 {
   156  		r.ep.rcvQueueMu.Lock()
   157  		if crossed, above := r.ep.windowCrossedACKThresholdLocked(int(newWnd), int(r.ep.ops.GetReceiveBufferSize())); !crossed && !above {
   158  			newWnd = 0
   159  		}
   160  		r.ep.rcvQueueMu.Unlock()
   161  	}
   162  
   163  	// Stash away the non-scaled receive window as we use it for measuring
   164  	// receiver's estimated RTT.
   165  	r.rcvWnd = newWnd
   166  	r.rcvWUP = r.RcvNxt
   167  	r.prevBufUsed = bufUsed
   168  	scaledWnd := r.rcvWnd >> r.RcvWndScale
   169  	if scaledWnd == 0 {
   170  		// Increment a metric if we are advertising an actual zero window.
   171  		r.ep.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
   172  	}
   173  
   174  	// If we started off with a window larger than what can he held in
   175  	// the 16bit window field, we ceil the value to the max value.
   176  	if scaledWnd > math.MaxUint16 {
   177  		scaledWnd = seqnum.Size(math.MaxUint16)
   178  
   179  		// Ensure that the stashed receive window always reflects what
   180  		// is being advertised.
   181  		r.rcvWnd = scaledWnd << r.RcvWndScale
   182  	}
   183  	return r.RcvNxt, scaledWnd
   184  }
   185  
   186  // nonZeroWindow is called when the receive window grows from zero to nonzero;
   187  // in such cases we may need to send an ack to indicate to our peer that it can
   188  // resume sending data.
   189  // +checklocks:r.ep.mu
   190  // +checklocksalias:r.ep.snd.ep.mu=r.ep.mu
   191  func (r *receiver) nonZeroWindow() {
   192  	// Immediately send an ack.
   193  	r.ep.snd.sendAck()
   194  }
   195  
   196  // consumeSegment attempts to consume a segment that was received by r. The
   197  // segment may have just been received or may have been received earlier but
   198  // wasn't ready to be consumed then.
   199  //
   200  // Returns true if the segment was consumed, false if it cannot be consumed
   201  // yet because of a missing segment.
   202  // +checklocks:r.ep.mu
   203  // +checklocksalias:r.ep.snd.ep.mu=r.ep.mu
   204  func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum.Size) bool {
   205  	if segLen > 0 {
   206  		// If the segment doesn't include the seqnum we're expecting to
   207  		// consume now, we're missing a segment. We cannot proceed until
   208  		// we receive that segment though.
   209  		if !r.RcvNxt.InWindow(segSeq, segLen) {
   210  			return false
   211  		}
   212  
   213  		// Trim segment to eliminate already acknowledged data.
   214  		if segSeq.LessThan(r.RcvNxt) {
   215  			diff := segSeq.Size(r.RcvNxt)
   216  			segLen -= diff
   217  			segSeq.UpdateForward(diff)
   218  			s.sequenceNumber.UpdateForward(diff)
   219  			s.TrimFront(diff)
   220  		}
   221  
   222  		// Move segment to ready-to-deliver list. Wakeup any waiters.
   223  		r.ep.readyToRead(s)
   224  
   225  	} else if segSeq != r.RcvNxt {
   226  		return false
   227  	}
   228  
   229  	// Update the segment that we're expecting to consume.
   230  	r.RcvNxt = segSeq.Add(segLen)
   231  
   232  	// In cases of a misbehaving sender which could send more than the
   233  	// advertised window, we could end up in a situation where we get a
   234  	// segment that exceeds the window advertised. Instead of partially
   235  	// accepting the segment and discarding bytes beyond the advertised
   236  	// window, we accept the whole segment and make sure r.RcvAcc is moved
   237  	// forward to match r.RcvNxt to indicate that the window is now closed.
   238  	//
   239  	// In absence of this check the r.acceptable() check fails and accepts
   240  	// segments that should be dropped because rcvWnd is calculated as
   241  	// the size of the interval (RcvNxt, RcvAcc] which becomes extremely
   242  	// large if RcvAcc is ever less than RcvNxt.
   243  	if r.RcvAcc.LessThan(r.RcvNxt) {
   244  		r.RcvAcc = r.RcvNxt
   245  	}
   246  
   247  	// Trim SACK Blocks to remove any SACK information that covers
   248  	// sequence numbers that have been consumed.
   249  	TrimSACKBlockList(&r.ep.sack, r.RcvNxt)
   250  
   251  	// Handle FIN or FIN-ACK.
   252  	if s.flags.Contains(header.TCPFlagFin) {
   253  		r.RcvNxt++
   254  
   255  		// Send ACK immediately.
   256  		r.ep.snd.sendAck()
   257  
   258  		// Tell any readers that no more data will come.
   259  		r.closed = true
   260  		r.ep.readyToRead(nil)
   261  
   262  		// We just received a FIN, our next state depends on whether we sent a
   263  		// FIN already or not.
   264  		switch r.ep.EndpointState() {
   265  		case StateEstablished:
   266  			r.ep.setEndpointState(StateCloseWait)
   267  		case StateFinWait1:
   268  			if s.flags.Contains(header.TCPFlagAck) && s.ackNumber == r.ep.snd.SndNxt {
   269  				// FIN-ACK, transition to TIME-WAIT.
   270  				r.ep.setEndpointState(StateTimeWait)
   271  			} else {
   272  				// Simultaneous close, expecting a final ACK.
   273  				r.ep.setEndpointState(StateClosing)
   274  			}
   275  		case StateFinWait2:
   276  			r.ep.setEndpointState(StateTimeWait)
   277  		}
   278  
   279  		// Flush out any pending segments, except the very first one if
   280  		// it happens to be the one we're handling now because the
   281  		// caller is using it.
   282  		first := 0
   283  		if len(r.pendingRcvdSegments) != 0 && r.pendingRcvdSegments[0] == s {
   284  			first = 1
   285  		}
   286  
   287  		for i := first; i < len(r.pendingRcvdSegments); i++ {
   288  			r.PendingBufUsed -= r.pendingRcvdSegments[i].segMemSize()
   289  			r.pendingRcvdSegments[i].DecRef()
   290  			// Note that slice truncation does not allow garbage
   291  			// collection of truncated items, thus truncated items
   292  			// must be set to nil to avoid memory leaks.
   293  			r.pendingRcvdSegments[i] = nil
   294  		}
   295  		r.pendingRcvdSegments = r.pendingRcvdSegments[:first]
   296  		r.ep.updateConnDirectionState(connDirectionStateRcvClosed)
   297  
   298  		return true
   299  	}
   300  
   301  	// Handle ACK (not FIN-ACK, which we handled above) during one of the
   302  	// shutdown states.
   303  	if s.flags.Contains(header.TCPFlagAck) && s.ackNumber == r.ep.snd.SndNxt {
   304  		switch r.ep.EndpointState() {
   305  		case StateFinWait1:
   306  			r.ep.setEndpointState(StateFinWait2)
   307  			if e := r.ep; e.closed {
   308  				// The socket has been closed and we are in
   309  				// FIN-WAIT-2 so start the FIN-WAIT-2 timer.
   310  				e.finWait2Timer = e.stack.Clock().AfterFunc(e.tcpLingerTimeout, e.finWait2TimerExpired)
   311  			}
   312  
   313  		case StateClosing:
   314  			r.ep.setEndpointState(StateTimeWait)
   315  		case StateLastAck:
   316  			r.ep.transitionToStateCloseLocked()
   317  		}
   318  	}
   319  
   320  	return true
   321  }
   322  
   323  // updateRTT updates the receiver RTT measurement based on the sequence number
   324  // of the received segment.
   325  func (r *receiver) updateRTT() {
   326  	// From: https://public.lanl.gov/radiant/pubs/drs/sc2001-poster.pdf
   327  	//
   328  	// A system that is only transmitting acknowledgements can still
   329  	// estimate the round-trip time by observing the time between when a byte
   330  	// is first acknowledged and the receipt of data that is at least one
   331  	// window beyond the sequence number that was acknowledged.
   332  	r.ep.rcvQueueMu.Lock()
   333  	if r.ep.RcvAutoParams.RTTMeasureTime == (tcpip.MonotonicTime{}) {
   334  		// New measurement.
   335  		r.ep.RcvAutoParams.RTTMeasureTime = r.ep.stack.Clock().NowMonotonic()
   336  		r.ep.RcvAutoParams.RTTMeasureSeqNumber = r.RcvNxt.Add(r.rcvWnd)
   337  		r.ep.rcvQueueMu.Unlock()
   338  		return
   339  	}
   340  	if r.RcvNxt.LessThan(r.ep.RcvAutoParams.RTTMeasureSeqNumber) {
   341  		r.ep.rcvQueueMu.Unlock()
   342  		return
   343  	}
   344  	rtt := r.ep.stack.Clock().NowMonotonic().Sub(r.ep.RcvAutoParams.RTTMeasureTime)
   345  	// We only store the minimum observed RTT here as this is only used in
   346  	// absence of a SRTT available from either timestamps or a sender
   347  	// measurement of RTT.
   348  	if r.ep.RcvAutoParams.RTT == 0 || rtt < r.ep.RcvAutoParams.RTT {
   349  		r.ep.RcvAutoParams.RTT = rtt
   350  	}
   351  	r.ep.RcvAutoParams.RTTMeasureTime = r.ep.stack.Clock().NowMonotonic()
   352  	r.ep.RcvAutoParams.RTTMeasureSeqNumber = r.RcvNxt.Add(r.rcvWnd)
   353  	r.ep.rcvQueueMu.Unlock()
   354  }
   355  
   356  // +checklocks:r.ep.mu
   357  // +checklocksalias:r.ep.snd.ep.mu=r.ep.mu
   358  func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, closed bool) (drop bool, err tcpip.Error) {
   359  	r.ep.rcvQueueMu.Lock()
   360  	rcvClosed := r.ep.RcvClosed || r.closed
   361  	r.ep.rcvQueueMu.Unlock()
   362  
   363  	// If we are in one of the shutdown states then we need to do
   364  	// additional checks before we try and process the segment.
   365  	switch state {
   366  	case StateCloseWait, StateClosing, StateLastAck:
   367  		if !s.sequenceNumber.LessThanEq(r.RcvNxt) {
   368  			// Just drop the segment as we have
   369  			// already received a FIN and this
   370  			// segment is after the sequence number
   371  			// for the FIN.
   372  			return true, nil
   373  		}
   374  		fallthrough
   375  	case StateFinWait1, StateFinWait2:
   376  		// If the ACK acks something not yet sent then we send an ACK.
   377  		//
   378  		// RFC793, page 37: If the connection is in a synchronized state,
   379  		// (ESTABLISHED, FIN-WAIT-1, FIN-WAIT-2, CLOSE-WAIT, CLOSING, LAST-ACK,
   380  		// TIME-WAIT), any unacceptable segment (out of window sequence number
   381  		// or unacceptable acknowledgment number) must elicit only an empty
   382  		// acknowledgment segment containing the current send-sequence number
   383  		// and an acknowledgment indicating the next sequence number expected
   384  		// to be received, and the connection remains in the same state.
   385  		//
   386  		// Just as on Linux, we do not apply this behavior when state is
   387  		// ESTABLISHED.
   388  		// Linux receive processing for all states except ESTABLISHED and
   389  		// TIME_WAIT is here where if the ACK check fails, we attempt to
   390  		// reply back with an ACK with correct seq/ack numbers.
   391  		// https://github.com/torvalds/linux/blob/v5.8/net/ipv4/tcp_input.c#L6186
   392  		// The ESTABLISHED state processing is here where if the ACK check
   393  		// fails, we ignore the packet:
   394  		// https://github.com/torvalds/linux/blob/v5.8/net/ipv4/tcp_input.c#L5591
   395  		if r.ep.snd.SndNxt.LessThan(s.ackNumber) {
   396  			r.ep.snd.maybeSendOutOfWindowAck(s)
   397  			return true, nil
   398  		}
   399  
   400  		// If we are closed for reads (either due to an
   401  		// incoming FIN or the user calling shutdown(..,
   402  		// SHUT_RD) then any data past the RcvNxt should
   403  		// trigger a RST.
   404  		endDataSeq := s.sequenceNumber.Add(seqnum.Size(s.payloadSize()))
   405  		if state != StateCloseWait && rcvClosed && r.RcvNxt.LessThan(endDataSeq) {
   406  			return true, &tcpip.ErrConnectionAborted{}
   407  		}
   408  		if state == StateFinWait1 {
   409  			break
   410  		}
   411  
   412  		// If it's a retransmission of an old data segment
   413  		// or a pure ACK then allow it.
   414  		if s.sequenceNumber.Add(s.logicalLen()).LessThanEq(r.RcvNxt) ||
   415  			s.logicalLen() == 0 {
   416  			break
   417  		}
   418  
   419  		// In FIN-WAIT2 if the socket is fully
   420  		// closed(not owned by application on our end
   421  		// then the only acceptable segment is a
   422  		// FIN. Since FIN can technically also carry
   423  		// data we verify that the segment carrying a
   424  		// FIN ends at exactly e.RcvNxt+1.
   425  		//
   426  		// From RFC793 page 25.
   427  		//
   428  		// For sequence number purposes, the SYN is
   429  		// considered to occur before the first actual
   430  		// data octet of the segment in which it occurs,
   431  		// while the FIN is considered to occur after
   432  		// the last actual data octet in a segment in
   433  		// which it occurs.
   434  		if closed && (!s.flags.Contains(header.TCPFlagFin) || s.sequenceNumber.Add(s.logicalLen()) != r.RcvNxt+1) {
   435  			return true, &tcpip.ErrConnectionAborted{}
   436  		}
   437  	}
   438  
   439  	// We don't care about receive processing anymore if the receive side
   440  	// is closed.
   441  	//
   442  	// NOTE: We still want to permit a FIN as it's possible only our
   443  	// end has closed and the peer is yet to send a FIN. Hence we
   444  	// compare only the payload.
   445  	segEnd := s.sequenceNumber.Add(seqnum.Size(s.payloadSize()))
   446  	if rcvClosed && !segEnd.LessThanEq(r.RcvNxt) {
   447  		return true, nil
   448  	}
   449  	return false, nil
   450  }
   451  
   452  // handleRcvdSegment handles TCP segments directed at the connection managed by
   453  // r as they arrive. It is called by the protocol main loop.
   454  // +checklocks:r.ep.mu
   455  // +checklocksalias:r.ep.snd.ep.mu=r.ep.mu
   456  func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err tcpip.Error) {
   457  	state := r.ep.EndpointState()
   458  	closed := r.ep.closed
   459  
   460  	segLen := seqnum.Size(s.payloadSize())
   461  	segSeq := s.sequenceNumber
   462  
   463  	// If the sequence number range is outside the acceptable range, just
   464  	// send an ACK and stop further processing of the segment.
   465  	// This is according to RFC 793, page 68.
   466  	if !r.acceptable(segSeq, segLen) {
   467  		r.ep.snd.maybeSendOutOfWindowAck(s)
   468  		return true, nil
   469  	}
   470  
   471  	if state != StateEstablished {
   472  		drop, err := r.handleRcvdSegmentClosing(s, state, closed)
   473  		if drop || err != nil {
   474  			return drop, err
   475  		}
   476  	}
   477  
   478  	// Store the time of the last ack.
   479  	r.lastRcvdAckTime = r.ep.stack.Clock().NowMonotonic()
   480  
   481  	// Defer segment processing if it can't be consumed now.
   482  	if !r.consumeSegment(s, segSeq, segLen) {
   483  		if segLen > 0 || s.flags.Contains(header.TCPFlagFin) {
   484  			// We only store the segment if it's within our buffer
   485  			// size limit.
   486  			//
   487  			// Only use 75% of the receive buffer queue for
   488  			// out-of-order segments. This ensures that we always
   489  			// leave some space for the inorder segments to arrive
   490  			// allowing pending segments to be processed and
   491  			// delivered to the user.
   492  			//
   493  			// The ratio must be at least 50% (the size of rwnd) to
   494  			// leave space for retransmitted dropped packets. 51%
   495  			// would make recovery slow when there are multiple
   496  			// drops by necessitating multiple round trips. 100%
   497  			// would enable the buffer to be totally full of
   498  			// out-of-order data and stall the connection.
   499  			//
   500  			// An ideal solution is to ensure that there are at
   501  			// least N bytes free when N bytes are missing, but we
   502  			// don't have that computed at this point in the stack.
   503  			if rcvBufSize := r.ep.ops.GetReceiveBufferSize(); rcvBufSize > 0 && (r.PendingBufUsed+int(segLen)) < int(rcvBufSize-rcvBufSize/4) {
   504  				r.ep.rcvQueueMu.Lock()
   505  				r.PendingBufUsed += s.segMemSize()
   506  				r.ep.rcvQueueMu.Unlock()
   507  				s.IncRef()
   508  				heap.Push(&r.pendingRcvdSegments, s)
   509  				UpdateSACKBlocks(&r.ep.sack, segSeq, segSeq.Add(segLen), r.RcvNxt)
   510  			}
   511  
   512  			// Immediately send an ack so that the peer knows it may
   513  			// have to retransmit.
   514  			r.ep.snd.sendAck()
   515  		}
   516  		return false, nil
   517  	}
   518  
   519  	// Since we consumed a segment update the receiver's RTT estimate
   520  	// if required.
   521  	if segLen > 0 {
   522  		r.updateRTT()
   523  	}
   524  
   525  	// By consuming the current segment, we may have filled a gap in the
   526  	// sequence number domain that allows pending segments to be consumed
   527  	// now. So try to do it.
   528  	for !r.closed && r.pendingRcvdSegments.Len() > 0 {
   529  		s := r.pendingRcvdSegments[0]
   530  		segLen := seqnum.Size(s.payloadSize())
   531  		segSeq := s.sequenceNumber
   532  
   533  		// Skip segment altogether if it has already been acknowledged.
   534  		if !segSeq.Add(segLen-1).LessThan(r.RcvNxt) &&
   535  			!r.consumeSegment(s, segSeq, segLen) {
   536  			break
   537  		}
   538  
   539  		heap.Pop(&r.pendingRcvdSegments)
   540  		r.ep.rcvQueueMu.Lock()
   541  		r.PendingBufUsed -= s.segMemSize()
   542  		r.ep.rcvQueueMu.Unlock()
   543  		s.DecRef()
   544  	}
   545  	return false, nil
   546  }
   547  
   548  // handleTimeWaitSegment handles inbound segments received when the endpoint
   549  // has entered the TIME_WAIT state.
   550  // +checklocks:r.ep.mu
   551  // +checklocksalias:r.ep.snd.ep.mu=r.ep.mu
   552  func (r *receiver) handleTimeWaitSegment(s *segment) (resetTimeWait bool, newSyn bool) {
   553  	segSeq := s.sequenceNumber
   554  	segLen := seqnum.Size(s.payloadSize())
   555  
   556  	// Just silently drop any RST packets in TIME_WAIT. We do not support
   557  	// TIME_WAIT assasination as a result we confirm w/ fix 1 as described
   558  	// in https://tools.ietf.org/html/rfc1337#section-3.
   559  	//
   560  	// This behavior overrides RFC793 page 70 where we transition to CLOSED
   561  	// on receiving RST, which is also default Linux behavior.
   562  	// On Linux the RST can be ignored by setting sysctl net.ipv4.tcp_rfc1337.
   563  	//
   564  	// As we do not yet support PAWS, we are being conservative in ignoring
   565  	// RSTs by default.
   566  	if s.flags.Contains(header.TCPFlagRst) {
   567  		return false, false
   568  	}
   569  
   570  	// If it's a SYN and the sequence number is higher than any seen before
   571  	// for this connection then try and redirect it to a listening endpoint
   572  	// if available.
   573  	//
   574  	// RFC 1122:
   575  	//   "When a connection is [...] on TIME-WAIT state [...]
   576  	//   [a TCP] MAY accept a new SYN from the remote TCP to
   577  	//   reopen the connection directly, if it:
   578  
   579  	//    (1) assigns its initial sequence number for the new
   580  	//     connection to be larger than the largest sequence
   581  	//     number it used on the previous connection incarnation,
   582  	//     and
   583  
   584  	//    (2) returns to TIME-WAIT state if the SYN turns out
   585  	//      to be an old duplicate".
   586  	if s.flags.Contains(header.TCPFlagSyn) && r.RcvNxt.LessThan(segSeq) {
   587  		return false, true
   588  	}
   589  
   590  	// Drop the segment if it does not contain an ACK.
   591  	if !s.flags.Contains(header.TCPFlagAck) {
   592  		return false, false
   593  	}
   594  
   595  	// Update Timestamp if required. See RFC7323, section-4.3.
   596  	if r.ep.SendTSOk && s.parsedOptions.TS {
   597  		r.ep.updateRecentTimestamp(s.parsedOptions.TSVal, r.ep.snd.MaxSentAck, segSeq)
   598  	}
   599  
   600  	if segSeq.Add(1) == r.RcvNxt && s.flags.Contains(header.TCPFlagFin) {
   601  		// If it's a FIN-ACK then resetTimeWait and send an ACK, as it
   602  		// indicates our final ACK could have been lost.
   603  		r.ep.snd.sendAck()
   604  		return true, false
   605  	}
   606  
   607  	// If the sequence number range is outside the acceptable range or
   608  	// carries data then just send an ACK. This is according to RFC 793,
   609  	// page 37.
   610  	//
   611  	// NOTE: In TIME_WAIT the only acceptable sequence number is RcvNxt.
   612  	if segSeq != r.RcvNxt || segLen != 0 {
   613  		r.ep.snd.sendAck()
   614  	}
   615  	return false, false
   616  }