gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/tcpip/transport/tcp/rack.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"time"
    19  
    20  	"gvisor.dev/gvisor/pkg/tcpip"
    21  	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
    22  	"gvisor.dev/gvisor/pkg/tcpip/stack"
    23  )
    24  
    25  const (
    26  	// wcDelayedACKTimeout is the recommended maximum delayed ACK timer
    27  	// value as defined in the RFC. It stands for worst case delayed ACK
    28  	// timer (WCDelAckT). When FlightSize is 1, PTO is inflated by
    29  	// WCDelAckT time to compensate for a potential long delayed ACK timer
    30  	// at the receiver.
    31  	// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.
    32  	wcDelayedACKTimeout = 200 * time.Millisecond
    33  
    34  	// tcpRACKRecoveryThreshold is the number of loss recoveries for which
    35  	// the reorder window is inflated and after that the reorder window is
    36  	// reset to its initial value of minRTT/4.
    37  	// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2.
    38  	tcpRACKRecoveryThreshold = 16
    39  )
    40  
    41  // RACK is a loss detection algorithm used in TCP to detect packet loss and
    42  // reordering using transmission timestamp of the packets instead of packet or
    43  // sequence counts. To use RACK, SACK should be enabled on the connection.
    44  
    45  // rackControl stores the rack related fields.
    46  // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-6.1
    47  //
    48  // +stateify savable
    49  type rackControl struct {
    50  	stack.TCPRACKState
    51  
    52  	// exitedRecovery indicates if the connection is exiting loss recovery.
    53  	// This flag is set if the sender is leaving the recovery after
    54  	// receiving an ACK and is reset during updating of reorder window.
    55  	exitedRecovery bool
    56  
    57  	// minRTT is the estimated minimum RTT of the connection.
    58  	minRTT time.Duration
    59  
    60  	// tlpRxtOut indicates whether there is an unacknowledged
    61  	// TLP retransmission.
    62  	tlpRxtOut bool
    63  
    64  	// tlpHighRxt the value of sender.sndNxt at the time of sending
    65  	// a TLP retransmission.
    66  	tlpHighRxt seqnum.Value
    67  
    68  	// snd is a reference to the sender.
    69  	snd *sender
    70  }
    71  
    72  // init initializes RACK specific fields.
    73  func (rc *rackControl) init(snd *sender, iss seqnum.Value) {
    74  	rc.FACK = iss
    75  	rc.ReoWndIncr = 1
    76  	rc.snd = snd
    77  }
    78  
    79  // update will update the RACK related fields when an ACK has been received.
    80  // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-09#section-6.2
    81  func (rc *rackControl) update(seg *segment, ackSeg *segment) {
    82  	rtt := rc.snd.ep.stack.Clock().NowMonotonic().Sub(seg.xmitTime)
    83  
    84  	// If the ACK is for a retransmitted packet, do not update if it is a
    85  	// spurious inference which is determined by below checks:
    86  	// 1. When Timestamping option is available, if the TSVal is less than
    87  	// the transmit time of the most recent retransmitted packet.
    88  	// 2. When RTT calculated for the packet is less than the smoothed RTT
    89  	// for the connection.
    90  	// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
    91  	// step 2
    92  	if seg.xmitCount > 1 {
    93  		if ackSeg.parsedOptions.TS && ackSeg.parsedOptions.TSEcr != 0 {
    94  			if ackSeg.parsedOptions.TSEcr < rc.snd.ep.tsVal(seg.xmitTime) {
    95  				return
    96  			}
    97  		}
    98  		if rtt < rc.minRTT {
    99  			return
   100  		}
   101  	}
   102  
   103  	rc.RTT = rtt
   104  
   105  	// The sender can either track a simple global minimum of all RTT
   106  	// measurements from the connection, or a windowed min-filtered value
   107  	// of recent RTT measurements. This implementation keeps track of the
   108  	// simple global minimum of all RTTs for the connection.
   109  	if rtt < rc.minRTT || rc.minRTT == 0 {
   110  		rc.minRTT = rtt
   111  	}
   112  
   113  	// Update rc.xmitTime and rc.endSequence to the transmit time and
   114  	// ending sequence number of the packet which has been acknowledged
   115  	// most recently.
   116  	endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize()))
   117  	if rc.XmitTime.Before(seg.xmitTime) || (seg.xmitTime == rc.XmitTime && rc.EndSequence.LessThan(endSeq)) {
   118  		rc.XmitTime = seg.xmitTime
   119  		rc.EndSequence = endSeq
   120  	}
   121  }
   122  
   123  // detectReorder detects if packet reordering has been observed.
   124  // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
   125  //   - Step 3: Detect data segment reordering.
   126  //     To detect reordering, the sender looks for original data segments being
   127  //     delivered out of order. To detect such cases, the sender tracks the
   128  //     highest sequence selectively or cumulatively acknowledged in the RACK.fack
   129  //     variable. The name "fack" stands for the most "Forward ACK" (this term is
   130  //     adopted from [FACK]). If a never retransmitted segment that's below
   131  //     RACK.fack is (selectively or cumulatively) acknowledged, it has been
   132  //     delivered out of order. The sender sets RACK.reord to TRUE if such segment
   133  //     is identified.
   134  func (rc *rackControl) detectReorder(seg *segment) {
   135  	endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize()))
   136  	if rc.FACK.LessThan(endSeq) {
   137  		rc.FACK = endSeq
   138  		return
   139  	}
   140  
   141  	if endSeq.LessThan(rc.FACK) && seg.xmitCount == 1 {
   142  		rc.Reord = true
   143  	}
   144  }
   145  
   146  func (rc *rackControl) setDSACKSeen(dsackSeen bool) {
   147  	rc.DSACKSeen = dsackSeen
   148  }
   149  
   150  // shouldSchedulePTO dictates whether we should schedule a PTO or not.
   151  // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1.
   152  func (s *sender) shouldSchedulePTO() bool {
   153  	// Schedule PTO only if RACK loss detection is enabled.
   154  	return s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 &&
   155  		// The connection supports SACK.
   156  		s.ep.SACKPermitted &&
   157  		// The connection is not in loss recovery.
   158  		(s.state != tcpip.RTORecovery && s.state != tcpip.SACKRecovery) &&
   159  		// The connection has no SACKed sequences in the SACK scoreboard.
   160  		s.ep.scoreboard.Sacked() == 0
   161  }
   162  
   163  // schedulePTO schedules the probe timeout as defined in
   164  // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1.
   165  func (s *sender) schedulePTO() {
   166  	pto := time.Second
   167  	s.rtt.Lock()
   168  	if s.rtt.TCPRTTState.SRTTInited && s.rtt.TCPRTTState.SRTT > 0 {
   169  		pto = s.rtt.TCPRTTState.SRTT * 2
   170  		if s.Outstanding == 1 {
   171  			pto += wcDelayedACKTimeout
   172  		}
   173  	}
   174  	s.rtt.Unlock()
   175  
   176  	now := s.ep.stack.Clock().NowMonotonic()
   177  	if s.resendTimer.enabled() {
   178  		if now.Add(pto).After(s.resendTimer.target) {
   179  			pto = s.resendTimer.target.Sub(now)
   180  		}
   181  		s.resendTimer.disable()
   182  	}
   183  
   184  	s.probeTimer.enable(pto)
   185  }
   186  
   187  // probeTimerExpired is the same as TLP_send_probe() as defined in
   188  // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.2.
   189  //
   190  // +checklocks:s.ep.mu
   191  func (s *sender) probeTimerExpired() tcpip.Error {
   192  	if s.probeTimer.isUninitialized() || !s.probeTimer.checkExpiration() {
   193  		return nil
   194  	}
   195  
   196  	var dataSent bool
   197  	if s.writeNext != nil && s.writeNext.xmitCount == 0 && s.Outstanding < s.SndCwnd {
   198  		dataSent = s.maybeSendSegment(s.writeNext, int(s.ep.scoreboard.SMSS()), s.SndUna.Add(s.SndWnd))
   199  		if dataSent {
   200  			s.Outstanding += s.pCount(s.writeNext, s.MaxPayloadSize)
   201  			s.updateWriteNext(s.writeNext.Next())
   202  		}
   203  	}
   204  
   205  	if !dataSent && !s.rc.tlpRxtOut {
   206  		var highestSeqXmit *segment
   207  		for highestSeqXmit = s.writeList.Front(); highestSeqXmit != nil; highestSeqXmit = highestSeqXmit.Next() {
   208  			if highestSeqXmit.xmitCount == 0 {
   209  				// Nothing in writeList is transmitted, no need to send a probe.
   210  				highestSeqXmit = nil
   211  				break
   212  			}
   213  			if highestSeqXmit.Next() == nil || highestSeqXmit.Next().xmitCount == 0 {
   214  				// Either everything in writeList has been transmitted or the next
   215  				// sequence has not been transmitted. Either way this is the highest
   216  				// sequence segment that was transmitted.
   217  				break
   218  			}
   219  		}
   220  
   221  		if highestSeqXmit != nil {
   222  			dataSent = s.maybeSendSegment(highestSeqXmit, int(s.ep.scoreboard.SMSS()), s.SndUna.Add(s.SndWnd))
   223  			if dataSent {
   224  				s.rc.tlpRxtOut = true
   225  				s.rc.tlpHighRxt = s.SndNxt
   226  			}
   227  		}
   228  	}
   229  
   230  	// Whether or not the probe was sent, the sender must arm the resend timer,
   231  	// not the probe timer. This ensures that the sender does not send repeated,
   232  	// back-to-back tail loss probes.
   233  	s.postXmit(dataSent, false /* shouldScheduleProbe */)
   234  	return nil
   235  }
   236  
   237  // detectTLPRecovery detects if recovery was accomplished by the loss probes
   238  // and updates TLP state accordingly.
   239  // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.3.
   240  func (s *sender) detectTLPRecovery(ack seqnum.Value, rcvdSeg *segment) {
   241  	if !(s.ep.SACKPermitted && s.rc.tlpRxtOut) {
   242  		return
   243  	}
   244  
   245  	// Step 1.
   246  	if s.isDupAck(rcvdSeg) && ack == s.rc.tlpHighRxt {
   247  		var sbAboveTLPHighRxt bool
   248  		for _, sb := range rcvdSeg.parsedOptions.SACKBlocks {
   249  			if s.rc.tlpHighRxt.LessThan(sb.End) {
   250  				sbAboveTLPHighRxt = true
   251  				break
   252  			}
   253  		}
   254  		if !sbAboveTLPHighRxt {
   255  			// TLP episode is complete.
   256  			s.rc.tlpRxtOut = false
   257  		}
   258  	}
   259  
   260  	if s.rc.tlpRxtOut && s.rc.tlpHighRxt.LessThanEq(ack) {
   261  		// TLP episode is complete.
   262  		s.rc.tlpRxtOut = false
   263  		if !checkDSACK(rcvdSeg) {
   264  			// Step 2. Either the original packet or the retransmission (in the
   265  			// form of a probe) was lost. Invoke a congestion control response
   266  			// equivalent to fast recovery.
   267  			s.cc.HandleLossDetected()
   268  			s.enterRecovery()
   269  			s.leaveRecovery()
   270  		}
   271  	}
   272  }
   273  
   274  // updateRACKReorderWindow updates the reorder window.
   275  // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
   276  //   - Step 4: Update RACK reordering window
   277  //     To handle the prevalent small degree of reordering, RACK.reo_wnd serves as
   278  //     an allowance for settling time before marking a packet lost. RACK starts
   279  //     initially with a conservative window of min_RTT/4. If no reordering has
   280  //     been observed RACK uses reo_wnd of zero during loss recovery, in order to
   281  //     retransmit quickly, or when the number of DUPACKs exceeds the classic
   282  //     DUPACKthreshold.
   283  func (rc *rackControl) updateRACKReorderWindow() {
   284  	dsackSeen := rc.DSACKSeen
   285  	snd := rc.snd
   286  
   287  	// React to DSACK once per round trip.
   288  	// If SND.UNA < RACK.rtt_seq:
   289  	//   RACK.dsack = false
   290  	if snd.SndUna.LessThan(rc.RTTSeq) {
   291  		dsackSeen = false
   292  	}
   293  
   294  	// If RACK.dsack:
   295  	//   RACK.reo_wnd_incr += 1
   296  	//   RACK.dsack = false
   297  	//   RACK.rtt_seq = SND.NXT
   298  	//   RACK.reo_wnd_persist = 16
   299  	if dsackSeen {
   300  		rc.ReoWndIncr++
   301  		dsackSeen = false
   302  		rc.RTTSeq = snd.SndNxt
   303  		rc.ReoWndPersist = tcpRACKRecoveryThreshold
   304  	} else if rc.exitedRecovery {
   305  		// Else if exiting loss recovery:
   306  		//   RACK.reo_wnd_persist -= 1
   307  		//   If RACK.reo_wnd_persist <= 0:
   308  		//      RACK.reo_wnd_incr = 1
   309  		rc.ReoWndPersist--
   310  		if rc.ReoWndPersist <= 0 {
   311  			rc.ReoWndIncr = 1
   312  		}
   313  		rc.exitedRecovery = false
   314  	}
   315  
   316  	// Reorder window is zero during loss recovery, or when the number of
   317  	// DUPACKs exceeds the classic DUPACKthreshold.
   318  	// If RACK.reord is FALSE:
   319  	//   If in loss recovery:  (If in fast or timeout recovery)
   320  	//      RACK.reo_wnd = 0
   321  	//      Return
   322  	//   Else if RACK.pkts_sacked >= RACK.dupthresh:
   323  	//     RACK.reo_wnd = 0
   324  	//     return
   325  	if !rc.Reord {
   326  		if snd.state == tcpip.RTORecovery || snd.state == tcpip.SACKRecovery {
   327  			rc.ReoWnd = 0
   328  			return
   329  		}
   330  
   331  		if snd.SackedOut >= nDupAckThreshold {
   332  			rc.ReoWnd = 0
   333  			return
   334  		}
   335  	}
   336  
   337  	// Calculate reorder window.
   338  	// RACK.reo_wnd = RACK.min_RTT / 4 * RACK.reo_wnd_incr
   339  	// RACK.reo_wnd = min(RACK.reo_wnd, SRTT)
   340  	snd.rtt.Lock()
   341  	srtt := snd.rtt.TCPRTTState.SRTT
   342  	snd.rtt.Unlock()
   343  	rc.ReoWnd = time.Duration((int64(rc.minRTT) / 4) * int64(rc.ReoWndIncr))
   344  	if srtt < rc.ReoWnd {
   345  		rc.ReoWnd = srtt
   346  	}
   347  }
   348  
   349  func (rc *rackControl) exitRecovery() {
   350  	rc.exitedRecovery = true
   351  }
   352  
   353  // detectLoss marks the segment as lost if the reordering window has elapsed
   354  // and the ACK is not received. It will also arm the reorder timer.
   355  // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 Step 5.
   356  func (rc *rackControl) detectLoss(rcvTime tcpip.MonotonicTime) int {
   357  	var timeout time.Duration
   358  	numLost := 0
   359  	for seg := rc.snd.writeList.Front(); seg != nil && seg.xmitCount != 0; seg = seg.Next() {
   360  		// xmitCount can be 0 for packets that are broken up for PMTUD.
   361  		// The initial transmission "doesn't count" WRT loss detection.
   362  		if rc.snd.ep.scoreboard.IsSACKED(seg.sackBlock()) || seg.xmitCount == 0 {
   363  			continue
   364  		}
   365  
   366  		if seg.lost && seg.xmitCount > 1 {
   367  			numLost++
   368  			continue
   369  		}
   370  
   371  		endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize()))
   372  		if seg.xmitTime.Before(rc.XmitTime) || (seg.xmitTime == rc.XmitTime && rc.EndSequence.LessThan(endSeq)) {
   373  			timeRemaining := seg.xmitTime.Sub(rcvTime) + rc.RTT + rc.ReoWnd
   374  			if timeRemaining <= 0 {
   375  				seg.lost = true
   376  				numLost++
   377  			} else if timeRemaining > timeout {
   378  				timeout = timeRemaining
   379  			}
   380  		}
   381  	}
   382  
   383  	if timeout != 0 && !rc.snd.reorderTimer.enabled() {
   384  		rc.snd.reorderTimer.enable(timeout)
   385  	}
   386  	return numLost
   387  }
   388  
   389  // reorderTimerExpired will retransmit the segments which have not been acked
   390  // before the reorder timer expired.
   391  //
   392  // +checklocks:rc.snd.ep.mu
   393  func (rc *rackControl) reorderTimerExpired() tcpip.Error {
   394  	if rc.snd.reorderTimer.isUninitialized() || !rc.snd.reorderTimer.checkExpiration() {
   395  		return nil
   396  	}
   397  
   398  	numLost := rc.detectLoss(rc.snd.ep.stack.Clock().NowMonotonic())
   399  	if numLost == 0 {
   400  		return nil
   401  	}
   402  
   403  	fastRetransmit := false
   404  	if !rc.snd.FastRecovery.Active {
   405  		rc.snd.cc.HandleLossDetected()
   406  		rc.snd.enterRecovery()
   407  		fastRetransmit = true
   408  	}
   409  
   410  	rc.DoRecovery(nil, fastRetransmit)
   411  	return nil
   412  }
   413  
   414  // DoRecovery implements lossRecovery.DoRecovery.
   415  //
   416  // +checklocks:rc.snd.ep.mu
   417  func (rc *rackControl) DoRecovery(_ *segment, fastRetransmit bool) {
   418  	snd := rc.snd
   419  	if fastRetransmit {
   420  		snd.resendSegment()
   421  	}
   422  
   423  	var dataSent bool
   424  	// Iterate the writeList and retransmit the segments which are marked
   425  	// as lost by RACK.
   426  	for seg := snd.writeList.Front(); seg != nil && seg.xmitCount > 0; seg = seg.Next() {
   427  		if seg == snd.writeNext {
   428  			break
   429  		}
   430  
   431  		if !seg.lost {
   432  			continue
   433  		}
   434  
   435  		// Reset seg.lost as it is already SACKed.
   436  		if snd.ep.scoreboard.IsSACKED(seg.sackBlock()) {
   437  			seg.lost = false
   438  			continue
   439  		}
   440  
   441  		// Check the congestion window after entering recovery.
   442  		if snd.Outstanding >= snd.SndCwnd {
   443  			break
   444  		}
   445  
   446  		if sent := snd.maybeSendSegment(seg, int(snd.ep.scoreboard.SMSS()), snd.SndUna.Add(snd.SndWnd)); !sent {
   447  			break
   448  		}
   449  		dataSent = true
   450  		snd.Outstanding += snd.pCount(seg, snd.MaxPayloadSize)
   451  	}
   452  
   453  	snd.postXmit(dataSent, true /* shouldScheduleProbe */)
   454  }