github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/tcpip/transport/tcp/snd.go

github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/tcpip/transport/tcp/snd.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"fmt"
    19  	"math"
    20  	"sort"
    21  	"time"
    22  
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/sync"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/header"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/seqnum"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/stack"
    28  )
    29  
    30  const (
    31  	// MinRTO is the minimum allowed value for the retransmit timeout.
    32  	MinRTO = 200 * time.Millisecond
    33  
    34  	// MaxRTO is the maximum allowed value for the retransmit timeout.
    35  	MaxRTO = 120 * time.Second
    36  
    37  	// MinSRTT is the minimum allowed value for smoothed RTT.
    38  	MinSRTT = 1 * time.Millisecond
    39  
    40  	// InitialCwnd is the initial congestion window.
    41  	InitialCwnd = 10
    42  
    43  	// nDupAckThreshold is the number of duplicate ACK's required
    44  	// before fast-retransmit is entered.
    45  	nDupAckThreshold = 3
    46  
    47  	// MaxRetries is the maximum number of probe retries sender does
    48  	// before timing out the connection.
    49  	// Linux default TCP_RETR2, net.ipv4.tcp_retries2.
    50  	MaxRetries = 15
    51  )
    52  
    53  // congestionControl is an interface that must be implemented by any supported
    54  // congestion control algorithm.
    55  type congestionControl interface {
    56  	// HandleLossDetected is invoked when the loss is detected by RACK or
    57  	// sender.dupAckCount >= nDupAckThreshold just before entering fast
    58  	// retransmit.
    59  	HandleLossDetected()
    60  
    61  	// HandleRTOExpired is invoked when the retransmit timer expires.
    62  	HandleRTOExpired()
    63  
    64  	// Update is invoked when processing inbound acks. It's passed the
    65  	// number of packet's that were acked by the most recent cumulative
    66  	// acknowledgement.
    67  	Update(packetsAcked int)
    68  
    69  	// PostRecovery is invoked when the sender is exiting a fast retransmit/
    70  	// recovery phase. This provides congestion control algorithms a way
    71  	// to adjust their state when exiting recovery.
    72  	PostRecovery()
    73  }
    74  
    75  // lossRecovery is an interface that must be implemented by any supported
    76  // loss recovery algorithm.
    77  type lossRecovery interface {
    78  	// DoRecovery is invoked when loss is detected and segments need
    79  	// to be retransmitted. The cumulative or selective ACK is passed along
    80  	// with the flag which identifies whether the connection entered fast
    81  	// retransmit with this ACK and to retransmit the first unacknowledged
    82  	// segment.
    83  	DoRecovery(rcvdSeg *segment, fastRetransmit bool)
    84  }
    85  
    86  // sender holds the state necessary to send TCP segments.
    87  //
    88  // +stateify savable
    89  type sender struct {
    90  	stack.TCPSenderState
    91  	ep *endpoint
    92  
    93  	// lr is the loss recovery algorithm used by the sender.
    94  	lr lossRecovery
    95  
    96  	// firstRetransmittedSegXmitTime is the original transmit time of
    97  	// the first segment that was retransmitted due to RTO expiration.
    98  	firstRetransmittedSegXmitTime tcpip.MonotonicTime
    99  
   100  	// zeroWindowProbing is set if the sender is currently probing
   101  	// for zero receive window.
   102  	zeroWindowProbing bool `state:"nosave"`
   103  
   104  	// unackZeroWindowProbes is the number of unacknowledged zero
   105  	// window probes.
   106  	unackZeroWindowProbes uint32 `state:"nosave"`
   107  
   108  	writeNext   *segment
   109  	writeList   segmentList
   110  	resendTimer timer `state:"nosave"`
   111  
   112  	// rtt.TCPRTTState.SRTT and rtt.TCPRTTState.RTTVar are the "smoothed
   113  	// round-trip time", and "round-trip time variation", as defined in
   114  	// section 2 of RFC 6298.
   115  	rtt rtt
   116  
   117  	// minRTO is the minimum permitted value for sender.rto.
   118  	minRTO time.Duration
   119  
   120  	// maxRTO is the maximum permitted value for sender.rto.
   121  	maxRTO time.Duration
   122  
   123  	// maxRetries is the maximum permitted retransmissions.
   124  	maxRetries uint32
   125  
   126  	// gso is set if generic segmentation offload is enabled.
   127  	gso bool
   128  
   129  	// state is the current state of congestion control for this endpoint.
   130  	state tcpip.CongestionControlState
   131  
   132  	// cc is the congestion control algorithm in use for this sender.
   133  	cc congestionControl
   134  
   135  	// rc has the fields needed for implementing RACK loss detection
   136  	// algorithm.
   137  	rc rackControl
   138  
   139  	// reorderTimer is the timer used to retransmit the segments after RACK
   140  	// detects them as lost.
   141  	reorderTimer timer `state:"nosave"`
   142  
   143  	// probeTimer is used to schedule PTO for RACK TLP algorithm.
   144  	probeTimer timer `state:"nosave"`
   145  
   146  	// spuriousRecovery indicates whether the sender entered recovery
   147  	// spuriously as described in RFC3522 Section 3.2.
   148  	spuriousRecovery bool
   149  
   150  	// retransmitTS is the timestamp at which the sender sends retransmitted
   151  	// segment after entering an RTO for the first time as described in
   152  	// RFC3522 Section 3.2.
   153  	retransmitTS uint32
   154  }
   155  
   156  // rtt is a synchronization wrapper used to appease stateify. See the comment
   157  // in sender, where it is used.
   158  //
   159  // +stateify savable
   160  type rtt struct {
   161  	sync.Mutex `state:"nosave"`
   162  
   163  	stack.TCPRTTState
   164  }
   165  
   166  // +checklocks:ep.mu
   167  func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender {
   168  	// The sender MUST reduce the TCP data length to account for any IP or
   169  	// TCP options that it is including in the packets that it sends.
   170  	// See: https://tools.ietf.org/html/rfc6691#section-2
   171  	maxPayloadSize := int(mss) - ep.maxOptionSize()
   172  
   173  	s := &sender{
   174  		ep: ep,
   175  		TCPSenderState: stack.TCPSenderState{
   176  			SndWnd:           sndWnd,
   177  			SndUna:           iss + 1,
   178  			SndNxt:           iss + 1,
   179  			RTTMeasureSeqNum: iss + 1,
   180  			LastSendTime:     ep.stack.Clock().NowMonotonic(),
   181  			MaxPayloadSize:   maxPayloadSize,
   182  			MaxSentAck:       irs + 1,
   183  			FastRecovery: stack.TCPFastRecoveryState{
   184  				// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1.
   185  				Last:      iss,
   186  				HighRxt:   iss,
   187  				RescueRxt: iss,
   188  			},
   189  			RTO: 1 * time.Second,
   190  		},
   191  		gso: ep.gso.Type != stack.GSONone,
   192  	}
   193  
   194  	if s.gso {
   195  		s.ep.gso.MSS = uint16(maxPayloadSize)
   196  	}
   197  
   198  	s.cc = s.initCongestionControl(ep.cc)
   199  	s.lr = s.initLossRecovery()
   200  	s.rc.init(s, iss)
   201  
   202  	// A negative sndWndScale means that no scaling is in use, otherwise we
   203  	// store the scaling value.
   204  	if sndWndScale > 0 {
   205  		s.SndWndScale = uint8(sndWndScale)
   206  	}
   207  
   208  	s.resendTimer.init(s.ep.stack.Clock(), maybeFailTimerHandler(s.ep, s.retransmitTimerExpired))
   209  	s.reorderTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.rc.reorderTimerExpired))
   210  	s.probeTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.probeTimerExpired))
   211  
   212  	s.ep.AssertLockHeld(ep)
   213  	s.updateMaxPayloadSize(int(ep.route.MTU()), 0)
   214  	// Initialize SACK Scoreboard after updating max payload size as we use
   215  	// the maxPayloadSize as the smss when determining if a segment is lost
   216  	// etc.
   217  	s.ep.scoreboard = NewSACKScoreboard(uint16(s.MaxPayloadSize), iss)
   218  
   219  	// Get Stack wide config.
   220  	var minRTO tcpip.TCPMinRTOOption
   221  	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil {
   222  		panic(fmt.Sprintf("unable to get minRTO from stack: %s", err))
   223  	}
   224  	s.minRTO = time.Duration(minRTO)
   225  
   226  	var maxRTO tcpip.TCPMaxRTOOption
   227  	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil {
   228  		panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err))
   229  	}
   230  	s.maxRTO = time.Duration(maxRTO)
   231  
   232  	var maxRetries tcpip.TCPMaxRetriesOption
   233  	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil {
   234  		panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err))
   235  	}
   236  	s.maxRetries = uint32(maxRetries)
   237  
   238  	return s
   239  }
   240  
   241  // initCongestionControl initializes the specified congestion control module and
   242  // returns a handle to it. It also initializes the sndCwnd and sndSsThresh to
   243  // their initial values.
   244  func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl {
   245  	s.SndCwnd = InitialCwnd
   246  	// Set sndSsthresh to the maximum int value, which depends on the
   247  	// platform.
   248  	s.Ssthresh = int(^uint(0) >> 1)
   249  
   250  	switch congestionControlName {
   251  	case ccCubic:
   252  		return newCubicCC(s)
   253  	case ccReno:
   254  		fallthrough
   255  	default:
   256  		return newRenoCC(s)
   257  	}
   258  }
   259  
   260  // initLossRecovery initiates the loss recovery algorithm for the sender.
   261  func (s *sender) initLossRecovery() lossRecovery {
   262  	if s.ep.SACKPermitted {
   263  		return newSACKRecovery(s)
   264  	}
   265  	return newRenoRecovery(s)
   266  }
   267  
   268  // updateMaxPayloadSize updates the maximum payload size based on the given
   269  // MTU. If this is in response to "packet too big" control packets (indicated
   270  // by the count argument), it also reduces the number of outstanding packets and
   271  // attempts to retransmit the first packet above the MTU size.
   272  // +checklocks:s.ep.mu
   273  func (s *sender) updateMaxPayloadSize(mtu, count int) {
   274  	m := mtu - header.TCPMinimumSize
   275  
   276  	m -= s.ep.maxOptionSize()
   277  
   278  	// We don't adjust up for now.
   279  	if m >= s.MaxPayloadSize {
   280  		return
   281  	}
   282  
   283  	// Make sure we can transmit at least one byte.
   284  	if m <= 0 {
   285  		m = 1
   286  	}
   287  
   288  	oldMSS := s.MaxPayloadSize
   289  	s.MaxPayloadSize = m
   290  	if s.gso {
   291  		s.ep.gso.MSS = uint16(m)
   292  	}
   293  
   294  	if count == 0 {
   295  		// updateMaxPayloadSize is also called when the sender is created.
   296  		// and there is no data to send in such cases. Return immediately.
   297  		return
   298  	}
   299  
   300  	// Update the scoreboard's smss to reflect the new lowered
   301  	// maxPayloadSize.
   302  	s.ep.scoreboard.smss = uint16(m)
   303  
   304  	s.Outstanding -= count
   305  	if s.Outstanding < 0 {
   306  		s.Outstanding = 0
   307  	}
   308  
   309  	// Rewind writeNext to the first segment exceeding the MTU. Do nothing
   310  	// if it is already before such a packet.
   311  	nextSeg := s.writeNext
   312  	for seg := s.writeList.Front(); seg != nil; seg = seg.Next() {
   313  		if seg == s.writeNext {
   314  			// We got to writeNext before we could find a segment
   315  			// exceeding the MTU.
   316  			break
   317  		}
   318  
   319  		if nextSeg == s.writeNext && seg.payloadSize() > m {
   320  			// We found a segment exceeding the MTU. Rewind
   321  			// writeNext and try to retransmit it.
   322  			nextSeg = seg
   323  		}
   324  
   325  		if s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
   326  			// Update sackedOut for new maximum payload size.
   327  			s.SackedOut -= s.pCount(seg, oldMSS)
   328  			s.SackedOut += s.pCount(seg, s.MaxPayloadSize)
   329  		}
   330  	}
   331  
   332  	// Since we likely reduced the number of outstanding packets, we may be
   333  	// ready to send some more.
   334  	s.updateWriteNext(nextSeg)
   335  	s.sendData()
   336  }
   337  
   338  // sendAck sends an ACK segment.
   339  // +checklocks:s.ep.mu
   340  func (s *sender) sendAck() {
   341  	s.sendEmptySegment(header.TCPFlagAck, s.SndNxt)
   342  }
   343  
   344  // updateRTO updates the retransmit timeout when a new roud-trip time is
   345  // available. This is done in accordance with section 2 of RFC 6298.
   346  func (s *sender) updateRTO(rtt time.Duration) {
   347  	s.rtt.Lock()
   348  	if !s.rtt.TCPRTTState.SRTTInited {
   349  		s.rtt.TCPRTTState.RTTVar = rtt / 2
   350  		s.rtt.TCPRTTState.SRTT = rtt
   351  		s.rtt.TCPRTTState.SRTTInited = true
   352  	} else {
   353  		diff := s.rtt.TCPRTTState.SRTT - rtt
   354  		if diff < 0 {
   355  			diff = -diff
   356  		}
   357  		// Use RFC6298 standard algorithm to update TCPRTTState.RTTVar and TCPRTTState.SRTT when
   358  		// no timestamps are available.
   359  		if !s.ep.SendTSOk {
   360  			s.rtt.TCPRTTState.RTTVar = (3*s.rtt.TCPRTTState.RTTVar + diff) / 4
   361  			s.rtt.TCPRTTState.SRTT = (7*s.rtt.TCPRTTState.SRTT + rtt) / 8
   362  		} else {
   363  			// When we are taking RTT measurements of every ACK then
   364  			// we need to use a modified method as specified in
   365  			// https://tools.ietf.org/html/rfc7323#appendix-G
   366  			if s.Outstanding == 0 {
   367  				s.rtt.Unlock()
   368  				return
   369  			}
   370  			// Netstack measures congestion window/inflight all in
   371  			// terms of packets and not bytes. This is similar to
   372  			// how linux also does cwnd and inflight. In practice
   373  			// this approximation works as expected.
   374  			expectedSamples := math.Ceil(float64(s.Outstanding) / 2)
   375  
   376  			// alpha & beta values are the original values as recommended in
   377  			// https://tools.ietf.org/html/rfc6298#section-2.3.
   378  			const alpha = 0.125
   379  			const beta = 0.25
   380  
   381  			alphaPrime := alpha / expectedSamples
   382  			betaPrime := beta / expectedSamples
   383  			rttVar := (1-betaPrime)*s.rtt.TCPRTTState.RTTVar.Seconds() + betaPrime*diff.Seconds()
   384  			srtt := (1-alphaPrime)*s.rtt.TCPRTTState.SRTT.Seconds() + alphaPrime*rtt.Seconds()
   385  			s.rtt.TCPRTTState.RTTVar = time.Duration(rttVar * float64(time.Second))
   386  			s.rtt.TCPRTTState.SRTT = time.Duration(srtt * float64(time.Second))
   387  		}
   388  	}
   389  
   390  	if s.rtt.TCPRTTState.SRTT < MinSRTT {
   391  		s.rtt.TCPRTTState.SRTT = MinSRTT
   392  	}
   393  
   394  	s.RTO = s.rtt.TCPRTTState.SRTT + 4*s.rtt.TCPRTTState.RTTVar
   395  	s.rtt.Unlock()
   396  	if s.RTO < s.minRTO {
   397  		s.RTO = s.minRTO
   398  	}
   399  	if s.RTO > s.maxRTO {
   400  		s.RTO = s.maxRTO
   401  	}
   402  }
   403  
   404  // resendSegment resends the first unacknowledged segment.
   405  // +checklocks:s.ep.mu
   406  func (s *sender) resendSegment() {
   407  	// Don't use any segments we already sent to measure RTT as they may
   408  	// have been affected by packets being lost.
   409  	s.RTTMeasureSeqNum = s.SndNxt
   410  
   411  	// Resend the segment.
   412  	if seg := s.writeList.Front(); seg != nil {
   413  		if seg.payloadSize() > s.MaxPayloadSize {
   414  			s.splitSeg(seg, s.MaxPayloadSize)
   415  		}
   416  
   417  		// See: RFC 6675 section 5 Step 4.3
   418  		//
   419  		// To prevent retransmission, set both the HighRXT and RescueRXT
   420  		// to the highest sequence number in the retransmitted segment.
   421  		s.FastRecovery.HighRxt = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) - 1
   422  		s.FastRecovery.RescueRxt = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) - 1
   423  		s.sendSegment(seg)
   424  		s.ep.stack.Stats().TCP.FastRetransmit.Increment()
   425  		s.ep.stats.SendErrors.FastRetransmit.Increment()
   426  
   427  		// Run SetPipe() as per RFC 6675 section 5 Step 4.4
   428  		s.SetPipe()
   429  	}
   430  }
   431  
   432  // retransmitTimerExpired is called when the retransmit timer expires, and
   433  // unacknowledged segments are assumed lost, and thus need to be resent.
   434  // Returns true if the connection is still usable, or false if the connection
   435  // is deemed lost.
   436  // +checklocks:s.ep.mu
   437  func (s *sender) retransmitTimerExpired() tcpip.Error {
   438  	// Check if the timer actually expired or if it's a spurious wake due
   439  	// to a previously orphaned runtime timer.
   440  	if s.resendTimer.isZero() || !s.resendTimer.checkExpiration() {
   441  		return nil
   442  	}
   443  
   444  	// Initialize the variables used to detect spurious recovery after
   445  	// entering RTO.
   446  	//
   447  	// See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1.
   448  	s.spuriousRecovery = false
   449  	s.retransmitTS = 0
   450  
   451  	// TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases
   452  	// when writeList is empty. Remove this once we have a proper fix for this
   453  	// issue.
   454  	if s.writeList.Front() == nil {
   455  		return nil
   456  	}
   457  
   458  	s.ep.stack.Stats().TCP.Timeouts.Increment()
   459  	s.ep.stats.SendErrors.Timeouts.Increment()
   460  
   461  	// Set TLPRxtOut to false according to
   462  	// https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1.
   463  	s.rc.tlpRxtOut = false
   464  
   465  	// Give up if we've waited more than a minute since the last resend or
   466  	// if a user time out is set and we have exceeded the user specified
   467  	// timeout since the first retransmission.
   468  	uto := s.ep.userTimeout
   469  
   470  	if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) {
   471  		// We store the original xmitTime of the segment that we are
   472  		// about to retransmit as the retransmission time. This is
   473  		// required as by the time the retransmitTimer has expired the
   474  		// segment has already been sent and unacked for the RTO at the
   475  		// time the segment was sent.
   476  		s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime
   477  	}
   478  
   479  	elapsed := s.ep.stack.Clock().NowMonotonic().Sub(s.firstRetransmittedSegXmitTime)
   480  	remaining := s.maxRTO
   481  	if uto != 0 {
   482  		// Cap to the user specified timeout if one is specified.
   483  		remaining = uto - elapsed
   484  	}
   485  
   486  	// Always honor the user-timeout irrespective of whether the zero
   487  	// window probes were acknowledged.
   488  	// net/ipv4/tcp_timer.c::tcp_probe_timer()
   489  	if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries {
   490  		s.ep.stack.Stats().TCP.EstablishedTimedout.Increment()
   491  		return &tcpip.ErrTimeout{}
   492  	}
   493  
   494  	// Set new timeout. The timer will be restarted by the call to sendData
   495  	// below.
   496  	s.RTO *= 2
   497  	// Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5
   498  	if s.RTO > s.maxRTO {
   499  		s.RTO = s.maxRTO
   500  	}
   501  
   502  	// Cap RTO to remaining time.
   503  	if s.RTO > remaining {
   504  		s.RTO = remaining
   505  	}
   506  
   507  	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4.
   508  	//
   509  	// Retransmit timeouts:
   510  	//     After a retransmit timeout, record the highest sequence number
   511  	//     transmitted in the variable recover, and exit the fast recovery
   512  	//     procedure if applicable.
   513  	s.FastRecovery.Last = s.SndNxt - 1
   514  
   515  	if s.FastRecovery.Active {
   516  		// We were attempting fast recovery but were not successful.
   517  		// Leave the state. We don't need to update ssthresh because it
   518  		// has already been updated when entered fast-recovery.
   519  		s.leaveRecovery()
   520  	}
   521  
   522  	// Record retransmitTS if the sender is not in recovery as per:
   523  	// https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2
   524  	s.recordRetransmitTS()
   525  
   526  	s.state = tcpip.RTORecovery
   527  	s.cc.HandleRTOExpired()
   528  
   529  	// Mark the next segment to be sent as the first unacknowledged one and
   530  	// start sending again. Set the number of outstanding packets to 0 so
   531  	// that we'll be able to retransmit.
   532  	//
   533  	// We'll keep on transmitting (or retransmitting) as we get acks for
   534  	// the data we transmit.
   535  	s.Outstanding = 0
   536  
   537  	// Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1
   538  	//
   539  	//  In order to avoid memory deadlocks, the TCP receiver is allowed to
   540  	//  discard data that has already been selectively acknowledged. As a
   541  	//  result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK
   542  	//  information gathered from a receiver upon a retransmission timeout
   543  	//  (RTO) "since the timeout might indicate that the data receiver has
   544  	//  reneged." Additionally, a TCP sender MUST "ignore prior SACK
   545  	//  information in determining which data to retransmit."
   546  	//
   547  	// NOTE: We take the stricter interpretation and just expunge all
   548  	// information as we lack more rigorous checks to validate if the SACK
   549  	// information is usable after an RTO.
   550  	s.ep.scoreboard.Reset()
   551  	s.updateWriteNext(s.writeList.Front())
   552  
   553  	// RFC 1122 4.2.2.17: Start sending zero window probes when we still see a
   554  	// zero receive window after retransmission interval and we have data to
   555  	// send.
   556  	if s.zeroWindowProbing {
   557  		s.sendZeroWindowProbe()
   558  		// RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed
   559  		// indefinitely.  As long as the receiving TCP continues to send
   560  		// acknowledgments in response to the probe segments, the sending TCP
   561  		// MUST allow the connection to stay open.
   562  		return nil
   563  	}
   564  
   565  	seg := s.writeNext
   566  	// RFC 1122 4.2.3.5: Close the connection when the number of
   567  	// retransmissions for this segment is beyond a limit.
   568  	if seg != nil && seg.xmitCount > s.maxRetries {
   569  		s.ep.stack.Stats().TCP.EstablishedTimedout.Increment()
   570  		return &tcpip.ErrTimeout{}
   571  	}
   572  
   573  	s.sendData()
   574  
   575  	return nil
   576  }
   577  
   578  // pCount returns the number of packets in the segment. Due to GSO, a segment
   579  // can be composed of multiple packets.
   580  func (s *sender) pCount(seg *segment, maxPayloadSize int) int {
   581  	size := seg.payloadSize()
   582  	if size == 0 {
   583  		return 1
   584  	}
   585  
   586  	return (size-1)/maxPayloadSize + 1
   587  }
   588  
   589  // splitSeg splits a given segment at the size specified and inserts the
   590  // remainder as a new segment after the current one in the write list.
   591  func (s *sender) splitSeg(seg *segment, size int) {
   592  	if seg.payloadSize() <= size {
   593  		return
   594  	}
   595  	// Split this segment up.
   596  	nSeg := seg.clone()
   597  	nSeg.pkt.Data().TrimFront(size)
   598  	nSeg.sequenceNumber.UpdateForward(seqnum.Size(size))
   599  	s.writeList.InsertAfter(seg, nSeg)
   600  
   601  	// The segment being split does not carry PUSH flag because it is
   602  	// followed by the newly split segment.
   603  	// RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered
   604  	// segment (i.e., when there is no more queued data to be sent).
   605  	// Linux removes PSH flag only when the segment is being split over MSS
   606  	// and retains it when we are splitting the segment over lack of sender
   607  	// window space.
   608  	// ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point()
   609  	// ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test()
   610  	if seg.payloadSize() > s.MaxPayloadSize {
   611  		seg.flags ^= header.TCPFlagPsh
   612  	}
   613  	seg.pkt.Data().CapLength(size)
   614  }
   615  
   616  // NextSeg implements the RFC6675 NextSeg() operation.
   617  //
   618  // NextSeg starts scanning the writeList starting from nextSegHint and returns
   619  // the hint to be passed on the next call to NextSeg. This is required to avoid
   620  // iterating the write list repeatedly when NextSeg is invoked in a loop during
   621  // recovery. The returned hint will be nil if there are no more segments that
   622  // can match rules defined by NextSeg operation in RFC6675.
   623  //
   624  // rescueRtx will be true only if nextSeg is a rescue retransmission as
   625  // described by Step 4) of the NextSeg algorithm.
   626  func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) {
   627  	var s3 *segment
   628  	var s4 *segment
   629  	// Step 1.
   630  	for seg := nextSegHint; seg != nil; seg = seg.Next() {
   631  		// Stop iteration if we hit a segment that has never been
   632  		// transmitted (i.e. either it has no assigned sequence number
   633  		// or if it does have one, it's >= the next sequence number
   634  		// to be sent [i.e. >= s.sndNxt]).
   635  		if !s.isAssignedSequenceNumber(seg) || s.SndNxt.LessThanEq(seg.sequenceNumber) {
   636  			hint = nil
   637  			break
   638  		}
   639  		segSeq := seg.sequenceNumber
   640  		if smss := s.ep.scoreboard.SMSS(); seg.payloadSize() > int(smss) {
   641  			s.splitSeg(seg, int(smss))
   642  		}
   643  
   644  		// See RFC 6675 Section 4
   645  		//
   646  		//     1. If there exists a smallest unSACKED sequence number
   647  		//     'S2' that meets the following 3 criteria for determinig
   648  		//     loss, the sequence range of one segment of up to SMSS
   649  		//     octects starting with S2 MUST be returned.
   650  		if !s.ep.scoreboard.IsSACKED(header.SACKBlock{Start: segSeq, End: segSeq.Add(1)}) {
   651  			// NextSeg():
   652  			//
   653  			//    (1.a) S2 is greater than HighRxt
   654  			//    (1.b) S2 is less than highest octect covered by
   655  			//    any received SACK.
   656  			if s.FastRecovery.HighRxt.LessThan(segSeq) && segSeq.LessThan(s.ep.scoreboard.maxSACKED) {
   657  				// NextSeg():
   658  				//     (1.c) IsLost(S2) returns true.
   659  				if s.ep.scoreboard.IsLost(segSeq) {
   660  					return seg, seg.Next(), false
   661  				}
   662  
   663  				// NextSeg():
   664  				//
   665  				// (3): If the conditions for rules (1) and (2)
   666  				// fail, but there exists an unSACKed sequence
   667  				// number S3 that meets the criteria for
   668  				// detecting loss given in steps 1.a and 1.b
   669  				// above (specifically excluding (1.c)) then one
   670  				// segment of upto SMSS octets starting with S3
   671  				// SHOULD be returned.
   672  				if s3 == nil {
   673  					s3 = seg
   674  					hint = seg.Next()
   675  				}
   676  			}
   677  			// NextSeg():
   678  			//
   679  			//     (4) If the conditions for (1), (2) and (3) fail,
   680  			//     but there exists outstanding unSACKED data, we
   681  			//     provide the opportunity for a single "rescue"
   682  			//     retransmission per entry into loss recovery. If
   683  			//     HighACK is greater than RescueRxt (or RescueRxt
   684  			//     is undefined), then one segment of upto SMSS
   685  			//     octects that MUST include the highest outstanding
   686  			//     unSACKed sequence number SHOULD be returned, and
   687  			//     RescueRxt set to RecoveryPoint. HighRxt MUST NOT
   688  			//     be updated.
   689  			if s.FastRecovery.RescueRxt.LessThan(s.SndUna - 1) {
   690  				if s4 != nil {
   691  					if s4.sequenceNumber.LessThan(segSeq) {
   692  						s4 = seg
   693  					}
   694  				} else {
   695  					s4 = seg
   696  				}
   697  			}
   698  		}
   699  	}
   700  
   701  	// If we got here then no segment matched step (1).
   702  	// Step (2): "If no sequence number 'S2' per rule (1)
   703  	// exists but there exists available unsent data and the
   704  	// receiver's advertised window allows, the sequence
   705  	// range of one segment of up to SMSS octets of
   706  	// previously unsent data starting with sequence number
   707  	// HighData+1 MUST be returned."
   708  	for seg := s.writeNext; seg != nil; seg = seg.Next() {
   709  		if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.SndNxt) {
   710  			continue
   711  		}
   712  		// We do not split the segment here to <= smss as it has
   713  		// potentially not been assigned a sequence number yet.
   714  		return seg, nil, false
   715  	}
   716  
   717  	if s3 != nil {
   718  		return s3, hint, false
   719  	}
   720  
   721  	return s4, nil, true
   722  }
   723  
   724  // maybeSendSegment tries to send the specified segment and either coalesces
   725  // other segments into this one or splits the specified segment based on the
   726  // lower of the specified limit value or the receivers window size specified by
   727  // end.
   728  // +checklocks:s.ep.mu
   729  func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (sent bool) {
   730  	// We abuse the flags field to determine if we have already
   731  	// assigned a sequence number to this segment.
   732  	if !s.isAssignedSequenceNumber(seg) {
   733  		// Merge segments if allowed.
   734  		if seg.payloadSize() != 0 {
   735  			available := int(s.SndNxt.Size(end))
   736  			if available > limit {
   737  				available = limit
   738  			}
   739  
   740  			// nextTooBig indicates that the next segment was too
   741  			// large to entirely fit in the current segment. It
   742  			// would be possible to split the next segment and merge
   743  			// the portion that fits, but unexpectedly splitting
   744  			// segments can have user visible side-effects which can
   745  			// break applications. For example, RFC 7766 section 8
   746  			// says that the length and data of a DNS response
   747  			// should be sent in the same TCP segment to avoid
   748  			// triggering bugs in poorly written DNS
   749  			// implementations.
   750  			var nextTooBig bool
   751  			for nSeg := seg.Next(); nSeg != nil && nSeg.payloadSize() != 0; nSeg = seg.Next() {
   752  				if seg.payloadSize()+nSeg.payloadSize() > available {
   753  					nextTooBig = true
   754  					break
   755  				}
   756  				seg.merge(nSeg)
   757  				s.writeList.Remove(nSeg)
   758  				nSeg.DecRef()
   759  			}
   760  			if !nextTooBig && seg.payloadSize() < available {
   761  				// Segment is not full.
   762  				if s.Outstanding > 0 && s.ep.ops.GetDelayOption() {
   763  					// Nagle's algorithm. From Wikipedia:
   764  					//   Nagle's algorithm works by
   765  					//   combining a number of small
   766  					//   outgoing messages and sending them
   767  					//   all at once. Specifically, as long
   768  					//   as there is a sent packet for which
   769  					//   the sender has received no
   770  					//   acknowledgment, the sender should
   771  					//   keep buffering its output until it
   772  					//   has a full packet's worth of
   773  					//   output, thus allowing output to be
   774  					//   sent all at once.
   775  					return false
   776  				}
   777  				// With TCP_CORK, hold back until minimum of the available
   778  				// send space and MSS.
   779  				// TODO(gvisor.dev/issue/2833): Drain the held segments after a
   780  				// timeout.
   781  				if seg.payloadSize() < s.MaxPayloadSize && s.ep.ops.GetCorkOption() {
   782  					return false
   783  				}
   784  			}
   785  		}
   786  
   787  		// Assign flags. We don't do it above so that we can merge
   788  		// additional data if Nagle holds the segment.
   789  		seg.sequenceNumber = s.SndNxt
   790  		seg.flags = header.TCPFlagAck | header.TCPFlagPsh
   791  	}
   792  
   793  	var segEnd seqnum.Value
   794  	if seg.payloadSize() == 0 {
   795  		if s.writeList.Back() != seg {
   796  			panic("FIN segments must be the final segment in the write list.")
   797  		}
   798  		seg.flags = header.TCPFlagAck | header.TCPFlagFin
   799  		segEnd = seg.sequenceNumber.Add(1)
   800  		// Update the state to reflect that we have now
   801  		// queued a FIN.
   802  		s.ep.updateConnDirectionState(connDirectionStateSndClosed)
   803  		switch s.ep.EndpointState() {
   804  		case StateCloseWait:
   805  			s.ep.setEndpointState(StateLastAck)
   806  		default:
   807  			s.ep.setEndpointState(StateFinWait1)
   808  		}
   809  	} else {
   810  		// We're sending a non-FIN segment.
   811  		if seg.flags&header.TCPFlagFin != 0 {
   812  			panic("Netstack queues FIN segments without data.")
   813  		}
   814  
   815  		if !seg.sequenceNumber.LessThan(end) {
   816  			return false
   817  		}
   818  
   819  		available := int(seg.sequenceNumber.Size(end))
   820  		if available == 0 {
   821  			return false
   822  		}
   823  
   824  		// If the whole segment or at least 1MSS sized segment cannot
   825  		// be accomodated in the receiver advertized window, skip
   826  		// splitting and sending of the segment. ref:
   827  		// net/ipv4/tcp_output.c::tcp_snd_wnd_test()
   828  		//
   829  		// Linux checks this for all segment transmits not triggered by
   830  		// a probe timer. On this condition, it defers the segment split
   831  		// and transmit to a short probe timer.
   832  		//
   833  		// ref: include/net/tcp.h::tcp_check_probe_timer()
   834  		// ref: net/ipv4/tcp_output.c::tcp_write_wakeup()
   835  		//
   836  		// Instead of defining a new transmit timer, we attempt to split
   837  		// the segment right here if there are no pending segments. If
   838  		// there are pending segments, segment transmits are deferred to
   839  		// the retransmit timer handler.
   840  		if s.SndUna != s.SndNxt {
   841  			switch {
   842  			case available >= seg.payloadSize():
   843  				// OK to send, the whole segments fits in the
   844  				// receiver's advertised window.
   845  			case available >= s.MaxPayloadSize:
   846  				// OK to send, at least 1 MSS sized segment fits
   847  				// in the receiver's advertised window.
   848  			default:
   849  				return false
   850  			}
   851  		}
   852  
   853  		// The segment size limit is computed as a function of sender
   854  		// congestion window and MSS. When sender congestion window is >
   855  		// 1, this limit can be larger than MSS. Ensure that the
   856  		// currently available send space is not greater than minimum of
   857  		// this limit and MSS.
   858  		if available > limit {
   859  			available = limit
   860  		}
   861  
   862  		// If GSO is not in use then cap available to
   863  		// maxPayloadSize. When GSO is in use the gVisor GSO logic or
   864  		// the host GSO logic will cap the segment to the correct size.
   865  		if s.ep.gso.Type == stack.GSONone && available > s.MaxPayloadSize {
   866  			available = s.MaxPayloadSize
   867  		}
   868  
   869  		if seg.payloadSize() > available {
   870  			// A negative value causes splitSeg to panic anyways, so just panic
   871  			// earlier to get more information about the cause.
   872  			s.splitSeg(seg, available)
   873  		}
   874  
   875  		segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize()))
   876  	}
   877  
   878  	s.sendSegment(seg)
   879  
   880  	// Update sndNxt if we actually sent new data (as opposed to
   881  	// retransmitting some previously sent data).
   882  	if s.SndNxt.LessThan(segEnd) {
   883  		s.SndNxt = segEnd
   884  	}
   885  
   886  	return true
   887  }
   888  
   889  // +checklocks:s.ep.mu
   890  func (s *sender) sendZeroWindowProbe() {
   891  	s.unackZeroWindowProbes++
   892  	// Send a zero window probe with sequence number pointing to
   893  	// the last acknowledged byte.
   894  	s.sendEmptySegment(header.TCPFlagAck, s.SndUna-1)
   895  	// Rearm the timer to continue probing.
   896  	s.resendTimer.enable(s.RTO)
   897  }
   898  
   899  func (s *sender) enableZeroWindowProbing() {
   900  	s.zeroWindowProbing = true
   901  	// We piggyback the probing on the retransmit timer with the
   902  	// current retranmission interval, as we may start probing while
   903  	// segment retransmissions.
   904  	if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) {
   905  		s.firstRetransmittedSegXmitTime = s.ep.stack.Clock().NowMonotonic()
   906  	}
   907  	s.resendTimer.enable(s.RTO)
   908  }
   909  
   910  func (s *sender) disableZeroWindowProbing() {
   911  	s.zeroWindowProbing = false
   912  	s.unackZeroWindowProbes = 0
   913  	s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{}
   914  	s.resendTimer.disable()
   915  }
   916  
   917  func (s *sender) postXmit(dataSent bool, shouldScheduleProbe bool) {
   918  	if dataSent {
   919  		// We sent data, so we should stop the keepalive timer to ensure
   920  		// that no keepalives are sent while there is pending data.
   921  		s.ep.disableKeepaliveTimer()
   922  	}
   923  
   924  	// If the sender has advertized zero receive window and we have
   925  	// data to be sent out, start zero window probing to query the
   926  	// the remote for it's receive window size.
   927  	if s.writeNext != nil && s.SndWnd == 0 {
   928  		s.enableZeroWindowProbing()
   929  	}
   930  
   931  	// If we have no more pending data, start the keepalive timer.
   932  	if s.SndUna == s.SndNxt {
   933  		s.ep.resetKeepaliveTimer(false)
   934  	} else {
   935  		// Enable timers if we have pending data.
   936  		if shouldScheduleProbe && s.shouldSchedulePTO() {
   937  			// Schedule PTO after transmitting new data that wasn't itself a TLP probe.
   938  			s.schedulePTO()
   939  		} else if !s.resendTimer.enabled() {
   940  			s.probeTimer.disable()
   941  			if s.Outstanding > 0 {
   942  				// Enable the resend timer if it's not enabled yet and there is
   943  				// outstanding data.
   944  				s.resendTimer.enable(s.RTO)
   945  			}
   946  		}
   947  	}
   948  }
   949  
   950  // sendData sends new data segments. It is called when data becomes available or
   951  // when the send window opens up.
   952  // +checklocks:s.ep.mu
   953  func (s *sender) sendData() {
   954  	limit := s.MaxPayloadSize
   955  	if s.gso {
   956  		limit = int(s.ep.gso.MaxSize - header.TCPHeaderMaximumSize)
   957  	}
   958  	end := s.SndUna.Add(s.SndWnd)
   959  
   960  	// Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10.
   961  	// "A TCP SHOULD set cwnd to no more than RW before beginning
   962  	// transmission if the TCP has not sent data in the interval exceeding
   963  	// the retrasmission timeout."
   964  	if !s.FastRecovery.Active && s.state != tcpip.RTORecovery && s.ep.stack.Clock().NowMonotonic().Sub(s.LastSendTime) > s.RTO {
   965  		if s.SndCwnd > InitialCwnd {
   966  			s.SndCwnd = InitialCwnd
   967  		}
   968  	}
   969  
   970  	var dataSent bool
   971  	for seg := s.writeNext; seg != nil && s.Outstanding < s.SndCwnd; seg = seg.Next() {
   972  		cwndLimit := (s.SndCwnd - s.Outstanding) * s.MaxPayloadSize
   973  		if cwndLimit < limit {
   974  			limit = cwndLimit
   975  		}
   976  		if s.isAssignedSequenceNumber(seg) && s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
   977  			// Move writeNext along so that we don't try and scan data that
   978  			// has already been SACKED.
   979  			s.updateWriteNext(seg.Next())
   980  			continue
   981  		}
   982  		if sent := s.maybeSendSegment(seg, limit, end); !sent {
   983  			break
   984  		}
   985  		dataSent = true
   986  		s.Outstanding += s.pCount(seg, s.MaxPayloadSize)
   987  		s.updateWriteNext(seg.Next())
   988  	}
   989  
   990  	s.postXmit(dataSent, true /* shouldScheduleProbe */)
   991  }
   992  
   993  func (s *sender) enterRecovery() {
   994  	// Initialize the variables used to detect spurious recovery after
   995  	// entering recovery.
   996  	//
   997  	// See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1.
   998  	s.spuriousRecovery = false
   999  	s.retransmitTS = 0
  1000  
  1001  	s.FastRecovery.Active = true
  1002  	// Save state to reflect we're now in fast recovery.
  1003  	//
  1004  	// See : https://tools.ietf.org/html/rfc5681#section-3.2 Step 3.
  1005  	// We inflate the cwnd by 3 to account for the 3 packets which triggered
  1006  	// the 3 duplicate ACKs and are now not in flight.
  1007  	s.SndCwnd = s.Ssthresh + 3
  1008  	s.SackedOut = 0
  1009  	s.DupAckCount = 0
  1010  	s.FastRecovery.First = s.SndUna
  1011  	s.FastRecovery.Last = s.SndNxt - 1
  1012  	s.FastRecovery.MaxCwnd = s.SndCwnd + s.Outstanding
  1013  	s.FastRecovery.HighRxt = s.SndUna
  1014  	s.FastRecovery.RescueRxt = s.SndUna
  1015  
  1016  	// Record retransmitTS if the sender is not in recovery as per:
  1017  	// https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2
  1018  	s.recordRetransmitTS()
  1019  
  1020  	if s.ep.SACKPermitted {
  1021  		s.state = tcpip.SACKRecovery
  1022  		s.ep.stack.Stats().TCP.SACKRecovery.Increment()
  1023  		// Set TLPRxtOut to false according to
  1024  		// https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1.
  1025  		if s.rc.tlpRxtOut {
  1026  			// The tail loss probe triggered recovery.
  1027  			s.ep.stack.Stats().TCP.TLPRecovery.Increment()
  1028  		}
  1029  		s.rc.tlpRxtOut = false
  1030  		return
  1031  	}
  1032  	s.state = tcpip.FastRecovery
  1033  	s.ep.stack.Stats().TCP.FastRecovery.Increment()
  1034  }
  1035  
  1036  func (s *sender) leaveRecovery() {
  1037  	s.FastRecovery.Active = false
  1038  	s.FastRecovery.MaxCwnd = 0
  1039  	s.DupAckCount = 0
  1040  
  1041  	// Deflate cwnd. It had been artificially inflated when new dups arrived.
  1042  	s.SndCwnd = s.Ssthresh
  1043  	s.cc.PostRecovery()
  1044  }
  1045  
  1046  // isAssignedSequenceNumber relies on the fact that we only set flags once a
  1047  // sequencenumber is assigned and that is only done right before we send the
  1048  // segment. As a result any segment that has a non-zero flag has a valid
  1049  // sequence number assigned to it.
  1050  func (s *sender) isAssignedSequenceNumber(seg *segment) bool {
  1051  	return seg.flags != 0
  1052  }
  1053  
  1054  // SetPipe implements the SetPipe() function described in RFC6675. Netstack
  1055  // maintains the congestion window in number of packets and not bytes, so
  1056  // SetPipe() here measures number of outstanding packets rather than actual
  1057  // outstanding bytes in the network.
  1058  func (s *sender) SetPipe() {
  1059  	// If SACK isn't permitted or it is permitted but recovery is not active
  1060  	// then ignore pipe calculations.
  1061  	if !s.ep.SACKPermitted || !s.FastRecovery.Active {
  1062  		return
  1063  	}
  1064  	pipe := 0
  1065  	smss := seqnum.Size(s.ep.scoreboard.SMSS())
  1066  	for s1 := s.writeList.Front(); s1 != nil && s1.payloadSize() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() {
  1067  		// With GSO each segment can be much larger than SMSS. So check the segment
  1068  		// in SMSS sized ranges.
  1069  		segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.payloadSize()))
  1070  		for startSeq := s1.sequenceNumber; startSeq.LessThan(segEnd); startSeq = startSeq.Add(smss) {
  1071  			endSeq := startSeq.Add(smss)
  1072  			if segEnd.LessThan(endSeq) {
  1073  				endSeq = segEnd
  1074  			}
  1075  			sb := header.SACKBlock{Start: startSeq, End: endSeq}
  1076  			// SetPipe():
  1077  			//
  1078  			// After initializing pipe to zero, the following steps are
  1079  			// taken for each octet 'S1' in the sequence space between
  1080  			// HighACK and HighData that has not been SACKed:
  1081  			if !s1.sequenceNumber.LessThan(s.SndNxt) {
  1082  				break
  1083  			}
  1084  			if s.ep.scoreboard.IsSACKED(sb) {
  1085  				continue
  1086  			}
  1087  
  1088  			// SetPipe():
  1089  			//
  1090  			//    (a) If IsLost(S1) returns false, Pipe is incremened by 1.
  1091  			//
  1092  			// NOTE: here we mark the whole segment as lost. We do not try
  1093  			// and test every byte in our write buffer as we maintain our
  1094  			// pipe in terms of oustanding packets and not bytes.
  1095  			if !s.ep.scoreboard.IsRangeLost(sb) {
  1096  				pipe++
  1097  			}
  1098  			// SetPipe():
  1099  			//    (b) If S1 <= HighRxt, Pipe is incremented by 1.
  1100  			if s1.sequenceNumber.LessThanEq(s.FastRecovery.HighRxt) {
  1101  				pipe++
  1102  			}
  1103  		}
  1104  	}
  1105  	s.Outstanding = pipe
  1106  }
  1107  
  1108  // shouldEnterRecovery returns true if the sender should enter fast recovery
  1109  // based on dupAck count and sack scoreboard.
  1110  // See RFC 6675 section 5.
  1111  func (s *sender) shouldEnterRecovery() bool {
  1112  	return s.DupAckCount >= nDupAckThreshold ||
  1113  		(s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 && s.ep.scoreboard.IsLost(s.SndUna))
  1114  }
  1115  
  1116  // detectLoss is called when an ack is received and returns whether a loss is
  1117  // detected. It manages the state related to duplicate acks and determines if
  1118  // a retransmit is needed according to the rules in RFC 6582 (NewReno).
  1119  func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) {
  1120  	// We're not in fast recovery yet.
  1121  
  1122  	// If RACK is enabled and there is no reordering we should honor the
  1123  	// three duplicate ACK rule to enter recovery.
  1124  	// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-4
  1125  	if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1126  		if s.rc.Reord {
  1127  			return false
  1128  		}
  1129  	}
  1130  
  1131  	if !s.isDupAck(seg) {
  1132  		s.DupAckCount = 0
  1133  		return false
  1134  	}
  1135  
  1136  	s.DupAckCount++
  1137  
  1138  	// Do not enter fast recovery until we reach nDupAckThreshold or the
  1139  	// first unacknowledged byte is considered lost as per SACK scoreboard.
  1140  	if !s.shouldEnterRecovery() {
  1141  		// RFC 6675 Step 3.
  1142  		s.FastRecovery.HighRxt = s.SndUna - 1
  1143  		// Do run SetPipe() to calculate the outstanding segments.
  1144  		s.SetPipe()
  1145  		s.state = tcpip.Disorder
  1146  		return false
  1147  	}
  1148  
  1149  	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2
  1150  	//
  1151  	// We only do the check here, the incrementing of last to the highest
  1152  	// sequence number transmitted till now is done when enterRecovery
  1153  	// is invoked.
  1154  	//
  1155  	// Note that we only enter recovery when at least one more byte of data
  1156  	// beyond s.fr.last (the highest byte that was outstanding when fast
  1157  	// retransmit was last entered) is acked.
  1158  	if !s.FastRecovery.Last.LessThan(seg.ackNumber - 1) {
  1159  		s.DupAckCount = 0
  1160  		return false
  1161  	}
  1162  	s.cc.HandleLossDetected()
  1163  	s.enterRecovery()
  1164  	return true
  1165  }
  1166  
  1167  // isDupAck determines if seg is a duplicate ack as defined in
  1168  // https://tools.ietf.org/html/rfc5681#section-2.
  1169  func (s *sender) isDupAck(seg *segment) bool {
  1170  	// A TCP that utilizes selective acknowledgments (SACKs) [RFC2018, RFC2883]
  1171  	// can leverage the SACK information to determine when an incoming ACK is a
  1172  	// "duplicate" (e.g., if the ACK contains previously unknown SACK
  1173  	// information).
  1174  	if s.ep.SACKPermitted && !seg.hasNewSACKInfo {
  1175  		return false
  1176  	}
  1177  
  1178  	// (a) The receiver of the ACK has outstanding data.
  1179  	return s.SndUna != s.SndNxt &&
  1180  		// (b) The incoming acknowledgment carries no data.
  1181  		seg.logicalLen() == 0 &&
  1182  		// (c) The SYN and FIN bits are both off.
  1183  		!seg.flags.Intersects(header.TCPFlagFin|header.TCPFlagSyn) &&
  1184  		// (d) the ACK number is equal to the greatest acknowledgment received on
  1185  		// the given connection (TCP.UNA from RFC793).
  1186  		seg.ackNumber == s.SndUna &&
  1187  		// (e) the advertised window in the incoming acknowledgment equals the
  1188  		// advertised window in the last incoming acknowledgment.
  1189  		s.SndWnd == seg.window
  1190  }
  1191  
  1192  // Iterate the writeList and update RACK for each segment which is newly acked
  1193  // either cumulatively or selectively. Loop through the segments which are
  1194  // sacked, and update the RACK related variables and check for reordering.
  1195  // Returns true when the DSACK block has been detected in the received ACK.
  1196  //
  1197  // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
  1198  // steps 2 and 3.
  1199  func (s *sender) walkSACK(rcvdSeg *segment) bool {
  1200  	s.rc.setDSACKSeen(false)
  1201  
  1202  	// Look for DSACK block.
  1203  	hasDSACK := false
  1204  	idx := 0
  1205  	n := len(rcvdSeg.parsedOptions.SACKBlocks)
  1206  	if checkDSACK(rcvdSeg) {
  1207  		dsackBlock := rcvdSeg.parsedOptions.SACKBlocks[0]
  1208  		numDSACK := uint64(dsackBlock.End-dsackBlock.Start) / uint64(s.MaxPayloadSize)
  1209  		// numDSACK can be zero when DSACK is sent for subsegments.
  1210  		if numDSACK < 1 {
  1211  			numDSACK = 1
  1212  		}
  1213  		s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.IncrementBy(numDSACK)
  1214  		s.rc.setDSACKSeen(true)
  1215  		idx = 1
  1216  		n--
  1217  		hasDSACK = true
  1218  	}
  1219  
  1220  	if n == 0 {
  1221  		return hasDSACK
  1222  	}
  1223  
  1224  	// Sort the SACK blocks. The first block is the most recent unacked
  1225  	// block. The following blocks can be in arbitrary order.
  1226  	sackBlocks := make([]header.SACKBlock, n)
  1227  	copy(sackBlocks, rcvdSeg.parsedOptions.SACKBlocks[idx:])
  1228  	sort.Slice(sackBlocks, func(i, j int) bool {
  1229  		return sackBlocks[j].Start.LessThan(sackBlocks[i].Start)
  1230  	})
  1231  
  1232  	seg := s.writeList.Front()
  1233  	for _, sb := range sackBlocks {
  1234  		for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 {
  1235  			if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked {
  1236  				s.rc.update(seg, rcvdSeg)
  1237  				s.rc.detectReorder(seg)
  1238  				seg.acked = true
  1239  				s.SackedOut += s.pCount(seg, s.MaxPayloadSize)
  1240  			}
  1241  			seg = seg.Next()
  1242  		}
  1243  	}
  1244  	return hasDSACK
  1245  }
  1246  
  1247  // checkDSACK checks if a DSACK is reported.
  1248  func checkDSACK(rcvdSeg *segment) bool {
  1249  	n := len(rcvdSeg.parsedOptions.SACKBlocks)
  1250  	if n == 0 {
  1251  		return false
  1252  	}
  1253  
  1254  	sb := rcvdSeg.parsedOptions.SACKBlocks[0]
  1255  	// Check if SACK block is invalid.
  1256  	if sb.End.LessThan(sb.Start) {
  1257  		return false
  1258  	}
  1259  
  1260  	// See: https://tools.ietf.org/html/rfc2883#section-5 DSACK is sent in
  1261  	// at most one SACK block. DSACK is detected in the below two cases:
  1262  	//	* If the SACK sequence space is less than this cumulative ACK, it is
  1263  	//		an indication that the segment identified by the SACK block has
  1264  	//		been received more than once by the receiver.
  1265  	//	* If the sequence space in the first SACK block is greater than the
  1266  	//		cumulative ACK, then the sender next compares the sequence space
  1267  	//		in the first SACK block with the sequence space in the second SACK
  1268  	//		block, if there is one. This comparison can determine if the first
  1269  	//		SACK block is reporting duplicate data that lies above the
  1270  	//		cumulative ACK.
  1271  	if sb.Start.LessThan(rcvdSeg.ackNumber) {
  1272  		return true
  1273  	}
  1274  
  1275  	if n > 1 {
  1276  		sb1 := rcvdSeg.parsedOptions.SACKBlocks[1]
  1277  		if sb1.End.LessThan(sb1.Start) {
  1278  			return false
  1279  		}
  1280  
  1281  		// If the first SACK block is fully covered by second SACK
  1282  		// block, then the first block is a DSACK block.
  1283  		if sb.End.LessThanEq(sb1.End) && sb1.Start.LessThanEq(sb.Start) {
  1284  			return true
  1285  		}
  1286  	}
  1287  
  1288  	return false
  1289  }
  1290  
  1291  func (s *sender) recordRetransmitTS() {
  1292  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2
  1293  	//
  1294  	// The Eifel detection algorithm is used, only upon initiation of loss
  1295  	// recovery, i.e., when either the timeout-based retransmit or the fast
  1296  	// retransmit is sent. The Eifel detection algorithm MUST NOT be
  1297  	// reinitiated after loss recovery has already started. In particular,
  1298  	// it must not be reinitiated upon subsequent timeouts for the same
  1299  	// segment, and not upon retransmitting segments other than the oldest
  1300  	// outstanding segment, e.g., during selective loss recovery.
  1301  	if s.inRecovery() {
  1302  		return
  1303  	}
  1304  
  1305  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2
  1306  	//
  1307  	// Set a "RetransmitTS" variable to the value of the Timestamp Value
  1308  	// field of the Timestamps option included in the retransmit sent when
  1309  	// loss recovery is initiated. A TCP sender must ensure that
  1310  	// RetransmitTS does not get overwritten as loss recovery progresses,
  1311  	// e.g., in case of a second timeout and subsequent second retransmit of
  1312  	// the same octet.
  1313  	s.retransmitTS = s.ep.tsValNow()
  1314  }
  1315  
  1316  func (s *sender) detectSpuriousRecovery(hasDSACK bool, tsEchoReply uint32) {
  1317  	// Return if the sender has already detected spurious recovery.
  1318  	if s.spuriousRecovery {
  1319  		return
  1320  	}
  1321  
  1322  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 4
  1323  	//
  1324  	// If the value of the Timestamp Echo Reply field of the acceptable ACK's
  1325  	// Timestamps option is smaller than the value of RetransmitTS, then
  1326  	// proceed to next step, else return.
  1327  	if tsEchoReply >= s.retransmitTS {
  1328  		return
  1329  	}
  1330  
  1331  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5
  1332  	//
  1333  	// If the acceptable ACK carries a DSACK option [RFC2883], then return.
  1334  	if hasDSACK {
  1335  		return
  1336  	}
  1337  
  1338  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5
  1339  	//
  1340  	// If during the lifetime of the TCP connection the TCP sender has
  1341  	// previously received an ACK with a DSACK option, or the acceptable ACK
  1342  	// does not acknowledge all outstanding data, then proceed to next step,
  1343  	// else return.
  1344  	numDSACK := s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.Value()
  1345  	if numDSACK == 0 && s.SndUna == s.SndNxt {
  1346  		return
  1347  	}
  1348  
  1349  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 6
  1350  	//
  1351  	// If the loss recovery has been initiated with a timeout-based
  1352  	// retransmit, then set
  1353  	//    SpuriousRecovery <- SPUR_TO (equal 1),
  1354  	// else set
  1355  	//    SpuriousRecovery <- dupacks+1
  1356  	// Set the spurious recovery variable to true as we do not differentiate
  1357  	// between fast, SACK or RTO recovery.
  1358  	s.spuriousRecovery = true
  1359  	s.ep.stack.Stats().TCP.SpuriousRecovery.Increment()
  1360  
  1361  	// RFC 3522 will detect all kinds of spurious recoveries (fast, SACK and
  1362  	// timeout). Increment the metric for RTO only as we want to track the
  1363  	// number of timeout recoveries.
  1364  	if s.state == tcpip.RTORecovery {
  1365  		s.ep.stack.Stats().TCP.SpuriousRTORecovery.Increment()
  1366  	}
  1367  }
  1368  
  1369  // Check if the sender is in RTORecovery, FastRecovery or SACKRecovery state.
  1370  func (s *sender) inRecovery() bool {
  1371  	if s.state == tcpip.RTORecovery || s.state == tcpip.FastRecovery || s.state == tcpip.SACKRecovery {
  1372  		return true
  1373  	}
  1374  	return false
  1375  }
  1376  
  1377  // handleRcvdSegment is called when a segment is received; it is responsible for
  1378  // updating the send-related state.
  1379  // +checklocks:s.ep.mu
  1380  // +checklocksalias:s.rc.snd.ep.mu=s.ep.mu
  1381  func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
  1382  	// Check if we can extract an RTT measurement from this ack.
  1383  	if !rcvdSeg.parsedOptions.TS && s.RTTMeasureSeqNum.LessThan(rcvdSeg.ackNumber) {
  1384  		s.updateRTO(s.ep.stack.Clock().NowMonotonic().Sub(s.RTTMeasureTime))
  1385  		s.RTTMeasureSeqNum = s.SndNxt
  1386  	}
  1387  
  1388  	// Update Timestamp if required. See RFC7323, section-4.3.
  1389  	if s.ep.SendTSOk && rcvdSeg.parsedOptions.TS {
  1390  		s.ep.updateRecentTimestamp(rcvdSeg.parsedOptions.TSVal, s.MaxSentAck, rcvdSeg.sequenceNumber)
  1391  	}
  1392  
  1393  	// Insert SACKBlock information into our scoreboard.
  1394  	hasDSACK := false
  1395  	if s.ep.SACKPermitted {
  1396  		for _, sb := range rcvdSeg.parsedOptions.SACKBlocks {
  1397  			// Only insert the SACK block if the following holds
  1398  			// true:
  1399  			//  * SACK block acks data after the ack number in the
  1400  			//    current segment.
  1401  			//  * SACK block represents a sequence
  1402  			//    between sndUna and sndNxt (i.e. data that is
  1403  			//    currently unacked and in-flight).
  1404  			//  * SACK block that has not been SACKed already.
  1405  			//
  1406  			// NOTE: This check specifically excludes DSACK blocks
  1407  			// which have start/end before sndUna and are used to
  1408  			// indicate spurious retransmissions.
  1409  			if rcvdSeg.ackNumber.LessThan(sb.Start) && s.SndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.SndNxt) && !s.ep.scoreboard.IsSACKED(sb) {
  1410  				s.ep.scoreboard.Insert(sb)
  1411  				rcvdSeg.hasNewSACKInfo = true
  1412  			}
  1413  		}
  1414  
  1415  		// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08
  1416  		// section-7.2
  1417  		//	* Step 2: Update RACK stats.
  1418  		//		If the ACK is not ignored as invalid, update the RACK.rtt
  1419  		//		to be the RTT sample calculated using this ACK, and
  1420  		//		continue.  If this ACK or SACK was for the most recently
  1421  		//		sent packet, then record the RACK.xmit_ts timestamp and
  1422  		//		RACK.end_seq sequence implied by this ACK.
  1423  		//	* Step 3: Detect packet reordering.
  1424  		//		If the ACK selectively or cumulatively acknowledges an
  1425  		//		unacknowledged and also never retransmitted sequence below
  1426  		//		RACK.fack, then the corresponding packet has been
  1427  		//		reordered and RACK.reord is set to TRUE.
  1428  		if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1429  			hasDSACK = s.walkSACK(rcvdSeg)
  1430  		}
  1431  		s.SetPipe()
  1432  	}
  1433  
  1434  	ack := rcvdSeg.ackNumber
  1435  	fastRetransmit := false
  1436  	// Do not leave fast recovery, if the ACK is out of range.
  1437  	if s.FastRecovery.Active {
  1438  		// Leave fast recovery if it acknowledges all the data covered by
  1439  		// this fast recovery session.
  1440  		if (ack-1).InRange(s.SndUna, s.SndNxt) && s.FastRecovery.Last.LessThan(ack) {
  1441  			s.leaveRecovery()
  1442  		}
  1443  	} else {
  1444  		// Detect loss by counting the duplicates and enter recovery.
  1445  		fastRetransmit = s.detectLoss(rcvdSeg)
  1446  	}
  1447  
  1448  	// See if TLP based recovery was successful.
  1449  	if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1450  		s.detectTLPRecovery(ack, rcvdSeg)
  1451  	}
  1452  
  1453  	// Stash away the current window size.
  1454  	s.SndWnd = rcvdSeg.window
  1455  
  1456  	// Disable zero window probing if remote advertizes a non-zero receive
  1457  	// window. This can be with an ACK to the zero window probe (where the
  1458  	// acknumber refers to the already acknowledged byte) OR to any previously
  1459  	// unacknowledged segment.
  1460  	if s.zeroWindowProbing && rcvdSeg.window > 0 &&
  1461  		(ack == s.SndUna || (ack-1).InRange(s.SndUna, s.SndNxt)) {
  1462  		s.disableZeroWindowProbing()
  1463  	}
  1464  
  1465  	// On receiving the ACK for the zero window probe, account for it and
  1466  	// skip trying to send any segment as we are still probing for
  1467  	// receive window to become non-zero.
  1468  	if s.zeroWindowProbing && s.unackZeroWindowProbes > 0 && ack == s.SndUna {
  1469  		s.unackZeroWindowProbes--
  1470  		return
  1471  	}
  1472  
  1473  	// Ignore ack if it doesn't acknowledge any new data.
  1474  	if (ack - 1).InRange(s.SndUna, s.SndNxt) {
  1475  		s.DupAckCount = 0
  1476  
  1477  		// See : https://tools.ietf.org/html/rfc1323#section-3.3.
  1478  		// Specifically we should only update the RTO using TSEcr if the
  1479  		// following condition holds:
  1480  		//
  1481  		//    A TSecr value received in a segment is used to update the
  1482  		//    averaged RTT measurement only if the segment acknowledges
  1483  		//    some new data, i.e., only if it advances the left edge of
  1484  		//    the send window.
  1485  		if s.ep.SendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 {
  1486  			s.updateRTO(s.ep.elapsed(s.ep.stack.Clock().NowMonotonic(), rcvdSeg.parsedOptions.TSEcr))
  1487  		}
  1488  
  1489  		if s.shouldSchedulePTO() {
  1490  			// Schedule PTO upon receiving an ACK that cumulatively acknowledges data.
  1491  			// See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1.
  1492  			s.schedulePTO()
  1493  		} else {
  1494  			// When an ack is received we must rearm the timer.
  1495  			// RFC 6298 5.3
  1496  			s.probeTimer.disable()
  1497  			s.resendTimer.enable(s.RTO)
  1498  		}
  1499  
  1500  		// Remove all acknowledged data from the write list.
  1501  		acked := s.SndUna.Size(ack)
  1502  		s.SndUna = ack
  1503  		ackLeft := acked
  1504  		originalOutstanding := s.Outstanding
  1505  		for ackLeft > 0 {
  1506  			// We use logicalLen here because we can have FIN
  1507  			// segments (which are always at the end of list) that
  1508  			// have no data, but do consume a sequence number.
  1509  			seg := s.writeList.Front()
  1510  			datalen := seg.logicalLen()
  1511  
  1512  			if datalen > ackLeft {
  1513  				prevCount := s.pCount(seg, s.MaxPayloadSize)
  1514  				seg.TrimFront(ackLeft)
  1515  				seg.sequenceNumber.UpdateForward(ackLeft)
  1516  				s.Outstanding -= prevCount - s.pCount(seg, s.MaxPayloadSize)
  1517  				break
  1518  			}
  1519  
  1520  			if s.writeNext == seg {
  1521  				s.updateWriteNext(seg.Next())
  1522  			}
  1523  
  1524  			// Update the RACK fields if SACK is enabled.
  1525  			if s.ep.SACKPermitted && !seg.acked && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1526  				s.rc.update(seg, rcvdSeg)
  1527  				s.rc.detectReorder(seg)
  1528  			}
  1529  
  1530  			s.writeList.Remove(seg)
  1531  
  1532  			// If SACK is enabled then only reduce outstanding if
  1533  			// the segment was not previously SACKED as these have
  1534  			// already been accounted for in SetPipe().
  1535  			if !s.ep.SACKPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
  1536  				s.Outstanding -= s.pCount(seg, s.MaxPayloadSize)
  1537  			} else {
  1538  				s.SackedOut -= s.pCount(seg, s.MaxPayloadSize)
  1539  			}
  1540  			seg.DecRef()
  1541  			ackLeft -= datalen
  1542  		}
  1543  
  1544  		// Clear SACK information for all acked data.
  1545  		s.ep.scoreboard.Delete(s.SndUna)
  1546  
  1547  		// Detect if the sender entered recovery spuriously.
  1548  		if s.inRecovery() {
  1549  			s.detectSpuriousRecovery(hasDSACK, rcvdSeg.parsedOptions.TSEcr)
  1550  		}
  1551  
  1552  		// If we are not in fast recovery then update the congestion
  1553  		// window based on the number of acknowledged packets.
  1554  		if !s.FastRecovery.Active {
  1555  			s.cc.Update(originalOutstanding - s.Outstanding)
  1556  			if s.FastRecovery.Last.LessThan(s.SndUna) {
  1557  				s.state = tcpip.Open
  1558  				// Update RACK when we are exiting fast or RTO
  1559  				// recovery as described in the RFC
  1560  				// draft-ietf-tcpm-rack-08 Section-7.2 Step 4.
  1561  				if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1562  					s.rc.exitRecovery()
  1563  				}
  1564  				s.reorderTimer.disable()
  1565  			}
  1566  		}
  1567  
  1568  		// Update the send buffer usage and notify potential waiters.
  1569  		s.ep.updateSndBufferUsage(int(acked))
  1570  
  1571  		// It is possible for s.outstanding to drop below zero if we get
  1572  		// a retransmit timeout, reset outstanding to zero but later
  1573  		// get an ack that cover previously sent data.
  1574  		if s.Outstanding < 0 {
  1575  			s.Outstanding = 0
  1576  		}
  1577  
  1578  		s.SetPipe()
  1579  
  1580  		// If all outstanding data was acknowledged the disable the timer.
  1581  		// RFC 6298 Rule 5.3
  1582  		if s.SndUna == s.SndNxt {
  1583  			s.Outstanding = 0
  1584  			// Reset firstRetransmittedSegXmitTime to the zero value.
  1585  			s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{}
  1586  			s.resendTimer.disable()
  1587  			s.probeTimer.disable()
  1588  		}
  1589  	}
  1590  
  1591  	if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1592  		// Update RACK reorder window.
  1593  		// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
  1594  		//	* Upon receiving an ACK:
  1595  		//	* Step 4: Update RACK reordering window
  1596  		s.rc.updateRACKReorderWindow()
  1597  
  1598  		// After the reorder window is calculated, detect any loss by checking
  1599  		// if the time elapsed after the segments are sent is greater than the
  1600  		// reorder window.
  1601  		if numLost := s.rc.detectLoss(rcvdSeg.rcvdTime); numLost > 0 && !s.FastRecovery.Active {
  1602  			// If any segment is marked as lost by
  1603  			// RACK, enter recovery and retransmit
  1604  			// the lost segments.
  1605  			s.cc.HandleLossDetected()
  1606  			s.enterRecovery()
  1607  			fastRetransmit = true
  1608  		}
  1609  
  1610  		if s.FastRecovery.Active {
  1611  			s.rc.DoRecovery(nil, fastRetransmit)
  1612  		}
  1613  	}
  1614  
  1615  	// Now that we've popped all acknowledged data from the retransmit
  1616  	// queue, retransmit if needed.
  1617  	if s.FastRecovery.Active && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 {
  1618  		s.lr.DoRecovery(rcvdSeg, fastRetransmit)
  1619  		// When SACK is enabled data sending is governed by steps in
  1620  		// RFC 6675 Section 5 recovery steps  A-C.
  1621  		// See: https://tools.ietf.org/html/rfc6675#section-5.
  1622  		if s.ep.SACKPermitted {
  1623  			return
  1624  		}
  1625  	}
  1626  
  1627  	// Send more data now that some of the pending data has been ack'd, or
  1628  	// that the window opened up, or the congestion window was inflated due
  1629  	// to a duplicate ack during fast recovery. This will also re-enable
  1630  	// the retransmit timer if needed.
  1631  	s.sendData()
  1632  }
  1633  
  1634  // sendSegment sends the specified segment.
  1635  // +checklocks:s.ep.mu
  1636  func (s *sender) sendSegment(seg *segment) tcpip.Error {
  1637  	if seg.xmitCount > 0 {
  1638  		s.ep.stack.Stats().TCP.Retransmits.Increment()
  1639  		s.ep.stats.SendErrors.Retransmits.Increment()
  1640  		if s.SndCwnd < s.Ssthresh {
  1641  			s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment()
  1642  		}
  1643  	}
  1644  	seg.xmitTime = s.ep.stack.Clock().NowMonotonic()
  1645  	seg.xmitCount++
  1646  	seg.lost = false
  1647  
  1648  	err := s.sendSegmentFromPacketBuffer(seg.pkt, seg.flags, seg.sequenceNumber)
  1649  
  1650  	// Every time a packet containing data is sent (including a
  1651  	// retransmission), if SACK is enabled and we are retransmitting data
  1652  	// then use the conservative timer described in RFC6675 Section 6.0,
  1653  	// otherwise follow the standard time described in RFC6298 Section 5.1.
  1654  	if err != nil && seg.payloadSize() != 0 {
  1655  		if s.FastRecovery.Active && seg.xmitCount > 1 && s.ep.SACKPermitted {
  1656  			s.resendTimer.enable(s.RTO)
  1657  		} else {
  1658  			if !s.resendTimer.enabled() {
  1659  				s.resendTimer.enable(s.RTO)
  1660  			}
  1661  		}
  1662  	}
  1663  
  1664  	return err
  1665  }
  1666  
  1667  // sendSegmentFromPacketBuffer sends a new segment containing the given payload,
  1668  // flags and sequence number.
  1669  // +checklocks:s.ep.mu
  1670  // +checklocksalias:s.ep.rcv.ep.mu=s.ep.mu
  1671  func (s *sender) sendSegmentFromPacketBuffer(pkt stack.PacketBufferPtr, flags header.TCPFlags, seq seqnum.Value) tcpip.Error {
  1672  	s.LastSendTime = s.ep.stack.Clock().NowMonotonic()
  1673  	if seq == s.RTTMeasureSeqNum {
  1674  		s.RTTMeasureTime = s.LastSendTime
  1675  	}
  1676  
  1677  	rcvNxt, rcvWnd := s.ep.rcv.getSendParams()
  1678  
  1679  	// Remember the max sent ack.
  1680  	s.MaxSentAck = rcvNxt
  1681  
  1682  	// We need to clone the packet because sendRaw takes ownership of pkt,
  1683  	// and pkt could be reprocessed later on (i.e retrasmission).
  1684  	pkt = pkt.Clone()
  1685  	defer pkt.DecRef()
  1686  
  1687  	return s.ep.sendRaw(pkt, flags, seq, rcvNxt, rcvWnd)
  1688  }
  1689  
  1690  // sendEmptySegment sends a new empty segment, flags and sequence number.
  1691  // +checklocks:s.ep.mu
  1692  // +checklocksalias:s.ep.rcv.ep.mu=s.ep.mu
  1693  func (s *sender) sendEmptySegment(flags header.TCPFlags, seq seqnum.Value) tcpip.Error {
  1694  	s.LastSendTime = s.ep.stack.Clock().NowMonotonic()
  1695  	if seq == s.RTTMeasureSeqNum {
  1696  		s.RTTMeasureTime = s.LastSendTime
  1697  	}
  1698  
  1699  	rcvNxt, rcvWnd := s.ep.rcv.getSendParams()
  1700  
  1701  	// Remember the max sent ack.
  1702  	s.MaxSentAck = rcvNxt
  1703  
  1704  	return s.ep.sendEmptyRaw(flags, seq, rcvNxt, rcvWnd)
  1705  }
  1706  
  1707  // maybeSendOutOfWindowAck sends an ACK if we are not being rate limited
  1708  // currently.
  1709  // +checklocks:s.ep.mu
  1710  func (s *sender) maybeSendOutOfWindowAck(seg *segment) {
  1711  	// Data packets are unlikely to be part of an ACK loop. So always send
  1712  	// an ACK for a packet w/ data.
  1713  	if seg.payloadSize() > 0 || s.ep.allowOutOfWindowAck() {
  1714  		s.sendAck()
  1715  	}
  1716  }
  1717  
  1718  func (s *sender) updateWriteNext(seg *segment) {
  1719  	if s.writeNext != nil {
  1720  		s.writeNext.DecRef()
  1721  	}
  1722  	if seg != nil {
  1723  		seg.IncRef()
  1724  	}
  1725  	s.writeNext = seg
  1726  }