gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/tcpip/transport/tcp/snd.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"fmt"
    19  	"math"
    20  	"sort"
    21  	"time"
    22  
    23  	"gvisor.dev/gvisor/pkg/buffer"
    24  	"gvisor.dev/gvisor/pkg/sync"
    25  	"gvisor.dev/gvisor/pkg/tcpip"
    26  	"gvisor.dev/gvisor/pkg/tcpip/header"
    27  	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
    28  	"gvisor.dev/gvisor/pkg/tcpip/stack"
    29  )
    30  
    31  const (
    32  	// MinRTO is the minimum allowed value for the retransmit timeout.
    33  	MinRTO = 200 * time.Millisecond
    34  
    35  	// MaxRTO is the maximum allowed value for the retransmit timeout.
    36  	MaxRTO = 120 * time.Second
    37  
    38  	// MinSRTT is the minimum allowed value for smoothed RTT.
    39  	MinSRTT = 1 * time.Millisecond
    40  
    41  	// InitialCwnd is the initial congestion window.
    42  	InitialCwnd = 10
    43  
    44  	// nDupAckThreshold is the number of duplicate ACK's required
    45  	// before fast-retransmit is entered.
    46  	nDupAckThreshold = 3
    47  
    48  	// MaxRetries is the maximum number of probe retries sender does
    49  	// before timing out the connection.
    50  	// Linux default TCP_RETR2, net.ipv4.tcp_retries2.
    51  	MaxRetries = 15
    52  
    53  	// InitialSsthresh is the the maximum int value, which depends on the
    54  	// platform.
    55  	InitialSsthresh = math.MaxInt
    56  
    57  	// unknownRTT is used to indicate to congestion control algorithms that we
    58  	// were unable to measure the round-trip time when processing ACKs.
    59  	// Algorithms (such as HyStart) that use the round-trip time should ignore
    60  	// such Updates.
    61  	unknownRTT = time.Duration(-1)
    62  )
    63  
    64  // congestionControl is an interface that must be implemented by any supported
    65  // congestion control algorithm.
    66  type congestionControl interface {
    67  	// HandleLossDetected is invoked when the loss is detected by RACK or
    68  	// sender.dupAckCount >= nDupAckThreshold just before entering fast
    69  	// retransmit.
    70  	HandleLossDetected()
    71  
    72  	// HandleRTOExpired is invoked when the retransmit timer expires.
    73  	HandleRTOExpired()
    74  
    75  	// Update is invoked when processing inbound acks. It's passed the
    76  	// number of packet's that were acked by the most recent cumulative
    77  	// acknowledgement.  rtt is the round-trip time, or is set to unknownRTT
    78  	// (above) to indicate the time is unknown.
    79  	Update(packetsAcked int, rtt time.Duration)
    80  
    81  	// PostRecovery is invoked when the sender is exiting a fast retransmit/
    82  	// recovery phase. This provides congestion control algorithms a way
    83  	// to adjust their state when exiting recovery.
    84  	PostRecovery()
    85  }
    86  
    87  // lossRecovery is an interface that must be implemented by any supported
    88  // loss recovery algorithm.
    89  type lossRecovery interface {
    90  	// DoRecovery is invoked when loss is detected and segments need
    91  	// to be retransmitted. The cumulative or selective ACK is passed along
    92  	// with the flag which identifies whether the connection entered fast
    93  	// retransmit with this ACK and to retransmit the first unacknowledged
    94  	// segment.
    95  	DoRecovery(rcvdSeg *segment, fastRetransmit bool)
    96  }
    97  
    98  // sender holds the state necessary to send TCP segments.
    99  //
   100  // +stateify savable
   101  type sender struct {
   102  	stack.TCPSenderState
   103  	ep *Endpoint
   104  
   105  	// lr is the loss recovery algorithm used by the sender.
   106  	lr lossRecovery
   107  
   108  	// firstRetransmittedSegXmitTime is the original transmit time of
   109  	// the first segment that was retransmitted due to RTO expiration.
   110  	firstRetransmittedSegXmitTime tcpip.MonotonicTime
   111  
   112  	// zeroWindowProbing is set if the sender is currently probing
   113  	// for zero receive window.
   114  	zeroWindowProbing bool `state:"nosave"`
   115  
   116  	// unackZeroWindowProbes is the number of unacknowledged zero
   117  	// window probes.
   118  	unackZeroWindowProbes uint32 `state:"nosave"`
   119  
   120  	// writeNext is the next segment to write that hasn't already been
   121  	// written, i.e. the first payload starting at SND.NXT.
   122  	writeNext *segment
   123  
   124  	// writeList holds all writable data: both unsent data and
   125  	// sent-but-unacknowledged data. Alternatively: it holds all bytes
   126  	// starting from SND.UNA.
   127  	writeList segmentList
   128  
   129  	// resendTimer is used for RTOs.
   130  	resendTimer timer `state:"nosave"`
   131  
   132  	// rtt.TCPRTTState.SRTT and rtt.TCPRTTState.RTTVar are the "smoothed
   133  	// round-trip time", and "round-trip time variation", as defined in
   134  	// section 2 of RFC 6298.
   135  	rtt rtt
   136  
   137  	// minRTO is the minimum permitted value for sender.rto.
   138  	minRTO time.Duration
   139  
   140  	// maxRTO is the maximum permitted value for sender.rto.
   141  	maxRTO time.Duration
   142  
   143  	// maxRetries is the maximum permitted retransmissions.
   144  	maxRetries uint32
   145  
   146  	// gso is set if generic segmentation offload is enabled.
   147  	gso bool
   148  
   149  	// state is the current state of congestion control for this endpoint.
   150  	state tcpip.CongestionControlState
   151  
   152  	// cc is the congestion control algorithm in use for this sender.
   153  	cc congestionControl
   154  
   155  	// rc has the fields needed for implementing RACK loss detection
   156  	// algorithm.
   157  	rc rackControl
   158  
   159  	// reorderTimer is the timer used to retransmit the segments after RACK
   160  	// detects them as lost.
   161  	reorderTimer timer `state:"nosave"`
   162  
   163  	// probeTimer is used to schedule PTO for RACK TLP algorithm.
   164  	probeTimer timer `state:"nosave"`
   165  
   166  	// spuriousRecovery indicates whether the sender entered recovery
   167  	// spuriously as described in RFC3522 Section 3.2.
   168  	spuriousRecovery bool
   169  
   170  	// retransmitTS is the timestamp at which the sender sends retransmitted
   171  	// segment after entering an RTO for the first time as described in
   172  	// RFC3522 Section 3.2.
   173  	retransmitTS uint32
   174  
   175  	// startCork start corking the segments.
   176  	startCork bool
   177  
   178  	// corkTimer is used to drain the segments which are held when TCP_CORK
   179  	// option is enabled.
   180  	corkTimer timer `state:"nosave"`
   181  }
   182  
   183  // rtt is a synchronization wrapper used to appease stateify. See the comment
   184  // in sender, where it is used.
   185  //
   186  // +stateify savable
   187  type rtt struct {
   188  	sync.Mutex `state:"nosave"`
   189  
   190  	stack.TCPRTTState
   191  }
   192  
   193  // +checklocks:ep.mu
   194  func newSender(ep *Endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender {
   195  	// The sender MUST reduce the TCP data length to account for any IP or
   196  	// TCP options that it is including in the packets that it sends.
   197  	// See: https://tools.ietf.org/html/rfc6691#section-2
   198  	maxPayloadSize := int(mss) - ep.maxOptionSize()
   199  
   200  	s := &sender{
   201  		ep: ep,
   202  		TCPSenderState: stack.TCPSenderState{
   203  			SndWnd:           sndWnd,
   204  			SndUna:           iss + 1,
   205  			SndNxt:           iss + 1,
   206  			RTTMeasureSeqNum: iss + 1,
   207  			LastSendTime:     ep.stack.Clock().NowMonotonic(),
   208  			MaxPayloadSize:   maxPayloadSize,
   209  			MaxSentAck:       irs + 1,
   210  			FastRecovery: stack.TCPFastRecoveryState{
   211  				// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1.
   212  				Last:      iss,
   213  				HighRxt:   iss,
   214  				RescueRxt: iss,
   215  			},
   216  			RTO: 1 * time.Second,
   217  		},
   218  		gso: ep.gso.Type != stack.GSONone,
   219  	}
   220  
   221  	if s.gso {
   222  		s.ep.gso.MSS = uint16(maxPayloadSize)
   223  	}
   224  
   225  	s.cc = s.initCongestionControl(ep.cc)
   226  	s.lr = s.initLossRecovery()
   227  	s.rc.init(s, iss)
   228  
   229  	// A negative sndWndScale means that no scaling is in use, otherwise we
   230  	// store the scaling value.
   231  	if sndWndScale > 0 {
   232  		s.SndWndScale = uint8(sndWndScale)
   233  	}
   234  
   235  	s.resendTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.retransmitTimerExpired))
   236  	s.reorderTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.rc.reorderTimerExpired))
   237  	s.probeTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.probeTimerExpired))
   238  	s.corkTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.corkTimerExpired))
   239  
   240  	s.ep.AssertLockHeld(ep)
   241  	s.updateMaxPayloadSize(int(ep.route.MTU()), 0)
   242  	// Initialize SACK Scoreboard after updating max payload size as we use
   243  	// the maxPayloadSize as the smss when determining if a segment is lost
   244  	// etc.
   245  	s.ep.scoreboard = NewSACKScoreboard(uint16(s.MaxPayloadSize), iss)
   246  
   247  	// Get Stack wide config.
   248  	var minRTO tcpip.TCPMinRTOOption
   249  	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil {
   250  		panic(fmt.Sprintf("unable to get minRTO from stack: %s", err))
   251  	}
   252  	s.minRTO = time.Duration(minRTO)
   253  
   254  	var maxRTO tcpip.TCPMaxRTOOption
   255  	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil {
   256  		panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err))
   257  	}
   258  	s.maxRTO = time.Duration(maxRTO)
   259  
   260  	var maxRetries tcpip.TCPMaxRetriesOption
   261  	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil {
   262  		panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err))
   263  	}
   264  	s.maxRetries = uint32(maxRetries)
   265  
   266  	return s
   267  }
   268  
   269  // initCongestionControl initializes the specified congestion control module and
   270  // returns a handle to it. It also initializes the sndCwnd and sndSsThresh to
   271  // their initial values.
   272  func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl {
   273  	s.SndCwnd = InitialCwnd
   274  	s.Ssthresh = InitialSsthresh
   275  
   276  	switch congestionControlName {
   277  	case ccCubic:
   278  		return newCubicCC(s)
   279  	case ccReno:
   280  		fallthrough
   281  	default:
   282  		return newRenoCC(s)
   283  	}
   284  }
   285  
   286  // initLossRecovery initiates the loss recovery algorithm for the sender.
   287  func (s *sender) initLossRecovery() lossRecovery {
   288  	if s.ep.SACKPermitted {
   289  		return newSACKRecovery(s)
   290  	}
   291  	return newRenoRecovery(s)
   292  }
   293  
   294  // updateMaxPayloadSize updates the maximum payload size based on the given
   295  // MTU. If this is in response to "packet too big" control packets (indicated
   296  // by the count argument), it also reduces the number of outstanding packets and
   297  // attempts to retransmit the first packet above the MTU size.
   298  // +checklocks:s.ep.mu
   299  func (s *sender) updateMaxPayloadSize(mtu, count int) {
   300  	m := mtu - header.TCPMinimumSize
   301  
   302  	m -= s.ep.maxOptionSize()
   303  
   304  	// We don't adjust up for now.
   305  	if m >= s.MaxPayloadSize {
   306  		return
   307  	}
   308  
   309  	// Make sure we can transmit at least one byte.
   310  	if m <= 0 {
   311  		m = 1
   312  	}
   313  
   314  	oldMSS := s.MaxPayloadSize
   315  	s.MaxPayloadSize = m
   316  	if s.gso {
   317  		s.ep.gso.MSS = uint16(m)
   318  	}
   319  
   320  	if count == 0 {
   321  		// updateMaxPayloadSize is also called when the sender is created.
   322  		// and there is no data to send in such cases. Return immediately.
   323  		return
   324  	}
   325  
   326  	// Update the scoreboard's smss to reflect the new lowered
   327  	// maxPayloadSize.
   328  	s.ep.scoreboard.smss = uint16(m)
   329  
   330  	s.Outstanding -= count
   331  	if s.Outstanding < 0 {
   332  		s.Outstanding = 0
   333  	}
   334  
   335  	// Rewind writeNext to the first segment exceeding the MTU. Do nothing
   336  	// if it is already before such a packet.
   337  	nextSeg := s.writeNext
   338  	for seg := s.writeList.Front(); seg != nil; seg = seg.Next() {
   339  		if seg == s.writeNext {
   340  			// We got to writeNext before we could find a segment
   341  			// exceeding the MTU.
   342  			break
   343  		}
   344  
   345  		if seg.payloadSize() > m {
   346  			// xmitCount is used for loss detection, but
   347  			// retransmission doesn't indicate congestion here,
   348  			// it's just PMTUD.
   349  			seg.xmitCount = 0
   350  			if nextSeg == s.writeNext {
   351  				// We found a segment exceeding the MTU. Rewind
   352  				// writeNext and try to retransmit it.
   353  				nextSeg = seg
   354  			}
   355  		}
   356  
   357  		if s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
   358  			// Update sackedOut for new maximum payload size.
   359  			s.SackedOut -= s.pCount(seg, oldMSS)
   360  			s.SackedOut += s.pCount(seg, s.MaxPayloadSize)
   361  		}
   362  	}
   363  
   364  	// Since we likely reduced the number of outstanding packets, we may be
   365  	// ready to send some more.
   366  	s.updateWriteNext(nextSeg)
   367  	s.sendData()
   368  }
   369  
   370  // sendAck sends an ACK segment.
   371  // +checklocks:s.ep.mu
   372  func (s *sender) sendAck() {
   373  	s.sendEmptySegment(header.TCPFlagAck, s.SndNxt)
   374  }
   375  
   376  // updateRTO updates the retransmit timeout when a new roud-trip time is
   377  // available. This is done in accordance with section 2 of RFC 6298.
   378  func (s *sender) updateRTO(rtt time.Duration) {
   379  	s.rtt.Lock()
   380  	if !s.rtt.TCPRTTState.SRTTInited {
   381  		s.rtt.TCPRTTState.RTTVar = rtt / 2
   382  		s.rtt.TCPRTTState.SRTT = rtt
   383  		s.rtt.TCPRTTState.SRTTInited = true
   384  	} else {
   385  		diff := s.rtt.TCPRTTState.SRTT - rtt
   386  		if diff < 0 {
   387  			diff = -diff
   388  		}
   389  		// Use RFC6298 standard algorithm to update TCPRTTState.RTTVar and TCPRTTState.SRTT when
   390  		// no timestamps are available.
   391  		if !s.ep.SendTSOk {
   392  			s.rtt.TCPRTTState.RTTVar = (3*s.rtt.TCPRTTState.RTTVar + diff) / 4
   393  			s.rtt.TCPRTTState.SRTT = (7*s.rtt.TCPRTTState.SRTT + rtt) / 8
   394  		} else {
   395  			// When we are taking RTT measurements of every ACK then
   396  			// we need to use a modified method as specified in
   397  			// https://tools.ietf.org/html/rfc7323#appendix-G
   398  			if s.Outstanding == 0 {
   399  				s.rtt.Unlock()
   400  				return
   401  			}
   402  			// Netstack measures congestion window/inflight all in
   403  			// terms of packets and not bytes. This is similar to
   404  			// how linux also does cwnd and inflight. In practice
   405  			// this approximation works as expected.
   406  			expectedSamples := math.Ceil(float64(s.Outstanding) / 2)
   407  
   408  			// alpha & beta values are the original values as recommended in
   409  			// https://tools.ietf.org/html/rfc6298#section-2.3.
   410  			const alpha = 0.125
   411  			const beta = 0.25
   412  
   413  			alphaPrime := alpha / expectedSamples
   414  			betaPrime := beta / expectedSamples
   415  			rttVar := (1-betaPrime)*s.rtt.TCPRTTState.RTTVar.Seconds() + betaPrime*diff.Seconds()
   416  			srtt := (1-alphaPrime)*s.rtt.TCPRTTState.SRTT.Seconds() + alphaPrime*rtt.Seconds()
   417  			s.rtt.TCPRTTState.RTTVar = time.Duration(rttVar * float64(time.Second))
   418  			s.rtt.TCPRTTState.SRTT = time.Duration(srtt * float64(time.Second))
   419  		}
   420  	}
   421  
   422  	if s.rtt.TCPRTTState.SRTT < MinSRTT {
   423  		s.rtt.TCPRTTState.SRTT = MinSRTT
   424  	}
   425  
   426  	s.RTO = s.rtt.TCPRTTState.SRTT + 4*s.rtt.TCPRTTState.RTTVar
   427  	s.rtt.Unlock()
   428  	if s.RTO < s.minRTO {
   429  		s.RTO = s.minRTO
   430  	}
   431  	if s.RTO > s.maxRTO {
   432  		s.RTO = s.maxRTO
   433  	}
   434  }
   435  
   436  // resendSegment resends the first unacknowledged segment.
   437  // +checklocks:s.ep.mu
   438  func (s *sender) resendSegment() {
   439  	// Don't use any segments we already sent to measure RTT as they may
   440  	// have been affected by packets being lost.
   441  	s.RTTMeasureSeqNum = s.SndNxt
   442  
   443  	// Resend the segment.
   444  	if seg := s.writeList.Front(); seg != nil {
   445  		if seg.payloadSize() > s.MaxPayloadSize {
   446  			s.splitSeg(seg, s.MaxPayloadSize)
   447  		}
   448  
   449  		// See: RFC 6675 section 5 Step 4.3
   450  		//
   451  		// To prevent retransmission, set both the HighRXT and RescueRXT
   452  		// to the highest sequence number in the retransmitted segment.
   453  		s.FastRecovery.HighRxt = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) - 1
   454  		s.FastRecovery.RescueRxt = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) - 1
   455  		s.sendSegment(seg)
   456  		s.ep.stack.Stats().TCP.FastRetransmit.Increment()
   457  		s.ep.stats.SendErrors.FastRetransmit.Increment()
   458  
   459  		// Run SetPipe() as per RFC 6675 section 5 Step 4.4
   460  		s.SetPipe()
   461  	}
   462  }
   463  
   464  // retransmitTimerExpired is called when the retransmit timer expires, and
   465  // unacknowledged segments are assumed lost, and thus need to be resent.
   466  // Returns true if the connection is still usable, or false if the connection
   467  // is deemed lost.
   468  // +checklocks:s.ep.mu
   469  func (s *sender) retransmitTimerExpired() tcpip.Error {
   470  	// Check if the timer actually expired or if it's a spurious wake due
   471  	// to a previously orphaned runtime timer.
   472  	if s.resendTimer.isUninitialized() || !s.resendTimer.checkExpiration() {
   473  		return nil
   474  	}
   475  
   476  	// Initialize the variables used to detect spurious recovery after
   477  	// entering RTO.
   478  	//
   479  	// See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1.
   480  	s.spuriousRecovery = false
   481  	s.retransmitTS = 0
   482  
   483  	// TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases
   484  	// when writeList is empty. Remove this once we have a proper fix for this
   485  	// issue.
   486  	if s.writeList.Front() == nil {
   487  		return nil
   488  	}
   489  
   490  	s.ep.stack.Stats().TCP.Timeouts.Increment()
   491  	s.ep.stats.SendErrors.Timeouts.Increment()
   492  
   493  	// Set TLPRxtOut to false according to
   494  	// https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1.
   495  	s.rc.tlpRxtOut = false
   496  
   497  	// Give up if we've waited more than a minute since the last resend or
   498  	// if a user time out is set and we have exceeded the user specified
   499  	// timeout since the first retransmission.
   500  	uto := s.ep.userTimeout
   501  
   502  	if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) {
   503  		// We store the original xmitTime of the segment that we are
   504  		// about to retransmit as the retransmission time. This is
   505  		// required as by the time the retransmitTimer has expired the
   506  		// segment has already been sent and unacked for the RTO at the
   507  		// time the segment was sent.
   508  		s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime
   509  	}
   510  
   511  	elapsed := s.ep.stack.Clock().NowMonotonic().Sub(s.firstRetransmittedSegXmitTime)
   512  	remaining := s.maxRTO
   513  	if uto != 0 {
   514  		// Cap to the user specified timeout if one is specified.
   515  		remaining = uto - elapsed
   516  	}
   517  
   518  	// Always honor the user-timeout irrespective of whether the zero
   519  	// window probes were acknowledged.
   520  	// net/ipv4/tcp_timer.c::tcp_probe_timer()
   521  	if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries {
   522  		s.ep.stack.Stats().TCP.EstablishedTimedout.Increment()
   523  		return &tcpip.ErrTimeout{}
   524  	}
   525  
   526  	// Set new timeout. The timer will be restarted by the call to sendData
   527  	// below.
   528  	s.RTO *= 2
   529  	// Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5
   530  	if s.RTO > s.maxRTO {
   531  		s.RTO = s.maxRTO
   532  	}
   533  
   534  	// Cap RTO to remaining time.
   535  	if s.RTO > remaining {
   536  		s.RTO = remaining
   537  	}
   538  
   539  	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4.
   540  	//
   541  	// Retransmit timeouts:
   542  	//     After a retransmit timeout, record the highest sequence number
   543  	//     transmitted in the variable recover, and exit the fast recovery
   544  	//     procedure if applicable.
   545  	s.FastRecovery.Last = s.SndNxt - 1
   546  
   547  	if s.FastRecovery.Active {
   548  		// We were attempting fast recovery but were not successful.
   549  		// Leave the state. We don't need to update ssthresh because it
   550  		// has already been updated when entered fast-recovery.
   551  		s.leaveRecovery()
   552  	}
   553  
   554  	// Record retransmitTS if the sender is not in recovery as per:
   555  	// https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2
   556  	s.recordRetransmitTS()
   557  
   558  	s.state = tcpip.RTORecovery
   559  	s.cc.HandleRTOExpired()
   560  
   561  	// Mark the next segment to be sent as the first unacknowledged one and
   562  	// start sending again. Set the number of outstanding packets to 0 so
   563  	// that we'll be able to retransmit.
   564  	//
   565  	// We'll keep on transmitting (or retransmitting) as we get acks for
   566  	// the data we transmit.
   567  	s.Outstanding = 0
   568  
   569  	// Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1
   570  	//
   571  	//  In order to avoid memory deadlocks, the TCP receiver is allowed to
   572  	//  discard data that has already been selectively acknowledged. As a
   573  	//  result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK
   574  	//  information gathered from a receiver upon a retransmission timeout
   575  	//  (RTO) "since the timeout might indicate that the data receiver has
   576  	//  reneged." Additionally, a TCP sender MUST "ignore prior SACK
   577  	//  information in determining which data to retransmit."
   578  	//
   579  	// NOTE: We take the stricter interpretation and just expunge all
   580  	// information as we lack more rigorous checks to validate if the SACK
   581  	// information is usable after an RTO.
   582  	s.ep.scoreboard.Reset()
   583  	s.updateWriteNext(s.writeList.Front())
   584  
   585  	// RFC 1122 4.2.2.17: Start sending zero window probes when we still see a
   586  	// zero receive window after retransmission interval and we have data to
   587  	// send.
   588  	if s.zeroWindowProbing {
   589  		s.sendZeroWindowProbe()
   590  		// RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed
   591  		// indefinitely.  As long as the receiving TCP continues to send
   592  		// acknowledgments in response to the probe segments, the sending TCP
   593  		// MUST allow the connection to stay open.
   594  		return nil
   595  	}
   596  
   597  	seg := s.writeNext
   598  	// RFC 1122 4.2.3.5: Close the connection when the number of
   599  	// retransmissions for this segment is beyond a limit.
   600  	if seg != nil && seg.xmitCount > s.maxRetries {
   601  		s.ep.stack.Stats().TCP.EstablishedTimedout.Increment()
   602  		return &tcpip.ErrTimeout{}
   603  	}
   604  
   605  	s.sendData()
   606  
   607  	return nil
   608  }
   609  
   610  // pCount returns the number of packets in the segment. Due to GSO, a segment
   611  // can be composed of multiple packets.
   612  func (s *sender) pCount(seg *segment, maxPayloadSize int) int {
   613  	size := seg.payloadSize()
   614  	if size == 0 {
   615  		return 1
   616  	}
   617  
   618  	return (size-1)/maxPayloadSize + 1
   619  }
   620  
   621  // splitSeg splits a given segment at the size specified and inserts the
   622  // remainder as a new segment after the current one in the write list.
   623  func (s *sender) splitSeg(seg *segment, size int) {
   624  	if seg.payloadSize() <= size {
   625  		return
   626  	}
   627  	// Split this segment up.
   628  	nSeg := seg.clone()
   629  	nSeg.pkt.Data().TrimFront(size)
   630  	nSeg.sequenceNumber.UpdateForward(seqnum.Size(size))
   631  	s.writeList.InsertAfter(seg, nSeg)
   632  
   633  	// The segment being split does not carry PUSH flag because it is
   634  	// followed by the newly split segment.
   635  	// RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered
   636  	// segment (i.e., when there is no more queued data to be sent).
   637  	// Linux removes PSH flag only when the segment is being split over MSS
   638  	// and retains it when we are splitting the segment over lack of sender
   639  	// window space.
   640  	// ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point()
   641  	// ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test()
   642  	if seg.payloadSize() > s.MaxPayloadSize {
   643  		seg.flags ^= header.TCPFlagPsh
   644  	}
   645  	seg.pkt.Data().CapLength(size)
   646  }
   647  
   648  // NextSeg implements the RFC6675 NextSeg() operation.
   649  //
   650  // NextSeg starts scanning the writeList starting from nextSegHint and returns
   651  // the hint to be passed on the next call to NextSeg. This is required to avoid
   652  // iterating the write list repeatedly when NextSeg is invoked in a loop during
   653  // recovery. The returned hint will be nil if there are no more segments that
   654  // can match rules defined by NextSeg operation in RFC6675.
   655  //
   656  // rescueRtx will be true only if nextSeg is a rescue retransmission as
   657  // described by Step 4) of the NextSeg algorithm.
   658  func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) {
   659  	var s3 *segment
   660  	var s4 *segment
   661  	// Step 1.
   662  	for seg := nextSegHint; seg != nil; seg = seg.Next() {
   663  		// Stop iteration if we hit a segment that has never been
   664  		// transmitted (i.e. either it has no assigned sequence number
   665  		// or if it does have one, it's >= the next sequence number
   666  		// to be sent [i.e. >= s.sndNxt]).
   667  		if !s.isAssignedSequenceNumber(seg) || s.SndNxt.LessThanEq(seg.sequenceNumber) {
   668  			hint = nil
   669  			break
   670  		}
   671  		segSeq := seg.sequenceNumber
   672  		if smss := s.ep.scoreboard.SMSS(); seg.payloadSize() > int(smss) {
   673  			s.splitSeg(seg, int(smss))
   674  		}
   675  
   676  		// See RFC 6675 Section 4
   677  		//
   678  		//     1. If there exists a smallest unSACKED sequence number
   679  		//     'S2' that meets the following 3 criteria for determinig
   680  		//     loss, the sequence range of one segment of up to SMSS
   681  		//     octets starting with S2 MUST be returned.
   682  		if !s.ep.scoreboard.IsSACKED(header.SACKBlock{Start: segSeq, End: segSeq.Add(1)}) {
   683  			// NextSeg():
   684  			//
   685  			//    (1.a) S2 is greater than HighRxt
   686  			//    (1.b) S2 is less than highest octet covered by
   687  			//    any received SACK.
   688  			if s.FastRecovery.HighRxt.LessThan(segSeq) && segSeq.LessThan(s.ep.scoreboard.maxSACKED) {
   689  				// NextSeg():
   690  				//     (1.c) IsLost(S2) returns true.
   691  				if s.ep.scoreboard.IsLost(segSeq) {
   692  					return seg, seg.Next(), false
   693  				}
   694  
   695  				// NextSeg():
   696  				//
   697  				// (3): If the conditions for rules (1) and (2)
   698  				// fail, but there exists an unSACKed sequence
   699  				// number S3 that meets the criteria for
   700  				// detecting loss given in steps 1.a and 1.b
   701  				// above (specifically excluding (1.c)) then one
   702  				// segment of upto SMSS octets starting with S3
   703  				// SHOULD be returned.
   704  				if s3 == nil {
   705  					s3 = seg
   706  					hint = seg.Next()
   707  				}
   708  			}
   709  			// NextSeg():
   710  			//
   711  			//     (4) If the conditions for (1), (2) and (3) fail,
   712  			//     but there exists outstanding unSACKED data, we
   713  			//     provide the opportunity for a single "rescue"
   714  			//     retransmission per entry into loss recovery. If
   715  			//     HighACK is greater than RescueRxt (or RescueRxt
   716  			//     is undefined), then one segment of upto SMSS
   717  			//     octets that MUST include the highest outstanding
   718  			//     unSACKed sequence number SHOULD be returned, and
   719  			//     RescueRxt set to RecoveryPoint. HighRxt MUST NOT
   720  			//     be updated.
   721  			if s.FastRecovery.RescueRxt.LessThan(s.SndUna - 1) {
   722  				if s4 != nil {
   723  					if s4.sequenceNumber.LessThan(segSeq) {
   724  						s4 = seg
   725  					}
   726  				} else {
   727  					s4 = seg
   728  				}
   729  			}
   730  		}
   731  	}
   732  
   733  	// If we got here then no segment matched step (1).
   734  	// Step (2): "If no sequence number 'S2' per rule (1)
   735  	// exists but there exists available unsent data and the
   736  	// receiver's advertised window allows, the sequence
   737  	// range of one segment of up to SMSS octets of
   738  	// previously unsent data starting with sequence number
   739  	// HighData+1 MUST be returned."
   740  	for seg := s.writeNext; seg != nil; seg = seg.Next() {
   741  		if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.SndNxt) {
   742  			continue
   743  		}
   744  		// We do not split the segment here to <= smss as it has
   745  		// potentially not been assigned a sequence number yet.
   746  		return seg, nil, false
   747  	}
   748  
   749  	if s3 != nil {
   750  		return s3, hint, false
   751  	}
   752  
   753  	return s4, nil, true
   754  }
   755  
   756  // maybeSendSegment tries to send the specified segment and either coalesces
   757  // other segments into this one or splits the specified segment based on the
   758  // lower of the specified limit value or the receivers window size specified by
   759  // end.
   760  // +checklocks:s.ep.mu
   761  func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (sent bool) {
   762  	// We abuse the flags field to determine if we have already
   763  	// assigned a sequence number to this segment.
   764  	if !s.isAssignedSequenceNumber(seg) {
   765  		// Merge segments if allowed.
   766  		if seg.payloadSize() != 0 {
   767  			available := int(s.SndNxt.Size(end))
   768  			if available > limit {
   769  				available = limit
   770  			}
   771  
   772  			// nextTooBig indicates that the next segment was too
   773  			// large to entirely fit in the current segment. It
   774  			// would be possible to split the next segment and merge
   775  			// the portion that fits, but unexpectedly splitting
   776  			// segments can have user visible side-effects which can
   777  			// break applications. For example, RFC 7766 section 8
   778  			// says that the length and data of a DNS response
   779  			// should be sent in the same TCP segment to avoid
   780  			// triggering bugs in poorly written DNS
   781  			// implementations.
   782  			var nextTooBig bool
   783  			for nSeg := seg.Next(); nSeg != nil && nSeg.payloadSize() != 0; nSeg = seg.Next() {
   784  				if seg.payloadSize()+nSeg.payloadSize() > available {
   785  					nextTooBig = true
   786  					break
   787  				}
   788  				seg.merge(nSeg)
   789  				s.writeList.Remove(nSeg)
   790  				nSeg.DecRef()
   791  			}
   792  			if !nextTooBig && seg.payloadSize() < available {
   793  				// Segment is not full.
   794  				if s.Outstanding > 0 && s.ep.ops.GetDelayOption() {
   795  					// Nagle's algorithm. From Wikipedia:
   796  					//   Nagle's algorithm works by
   797  					//   combining a number of small
   798  					//   outgoing messages and sending them
   799  					//   all at once. Specifically, as long
   800  					//   as there is a sent packet for which
   801  					//   the sender has received no
   802  					//   acknowledgment, the sender should
   803  					//   keep buffering its output until it
   804  					//   has a full packet's worth of
   805  					//   output, thus allowing output to be
   806  					//   sent all at once.
   807  					return false
   808  				}
   809  				// With TCP_CORK, hold back until minimum of the available
   810  				// send space and MSS.
   811  				if s.ep.ops.GetCorkOption() {
   812  					if seg.payloadSize() < s.MaxPayloadSize {
   813  						if !s.startCork {
   814  							s.startCork = true
   815  							// Enable the timer for
   816  							// 200ms, after which
   817  							// the segments are drained.
   818  							s.corkTimer.enable(MinRTO)
   819  						}
   820  						return false
   821  					}
   822  					// Disable the TCP_CORK timer.
   823  					s.startCork = false
   824  					s.corkTimer.disable()
   825  				}
   826  			}
   827  		}
   828  
   829  		// Assign flags. We don't do it above so that we can merge
   830  		// additional data if Nagle holds the segment.
   831  		seg.sequenceNumber = s.SndNxt
   832  		seg.flags = header.TCPFlagAck | header.TCPFlagPsh
   833  	}
   834  
   835  	var segEnd seqnum.Value
   836  	if seg.payloadSize() == 0 {
   837  		if s.writeList.Back() != seg {
   838  			panic("FIN segments must be the final segment in the write list.")
   839  		}
   840  		seg.flags = header.TCPFlagAck | header.TCPFlagFin
   841  		segEnd = seg.sequenceNumber.Add(1)
   842  		// Update the state to reflect that we have now
   843  		// queued a FIN.
   844  		s.ep.updateConnDirectionState(connDirectionStateSndClosed)
   845  		switch s.ep.EndpointState() {
   846  		case StateCloseWait:
   847  			s.ep.setEndpointState(StateLastAck)
   848  		default:
   849  			s.ep.setEndpointState(StateFinWait1)
   850  		}
   851  	} else {
   852  		// We're sending a non-FIN segment.
   853  		if seg.flags&header.TCPFlagFin != 0 {
   854  			panic("Netstack queues FIN segments without data.")
   855  		}
   856  
   857  		if !seg.sequenceNumber.LessThan(end) {
   858  			return false
   859  		}
   860  
   861  		available := int(seg.sequenceNumber.Size(end))
   862  		if available == 0 {
   863  			return false
   864  		}
   865  
   866  		// If the whole segment or at least 1MSS sized segment cannot
   867  		// be accommodated in the receiver advertised window, skip
   868  		// splitting and sending of the segment. ref:
   869  		// net/ipv4/tcp_output.c::tcp_snd_wnd_test()
   870  		//
   871  		// Linux checks this for all segment transmits not triggered by
   872  		// a probe timer. On this condition, it defers the segment split
   873  		// and transmit to a short probe timer.
   874  		//
   875  		// ref: include/net/tcp.h::tcp_check_probe_timer()
   876  		// ref: net/ipv4/tcp_output.c::tcp_write_wakeup()
   877  		//
   878  		// Instead of defining a new transmit timer, we attempt to split
   879  		// the segment right here if there are no pending segments. If
   880  		// there are pending segments, segment transmits are deferred to
   881  		// the retransmit timer handler.
   882  		if s.SndUna != s.SndNxt {
   883  			switch {
   884  			case available >= seg.payloadSize():
   885  				// OK to send, the whole segments fits in the
   886  				// receiver's advertised window.
   887  			case available >= s.MaxPayloadSize:
   888  				// OK to send, at least 1 MSS sized segment fits
   889  				// in the receiver's advertised window.
   890  			default:
   891  				return false
   892  			}
   893  		}
   894  
   895  		// The segment size limit is computed as a function of sender
   896  		// congestion window and MSS. When sender congestion window is >
   897  		// 1, this limit can be larger than MSS. Ensure that the
   898  		// currently available send space is not greater than minimum of
   899  		// this limit and MSS.
   900  		if available > limit {
   901  			available = limit
   902  		}
   903  
   904  		// If GSO is not in use then cap available to
   905  		// maxPayloadSize. When GSO is in use the gVisor GSO logic or
   906  		// the host GSO logic will cap the segment to the correct size.
   907  		if s.ep.gso.Type == stack.GSONone && available > s.MaxPayloadSize {
   908  			available = s.MaxPayloadSize
   909  		}
   910  
   911  		if seg.payloadSize() > available {
   912  			// A negative value causes splitSeg to panic anyways, so just panic
   913  			// earlier to get more information about the cause.
   914  			s.splitSeg(seg, available)
   915  		}
   916  
   917  		segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize()))
   918  	}
   919  
   920  	s.sendSegment(seg)
   921  
   922  	// Update sndNxt if we actually sent new data (as opposed to
   923  	// retransmitting some previously sent data).
   924  	if s.SndNxt.LessThan(segEnd) {
   925  		s.SndNxt = segEnd
   926  	}
   927  
   928  	return true
   929  }
   930  
   931  // zeroProbeJunk is data sent during zero window probes. Its value is
   932  // irrelevant; since the sequence number has already been acknowledged it will
   933  // be discarded. It's only here to avoid allocating.
   934  var zeroProbeJunk = []byte{0}
   935  
   936  // +checklocks:s.ep.mu
   937  func (s *sender) sendZeroWindowProbe() {
   938  	s.unackZeroWindowProbes++
   939  
   940  	// Send a zero window probe with sequence number pointing to the last
   941  	// acknowledged byte. Note that, like Linux, this isn't quite what RFC
   942  	// 9293 3.8.6.1 describes: we don't send the next byte in the stream,
   943  	// we re-send an ACKed byte to goad the receiver into responding.
   944  	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
   945  		Payload: buffer.MakeWithData(zeroProbeJunk),
   946  	})
   947  	defer pkt.DecRef()
   948  	s.sendSegmentFromPacketBuffer(pkt, header.TCPFlagAck, s.SndUna-1)
   949  
   950  	// Rearm the timer to continue probing.
   951  	s.resendTimer.enable(s.RTO)
   952  }
   953  
   954  func (s *sender) enableZeroWindowProbing() {
   955  	s.zeroWindowProbing = true
   956  	// We piggyback the probing on the retransmit timer with the
   957  	// current retranmission interval, as we may start probing while
   958  	// segment retransmissions.
   959  	if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) {
   960  		s.firstRetransmittedSegXmitTime = s.ep.stack.Clock().NowMonotonic()
   961  	}
   962  	s.resendTimer.enable(s.RTO)
   963  }
   964  
   965  func (s *sender) disableZeroWindowProbing() {
   966  	s.zeroWindowProbing = false
   967  	s.unackZeroWindowProbes = 0
   968  	s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{}
   969  	s.resendTimer.disable()
   970  }
   971  
   972  func (s *sender) postXmit(dataSent bool, shouldScheduleProbe bool) {
   973  	if dataSent {
   974  		// We sent data, so we should stop the keepalive timer to ensure
   975  		// that no keepalives are sent while there is pending data.
   976  		s.ep.disableKeepaliveTimer()
   977  	}
   978  
   979  	// If the sender has advertised zero receive window and we have
   980  	// data to be sent out, start zero window probing to query the
   981  	// the remote for it's receive window size.
   982  	if s.writeNext != nil && s.SndWnd == 0 {
   983  		s.enableZeroWindowProbing()
   984  	}
   985  
   986  	// If we have no more pending data, start the keepalive timer.
   987  	if s.SndUna == s.SndNxt {
   988  		s.ep.resetKeepaliveTimer(false)
   989  	} else {
   990  		// Enable timers if we have pending data.
   991  		if shouldScheduleProbe && s.shouldSchedulePTO() {
   992  			// Schedule PTO after transmitting new data that wasn't itself a TLP probe.
   993  			s.schedulePTO()
   994  		} else if !s.resendTimer.enabled() {
   995  			s.probeTimer.disable()
   996  			if s.Outstanding > 0 {
   997  				// Enable the resend timer if it's not enabled yet and there is
   998  				// outstanding data.
   999  				s.resendTimer.enable(s.RTO)
  1000  			}
  1001  		}
  1002  	}
  1003  }
  1004  
  1005  // sendData sends new data segments. It is called when data becomes available or
  1006  // when the send window opens up.
  1007  // +checklocks:s.ep.mu
  1008  func (s *sender) sendData() {
  1009  	limit := s.MaxPayloadSize
  1010  	if s.gso {
  1011  		limit = int(s.ep.gso.MaxSize - header.TCPTotalHeaderMaximumSize - 1)
  1012  	}
  1013  	end := s.SndUna.Add(s.SndWnd)
  1014  
  1015  	// Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10.
  1016  	// "A TCP SHOULD set cwnd to no more than RW before beginning
  1017  	// transmission if the TCP has not sent data in the interval exceeding
  1018  	// the retrasmission timeout."
  1019  	if !s.FastRecovery.Active && s.state != tcpip.RTORecovery && s.ep.stack.Clock().NowMonotonic().Sub(s.LastSendTime) > s.RTO {
  1020  		if s.SndCwnd > InitialCwnd {
  1021  			s.SndCwnd = InitialCwnd
  1022  		}
  1023  	}
  1024  
  1025  	var dataSent bool
  1026  	for seg := s.writeNext; seg != nil && s.Outstanding < s.SndCwnd; seg = seg.Next() {
  1027  		cwndLimit := (s.SndCwnd - s.Outstanding) * s.MaxPayloadSize
  1028  		if cwndLimit < limit {
  1029  			limit = cwndLimit
  1030  		}
  1031  		if s.isAssignedSequenceNumber(seg) && s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
  1032  			// Move writeNext along so that we don't try and scan data that
  1033  			// has already been SACKED.
  1034  			s.updateWriteNext(seg.Next())
  1035  			continue
  1036  		}
  1037  		if sent := s.maybeSendSegment(seg, limit, end); !sent {
  1038  			break
  1039  		}
  1040  		dataSent = true
  1041  		s.Outstanding += s.pCount(seg, s.MaxPayloadSize)
  1042  		s.updateWriteNext(seg.Next())
  1043  	}
  1044  
  1045  	s.postXmit(dataSent, true /* shouldScheduleProbe */)
  1046  }
  1047  
  1048  func (s *sender) enterRecovery() {
  1049  	// Initialize the variables used to detect spurious recovery after
  1050  	// entering recovery.
  1051  	//
  1052  	// See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1.
  1053  	s.spuriousRecovery = false
  1054  	s.retransmitTS = 0
  1055  
  1056  	s.FastRecovery.Active = true
  1057  	// Save state to reflect we're now in fast recovery.
  1058  	//
  1059  	// See : https://tools.ietf.org/html/rfc5681#section-3.2 Step 3.
  1060  	// We inflate the cwnd by 3 to account for the 3 packets which triggered
  1061  	// the 3 duplicate ACKs and are now not in flight.
  1062  	s.SndCwnd = s.Ssthresh + 3
  1063  	s.SackedOut = 0
  1064  	s.DupAckCount = 0
  1065  	s.FastRecovery.First = s.SndUna
  1066  	s.FastRecovery.Last = s.SndNxt - 1
  1067  	s.FastRecovery.MaxCwnd = s.SndCwnd + s.Outstanding
  1068  	s.FastRecovery.HighRxt = s.SndUna
  1069  	s.FastRecovery.RescueRxt = s.SndUna
  1070  
  1071  	// Record retransmitTS if the sender is not in recovery as per:
  1072  	// https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2
  1073  	s.recordRetransmitTS()
  1074  
  1075  	if s.ep.SACKPermitted {
  1076  		s.state = tcpip.SACKRecovery
  1077  		s.ep.stack.Stats().TCP.SACKRecovery.Increment()
  1078  		// Set TLPRxtOut to false according to
  1079  		// https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1.
  1080  		if s.rc.tlpRxtOut {
  1081  			// The tail loss probe triggered recovery.
  1082  			s.ep.stack.Stats().TCP.TLPRecovery.Increment()
  1083  		}
  1084  		s.rc.tlpRxtOut = false
  1085  		return
  1086  	}
  1087  	s.state = tcpip.FastRecovery
  1088  	s.ep.stack.Stats().TCP.FastRecovery.Increment()
  1089  }
  1090  
  1091  func (s *sender) leaveRecovery() {
  1092  	s.FastRecovery.Active = false
  1093  	s.FastRecovery.MaxCwnd = 0
  1094  	s.DupAckCount = 0
  1095  
  1096  	// Deflate cwnd. It had been artificially inflated when new dups arrived.
  1097  	s.SndCwnd = s.Ssthresh
  1098  	s.cc.PostRecovery()
  1099  }
  1100  
  1101  // isAssignedSequenceNumber relies on the fact that we only set flags once a
  1102  // sequencenumber is assigned and that is only done right before we send the
  1103  // segment. As a result any segment that has a non-zero flag has a valid
  1104  // sequence number assigned to it.
  1105  func (s *sender) isAssignedSequenceNumber(seg *segment) bool {
  1106  	return seg.flags != 0
  1107  }
  1108  
  1109  // SetPipe implements the SetPipe() function described in RFC6675. Netstack
  1110  // maintains the congestion window in number of packets and not bytes, so
  1111  // SetPipe() here measures number of outstanding packets rather than actual
  1112  // outstanding bytes in the network.
  1113  func (s *sender) SetPipe() {
  1114  	// If SACK isn't permitted or it is permitted but recovery is not active
  1115  	// then ignore pipe calculations.
  1116  	if !s.ep.SACKPermitted || !s.FastRecovery.Active {
  1117  		return
  1118  	}
  1119  	pipe := 0
  1120  	smss := seqnum.Size(s.ep.scoreboard.SMSS())
  1121  	for s1 := s.writeList.Front(); s1 != nil && s1.payloadSize() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() {
  1122  		// With GSO each segment can be much larger than SMSS. So check the segment
  1123  		// in SMSS sized ranges.
  1124  		segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.payloadSize()))
  1125  		for startSeq := s1.sequenceNumber; startSeq.LessThan(segEnd); startSeq = startSeq.Add(smss) {
  1126  			endSeq := startSeq.Add(smss)
  1127  			if segEnd.LessThan(endSeq) {
  1128  				endSeq = segEnd
  1129  			}
  1130  			sb := header.SACKBlock{Start: startSeq, End: endSeq}
  1131  			// SetPipe():
  1132  			//
  1133  			// After initializing pipe to zero, the following steps are
  1134  			// taken for each octet 'S1' in the sequence space between
  1135  			// HighACK and HighData that has not been SACKed:
  1136  			if !s1.sequenceNumber.LessThan(s.SndNxt) {
  1137  				break
  1138  			}
  1139  			if s.ep.scoreboard.IsSACKED(sb) {
  1140  				continue
  1141  			}
  1142  
  1143  			// SetPipe():
  1144  			//
  1145  			//    (a) If IsLost(S1) returns false, Pipe is incremened by 1.
  1146  			//
  1147  			// NOTE: here we mark the whole segment as lost. We do not try
  1148  			// and test every byte in our write buffer as we maintain our
  1149  			// pipe in terms of outstanding packets and not bytes.
  1150  			if !s.ep.scoreboard.IsRangeLost(sb) {
  1151  				pipe++
  1152  			}
  1153  			// SetPipe():
  1154  			//    (b) If S1 <= HighRxt, Pipe is incremented by 1.
  1155  			if s1.sequenceNumber.LessThanEq(s.FastRecovery.HighRxt) {
  1156  				pipe++
  1157  			}
  1158  		}
  1159  	}
  1160  	s.Outstanding = pipe
  1161  }
  1162  
  1163  // shouldEnterRecovery returns true if the sender should enter fast recovery
  1164  // based on dupAck count and sack scoreboard.
  1165  // See RFC 6675 section 5.
  1166  func (s *sender) shouldEnterRecovery() bool {
  1167  	return s.DupAckCount >= nDupAckThreshold ||
  1168  		(s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 && s.ep.scoreboard.IsLost(s.SndUna))
  1169  }
  1170  
  1171  // detectLoss is called when an ack is received and returns whether a loss is
  1172  // detected. It manages the state related to duplicate acks and determines if
  1173  // a retransmit is needed according to the rules in RFC 6582 (NewReno).
  1174  func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) {
  1175  	// We're not in fast recovery yet.
  1176  
  1177  	// If RACK is enabled and there is no reordering we should honor the
  1178  	// three duplicate ACK rule to enter recovery.
  1179  	// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-4
  1180  	if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1181  		if s.rc.Reord {
  1182  			return false
  1183  		}
  1184  	}
  1185  
  1186  	if !s.isDupAck(seg) {
  1187  		s.DupAckCount = 0
  1188  		return false
  1189  	}
  1190  
  1191  	s.DupAckCount++
  1192  
  1193  	// Do not enter fast recovery until we reach nDupAckThreshold or the
  1194  	// first unacknowledged byte is considered lost as per SACK scoreboard.
  1195  	if !s.shouldEnterRecovery() {
  1196  		// RFC 6675 Step 3.
  1197  		s.FastRecovery.HighRxt = s.SndUna - 1
  1198  		// Do run SetPipe() to calculate the outstanding segments.
  1199  		s.SetPipe()
  1200  		s.state = tcpip.Disorder
  1201  		return false
  1202  	}
  1203  
  1204  	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2
  1205  	//
  1206  	// We only do the check here, the incrementing of last to the highest
  1207  	// sequence number transmitted till now is done when enterRecovery
  1208  	// is invoked.
  1209  	//
  1210  	// Note that we only enter recovery when at least one more byte of data
  1211  	// beyond s.fr.last (the highest byte that was outstanding when fast
  1212  	// retransmit was last entered) is acked.
  1213  	if !s.FastRecovery.Last.LessThan(seg.ackNumber - 1) {
  1214  		s.DupAckCount = 0
  1215  		return false
  1216  	}
  1217  	s.cc.HandleLossDetected()
  1218  	s.enterRecovery()
  1219  	return true
  1220  }
  1221  
  1222  // isDupAck determines if seg is a duplicate ack as defined in
  1223  // https://tools.ietf.org/html/rfc5681#section-2.
  1224  func (s *sender) isDupAck(seg *segment) bool {
  1225  	// A TCP that utilizes selective acknowledgments (SACKs) [RFC2018, RFC2883]
  1226  	// can leverage the SACK information to determine when an incoming ACK is a
  1227  	// "duplicate" (e.g., if the ACK contains previously unknown SACK
  1228  	// information).
  1229  	if s.ep.SACKPermitted && !seg.hasNewSACKInfo {
  1230  		return false
  1231  	}
  1232  
  1233  	// (a) The receiver of the ACK has outstanding data.
  1234  	return s.SndUna != s.SndNxt &&
  1235  		// (b) The incoming acknowledgment carries no data.
  1236  		seg.logicalLen() == 0 &&
  1237  		// (c) The SYN and FIN bits are both off.
  1238  		!seg.flags.Intersects(header.TCPFlagFin|header.TCPFlagSyn) &&
  1239  		// (d) the ACK number is equal to the greatest acknowledgment received on
  1240  		// the given connection (TCP.UNA from RFC793).
  1241  		seg.ackNumber == s.SndUna &&
  1242  		// (e) the advertised window in the incoming acknowledgment equals the
  1243  		// advertised window in the last incoming acknowledgment.
  1244  		s.SndWnd == seg.window
  1245  }
  1246  
  1247  // Iterate the writeList and update RACK for each segment which is newly acked
  1248  // either cumulatively or selectively. Loop through the segments which are
  1249  // sacked, and update the RACK related variables and check for reordering.
  1250  // Returns true when the DSACK block has been detected in the received ACK.
  1251  //
  1252  // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
  1253  // steps 2 and 3.
  1254  func (s *sender) walkSACK(rcvdSeg *segment) bool {
  1255  	s.rc.setDSACKSeen(false)
  1256  
  1257  	// Look for DSACK block.
  1258  	hasDSACK := false
  1259  	idx := 0
  1260  	n := len(rcvdSeg.parsedOptions.SACKBlocks)
  1261  	if checkDSACK(rcvdSeg) {
  1262  		dsackBlock := rcvdSeg.parsedOptions.SACKBlocks[0]
  1263  		numDSACK := uint64(dsackBlock.End-dsackBlock.Start) / uint64(s.MaxPayloadSize)
  1264  		// numDSACK can be zero when DSACK is sent for subsegments.
  1265  		if numDSACK < 1 {
  1266  			numDSACK = 1
  1267  		}
  1268  		s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.IncrementBy(numDSACK)
  1269  		s.rc.setDSACKSeen(true)
  1270  		idx = 1
  1271  		n--
  1272  		hasDSACK = true
  1273  	}
  1274  
  1275  	if n == 0 {
  1276  		return hasDSACK
  1277  	}
  1278  
  1279  	// Sort the SACK blocks. The first block is the most recent unacked
  1280  	// block. The following blocks can be in arbitrary order.
  1281  	sackBlocks := make([]header.SACKBlock, n)
  1282  	copy(sackBlocks, rcvdSeg.parsedOptions.SACKBlocks[idx:])
  1283  	sort.Slice(sackBlocks, func(i, j int) bool {
  1284  		return sackBlocks[j].Start.LessThan(sackBlocks[i].Start)
  1285  	})
  1286  
  1287  	seg := s.writeList.Front()
  1288  	for _, sb := range sackBlocks {
  1289  		for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 {
  1290  			if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked {
  1291  				s.rc.update(seg, rcvdSeg)
  1292  				s.rc.detectReorder(seg)
  1293  				seg.acked = true
  1294  				s.SackedOut += s.pCount(seg, s.MaxPayloadSize)
  1295  			}
  1296  			seg = seg.Next()
  1297  		}
  1298  	}
  1299  	return hasDSACK
  1300  }
  1301  
  1302  // checkDSACK checks if a DSACK is reported.
  1303  func checkDSACK(rcvdSeg *segment) bool {
  1304  	n := len(rcvdSeg.parsedOptions.SACKBlocks)
  1305  	if n == 0 {
  1306  		return false
  1307  	}
  1308  
  1309  	sb := rcvdSeg.parsedOptions.SACKBlocks[0]
  1310  	// Check if SACK block is invalid.
  1311  	if sb.End.LessThan(sb.Start) {
  1312  		return false
  1313  	}
  1314  
  1315  	// See: https://tools.ietf.org/html/rfc2883#section-5 DSACK is sent in
  1316  	// at most one SACK block. DSACK is detected in the below two cases:
  1317  	//	* If the SACK sequence space is less than this cumulative ACK, it is
  1318  	//		an indication that the segment identified by the SACK block has
  1319  	//		been received more than once by the receiver.
  1320  	//	* If the sequence space in the first SACK block is greater than the
  1321  	//		cumulative ACK, then the sender next compares the sequence space
  1322  	//		in the first SACK block with the sequence space in the second SACK
  1323  	//		block, if there is one. This comparison can determine if the first
  1324  	//		SACK block is reporting duplicate data that lies above the
  1325  	//		cumulative ACK.
  1326  	if sb.Start.LessThan(rcvdSeg.ackNumber) {
  1327  		return true
  1328  	}
  1329  
  1330  	if n > 1 {
  1331  		sb1 := rcvdSeg.parsedOptions.SACKBlocks[1]
  1332  		if sb1.End.LessThan(sb1.Start) {
  1333  			return false
  1334  		}
  1335  
  1336  		// If the first SACK block is fully covered by second SACK
  1337  		// block, then the first block is a DSACK block.
  1338  		if sb.End.LessThanEq(sb1.End) && sb1.Start.LessThanEq(sb.Start) {
  1339  			return true
  1340  		}
  1341  	}
  1342  
  1343  	return false
  1344  }
  1345  
  1346  func (s *sender) recordRetransmitTS() {
  1347  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2
  1348  	//
  1349  	// The Eifel detection algorithm is used, only upon initiation of loss
  1350  	// recovery, i.e., when either the timeout-based retransmit or the fast
  1351  	// retransmit is sent. The Eifel detection algorithm MUST NOT be
  1352  	// reinitiated after loss recovery has already started. In particular,
  1353  	// it must not be reinitiated upon subsequent timeouts for the same
  1354  	// segment, and not upon retransmitting segments other than the oldest
  1355  	// outstanding segment, e.g., during selective loss recovery.
  1356  	if s.inRecovery() {
  1357  		return
  1358  	}
  1359  
  1360  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2
  1361  	//
  1362  	// Set a "RetransmitTS" variable to the value of the Timestamp Value
  1363  	// field of the Timestamps option included in the retransmit sent when
  1364  	// loss recovery is initiated. A TCP sender must ensure that
  1365  	// RetransmitTS does not get overwritten as loss recovery progresses,
  1366  	// e.g., in case of a second timeout and subsequent second retransmit of
  1367  	// the same octet.
  1368  	s.retransmitTS = s.ep.tsValNow()
  1369  }
  1370  
  1371  func (s *sender) detectSpuriousRecovery(hasDSACK bool, tsEchoReply uint32) {
  1372  	// Return if the sender has already detected spurious recovery.
  1373  	if s.spuriousRecovery {
  1374  		return
  1375  	}
  1376  
  1377  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 4
  1378  	//
  1379  	// If the value of the Timestamp Echo Reply field of the acceptable ACK's
  1380  	// Timestamps option is smaller than the value of RetransmitTS, then
  1381  	// proceed to next step, else return.
  1382  	if tsEchoReply >= s.retransmitTS {
  1383  		return
  1384  	}
  1385  
  1386  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5
  1387  	//
  1388  	// If the acceptable ACK carries a DSACK option [RFC2883], then return.
  1389  	if hasDSACK {
  1390  		return
  1391  	}
  1392  
  1393  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5
  1394  	//
  1395  	// If during the lifetime of the TCP connection the TCP sender has
  1396  	// previously received an ACK with a DSACK option, or the acceptable ACK
  1397  	// does not acknowledge all outstanding data, then proceed to next step,
  1398  	// else return.
  1399  	numDSACK := s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.Value()
  1400  	if numDSACK == 0 && s.SndUna == s.SndNxt {
  1401  		return
  1402  	}
  1403  
  1404  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 6
  1405  	//
  1406  	// If the loss recovery has been initiated with a timeout-based
  1407  	// retransmit, then set
  1408  	//    SpuriousRecovery <- SPUR_TO (equal 1),
  1409  	// else set
  1410  	//    SpuriousRecovery <- dupacks+1
  1411  	// Set the spurious recovery variable to true as we do not differentiate
  1412  	// between fast, SACK or RTO recovery.
  1413  	s.spuriousRecovery = true
  1414  	s.ep.stack.Stats().TCP.SpuriousRecovery.Increment()
  1415  
  1416  	// RFC 3522 will detect all kinds of spurious recoveries (fast, SACK and
  1417  	// timeout). Increment the metric for RTO only as we want to track the
  1418  	// number of timeout recoveries.
  1419  	if s.state == tcpip.RTORecovery {
  1420  		s.ep.stack.Stats().TCP.SpuriousRTORecovery.Increment()
  1421  	}
  1422  }
  1423  
  1424  // Check if the sender is in RTORecovery, FastRecovery or SACKRecovery state.
  1425  func (s *sender) inRecovery() bool {
  1426  	if s.state == tcpip.RTORecovery || s.state == tcpip.FastRecovery || s.state == tcpip.SACKRecovery {
  1427  		return true
  1428  	}
  1429  	return false
  1430  }
  1431  
  1432  // handleRcvdSegment is called when a segment is received; it is responsible for
  1433  // updating the send-related state.
  1434  // +checklocks:s.ep.mu
  1435  // +checklocksalias:s.rc.snd.ep.mu=s.ep.mu
  1436  func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
  1437  	bestRTT := unknownRTT
  1438  
  1439  	// Check if we can extract an RTT measurement from this ack.
  1440  	if !rcvdSeg.parsedOptions.TS && s.RTTMeasureSeqNum.LessThan(rcvdSeg.ackNumber) {
  1441  		bestRTT = s.ep.stack.Clock().NowMonotonic().Sub(s.RTTMeasureTime)
  1442  		s.updateRTO(bestRTT)
  1443  		s.RTTMeasureSeqNum = s.SndNxt
  1444  	}
  1445  
  1446  	// Update Timestamp if required. See RFC7323, section-4.3.
  1447  	if s.ep.SendTSOk && rcvdSeg.parsedOptions.TS {
  1448  		s.ep.updateRecentTimestamp(rcvdSeg.parsedOptions.TSVal, s.MaxSentAck, rcvdSeg.sequenceNumber)
  1449  	}
  1450  
  1451  	// Insert SACKBlock information into our scoreboard.
  1452  	hasDSACK := false
  1453  	if s.ep.SACKPermitted {
  1454  		for _, sb := range rcvdSeg.parsedOptions.SACKBlocks {
  1455  			// Only insert the SACK block if the following holds
  1456  			// true:
  1457  			//  * SACK block acks data after the ack number in the
  1458  			//    current segment.
  1459  			//  * SACK block represents a sequence
  1460  			//    between sndUna and sndNxt (i.e. data that is
  1461  			//    currently unacked and in-flight).
  1462  			//  * SACK block that has not been SACKed already.
  1463  			//
  1464  			// NOTE: This check specifically excludes DSACK blocks
  1465  			// which have start/end before sndUna and are used to
  1466  			// indicate spurious retransmissions.
  1467  			if rcvdSeg.ackNumber.LessThan(sb.Start) && s.SndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.SndNxt) && !s.ep.scoreboard.IsSACKED(sb) {
  1468  				s.ep.scoreboard.Insert(sb)
  1469  				rcvdSeg.hasNewSACKInfo = true
  1470  			}
  1471  		}
  1472  
  1473  		// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08
  1474  		// section-7.2
  1475  		//	* Step 2: Update RACK stats.
  1476  		//		If the ACK is not ignored as invalid, update the RACK.rtt
  1477  		//		to be the RTT sample calculated using this ACK, and
  1478  		//		continue.  If this ACK or SACK was for the most recently
  1479  		//		sent packet, then record the RACK.xmit_ts timestamp and
  1480  		//		RACK.end_seq sequence implied by this ACK.
  1481  		//	* Step 3: Detect packet reordering.
  1482  		//		If the ACK selectively or cumulatively acknowledges an
  1483  		//		unacknowledged and also never retransmitted sequence below
  1484  		//		RACK.fack, then the corresponding packet has been
  1485  		//		reordered and RACK.reord is set to TRUE.
  1486  		if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1487  			hasDSACK = s.walkSACK(rcvdSeg)
  1488  		}
  1489  		s.SetPipe()
  1490  	}
  1491  
  1492  	ack := rcvdSeg.ackNumber
  1493  	fastRetransmit := false
  1494  	// Do not leave fast recovery, if the ACK is out of range.
  1495  	if s.FastRecovery.Active {
  1496  		// Leave fast recovery if it acknowledges all the data covered by
  1497  		// this fast recovery session.
  1498  		if (ack-1).InRange(s.SndUna, s.SndNxt) && s.FastRecovery.Last.LessThan(ack) {
  1499  			s.leaveRecovery()
  1500  		}
  1501  	} else {
  1502  		// Detect loss by counting the duplicates and enter recovery.
  1503  		fastRetransmit = s.detectLoss(rcvdSeg)
  1504  	}
  1505  
  1506  	// See if TLP based recovery was successful.
  1507  	if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1508  		s.detectTLPRecovery(ack, rcvdSeg)
  1509  	}
  1510  
  1511  	// Stash away the current window size.
  1512  	s.SndWnd = rcvdSeg.window
  1513  
  1514  	// Disable zero window probing if remote advertises a non-zero receive
  1515  	// window. This can be with an ACK to the zero window probe (where the
  1516  	// acknumber refers to the already acknowledged byte) OR to any previously
  1517  	// unacknowledged segment.
  1518  	if s.zeroWindowProbing && rcvdSeg.window > 0 &&
  1519  		(ack == s.SndUna || (ack-1).InRange(s.SndUna, s.SndNxt)) {
  1520  		s.disableZeroWindowProbing()
  1521  	}
  1522  
  1523  	// On receiving the ACK for the zero window probe, account for it and
  1524  	// skip trying to send any segment as we are still probing for
  1525  	// receive window to become non-zero.
  1526  	if s.zeroWindowProbing && s.unackZeroWindowProbes > 0 && ack == s.SndUna {
  1527  		s.unackZeroWindowProbes--
  1528  		return
  1529  	}
  1530  
  1531  	// Ignore ack if it doesn't acknowledge any new data.
  1532  	if (ack - 1).InRange(s.SndUna, s.SndNxt) {
  1533  		s.DupAckCount = 0
  1534  
  1535  		// See : https://tools.ietf.org/html/rfc1323#section-3.3.
  1536  		// Specifically we should only update the RTO using TSEcr if the
  1537  		// following condition holds:
  1538  		//
  1539  		//    A TSecr value received in a segment is used to update the
  1540  		//    averaged RTT measurement only if the segment acknowledges
  1541  		//    some new data, i.e., only if it advances the left edge of
  1542  		//    the send window.
  1543  		if s.ep.SendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 {
  1544  			tsRTT := s.ep.elapsed(s.ep.stack.Clock().NowMonotonic(), rcvdSeg.parsedOptions.TSEcr)
  1545  			s.updateRTO(tsRTT)
  1546  			// Following Linux, prefer RTT computed from ACKs to TSEcr because,
  1547  			// "broken middle-boxes or peers may corrupt TS-ECR fields"
  1548  			// https://github.com/torvalds/linux/blob/39cd87c4eb2b893354f3b850f916353f2658ae6f/net/ipv4/tcp_input.c#L3141C1-L3144C24
  1549  			if bestRTT == unknownRTT {
  1550  				bestRTT = tsRTT
  1551  			}
  1552  		}
  1553  
  1554  		if s.shouldSchedulePTO() {
  1555  			// Schedule PTO upon receiving an ACK that cumulatively acknowledges data.
  1556  			// See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1.
  1557  			s.schedulePTO()
  1558  		} else {
  1559  			// When an ack is received we must rearm the timer.
  1560  			// RFC 6298 5.3
  1561  			s.probeTimer.disable()
  1562  			s.resendTimer.enable(s.RTO)
  1563  		}
  1564  
  1565  		// Remove all acknowledged data from the write list.
  1566  		acked := s.SndUna.Size(ack)
  1567  		s.SndUna = ack
  1568  		ackLeft := acked
  1569  		originalOutstanding := s.Outstanding
  1570  		for ackLeft > 0 {
  1571  			// We use logicalLen here because we can have FIN
  1572  			// segments (which are always at the end of list) that
  1573  			// have no data, but do consume a sequence number.
  1574  			seg := s.writeList.Front()
  1575  			datalen := seg.logicalLen()
  1576  
  1577  			if datalen > ackLeft {
  1578  				prevCount := s.pCount(seg, s.MaxPayloadSize)
  1579  				seg.TrimFront(ackLeft)
  1580  				seg.sequenceNumber.UpdateForward(ackLeft)
  1581  				s.Outstanding -= prevCount - s.pCount(seg, s.MaxPayloadSize)
  1582  				break
  1583  			}
  1584  
  1585  			if s.writeNext == seg {
  1586  				s.updateWriteNext(seg.Next())
  1587  			}
  1588  
  1589  			// Update the RACK fields if SACK is enabled.
  1590  			if s.ep.SACKPermitted && !seg.acked && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1591  				s.rc.update(seg, rcvdSeg)
  1592  				s.rc.detectReorder(seg)
  1593  			}
  1594  
  1595  			s.writeList.Remove(seg)
  1596  
  1597  			// If SACK is enabled then only reduce outstanding if
  1598  			// the segment was not previously SACKED as these have
  1599  			// already been accounted for in SetPipe().
  1600  			if !s.ep.SACKPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
  1601  				s.Outstanding -= s.pCount(seg, s.MaxPayloadSize)
  1602  			} else {
  1603  				s.SackedOut -= s.pCount(seg, s.MaxPayloadSize)
  1604  			}
  1605  			seg.DecRef()
  1606  			ackLeft -= datalen
  1607  		}
  1608  
  1609  		// Clear SACK information for all acked data.
  1610  		s.ep.scoreboard.Delete(s.SndUna)
  1611  
  1612  		// Detect if the sender entered recovery spuriously.
  1613  		if s.inRecovery() {
  1614  			s.detectSpuriousRecovery(hasDSACK, rcvdSeg.parsedOptions.TSEcr)
  1615  		}
  1616  
  1617  		// If we are not in fast recovery then update the congestion
  1618  		// window based on the number of acknowledged packets.
  1619  		if !s.FastRecovery.Active {
  1620  			s.cc.Update(originalOutstanding-s.Outstanding, bestRTT)
  1621  			if s.FastRecovery.Last.LessThan(s.SndUna) {
  1622  				s.state = tcpip.Open
  1623  				// Update RACK when we are exiting fast or RTO
  1624  				// recovery as described in the RFC
  1625  				// draft-ietf-tcpm-rack-08 Section-7.2 Step 4.
  1626  				if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1627  					s.rc.exitRecovery()
  1628  				}
  1629  				s.reorderTimer.disable()
  1630  			}
  1631  		}
  1632  
  1633  		// Update the send buffer usage and notify potential waiters.
  1634  		s.ep.updateSndBufferUsage(int(acked))
  1635  
  1636  		// It is possible for s.outstanding to drop below zero if we get
  1637  		// a retransmit timeout, reset outstanding to zero but later
  1638  		// get an ack that cover previously sent data.
  1639  		if s.Outstanding < 0 {
  1640  			s.Outstanding = 0
  1641  		}
  1642  
  1643  		s.SetPipe()
  1644  
  1645  		// If all outstanding data was acknowledged the disable the timer.
  1646  		// RFC 6298 Rule 5.3
  1647  		if s.SndUna == s.SndNxt {
  1648  			s.Outstanding = 0
  1649  			// Reset firstRetransmittedSegXmitTime to the zero value.
  1650  			s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{}
  1651  			s.resendTimer.disable()
  1652  			s.probeTimer.disable()
  1653  		}
  1654  	}
  1655  
  1656  	if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1657  		// Update RACK reorder window.
  1658  		// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
  1659  		//	* Upon receiving an ACK:
  1660  		//	* Step 4: Update RACK reordering window
  1661  		s.rc.updateRACKReorderWindow()
  1662  
  1663  		// After the reorder window is calculated, detect any loss by checking
  1664  		// if the time elapsed after the segments are sent is greater than the
  1665  		// reorder window.
  1666  		if numLost := s.rc.detectLoss(rcvdSeg.rcvdTime); numLost > 0 && !s.FastRecovery.Active {
  1667  			// If any segment is marked as lost by
  1668  			// RACK, enter recovery and retransmit
  1669  			// the lost segments.
  1670  			s.cc.HandleLossDetected()
  1671  			s.enterRecovery()
  1672  			fastRetransmit = true
  1673  		}
  1674  
  1675  		if s.FastRecovery.Active {
  1676  			s.rc.DoRecovery(nil, fastRetransmit)
  1677  		}
  1678  	}
  1679  
  1680  	// Now that we've popped all acknowledged data from the retransmit
  1681  	// queue, retransmit if needed.
  1682  	if s.FastRecovery.Active && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 {
  1683  		s.lr.DoRecovery(rcvdSeg, fastRetransmit)
  1684  		// When SACK is enabled data sending is governed by steps in
  1685  		// RFC 6675 Section 5 recovery steps  A-C.
  1686  		// See: https://tools.ietf.org/html/rfc6675#section-5.
  1687  		if s.ep.SACKPermitted {
  1688  			return
  1689  		}
  1690  	}
  1691  
  1692  	// Send more data now that some of the pending data has been ack'd, or
  1693  	// that the window opened up, or the congestion window was inflated due
  1694  	// to a duplicate ack during fast recovery. This will also re-enable
  1695  	// the retransmit timer if needed.
  1696  	s.sendData()
  1697  }
  1698  
  1699  // sendSegment sends the specified segment.
  1700  // +checklocks:s.ep.mu
  1701  func (s *sender) sendSegment(seg *segment) tcpip.Error {
  1702  	if seg.xmitCount > 0 {
  1703  		s.ep.stack.Stats().TCP.Retransmits.Increment()
  1704  		s.ep.stats.SendErrors.Retransmits.Increment()
  1705  		if s.SndCwnd < s.Ssthresh {
  1706  			s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment()
  1707  		}
  1708  	}
  1709  	seg.xmitTime = s.ep.stack.Clock().NowMonotonic()
  1710  	seg.xmitCount++
  1711  	seg.lost = false
  1712  
  1713  	err := s.sendSegmentFromPacketBuffer(seg.pkt, seg.flags, seg.sequenceNumber)
  1714  
  1715  	// Every time a packet containing data is sent (including a
  1716  	// retransmission), if SACK is enabled and we are retransmitting data
  1717  	// then use the conservative timer described in RFC6675 Section 6.0,
  1718  	// otherwise follow the standard time described in RFC6298 Section 5.1.
  1719  	if err != nil && seg.payloadSize() != 0 {
  1720  		if s.FastRecovery.Active && seg.xmitCount > 1 && s.ep.SACKPermitted {
  1721  			s.resendTimer.enable(s.RTO)
  1722  		} else {
  1723  			if !s.resendTimer.enabled() {
  1724  				s.resendTimer.enable(s.RTO)
  1725  			}
  1726  		}
  1727  	}
  1728  
  1729  	return err
  1730  }
  1731  
  1732  // sendSegmentFromPacketBuffer sends a new segment containing the given payload,
  1733  // flags and sequence number.
  1734  // +checklocks:s.ep.mu
  1735  // +checklocksalias:s.ep.rcv.ep.mu=s.ep.mu
  1736  func (s *sender) sendSegmentFromPacketBuffer(pkt *stack.PacketBuffer, flags header.TCPFlags, seq seqnum.Value) tcpip.Error {
  1737  	s.LastSendTime = s.ep.stack.Clock().NowMonotonic()
  1738  	if seq == s.RTTMeasureSeqNum {
  1739  		s.RTTMeasureTime = s.LastSendTime
  1740  	}
  1741  
  1742  	rcvNxt, rcvWnd := s.ep.rcv.getSendParams()
  1743  
  1744  	// Remember the max sent ack.
  1745  	s.MaxSentAck = rcvNxt
  1746  
  1747  	// We need to clone the packet because sendRaw takes ownership of pkt,
  1748  	// and pkt could be reprocessed later on (i.e retrasmission).
  1749  	pkt = pkt.Clone()
  1750  	defer pkt.DecRef()
  1751  
  1752  	return s.ep.sendRaw(pkt, flags, seq, rcvNxt, rcvWnd)
  1753  }
  1754  
  1755  // sendEmptySegment sends a new empty segment, flags and sequence number.
  1756  // +checklocks:s.ep.mu
  1757  // +checklocksalias:s.ep.rcv.ep.mu=s.ep.mu
  1758  func (s *sender) sendEmptySegment(flags header.TCPFlags, seq seqnum.Value) tcpip.Error {
  1759  	s.LastSendTime = s.ep.stack.Clock().NowMonotonic()
  1760  	if seq == s.RTTMeasureSeqNum {
  1761  		s.RTTMeasureTime = s.LastSendTime
  1762  	}
  1763  
  1764  	rcvNxt, rcvWnd := s.ep.rcv.getSendParams()
  1765  
  1766  	// Remember the max sent ack.
  1767  	s.MaxSentAck = rcvNxt
  1768  
  1769  	return s.ep.sendEmptyRaw(flags, seq, rcvNxt, rcvWnd)
  1770  }
  1771  
  1772  // maybeSendOutOfWindowAck sends an ACK if we are not being rate limited
  1773  // currently.
  1774  // +checklocks:s.ep.mu
  1775  func (s *sender) maybeSendOutOfWindowAck(seg *segment) {
  1776  	// Data packets are unlikely to be part of an ACK loop. So always send
  1777  	// an ACK for a packet w/ data.
  1778  	if seg.payloadSize() > 0 || s.ep.allowOutOfWindowAck() {
  1779  		s.sendAck()
  1780  	}
  1781  }
  1782  
  1783  func (s *sender) updateWriteNext(seg *segment) {
  1784  	if s.writeNext != nil {
  1785  		s.writeNext.DecRef()
  1786  	}
  1787  	if seg != nil {
  1788  		seg.IncRef()
  1789  	}
  1790  	s.writeNext = seg
  1791  }
  1792  
  1793  // corkTimerExpired drains all the segments when TCP_CORK is enabled.
  1794  // +checklocks:s.ep.mu
  1795  func (s *sender) corkTimerExpired() tcpip.Error {
  1796  	// Check if the timer actually expired or if it's a spurious wake due
  1797  	// to a previously orphaned runtime timer.
  1798  	if s.corkTimer.isUninitialized() || !s.corkTimer.checkExpiration() {
  1799  		return nil
  1800  	}
  1801  
  1802  	// Assign sequence number and flags to the segment.
  1803  	seg := s.writeNext
  1804  	if seg == nil {
  1805  		return nil
  1806  	}
  1807  	seg.sequenceNumber = s.SndNxt
  1808  	seg.flags = header.TCPFlagAck | header.TCPFlagPsh
  1809  	// Drain all the segments.
  1810  	s.sendData()
  1811  	return nil
  1812  }