github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/tcpip/transport/tcp/snd.go

github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/tcpip/transport/tcp/snd.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"fmt"
    19  	"math"
    20  	"sort"
    21  	"time"
    22  
    23  	"github.com/metacubex/gvisor/pkg/sync"
    24  	"github.com/metacubex/gvisor/pkg/tcpip"
    25  	"github.com/metacubex/gvisor/pkg/tcpip/header"
    26  	"github.com/metacubex/gvisor/pkg/tcpip/seqnum"
    27  	"github.com/metacubex/gvisor/pkg/tcpip/stack"
    28  )
    29  
    30  const (
    31  	// MinRTO is the minimum allowed value for the retransmit timeout.
    32  	MinRTO = 200 * time.Millisecond
    33  
    34  	// MaxRTO is the maximum allowed value for the retransmit timeout.
    35  	MaxRTO = 120 * time.Second
    36  
    37  	// MinSRTT is the minimum allowed value for smoothed RTT.
    38  	MinSRTT = 1 * time.Millisecond
    39  
    40  	// InitialCwnd is the initial congestion window.
    41  	InitialCwnd = 10
    42  
    43  	// nDupAckThreshold is the number of duplicate ACK's required
    44  	// before fast-retransmit is entered.
    45  	nDupAckThreshold = 3
    46  
    47  	// MaxRetries is the maximum number of probe retries sender does
    48  	// before timing out the connection.
    49  	// Linux default TCP_RETR2, net.ipv4.tcp_retries2.
    50  	MaxRetries = 15
    51  )
    52  
    53  // congestionControl is an interface that must be implemented by any supported
    54  // congestion control algorithm.
    55  type congestionControl interface {
    56  	// HandleLossDetected is invoked when the loss is detected by RACK or
    57  	// sender.dupAckCount >= nDupAckThreshold just before entering fast
    58  	// retransmit.
    59  	HandleLossDetected()
    60  
    61  	// HandleRTOExpired is invoked when the retransmit timer expires.
    62  	HandleRTOExpired()
    63  
    64  	// Update is invoked when processing inbound acks. It's passed the
    65  	// number of packet's that were acked by the most recent cumulative
    66  	// acknowledgement.
    67  	Update(packetsAcked int)
    68  
    69  	// PostRecovery is invoked when the sender is exiting a fast retransmit/
    70  	// recovery phase. This provides congestion control algorithms a way
    71  	// to adjust their state when exiting recovery.
    72  	PostRecovery()
    73  }
    74  
    75  // lossRecovery is an interface that must be implemented by any supported
    76  // loss recovery algorithm.
    77  type lossRecovery interface {
    78  	// DoRecovery is invoked when loss is detected and segments need
    79  	// to be retransmitted. The cumulative or selective ACK is passed along
    80  	// with the flag which identifies whether the connection entered fast
    81  	// retransmit with this ACK and to retransmit the first unacknowledged
    82  	// segment.
    83  	DoRecovery(rcvdSeg *segment, fastRetransmit bool)
    84  }
    85  
    86  // sender holds the state necessary to send TCP segments.
    87  //
    88  // +stateify savable
    89  type sender struct {
    90  	stack.TCPSenderState
    91  	ep *Endpoint
    92  
    93  	// lr is the loss recovery algorithm used by the sender.
    94  	lr lossRecovery
    95  
    96  	// firstRetransmittedSegXmitTime is the original transmit time of
    97  	// the first segment that was retransmitted due to RTO expiration.
    98  	firstRetransmittedSegXmitTime tcpip.MonotonicTime
    99  
   100  	// zeroWindowProbing is set if the sender is currently probing
   101  	// for zero receive window.
   102  	zeroWindowProbing bool `state:"nosave"`
   103  
   104  	// unackZeroWindowProbes is the number of unacknowledged zero
   105  	// window probes.
   106  	unackZeroWindowProbes uint32 `state:"nosave"`
   107  
   108  	writeNext   *segment
   109  	writeList   segmentList
   110  	resendTimer timer `state:"nosave"`
   111  
   112  	// rtt.TCPRTTState.SRTT and rtt.TCPRTTState.RTTVar are the "smoothed
   113  	// round-trip time", and "round-trip time variation", as defined in
   114  	// section 2 of RFC 6298.
   115  	rtt rtt
   116  
   117  	// minRTO is the minimum permitted value for sender.rto.
   118  	minRTO time.Duration
   119  
   120  	// maxRTO is the maximum permitted value for sender.rto.
   121  	maxRTO time.Duration
   122  
   123  	// maxRetries is the maximum permitted retransmissions.
   124  	maxRetries uint32
   125  
   126  	// gso is set if generic segmentation offload is enabled.
   127  	gso bool
   128  
   129  	// state is the current state of congestion control for this endpoint.
   130  	state tcpip.CongestionControlState
   131  
   132  	// cc is the congestion control algorithm in use for this sender.
   133  	cc congestionControl
   134  
   135  	// rc has the fields needed for implementing RACK loss detection
   136  	// algorithm.
   137  	rc rackControl
   138  
   139  	// reorderTimer is the timer used to retransmit the segments after RACK
   140  	// detects them as lost.
   141  	reorderTimer timer `state:"nosave"`
   142  
   143  	// probeTimer is used to schedule PTO for RACK TLP algorithm.
   144  	probeTimer timer `state:"nosave"`
   145  
   146  	// spuriousRecovery indicates whether the sender entered recovery
   147  	// spuriously as described in RFC3522 Section 3.2.
   148  	spuriousRecovery bool
   149  
   150  	// retransmitTS is the timestamp at which the sender sends retransmitted
   151  	// segment after entering an RTO for the first time as described in
   152  	// RFC3522 Section 3.2.
   153  	retransmitTS uint32
   154  
   155  	// startCork start corking the segments.
   156  	startCork bool
   157  
   158  	// corkTimer is used to drain the segments which are held when TCP_CORK
   159  	// option is enabled.
   160  	corkTimer timer `state:"nosave"`
   161  }
   162  
   163  // rtt is a synchronization wrapper used to appease stateify. See the comment
   164  // in sender, where it is used.
   165  //
   166  // +stateify savable
   167  type rtt struct {
   168  	sync.Mutex `state:"nosave"`
   169  
   170  	stack.TCPRTTState
   171  }
   172  
   173  // +checklocks:ep.mu
   174  func newSender(ep *Endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender {
   175  	// The sender MUST reduce the TCP data length to account for any IP or
   176  	// TCP options that it is including in the packets that it sends.
   177  	// See: https://tools.ietf.org/html/rfc6691#section-2
   178  	maxPayloadSize := int(mss) - ep.maxOptionSize()
   179  
   180  	s := &sender{
   181  		ep: ep,
   182  		TCPSenderState: stack.TCPSenderState{
   183  			SndWnd:           sndWnd,
   184  			SndUna:           iss + 1,
   185  			SndNxt:           iss + 1,
   186  			RTTMeasureSeqNum: iss + 1,
   187  			LastSendTime:     ep.stack.Clock().NowMonotonic(),
   188  			MaxPayloadSize:   maxPayloadSize,
   189  			MaxSentAck:       irs + 1,
   190  			FastRecovery: stack.TCPFastRecoveryState{
   191  				// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1.
   192  				Last:      iss,
   193  				HighRxt:   iss,
   194  				RescueRxt: iss,
   195  			},
   196  			RTO: 1 * time.Second,
   197  		},
   198  		gso: ep.gso.Type != stack.GSONone,
   199  	}
   200  
   201  	if s.gso {
   202  		s.ep.gso.MSS = uint16(maxPayloadSize)
   203  	}
   204  
   205  	s.cc = s.initCongestionControl(ep.cc)
   206  	s.lr = s.initLossRecovery()
   207  	s.rc.init(s, iss)
   208  
   209  	// A negative sndWndScale means that no scaling is in use, otherwise we
   210  	// store the scaling value.
   211  	if sndWndScale > 0 {
   212  		s.SndWndScale = uint8(sndWndScale)
   213  	}
   214  
   215  	s.resendTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.retransmitTimerExpired))
   216  	s.reorderTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.rc.reorderTimerExpired))
   217  	s.probeTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.probeTimerExpired))
   218  	s.corkTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.corkTimerExpired))
   219  
   220  	s.ep.AssertLockHeld(ep)
   221  	s.updateMaxPayloadSize(int(ep.route.MTU()), 0)
   222  	// Initialize SACK Scoreboard after updating max payload size as we use
   223  	// the maxPayloadSize as the smss when determining if a segment is lost
   224  	// etc.
   225  	s.ep.scoreboard = NewSACKScoreboard(uint16(s.MaxPayloadSize), iss)
   226  
   227  	// Get Stack wide config.
   228  	var minRTO tcpip.TCPMinRTOOption
   229  	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil {
   230  		panic(fmt.Sprintf("unable to get minRTO from stack: %s", err))
   231  	}
   232  	s.minRTO = time.Duration(minRTO)
   233  
   234  	var maxRTO tcpip.TCPMaxRTOOption
   235  	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil {
   236  		panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err))
   237  	}
   238  	s.maxRTO = time.Duration(maxRTO)
   239  
   240  	var maxRetries tcpip.TCPMaxRetriesOption
   241  	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil {
   242  		panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err))
   243  	}
   244  	s.maxRetries = uint32(maxRetries)
   245  
   246  	return s
   247  }
   248  
   249  // initCongestionControl initializes the specified congestion control module and
   250  // returns a handle to it. It also initializes the sndCwnd and sndSsThresh to
   251  // their initial values.
   252  func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl {
   253  	s.SndCwnd = InitialCwnd
   254  	// Set sndSsthresh to the maximum int value, which depends on the
   255  	// platform.
   256  	s.Ssthresh = int(^uint(0) >> 1)
   257  
   258  	switch congestionControlName {
   259  	case ccCubic:
   260  		return newCubicCC(s)
   261  	case ccReno:
   262  		fallthrough
   263  	default:
   264  		return newRenoCC(s)
   265  	}
   266  }
   267  
   268  // initLossRecovery initiates the loss recovery algorithm for the sender.
   269  func (s *sender) initLossRecovery() lossRecovery {
   270  	if s.ep.SACKPermitted {
   271  		return newSACKRecovery(s)
   272  	}
   273  	return newRenoRecovery(s)
   274  }
   275  
   276  // updateMaxPayloadSize updates the maximum payload size based on the given
   277  // MTU. If this is in response to "packet too big" control packets (indicated
   278  // by the count argument), it also reduces the number of outstanding packets and
   279  // attempts to retransmit the first packet above the MTU size.
   280  // +checklocks:s.ep.mu
   281  func (s *sender) updateMaxPayloadSize(mtu, count int) {
   282  	m := mtu - header.TCPMinimumSize
   283  
   284  	m -= s.ep.maxOptionSize()
   285  
   286  	// We don't adjust up for now.
   287  	if m >= s.MaxPayloadSize {
   288  		return
   289  	}
   290  
   291  	// Make sure we can transmit at least one byte.
   292  	if m <= 0 {
   293  		m = 1
   294  	}
   295  
   296  	oldMSS := s.MaxPayloadSize
   297  	s.MaxPayloadSize = m
   298  	if s.gso {
   299  		s.ep.gso.MSS = uint16(m)
   300  	}
   301  
   302  	if count == 0 {
   303  		// updateMaxPayloadSize is also called when the sender is created.
   304  		// and there is no data to send in such cases. Return immediately.
   305  		return
   306  	}
   307  
   308  	// Update the scoreboard's smss to reflect the new lowered
   309  	// maxPayloadSize.
   310  	s.ep.scoreboard.smss = uint16(m)
   311  
   312  	s.Outstanding -= count
   313  	if s.Outstanding < 0 {
   314  		s.Outstanding = 0
   315  	}
   316  
   317  	// Rewind writeNext to the first segment exceeding the MTU. Do nothing
   318  	// if it is already before such a packet.
   319  	nextSeg := s.writeNext
   320  	for seg := s.writeList.Front(); seg != nil; seg = seg.Next() {
   321  		if seg == s.writeNext {
   322  			// We got to writeNext before we could find a segment
   323  			// exceeding the MTU.
   324  			break
   325  		}
   326  
   327  		if nextSeg == s.writeNext && seg.payloadSize() > m {
   328  			// We found a segment exceeding the MTU. Rewind
   329  			// writeNext and try to retransmit it.
   330  			nextSeg = seg
   331  		}
   332  
   333  		if s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
   334  			// Update sackedOut for new maximum payload size.
   335  			s.SackedOut -= s.pCount(seg, oldMSS)
   336  			s.SackedOut += s.pCount(seg, s.MaxPayloadSize)
   337  		}
   338  	}
   339  
   340  	// Since we likely reduced the number of outstanding packets, we may be
   341  	// ready to send some more.
   342  	s.updateWriteNext(nextSeg)
   343  	s.sendData()
   344  }
   345  
   346  // sendAck sends an ACK segment.
   347  // +checklocks:s.ep.mu
   348  func (s *sender) sendAck() {
   349  	s.sendEmptySegment(header.TCPFlagAck, s.SndNxt)
   350  }
   351  
   352  // updateRTO updates the retransmit timeout when a new roud-trip time is
   353  // available. This is done in accordance with section 2 of RFC 6298.
   354  func (s *sender) updateRTO(rtt time.Duration) {
   355  	s.rtt.Lock()
   356  	if !s.rtt.TCPRTTState.SRTTInited {
   357  		s.rtt.TCPRTTState.RTTVar = rtt / 2
   358  		s.rtt.TCPRTTState.SRTT = rtt
   359  		s.rtt.TCPRTTState.SRTTInited = true
   360  	} else {
   361  		diff := s.rtt.TCPRTTState.SRTT - rtt
   362  		if diff < 0 {
   363  			diff = -diff
   364  		}
   365  		// Use RFC6298 standard algorithm to update TCPRTTState.RTTVar and TCPRTTState.SRTT when
   366  		// no timestamps are available.
   367  		if !s.ep.SendTSOk {
   368  			s.rtt.TCPRTTState.RTTVar = (3*s.rtt.TCPRTTState.RTTVar + diff) / 4
   369  			s.rtt.TCPRTTState.SRTT = (7*s.rtt.TCPRTTState.SRTT + rtt) / 8
   370  		} else {
   371  			// When we are taking RTT measurements of every ACK then
   372  			// we need to use a modified method as specified in
   373  			// https://tools.ietf.org/html/rfc7323#appendix-G
   374  			if s.Outstanding == 0 {
   375  				s.rtt.Unlock()
   376  				return
   377  			}
   378  			// Netstack measures congestion window/inflight all in
   379  			// terms of packets and not bytes. This is similar to
   380  			// how linux also does cwnd and inflight. In practice
   381  			// this approximation works as expected.
   382  			expectedSamples := math.Ceil(float64(s.Outstanding) / 2)
   383  
   384  			// alpha & beta values are the original values as recommended in
   385  			// https://tools.ietf.org/html/rfc6298#section-2.3.
   386  			const alpha = 0.125
   387  			const beta = 0.25
   388  
   389  			alphaPrime := alpha / expectedSamples
   390  			betaPrime := beta / expectedSamples
   391  			rttVar := (1-betaPrime)*s.rtt.TCPRTTState.RTTVar.Seconds() + betaPrime*diff.Seconds()
   392  			srtt := (1-alphaPrime)*s.rtt.TCPRTTState.SRTT.Seconds() + alphaPrime*rtt.Seconds()
   393  			s.rtt.TCPRTTState.RTTVar = time.Duration(rttVar * float64(time.Second))
   394  			s.rtt.TCPRTTState.SRTT = time.Duration(srtt * float64(time.Second))
   395  		}
   396  	}
   397  
   398  	if s.rtt.TCPRTTState.SRTT < MinSRTT {
   399  		s.rtt.TCPRTTState.SRTT = MinSRTT
   400  	}
   401  
   402  	s.RTO = s.rtt.TCPRTTState.SRTT + 4*s.rtt.TCPRTTState.RTTVar
   403  	s.rtt.Unlock()
   404  	if s.RTO < s.minRTO {
   405  		s.RTO = s.minRTO
   406  	}
   407  	if s.RTO > s.maxRTO {
   408  		s.RTO = s.maxRTO
   409  	}
   410  }
   411  
   412  // resendSegment resends the first unacknowledged segment.
   413  // +checklocks:s.ep.mu
   414  func (s *sender) resendSegment() {
   415  	// Don't use any segments we already sent to measure RTT as they may
   416  	// have been affected by packets being lost.
   417  	s.RTTMeasureSeqNum = s.SndNxt
   418  
   419  	// Resend the segment.
   420  	if seg := s.writeList.Front(); seg != nil {
   421  		if seg.payloadSize() > s.MaxPayloadSize {
   422  			s.splitSeg(seg, s.MaxPayloadSize)
   423  		}
   424  
   425  		// See: RFC 6675 section 5 Step 4.3
   426  		//
   427  		// To prevent retransmission, set both the HighRXT and RescueRXT
   428  		// to the highest sequence number in the retransmitted segment.
   429  		s.FastRecovery.HighRxt = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) - 1
   430  		s.FastRecovery.RescueRxt = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) - 1
   431  		s.sendSegment(seg)
   432  		s.ep.stack.Stats().TCP.FastRetransmit.Increment()
   433  		s.ep.stats.SendErrors.FastRetransmit.Increment()
   434  
   435  		// Run SetPipe() as per RFC 6675 section 5 Step 4.4
   436  		s.SetPipe()
   437  	}
   438  }
   439  
   440  // retransmitTimerExpired is called when the retransmit timer expires, and
   441  // unacknowledged segments are assumed lost, and thus need to be resent.
   442  // Returns true if the connection is still usable, or false if the connection
   443  // is deemed lost.
   444  // +checklocks:s.ep.mu
   445  func (s *sender) retransmitTimerExpired() tcpip.Error {
   446  	// Check if the timer actually expired or if it's a spurious wake due
   447  	// to a previously orphaned runtime timer.
   448  	if s.resendTimer.isUninitialized() || !s.resendTimer.checkExpiration() {
   449  		return nil
   450  	}
   451  
   452  	// Initialize the variables used to detect spurious recovery after
   453  	// entering RTO.
   454  	//
   455  	// See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1.
   456  	s.spuriousRecovery = false
   457  	s.retransmitTS = 0
   458  
   459  	// TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases
   460  	// when writeList is empty. Remove this once we have a proper fix for this
   461  	// issue.
   462  	if s.writeList.Front() == nil {
   463  		return nil
   464  	}
   465  
   466  	s.ep.stack.Stats().TCP.Timeouts.Increment()
   467  	s.ep.stats.SendErrors.Timeouts.Increment()
   468  
   469  	// Set TLPRxtOut to false according to
   470  	// https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1.
   471  	s.rc.tlpRxtOut = false
   472  
   473  	// Give up if we've waited more than a minute since the last resend or
   474  	// if a user time out is set and we have exceeded the user specified
   475  	// timeout since the first retransmission.
   476  	uto := s.ep.userTimeout
   477  
   478  	if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) {
   479  		// We store the original xmitTime of the segment that we are
   480  		// about to retransmit as the retransmission time. This is
   481  		// required as by the time the retransmitTimer has expired the
   482  		// segment has already been sent and unacked for the RTO at the
   483  		// time the segment was sent.
   484  		s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime
   485  	}
   486  
   487  	elapsed := s.ep.stack.Clock().NowMonotonic().Sub(s.firstRetransmittedSegXmitTime)
   488  	remaining := s.maxRTO
   489  	if uto != 0 {
   490  		// Cap to the user specified timeout if one is specified.
   491  		remaining = uto - elapsed
   492  	}
   493  
   494  	// Always honor the user-timeout irrespective of whether the zero
   495  	// window probes were acknowledged.
   496  	// net/ipv4/tcp_timer.c::tcp_probe_timer()
   497  	if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries {
   498  		s.ep.stack.Stats().TCP.EstablishedTimedout.Increment()
   499  		return &tcpip.ErrTimeout{}
   500  	}
   501  
   502  	// Set new timeout. The timer will be restarted by the call to sendData
   503  	// below.
   504  	s.RTO *= 2
   505  	// Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5
   506  	if s.RTO > s.maxRTO {
   507  		s.RTO = s.maxRTO
   508  	}
   509  
   510  	// Cap RTO to remaining time.
   511  	if s.RTO > remaining {
   512  		s.RTO = remaining
   513  	}
   514  
   515  	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4.
   516  	//
   517  	// Retransmit timeouts:
   518  	//     After a retransmit timeout, record the highest sequence number
   519  	//     transmitted in the variable recover, and exit the fast recovery
   520  	//     procedure if applicable.
   521  	s.FastRecovery.Last = s.SndNxt - 1
   522  
   523  	if s.FastRecovery.Active {
   524  		// We were attempting fast recovery but were not successful.
   525  		// Leave the state. We don't need to update ssthresh because it
   526  		// has already been updated when entered fast-recovery.
   527  		s.leaveRecovery()
   528  	}
   529  
   530  	// Record retransmitTS if the sender is not in recovery as per:
   531  	// https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2
   532  	s.recordRetransmitTS()
   533  
   534  	s.state = tcpip.RTORecovery
   535  	s.cc.HandleRTOExpired()
   536  
   537  	// Mark the next segment to be sent as the first unacknowledged one and
   538  	// start sending again. Set the number of outstanding packets to 0 so
   539  	// that we'll be able to retransmit.
   540  	//
   541  	// We'll keep on transmitting (or retransmitting) as we get acks for
   542  	// the data we transmit.
   543  	s.Outstanding = 0
   544  
   545  	// Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1
   546  	//
   547  	//  In order to avoid memory deadlocks, the TCP receiver is allowed to
   548  	//  discard data that has already been selectively acknowledged. As a
   549  	//  result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK
   550  	//  information gathered from a receiver upon a retransmission timeout
   551  	//  (RTO) "since the timeout might indicate that the data receiver has
   552  	//  reneged." Additionally, a TCP sender MUST "ignore prior SACK
   553  	//  information in determining which data to retransmit."
   554  	//
   555  	// NOTE: We take the stricter interpretation and just expunge all
   556  	// information as we lack more rigorous checks to validate if the SACK
   557  	// information is usable after an RTO.
   558  	s.ep.scoreboard.Reset()
   559  	s.updateWriteNext(s.writeList.Front())
   560  
   561  	// RFC 1122 4.2.2.17: Start sending zero window probes when we still see a
   562  	// zero receive window after retransmission interval and we have data to
   563  	// send.
   564  	if s.zeroWindowProbing {
   565  		s.sendZeroWindowProbe()
   566  		// RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed
   567  		// indefinitely.  As long as the receiving TCP continues to send
   568  		// acknowledgments in response to the probe segments, the sending TCP
   569  		// MUST allow the connection to stay open.
   570  		return nil
   571  	}
   572  
   573  	seg := s.writeNext
   574  	// RFC 1122 4.2.3.5: Close the connection when the number of
   575  	// retransmissions for this segment is beyond a limit.
   576  	if seg != nil && seg.xmitCount > s.maxRetries {
   577  		s.ep.stack.Stats().TCP.EstablishedTimedout.Increment()
   578  		return &tcpip.ErrTimeout{}
   579  	}
   580  
   581  	s.sendData()
   582  
   583  	return nil
   584  }
   585  
   586  // pCount returns the number of packets in the segment. Due to GSO, a segment
   587  // can be composed of multiple packets.
   588  func (s *sender) pCount(seg *segment, maxPayloadSize int) int {
   589  	size := seg.payloadSize()
   590  	if size == 0 {
   591  		return 1
   592  	}
   593  
   594  	return (size-1)/maxPayloadSize + 1
   595  }
   596  
   597  // splitSeg splits a given segment at the size specified and inserts the
   598  // remainder as a new segment after the current one in the write list.
   599  func (s *sender) splitSeg(seg *segment, size int) {
   600  	if seg.payloadSize() <= size {
   601  		return
   602  	}
   603  	// Split this segment up.
   604  	nSeg := seg.clone()
   605  	nSeg.pkt.Data().TrimFront(size)
   606  	nSeg.sequenceNumber.UpdateForward(seqnum.Size(size))
   607  	s.writeList.InsertAfter(seg, nSeg)
   608  
   609  	// The segment being split does not carry PUSH flag because it is
   610  	// followed by the newly split segment.
   611  	// RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered
   612  	// segment (i.e., when there is no more queued data to be sent).
   613  	// Linux removes PSH flag only when the segment is being split over MSS
   614  	// and retains it when we are splitting the segment over lack of sender
   615  	// window space.
   616  	// ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point()
   617  	// ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test()
   618  	if seg.payloadSize() > s.MaxPayloadSize {
   619  		seg.flags ^= header.TCPFlagPsh
   620  	}
   621  	seg.pkt.Data().CapLength(size)
   622  }
   623  
   624  // NextSeg implements the RFC6675 NextSeg() operation.
   625  //
   626  // NextSeg starts scanning the writeList starting from nextSegHint and returns
   627  // the hint to be passed on the next call to NextSeg. This is required to avoid
   628  // iterating the write list repeatedly when NextSeg is invoked in a loop during
   629  // recovery. The returned hint will be nil if there are no more segments that
   630  // can match rules defined by NextSeg operation in RFC6675.
   631  //
   632  // rescueRtx will be true only if nextSeg is a rescue retransmission as
   633  // described by Step 4) of the NextSeg algorithm.
   634  func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) {
   635  	var s3 *segment
   636  	var s4 *segment
   637  	// Step 1.
   638  	for seg := nextSegHint; seg != nil; seg = seg.Next() {
   639  		// Stop iteration if we hit a segment that has never been
   640  		// transmitted (i.e. either it has no assigned sequence number
   641  		// or if it does have one, it's >= the next sequence number
   642  		// to be sent [i.e. >= s.sndNxt]).
   643  		if !s.isAssignedSequenceNumber(seg) || s.SndNxt.LessThanEq(seg.sequenceNumber) {
   644  			hint = nil
   645  			break
   646  		}
   647  		segSeq := seg.sequenceNumber
   648  		if smss := s.ep.scoreboard.SMSS(); seg.payloadSize() > int(smss) {
   649  			s.splitSeg(seg, int(smss))
   650  		}
   651  
   652  		// See RFC 6675 Section 4
   653  		//
   654  		//     1. If there exists a smallest unSACKED sequence number
   655  		//     'S2' that meets the following 3 criteria for determinig
   656  		//     loss, the sequence range of one segment of up to SMSS
   657  		//     octets starting with S2 MUST be returned.
   658  		if !s.ep.scoreboard.IsSACKED(header.SACKBlock{Start: segSeq, End: segSeq.Add(1)}) {
   659  			// NextSeg():
   660  			//
   661  			//    (1.a) S2 is greater than HighRxt
   662  			//    (1.b) S2 is less than highest octet covered by
   663  			//    any received SACK.
   664  			if s.FastRecovery.HighRxt.LessThan(segSeq) && segSeq.LessThan(s.ep.scoreboard.maxSACKED) {
   665  				// NextSeg():
   666  				//     (1.c) IsLost(S2) returns true.
   667  				if s.ep.scoreboard.IsLost(segSeq) {
   668  					return seg, seg.Next(), false
   669  				}
   670  
   671  				// NextSeg():
   672  				//
   673  				// (3): If the conditions for rules (1) and (2)
   674  				// fail, but there exists an unSACKed sequence
   675  				// number S3 that meets the criteria for
   676  				// detecting loss given in steps 1.a and 1.b
   677  				// above (specifically excluding (1.c)) then one
   678  				// segment of upto SMSS octets starting with S3
   679  				// SHOULD be returned.
   680  				if s3 == nil {
   681  					s3 = seg
   682  					hint = seg.Next()
   683  				}
   684  			}
   685  			// NextSeg():
   686  			//
   687  			//     (4) If the conditions for (1), (2) and (3) fail,
   688  			//     but there exists outstanding unSACKED data, we
   689  			//     provide the opportunity for a single "rescue"
   690  			//     retransmission per entry into loss recovery. If
   691  			//     HighACK is greater than RescueRxt (or RescueRxt
   692  			//     is undefined), then one segment of upto SMSS
   693  			//     octets that MUST include the highest outstanding
   694  			//     unSACKed sequence number SHOULD be returned, and
   695  			//     RescueRxt set to RecoveryPoint. HighRxt MUST NOT
   696  			//     be updated.
   697  			if s.FastRecovery.RescueRxt.LessThan(s.SndUna - 1) {
   698  				if s4 != nil {
   699  					if s4.sequenceNumber.LessThan(segSeq) {
   700  						s4 = seg
   701  					}
   702  				} else {
   703  					s4 = seg
   704  				}
   705  			}
   706  		}
   707  	}
   708  
   709  	// If we got here then no segment matched step (1).
   710  	// Step (2): "If no sequence number 'S2' per rule (1)
   711  	// exists but there exists available unsent data and the
   712  	// receiver's advertised window allows, the sequence
   713  	// range of one segment of up to SMSS octets of
   714  	// previously unsent data starting with sequence number
   715  	// HighData+1 MUST be returned."
   716  	for seg := s.writeNext; seg != nil; seg = seg.Next() {
   717  		if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.SndNxt) {
   718  			continue
   719  		}
   720  		// We do not split the segment here to <= smss as it has
   721  		// potentially not been assigned a sequence number yet.
   722  		return seg, nil, false
   723  	}
   724  
   725  	if s3 != nil {
   726  		return s3, hint, false
   727  	}
   728  
   729  	return s4, nil, true
   730  }
   731  
   732  // maybeSendSegment tries to send the specified segment and either coalesces
   733  // other segments into this one or splits the specified segment based on the
   734  // lower of the specified limit value or the receivers window size specified by
   735  // end.
   736  // +checklocks:s.ep.mu
   737  func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (sent bool) {
   738  	// We abuse the flags field to determine if we have already
   739  	// assigned a sequence number to this segment.
   740  	if !s.isAssignedSequenceNumber(seg) {
   741  		// Merge segments if allowed.
   742  		if seg.payloadSize() != 0 {
   743  			available := int(s.SndNxt.Size(end))
   744  			if available > limit {
   745  				available = limit
   746  			}
   747  
   748  			// nextTooBig indicates that the next segment was too
   749  			// large to entirely fit in the current segment. It
   750  			// would be possible to split the next segment and merge
   751  			// the portion that fits, but unexpectedly splitting
   752  			// segments can have user visible side-effects which can
   753  			// break applications. For example, RFC 7766 section 8
   754  			// says that the length and data of a DNS response
   755  			// should be sent in the same TCP segment to avoid
   756  			// triggering bugs in poorly written DNS
   757  			// implementations.
   758  			var nextTooBig bool
   759  			for nSeg := seg.Next(); nSeg != nil && nSeg.payloadSize() != 0; nSeg = seg.Next() {
   760  				if seg.payloadSize()+nSeg.payloadSize() > available {
   761  					nextTooBig = true
   762  					break
   763  				}
   764  				seg.merge(nSeg)
   765  				s.writeList.Remove(nSeg)
   766  				nSeg.DecRef()
   767  			}
   768  			if !nextTooBig && seg.payloadSize() < available {
   769  				// Segment is not full.
   770  				if s.Outstanding > 0 && s.ep.ops.GetDelayOption() {
   771  					// Nagle's algorithm. From Wikipedia:
   772  					//   Nagle's algorithm works by
   773  					//   combining a number of small
   774  					//   outgoing messages and sending them
   775  					//   all at once. Specifically, as long
   776  					//   as there is a sent packet for which
   777  					//   the sender has received no
   778  					//   acknowledgment, the sender should
   779  					//   keep buffering its output until it
   780  					//   has a full packet's worth of
   781  					//   output, thus allowing output to be
   782  					//   sent all at once.
   783  					return false
   784  				}
   785  				// With TCP_CORK, hold back until minimum of the available
   786  				// send space and MSS.
   787  				if s.ep.ops.GetCorkOption() {
   788  					if seg.payloadSize() < s.MaxPayloadSize {
   789  						if !s.startCork {
   790  							s.startCork = true
   791  							// Enable the timer for
   792  							// 200ms, after which
   793  							// the segments are drained.
   794  							s.corkTimer.enable(MinRTO)
   795  						}
   796  						return false
   797  					}
   798  					// Disable the TCP_CORK timer.
   799  					s.startCork = false
   800  					s.corkTimer.disable()
   801  				}
   802  			}
   803  		}
   804  
   805  		// Assign flags. We don't do it above so that we can merge
   806  		// additional data if Nagle holds the segment.
   807  		seg.sequenceNumber = s.SndNxt
   808  		seg.flags = header.TCPFlagAck | header.TCPFlagPsh
   809  	}
   810  
   811  	var segEnd seqnum.Value
   812  	if seg.payloadSize() == 0 {
   813  		if s.writeList.Back() != seg {
   814  			panic("FIN segments must be the final segment in the write list.")
   815  		}
   816  		seg.flags = header.TCPFlagAck | header.TCPFlagFin
   817  		segEnd = seg.sequenceNumber.Add(1)
   818  		// Update the state to reflect that we have now
   819  		// queued a FIN.
   820  		s.ep.updateConnDirectionState(connDirectionStateSndClosed)
   821  		switch s.ep.EndpointState() {
   822  		case StateCloseWait:
   823  			s.ep.setEndpointState(StateLastAck)
   824  		default:
   825  			s.ep.setEndpointState(StateFinWait1)
   826  		}
   827  	} else {
   828  		// We're sending a non-FIN segment.
   829  		if seg.flags&header.TCPFlagFin != 0 {
   830  			panic("Netstack queues FIN segments without data.")
   831  		}
   832  
   833  		if !seg.sequenceNumber.LessThan(end) {
   834  			return false
   835  		}
   836  
   837  		available := int(seg.sequenceNumber.Size(end))
   838  		if available == 0 {
   839  			return false
   840  		}
   841  
   842  		// If the whole segment or at least 1MSS sized segment cannot
   843  		// be accommodated in the receiver advertised window, skip
   844  		// splitting and sending of the segment. ref:
   845  		// net/ipv4/tcp_output.c::tcp_snd_wnd_test()
   846  		//
   847  		// Linux checks this for all segment transmits not triggered by
   848  		// a probe timer. On this condition, it defers the segment split
   849  		// and transmit to a short probe timer.
   850  		//
   851  		// ref: include/net/tcp.h::tcp_check_probe_timer()
   852  		// ref: net/ipv4/tcp_output.c::tcp_write_wakeup()
   853  		//
   854  		// Instead of defining a new transmit timer, we attempt to split
   855  		// the segment right here if there are no pending segments. If
   856  		// there are pending segments, segment transmits are deferred to
   857  		// the retransmit timer handler.
   858  		if s.SndUna != s.SndNxt {
   859  			switch {
   860  			case available >= seg.payloadSize():
   861  				// OK to send, the whole segments fits in the
   862  				// receiver's advertised window.
   863  			case available >= s.MaxPayloadSize:
   864  				// OK to send, at least 1 MSS sized segment fits
   865  				// in the receiver's advertised window.
   866  			default:
   867  				return false
   868  			}
   869  		}
   870  
   871  		// The segment size limit is computed as a function of sender
   872  		// congestion window and MSS. When sender congestion window is >
   873  		// 1, this limit can be larger than MSS. Ensure that the
   874  		// currently available send space is not greater than minimum of
   875  		// this limit and MSS.
   876  		if available > limit {
   877  			available = limit
   878  		}
   879  
   880  		// If GSO is not in use then cap available to
   881  		// maxPayloadSize. When GSO is in use the gVisor GSO logic or
   882  		// the host GSO logic will cap the segment to the correct size.
   883  		if s.ep.gso.Type == stack.GSONone && available > s.MaxPayloadSize {
   884  			available = s.MaxPayloadSize
   885  		}
   886  
   887  		if seg.payloadSize() > available {
   888  			// A negative value causes splitSeg to panic anyways, so just panic
   889  			// earlier to get more information about the cause.
   890  			s.splitSeg(seg, available)
   891  		}
   892  
   893  		segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize()))
   894  	}
   895  
   896  	s.sendSegment(seg)
   897  
   898  	// Update sndNxt if we actually sent new data (as opposed to
   899  	// retransmitting some previously sent data).
   900  	if s.SndNxt.LessThan(segEnd) {
   901  		s.SndNxt = segEnd
   902  	}
   903  
   904  	return true
   905  }
   906  
   907  // +checklocks:s.ep.mu
   908  func (s *sender) sendZeroWindowProbe() {
   909  	s.unackZeroWindowProbes++
   910  	// Send a zero window probe with sequence number pointing to
   911  	// the last acknowledged byte.
   912  	s.sendEmptySegment(header.TCPFlagAck, s.SndUna-1)
   913  	// Rearm the timer to continue probing.
   914  	s.resendTimer.enable(s.RTO)
   915  }
   916  
   917  func (s *sender) enableZeroWindowProbing() {
   918  	s.zeroWindowProbing = true
   919  	// We piggyback the probing on the retransmit timer with the
   920  	// current retranmission interval, as we may start probing while
   921  	// segment retransmissions.
   922  	if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) {
   923  		s.firstRetransmittedSegXmitTime = s.ep.stack.Clock().NowMonotonic()
   924  	}
   925  	s.resendTimer.enable(s.RTO)
   926  }
   927  
   928  func (s *sender) disableZeroWindowProbing() {
   929  	s.zeroWindowProbing = false
   930  	s.unackZeroWindowProbes = 0
   931  	s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{}
   932  	s.resendTimer.disable()
   933  }
   934  
   935  func (s *sender) postXmit(dataSent bool, shouldScheduleProbe bool) {
   936  	if dataSent {
   937  		// We sent data, so we should stop the keepalive timer to ensure
   938  		// that no keepalives are sent while there is pending data.
   939  		s.ep.disableKeepaliveTimer()
   940  	}
   941  
   942  	// If the sender has advertised zero receive window and we have
   943  	// data to be sent out, start zero window probing to query the
   944  	// the remote for it's receive window size.
   945  	if s.writeNext != nil && s.SndWnd == 0 {
   946  		s.enableZeroWindowProbing()
   947  	}
   948  
   949  	// If we have no more pending data, start the keepalive timer.
   950  	if s.SndUna == s.SndNxt {
   951  		s.ep.resetKeepaliveTimer(false)
   952  	} else {
   953  		// Enable timers if we have pending data.
   954  		if shouldScheduleProbe && s.shouldSchedulePTO() {
   955  			// Schedule PTO after transmitting new data that wasn't itself a TLP probe.
   956  			s.schedulePTO()
   957  		} else if !s.resendTimer.enabled() {
   958  			s.probeTimer.disable()
   959  			if s.Outstanding > 0 {
   960  				// Enable the resend timer if it's not enabled yet and there is
   961  				// outstanding data.
   962  				s.resendTimer.enable(s.RTO)
   963  			}
   964  		}
   965  	}
   966  }
   967  
   968  // sendData sends new data segments. It is called when data becomes available or
   969  // when the send window opens up.
   970  // +checklocks:s.ep.mu
   971  func (s *sender) sendData() {
   972  	limit := s.MaxPayloadSize
   973  	if s.gso {
   974  		limit = int(s.ep.gso.MaxSize - header.TCPTotalHeaderMaximumSize - 1)
   975  	}
   976  	end := s.SndUna.Add(s.SndWnd)
   977  
   978  	// Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10.
   979  	// "A TCP SHOULD set cwnd to no more than RW before beginning
   980  	// transmission if the TCP has not sent data in the interval exceeding
   981  	// the retrasmission timeout."
   982  	if !s.FastRecovery.Active && s.state != tcpip.RTORecovery && s.ep.stack.Clock().NowMonotonic().Sub(s.LastSendTime) > s.RTO {
   983  		if s.SndCwnd > InitialCwnd {
   984  			s.SndCwnd = InitialCwnd
   985  		}
   986  	}
   987  
   988  	var dataSent bool
   989  	for seg := s.writeNext; seg != nil && s.Outstanding < s.SndCwnd; seg = seg.Next() {
   990  		cwndLimit := (s.SndCwnd - s.Outstanding) * s.MaxPayloadSize
   991  		if cwndLimit > 0 && cwndLimit < limit {
   992  			limit = cwndLimit
   993  		}
   994  		if s.isAssignedSequenceNumber(seg) && s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
   995  			// Move writeNext along so that we don't try and scan data that
   996  			// has already been SACKED.
   997  			s.updateWriteNext(seg.Next())
   998  			continue
   999  		}
  1000  		if sent := s.maybeSendSegment(seg, limit, end); !sent {
  1001  			break
  1002  		}
  1003  		dataSent = true
  1004  		s.Outstanding += s.pCount(seg, s.MaxPayloadSize)
  1005  		s.updateWriteNext(seg.Next())
  1006  	}
  1007  
  1008  	s.postXmit(dataSent, true /* shouldScheduleProbe */)
  1009  }
  1010  
  1011  func (s *sender) enterRecovery() {
  1012  	// Initialize the variables used to detect spurious recovery after
  1013  	// entering recovery.
  1014  	//
  1015  	// See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1.
  1016  	s.spuriousRecovery = false
  1017  	s.retransmitTS = 0
  1018  
  1019  	s.FastRecovery.Active = true
  1020  	// Save state to reflect we're now in fast recovery.
  1021  	//
  1022  	// See : https://tools.ietf.org/html/rfc5681#section-3.2 Step 3.
  1023  	// We inflate the cwnd by 3 to account for the 3 packets which triggered
  1024  	// the 3 duplicate ACKs and are now not in flight.
  1025  	s.SndCwnd = s.Ssthresh + 3
  1026  	s.SackedOut = 0
  1027  	s.DupAckCount = 0
  1028  	s.FastRecovery.First = s.SndUna
  1029  	s.FastRecovery.Last = s.SndNxt - 1
  1030  	s.FastRecovery.MaxCwnd = s.SndCwnd + s.Outstanding
  1031  	s.FastRecovery.HighRxt = s.SndUna
  1032  	s.FastRecovery.RescueRxt = s.SndUna
  1033  
  1034  	// Record retransmitTS if the sender is not in recovery as per:
  1035  	// https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2
  1036  	s.recordRetransmitTS()
  1037  
  1038  	if s.ep.SACKPermitted {
  1039  		s.state = tcpip.SACKRecovery
  1040  		s.ep.stack.Stats().TCP.SACKRecovery.Increment()
  1041  		// Set TLPRxtOut to false according to
  1042  		// https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1.
  1043  		if s.rc.tlpRxtOut {
  1044  			// The tail loss probe triggered recovery.
  1045  			s.ep.stack.Stats().TCP.TLPRecovery.Increment()
  1046  		}
  1047  		s.rc.tlpRxtOut = false
  1048  		return
  1049  	}
  1050  	s.state = tcpip.FastRecovery
  1051  	s.ep.stack.Stats().TCP.FastRecovery.Increment()
  1052  }
  1053  
  1054  func (s *sender) leaveRecovery() {
  1055  	s.FastRecovery.Active = false
  1056  	s.FastRecovery.MaxCwnd = 0
  1057  	s.DupAckCount = 0
  1058  
  1059  	// Deflate cwnd. It had been artificially inflated when new dups arrived.
  1060  	s.SndCwnd = s.Ssthresh
  1061  	s.cc.PostRecovery()
  1062  }
  1063  
  1064  // isAssignedSequenceNumber relies on the fact that we only set flags once a
  1065  // sequencenumber is assigned and that is only done right before we send the
  1066  // segment. As a result any segment that has a non-zero flag has a valid
  1067  // sequence number assigned to it.
  1068  func (s *sender) isAssignedSequenceNumber(seg *segment) bool {
  1069  	return seg.flags != 0
  1070  }
  1071  
  1072  // SetPipe implements the SetPipe() function described in RFC6675. Netstack
  1073  // maintains the congestion window in number of packets and not bytes, so
  1074  // SetPipe() here measures number of outstanding packets rather than actual
  1075  // outstanding bytes in the network.
  1076  func (s *sender) SetPipe() {
  1077  	// If SACK isn't permitted or it is permitted but recovery is not active
  1078  	// then ignore pipe calculations.
  1079  	if !s.ep.SACKPermitted || !s.FastRecovery.Active {
  1080  		return
  1081  	}
  1082  	pipe := 0
  1083  	smss := seqnum.Size(s.ep.scoreboard.SMSS())
  1084  	for s1 := s.writeList.Front(); s1 != nil && s1.payloadSize() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() {
  1085  		// With GSO each segment can be much larger than SMSS. So check the segment
  1086  		// in SMSS sized ranges.
  1087  		segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.payloadSize()))
  1088  		for startSeq := s1.sequenceNumber; startSeq.LessThan(segEnd); startSeq = startSeq.Add(smss) {
  1089  			endSeq := startSeq.Add(smss)
  1090  			if segEnd.LessThan(endSeq) {
  1091  				endSeq = segEnd
  1092  			}
  1093  			sb := header.SACKBlock{Start: startSeq, End: endSeq}
  1094  			// SetPipe():
  1095  			//
  1096  			// After initializing pipe to zero, the following steps are
  1097  			// taken for each octet 'S1' in the sequence space between
  1098  			// HighACK and HighData that has not been SACKed:
  1099  			if !s1.sequenceNumber.LessThan(s.SndNxt) {
  1100  				break
  1101  			}
  1102  			if s.ep.scoreboard.IsSACKED(sb) {
  1103  				continue
  1104  			}
  1105  
  1106  			// SetPipe():
  1107  			//
  1108  			//    (a) If IsLost(S1) returns false, Pipe is incremened by 1.
  1109  			//
  1110  			// NOTE: here we mark the whole segment as lost. We do not try
  1111  			// and test every byte in our write buffer as we maintain our
  1112  			// pipe in terms of outstanding packets and not bytes.
  1113  			if !s.ep.scoreboard.IsRangeLost(sb) {
  1114  				pipe++
  1115  			}
  1116  			// SetPipe():
  1117  			//    (b) If S1 <= HighRxt, Pipe is incremented by 1.
  1118  			if s1.sequenceNumber.LessThanEq(s.FastRecovery.HighRxt) {
  1119  				pipe++
  1120  			}
  1121  		}
  1122  	}
  1123  	s.Outstanding = pipe
  1124  }
  1125  
  1126  // shouldEnterRecovery returns true if the sender should enter fast recovery
  1127  // based on dupAck count and sack scoreboard.
  1128  // See RFC 6675 section 5.
  1129  func (s *sender) shouldEnterRecovery() bool {
  1130  	return s.DupAckCount >= nDupAckThreshold ||
  1131  		(s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 && s.ep.scoreboard.IsLost(s.SndUna))
  1132  }
  1133  
  1134  // detectLoss is called when an ack is received and returns whether a loss is
  1135  // detected. It manages the state related to duplicate acks and determines if
  1136  // a retransmit is needed according to the rules in RFC 6582 (NewReno).
  1137  func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) {
  1138  	// We're not in fast recovery yet.
  1139  
  1140  	// If RACK is enabled and there is no reordering we should honor the
  1141  	// three duplicate ACK rule to enter recovery.
  1142  	// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-4
  1143  	if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1144  		if s.rc.Reord {
  1145  			return false
  1146  		}
  1147  	}
  1148  
  1149  	if !s.isDupAck(seg) {
  1150  		s.DupAckCount = 0
  1151  		return false
  1152  	}
  1153  
  1154  	s.DupAckCount++
  1155  
  1156  	// Do not enter fast recovery until we reach nDupAckThreshold or the
  1157  	// first unacknowledged byte is considered lost as per SACK scoreboard.
  1158  	if !s.shouldEnterRecovery() {
  1159  		// RFC 6675 Step 3.
  1160  		s.FastRecovery.HighRxt = s.SndUna - 1
  1161  		// Do run SetPipe() to calculate the outstanding segments.
  1162  		s.SetPipe()
  1163  		s.state = tcpip.Disorder
  1164  		return false
  1165  	}
  1166  
  1167  	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2
  1168  	//
  1169  	// We only do the check here, the incrementing of last to the highest
  1170  	// sequence number transmitted till now is done when enterRecovery
  1171  	// is invoked.
  1172  	//
  1173  	// Note that we only enter recovery when at least one more byte of data
  1174  	// beyond s.fr.last (the highest byte that was outstanding when fast
  1175  	// retransmit was last entered) is acked.
  1176  	if !s.FastRecovery.Last.LessThan(seg.ackNumber - 1) {
  1177  		s.DupAckCount = 0
  1178  		return false
  1179  	}
  1180  	s.cc.HandleLossDetected()
  1181  	s.enterRecovery()
  1182  	return true
  1183  }
  1184  
  1185  // isDupAck determines if seg is a duplicate ack as defined in
  1186  // https://tools.ietf.org/html/rfc5681#section-2.
  1187  func (s *sender) isDupAck(seg *segment) bool {
  1188  	// A TCP that utilizes selective acknowledgments (SACKs) [RFC2018, RFC2883]
  1189  	// can leverage the SACK information to determine when an incoming ACK is a
  1190  	// "duplicate" (e.g., if the ACK contains previously unknown SACK
  1191  	// information).
  1192  	if s.ep.SACKPermitted && !seg.hasNewSACKInfo {
  1193  		return false
  1194  	}
  1195  
  1196  	// (a) The receiver of the ACK has outstanding data.
  1197  	return s.SndUna != s.SndNxt &&
  1198  		// (b) The incoming acknowledgment carries no data.
  1199  		seg.logicalLen() == 0 &&
  1200  		// (c) The SYN and FIN bits are both off.
  1201  		!seg.flags.Intersects(header.TCPFlagFin|header.TCPFlagSyn) &&
  1202  		// (d) the ACK number is equal to the greatest acknowledgment received on
  1203  		// the given connection (TCP.UNA from RFC793).
  1204  		seg.ackNumber == s.SndUna &&
  1205  		// (e) the advertised window in the incoming acknowledgment equals the
  1206  		// advertised window in the last incoming acknowledgment.
  1207  		s.SndWnd == seg.window
  1208  }
  1209  
  1210  // Iterate the writeList and update RACK for each segment which is newly acked
  1211  // either cumulatively or selectively. Loop through the segments which are
  1212  // sacked, and update the RACK related variables and check for reordering.
  1213  // Returns true when the DSACK block has been detected in the received ACK.
  1214  //
  1215  // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
  1216  // steps 2 and 3.
  1217  func (s *sender) walkSACK(rcvdSeg *segment) bool {
  1218  	s.rc.setDSACKSeen(false)
  1219  
  1220  	// Look for DSACK block.
  1221  	hasDSACK := false
  1222  	idx := 0
  1223  	n := len(rcvdSeg.parsedOptions.SACKBlocks)
  1224  	if checkDSACK(rcvdSeg) {
  1225  		dsackBlock := rcvdSeg.parsedOptions.SACKBlocks[0]
  1226  		numDSACK := uint64(dsackBlock.End-dsackBlock.Start) / uint64(s.MaxPayloadSize)
  1227  		// numDSACK can be zero when DSACK is sent for subsegments.
  1228  		if numDSACK < 1 {
  1229  			numDSACK = 1
  1230  		}
  1231  		s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.IncrementBy(numDSACK)
  1232  		s.rc.setDSACKSeen(true)
  1233  		idx = 1
  1234  		n--
  1235  		hasDSACK = true
  1236  	}
  1237  
  1238  	if n == 0 {
  1239  		return hasDSACK
  1240  	}
  1241  
  1242  	// Sort the SACK blocks. The first block is the most recent unacked
  1243  	// block. The following blocks can be in arbitrary order.
  1244  	sackBlocks := make([]header.SACKBlock, n)
  1245  	copy(sackBlocks, rcvdSeg.parsedOptions.SACKBlocks[idx:])
  1246  	sort.Slice(sackBlocks, func(i, j int) bool {
  1247  		return sackBlocks[j].Start.LessThan(sackBlocks[i].Start)
  1248  	})
  1249  
  1250  	seg := s.writeList.Front()
  1251  	for _, sb := range sackBlocks {
  1252  		for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 {
  1253  			if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked {
  1254  				s.rc.update(seg, rcvdSeg)
  1255  				s.rc.detectReorder(seg)
  1256  				seg.acked = true
  1257  				s.SackedOut += s.pCount(seg, s.MaxPayloadSize)
  1258  			}
  1259  			seg = seg.Next()
  1260  		}
  1261  	}
  1262  	return hasDSACK
  1263  }
  1264  
  1265  // checkDSACK checks if a DSACK is reported.
  1266  func checkDSACK(rcvdSeg *segment) bool {
  1267  	n := len(rcvdSeg.parsedOptions.SACKBlocks)
  1268  	if n == 0 {
  1269  		return false
  1270  	}
  1271  
  1272  	sb := rcvdSeg.parsedOptions.SACKBlocks[0]
  1273  	// Check if SACK block is invalid.
  1274  	if sb.End.LessThan(sb.Start) {
  1275  		return false
  1276  	}
  1277  
  1278  	// See: https://tools.ietf.org/html/rfc2883#section-5 DSACK is sent in
  1279  	// at most one SACK block. DSACK is detected in the below two cases:
  1280  	//	* If the SACK sequence space is less than this cumulative ACK, it is
  1281  	//		an indication that the segment identified by the SACK block has
  1282  	//		been received more than once by the receiver.
  1283  	//	* If the sequence space in the first SACK block is greater than the
  1284  	//		cumulative ACK, then the sender next compares the sequence space
  1285  	//		in the first SACK block with the sequence space in the second SACK
  1286  	//		block, if there is one. This comparison can determine if the first
  1287  	//		SACK block is reporting duplicate data that lies above the
  1288  	//		cumulative ACK.
  1289  	if sb.Start.LessThan(rcvdSeg.ackNumber) {
  1290  		return true
  1291  	}
  1292  
  1293  	if n > 1 {
  1294  		sb1 := rcvdSeg.parsedOptions.SACKBlocks[1]
  1295  		if sb1.End.LessThan(sb1.Start) {
  1296  			return false
  1297  		}
  1298  
  1299  		// If the first SACK block is fully covered by second SACK
  1300  		// block, then the first block is a DSACK block.
  1301  		if sb.End.LessThanEq(sb1.End) && sb1.Start.LessThanEq(sb.Start) {
  1302  			return true
  1303  		}
  1304  	}
  1305  
  1306  	return false
  1307  }
  1308  
  1309  func (s *sender) recordRetransmitTS() {
  1310  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2
  1311  	//
  1312  	// The Eifel detection algorithm is used, only upon initiation of loss
  1313  	// recovery, i.e., when either the timeout-based retransmit or the fast
  1314  	// retransmit is sent. The Eifel detection algorithm MUST NOT be
  1315  	// reinitiated after loss recovery has already started. In particular,
  1316  	// it must not be reinitiated upon subsequent timeouts for the same
  1317  	// segment, and not upon retransmitting segments other than the oldest
  1318  	// outstanding segment, e.g., during selective loss recovery.
  1319  	if s.inRecovery() {
  1320  		return
  1321  	}
  1322  
  1323  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2
  1324  	//
  1325  	// Set a "RetransmitTS" variable to the value of the Timestamp Value
  1326  	// field of the Timestamps option included in the retransmit sent when
  1327  	// loss recovery is initiated. A TCP sender must ensure that
  1328  	// RetransmitTS does not get overwritten as loss recovery progresses,
  1329  	// e.g., in case of a second timeout and subsequent second retransmit of
  1330  	// the same octet.
  1331  	s.retransmitTS = s.ep.tsValNow()
  1332  }
  1333  
  1334  func (s *sender) detectSpuriousRecovery(hasDSACK bool, tsEchoReply uint32) {
  1335  	// Return if the sender has already detected spurious recovery.
  1336  	if s.spuriousRecovery {
  1337  		return
  1338  	}
  1339  
  1340  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 4
  1341  	//
  1342  	// If the value of the Timestamp Echo Reply field of the acceptable ACK's
  1343  	// Timestamps option is smaller than the value of RetransmitTS, then
  1344  	// proceed to next step, else return.
  1345  	if tsEchoReply >= s.retransmitTS {
  1346  		return
  1347  	}
  1348  
  1349  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5
  1350  	//
  1351  	// If the acceptable ACK carries a DSACK option [RFC2883], then return.
  1352  	if hasDSACK {
  1353  		return
  1354  	}
  1355  
  1356  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5
  1357  	//
  1358  	// If during the lifetime of the TCP connection the TCP sender has
  1359  	// previously received an ACK with a DSACK option, or the acceptable ACK
  1360  	// does not acknowledge all outstanding data, then proceed to next step,
  1361  	// else return.
  1362  	numDSACK := s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.Value()
  1363  	if numDSACK == 0 && s.SndUna == s.SndNxt {
  1364  		return
  1365  	}
  1366  
  1367  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 6
  1368  	//
  1369  	// If the loss recovery has been initiated with a timeout-based
  1370  	// retransmit, then set
  1371  	//    SpuriousRecovery <- SPUR_TO (equal 1),
  1372  	// else set
  1373  	//    SpuriousRecovery <- dupacks+1
  1374  	// Set the spurious recovery variable to true as we do not differentiate
  1375  	// between fast, SACK or RTO recovery.
  1376  	s.spuriousRecovery = true
  1377  	s.ep.stack.Stats().TCP.SpuriousRecovery.Increment()
  1378  
  1379  	// RFC 3522 will detect all kinds of spurious recoveries (fast, SACK and
  1380  	// timeout). Increment the metric for RTO only as we want to track the
  1381  	// number of timeout recoveries.
  1382  	if s.state == tcpip.RTORecovery {
  1383  		s.ep.stack.Stats().TCP.SpuriousRTORecovery.Increment()
  1384  	}
  1385  }
  1386  
  1387  // Check if the sender is in RTORecovery, FastRecovery or SACKRecovery state.
  1388  func (s *sender) inRecovery() bool {
  1389  	if s.state == tcpip.RTORecovery || s.state == tcpip.FastRecovery || s.state == tcpip.SACKRecovery {
  1390  		return true
  1391  	}
  1392  	return false
  1393  }
  1394  
  1395  // handleRcvdSegment is called when a segment is received; it is responsible for
  1396  // updating the send-related state.
  1397  // +checklocks:s.ep.mu
  1398  // +checklocksalias:s.rc.snd.ep.mu=s.ep.mu
  1399  func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
  1400  	// Check if we can extract an RTT measurement from this ack.
  1401  	if !rcvdSeg.parsedOptions.TS && s.RTTMeasureSeqNum.LessThan(rcvdSeg.ackNumber) {
  1402  		s.updateRTO(s.ep.stack.Clock().NowMonotonic().Sub(s.RTTMeasureTime))
  1403  		s.RTTMeasureSeqNum = s.SndNxt
  1404  	}
  1405  
  1406  	// Update Timestamp if required. See RFC7323, section-4.3.
  1407  	if s.ep.SendTSOk && rcvdSeg.parsedOptions.TS {
  1408  		s.ep.updateRecentTimestamp(rcvdSeg.parsedOptions.TSVal, s.MaxSentAck, rcvdSeg.sequenceNumber)
  1409  	}
  1410  
  1411  	// Insert SACKBlock information into our scoreboard.
  1412  	hasDSACK := false
  1413  	if s.ep.SACKPermitted {
  1414  		for _, sb := range rcvdSeg.parsedOptions.SACKBlocks {
  1415  			// Only insert the SACK block if the following holds
  1416  			// true:
  1417  			//  * SACK block acks data after the ack number in the
  1418  			//    current segment.
  1419  			//  * SACK block represents a sequence
  1420  			//    between sndUna and sndNxt (i.e. data that is
  1421  			//    currently unacked and in-flight).
  1422  			//  * SACK block that has not been SACKed already.
  1423  			//
  1424  			// NOTE: This check specifically excludes DSACK blocks
  1425  			// which have start/end before sndUna and are used to
  1426  			// indicate spurious retransmissions.
  1427  			if rcvdSeg.ackNumber.LessThan(sb.Start) && s.SndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.SndNxt) && !s.ep.scoreboard.IsSACKED(sb) {
  1428  				s.ep.scoreboard.Insert(sb)
  1429  				rcvdSeg.hasNewSACKInfo = true
  1430  			}
  1431  		}
  1432  
  1433  		// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08
  1434  		// section-7.2
  1435  		//	* Step 2: Update RACK stats.
  1436  		//		If the ACK is not ignored as invalid, update the RACK.rtt
  1437  		//		to be the RTT sample calculated using this ACK, and
  1438  		//		continue.  If this ACK or SACK was for the most recently
  1439  		//		sent packet, then record the RACK.xmit_ts timestamp and
  1440  		//		RACK.end_seq sequence implied by this ACK.
  1441  		//	* Step 3: Detect packet reordering.
  1442  		//		If the ACK selectively or cumulatively acknowledges an
  1443  		//		unacknowledged and also never retransmitted sequence below
  1444  		//		RACK.fack, then the corresponding packet has been
  1445  		//		reordered and RACK.reord is set to TRUE.
  1446  		if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1447  			hasDSACK = s.walkSACK(rcvdSeg)
  1448  		}
  1449  		s.SetPipe()
  1450  	}
  1451  
  1452  	ack := rcvdSeg.ackNumber
  1453  	fastRetransmit := false
  1454  	// Do not leave fast recovery, if the ACK is out of range.
  1455  	if s.FastRecovery.Active {
  1456  		// Leave fast recovery if it acknowledges all the data covered by
  1457  		// this fast recovery session.
  1458  		if (ack-1).InRange(s.SndUna, s.SndNxt) && s.FastRecovery.Last.LessThan(ack) {
  1459  			s.leaveRecovery()
  1460  		}
  1461  	} else {
  1462  		// Detect loss by counting the duplicates and enter recovery.
  1463  		fastRetransmit = s.detectLoss(rcvdSeg)
  1464  	}
  1465  
  1466  	// See if TLP based recovery was successful.
  1467  	if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1468  		s.detectTLPRecovery(ack, rcvdSeg)
  1469  	}
  1470  
  1471  	// Stash away the current window size.
  1472  	s.SndWnd = rcvdSeg.window
  1473  
  1474  	// Disable zero window probing if remote advertises a non-zero receive
  1475  	// window. This can be with an ACK to the zero window probe (where the
  1476  	// acknumber refers to the already acknowledged byte) OR to any previously
  1477  	// unacknowledged segment.
  1478  	if s.zeroWindowProbing && rcvdSeg.window > 0 &&
  1479  		(ack == s.SndUna || (ack-1).InRange(s.SndUna, s.SndNxt)) {
  1480  		s.disableZeroWindowProbing()
  1481  	}
  1482  
  1483  	// On receiving the ACK for the zero window probe, account for it and
  1484  	// skip trying to send any segment as we are still probing for
  1485  	// receive window to become non-zero.
  1486  	if s.zeroWindowProbing && s.unackZeroWindowProbes > 0 && ack == s.SndUna {
  1487  		s.unackZeroWindowProbes--
  1488  		return
  1489  	}
  1490  
  1491  	// Ignore ack if it doesn't acknowledge any new data.
  1492  	if (ack - 1).InRange(s.SndUna, s.SndNxt) {
  1493  		s.DupAckCount = 0
  1494  
  1495  		// See : https://tools.ietf.org/html/rfc1323#section-3.3.
  1496  		// Specifically we should only update the RTO using TSEcr if the
  1497  		// following condition holds:
  1498  		//
  1499  		//    A TSecr value received in a segment is used to update the
  1500  		//    averaged RTT measurement only if the segment acknowledges
  1501  		//    some new data, i.e., only if it advances the left edge of
  1502  		//    the send window.
  1503  		if s.ep.SendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 {
  1504  			s.updateRTO(s.ep.elapsed(s.ep.stack.Clock().NowMonotonic(), rcvdSeg.parsedOptions.TSEcr))
  1505  		}
  1506  
  1507  		if s.shouldSchedulePTO() {
  1508  			// Schedule PTO upon receiving an ACK that cumulatively acknowledges data.
  1509  			// See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1.
  1510  			s.schedulePTO()
  1511  		} else {
  1512  			// When an ack is received we must rearm the timer.
  1513  			// RFC 6298 5.3
  1514  			s.probeTimer.disable()
  1515  			s.resendTimer.enable(s.RTO)
  1516  		}
  1517  
  1518  		// Remove all acknowledged data from the write list.
  1519  		acked := s.SndUna.Size(ack)
  1520  		s.SndUna = ack
  1521  		ackLeft := acked
  1522  		originalOutstanding := s.Outstanding
  1523  		for ackLeft > 0 {
  1524  			// We use logicalLen here because we can have FIN
  1525  			// segments (which are always at the end of list) that
  1526  			// have no data, but do consume a sequence number.
  1527  			seg := s.writeList.Front()
  1528  			datalen := seg.logicalLen()
  1529  
  1530  			if datalen > ackLeft {
  1531  				prevCount := s.pCount(seg, s.MaxPayloadSize)
  1532  				seg.TrimFront(ackLeft)
  1533  				seg.sequenceNumber.UpdateForward(ackLeft)
  1534  				s.Outstanding -= prevCount - s.pCount(seg, s.MaxPayloadSize)
  1535  				break
  1536  			}
  1537  
  1538  			if s.writeNext == seg {
  1539  				s.updateWriteNext(seg.Next())
  1540  			}
  1541  
  1542  			// Update the RACK fields if SACK is enabled.
  1543  			if s.ep.SACKPermitted && !seg.acked && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1544  				s.rc.update(seg, rcvdSeg)
  1545  				s.rc.detectReorder(seg)
  1546  			}
  1547  
  1548  			s.writeList.Remove(seg)
  1549  
  1550  			// If SACK is enabled then only reduce outstanding if
  1551  			// the segment was not previously SACKED as these have
  1552  			// already been accounted for in SetPipe().
  1553  			if !s.ep.SACKPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
  1554  				s.Outstanding -= s.pCount(seg, s.MaxPayloadSize)
  1555  			} else {
  1556  				s.SackedOut -= s.pCount(seg, s.MaxPayloadSize)
  1557  			}
  1558  			seg.DecRef()
  1559  			ackLeft -= datalen
  1560  		}
  1561  
  1562  		// Clear SACK information for all acked data.
  1563  		s.ep.scoreboard.Delete(s.SndUna)
  1564  
  1565  		// Detect if the sender entered recovery spuriously.
  1566  		if s.inRecovery() {
  1567  			s.detectSpuriousRecovery(hasDSACK, rcvdSeg.parsedOptions.TSEcr)
  1568  		}
  1569  
  1570  		// If we are not in fast recovery then update the congestion
  1571  		// window based on the number of acknowledged packets.
  1572  		if !s.FastRecovery.Active {
  1573  			s.cc.Update(originalOutstanding - s.Outstanding)
  1574  			if s.FastRecovery.Last.LessThan(s.SndUna) {
  1575  				s.state = tcpip.Open
  1576  				// Update RACK when we are exiting fast or RTO
  1577  				// recovery as described in the RFC
  1578  				// draft-ietf-tcpm-rack-08 Section-7.2 Step 4.
  1579  				if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1580  					s.rc.exitRecovery()
  1581  				}
  1582  				s.reorderTimer.disable()
  1583  			}
  1584  		}
  1585  
  1586  		// Update the send buffer usage and notify potential waiters.
  1587  		s.ep.updateSndBufferUsage(int(acked))
  1588  
  1589  		// It is possible for s.outstanding to drop below zero if we get
  1590  		// a retransmit timeout, reset outstanding to zero but later
  1591  		// get an ack that cover previously sent data.
  1592  		if s.Outstanding < 0 {
  1593  			s.Outstanding = 0
  1594  		}
  1595  
  1596  		s.SetPipe()
  1597  
  1598  		// If all outstanding data was acknowledged the disable the timer.
  1599  		// RFC 6298 Rule 5.3
  1600  		if s.SndUna == s.SndNxt {
  1601  			s.Outstanding = 0
  1602  			// Reset firstRetransmittedSegXmitTime to the zero value.
  1603  			s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{}
  1604  			s.resendTimer.disable()
  1605  			s.probeTimer.disable()
  1606  		}
  1607  	}
  1608  
  1609  	if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1610  		// Update RACK reorder window.
  1611  		// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
  1612  		//	* Upon receiving an ACK:
  1613  		//	* Step 4: Update RACK reordering window
  1614  		s.rc.updateRACKReorderWindow()
  1615  
  1616  		// After the reorder window is calculated, detect any loss by checking
  1617  		// if the time elapsed after the segments are sent is greater than the
  1618  		// reorder window.
  1619  		if numLost := s.rc.detectLoss(rcvdSeg.rcvdTime); numLost > 0 && !s.FastRecovery.Active {
  1620  			// If any segment is marked as lost by
  1621  			// RACK, enter recovery and retransmit
  1622  			// the lost segments.
  1623  			s.cc.HandleLossDetected()
  1624  			s.enterRecovery()
  1625  			fastRetransmit = true
  1626  		}
  1627  
  1628  		if s.FastRecovery.Active {
  1629  			s.rc.DoRecovery(nil, fastRetransmit)
  1630  		}
  1631  	}
  1632  
  1633  	// Now that we've popped all acknowledged data from the retransmit
  1634  	// queue, retransmit if needed.
  1635  	if s.FastRecovery.Active && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 {
  1636  		s.lr.DoRecovery(rcvdSeg, fastRetransmit)
  1637  		// When SACK is enabled data sending is governed by steps in
  1638  		// RFC 6675 Section 5 recovery steps  A-C.
  1639  		// See: https://tools.ietf.org/html/rfc6675#section-5.
  1640  		if s.ep.SACKPermitted {
  1641  			return
  1642  		}
  1643  	}
  1644  
  1645  	// Send more data now that some of the pending data has been ack'd, or
  1646  	// that the window opened up, or the congestion window was inflated due
  1647  	// to a duplicate ack during fast recovery. This will also re-enable
  1648  	// the retransmit timer if needed.
  1649  	s.sendData()
  1650  }
  1651  
  1652  // sendSegment sends the specified segment.
  1653  // +checklocks:s.ep.mu
  1654  func (s *sender) sendSegment(seg *segment) tcpip.Error {
  1655  	if seg.xmitCount > 0 {
  1656  		s.ep.stack.Stats().TCP.Retransmits.Increment()
  1657  		s.ep.stats.SendErrors.Retransmits.Increment()
  1658  		if s.SndCwnd < s.Ssthresh {
  1659  			s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment()
  1660  		}
  1661  	}
  1662  	seg.xmitTime = s.ep.stack.Clock().NowMonotonic()
  1663  	seg.xmitCount++
  1664  	seg.lost = false
  1665  
  1666  	err := s.sendSegmentFromPacketBuffer(seg.pkt, seg.flags, seg.sequenceNumber)
  1667  
  1668  	// Every time a packet containing data is sent (including a
  1669  	// retransmission), if SACK is enabled and we are retransmitting data
  1670  	// then use the conservative timer described in RFC6675 Section 6.0,
  1671  	// otherwise follow the standard time described in RFC6298 Section 5.1.
  1672  	if err != nil && seg.payloadSize() != 0 {
  1673  		if s.FastRecovery.Active && seg.xmitCount > 1 && s.ep.SACKPermitted {
  1674  			s.resendTimer.enable(s.RTO)
  1675  		} else {
  1676  			if !s.resendTimer.enabled() {
  1677  				s.resendTimer.enable(s.RTO)
  1678  			}
  1679  		}
  1680  	}
  1681  
  1682  	return err
  1683  }
  1684  
  1685  // sendSegmentFromPacketBuffer sends a new segment containing the given payload,
  1686  // flags and sequence number.
  1687  // +checklocks:s.ep.mu
  1688  // +checklocksalias:s.ep.rcv.ep.mu=s.ep.mu
  1689  func (s *sender) sendSegmentFromPacketBuffer(pkt *stack.PacketBuffer, flags header.TCPFlags, seq seqnum.Value) tcpip.Error {
  1690  	s.LastSendTime = s.ep.stack.Clock().NowMonotonic()
  1691  	if seq == s.RTTMeasureSeqNum {
  1692  		s.RTTMeasureTime = s.LastSendTime
  1693  	}
  1694  
  1695  	rcvNxt, rcvWnd := s.ep.rcv.getSendParams()
  1696  
  1697  	// Remember the max sent ack.
  1698  	s.MaxSentAck = rcvNxt
  1699  
  1700  	// We need to clone the packet because sendRaw takes ownership of pkt,
  1701  	// and pkt could be reprocessed later on (i.e retrasmission).
  1702  	pkt = pkt.Clone()
  1703  	defer pkt.DecRef()
  1704  
  1705  	return s.ep.sendRaw(pkt, flags, seq, rcvNxt, rcvWnd)
  1706  }
  1707  
  1708  // sendEmptySegment sends a new empty segment, flags and sequence number.
  1709  // +checklocks:s.ep.mu
  1710  // +checklocksalias:s.ep.rcv.ep.mu=s.ep.mu
  1711  func (s *sender) sendEmptySegment(flags header.TCPFlags, seq seqnum.Value) tcpip.Error {
  1712  	s.LastSendTime = s.ep.stack.Clock().NowMonotonic()
  1713  	if seq == s.RTTMeasureSeqNum {
  1714  		s.RTTMeasureTime = s.LastSendTime
  1715  	}
  1716  
  1717  	rcvNxt, rcvWnd := s.ep.rcv.getSendParams()
  1718  
  1719  	// Remember the max sent ack.
  1720  	s.MaxSentAck = rcvNxt
  1721  
  1722  	return s.ep.sendEmptyRaw(flags, seq, rcvNxt, rcvWnd)
  1723  }
  1724  
  1725  // maybeSendOutOfWindowAck sends an ACK if we are not being rate limited
  1726  // currently.
  1727  // +checklocks:s.ep.mu
  1728  func (s *sender) maybeSendOutOfWindowAck(seg *segment) {
  1729  	// Data packets are unlikely to be part of an ACK loop. So always send
  1730  	// an ACK for a packet w/ data.
  1731  	if seg.payloadSize() > 0 || s.ep.allowOutOfWindowAck() {
  1732  		s.sendAck()
  1733  	}
  1734  }
  1735  
  1736  func (s *sender) updateWriteNext(seg *segment) {
  1737  	if s.writeNext != nil {
  1738  		s.writeNext.DecRef()
  1739  	}
  1740  	if seg != nil {
  1741  		seg.IncRef()
  1742  	}
  1743  	s.writeNext = seg
  1744  }
  1745  
  1746  // corkTimerExpired drains all the segments when TCP_CORK is enabled.
  1747  // +checklocks:s.ep.mu
  1748  func (s *sender) corkTimerExpired() tcpip.Error {
  1749  	// Check if the timer actually expired or if it's a spurious wake due
  1750  	// to a previously orphaned runtime timer.
  1751  	if s.corkTimer.isUninitialized() || !s.corkTimer.checkExpiration() {
  1752  		return nil
  1753  	}
  1754  
  1755  	// Assign sequence number and flags to the segment.
  1756  	seg := s.writeNext
  1757  	if seg == nil {
  1758  		return nil
  1759  	}
  1760  	seg.sequenceNumber = s.SndNxt
  1761  	seg.flags = header.TCPFlagAck | header.TCPFlagPsh
  1762  	// Drain all the segments.
  1763  	s.sendData()
  1764  	return nil
  1765  }