github.com/sagernet/gvisor@v0.0.0-20240428053021-e691de28565f/pkg/tcpip/transport/tcp/snd.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"fmt"
    19  	"math"
    20  	"sort"
    21  	"time"
    22  
    23  	"github.com/sagernet/gvisor/pkg/buffer"
    24  	"github.com/sagernet/gvisor/pkg/sync"
    25  	"github.com/sagernet/gvisor/pkg/tcpip"
    26  	"github.com/sagernet/gvisor/pkg/tcpip/header"
    27  	"github.com/sagernet/gvisor/pkg/tcpip/seqnum"
    28  	"github.com/sagernet/gvisor/pkg/tcpip/stack"
    29  )
    30  
    31  const (
    32  	// MinRTO is the minimum allowed value for the retransmit timeout.
    33  	MinRTO = 200 * time.Millisecond
    34  
    35  	// MaxRTO is the maximum allowed value for the retransmit timeout.
    36  	MaxRTO = 120 * time.Second
    37  
    38  	// MinSRTT is the minimum allowed value for smoothed RTT.
    39  	MinSRTT = 1 * time.Millisecond
    40  
    41  	// InitialCwnd is the initial congestion window.
    42  	InitialCwnd = 10
    43  
    44  	// nDupAckThreshold is the number of duplicate ACK's required
    45  	// before fast-retransmit is entered.
    46  	nDupAckThreshold = 3
    47  
    48  	// MaxRetries is the maximum number of probe retries sender does
    49  	// before timing out the connection.
    50  	// Linux default TCP_RETR2, net.ipv4.tcp_retries2.
    51  	MaxRetries = 15
    52  )
    53  
    54  // congestionControl is an interface that must be implemented by any supported
    55  // congestion control algorithm.
    56  type congestionControl interface {
    57  	// HandleLossDetected is invoked when the loss is detected by RACK or
    58  	// sender.dupAckCount >= nDupAckThreshold just before entering fast
    59  	// retransmit.
    60  	HandleLossDetected()
    61  
    62  	// HandleRTOExpired is invoked when the retransmit timer expires.
    63  	HandleRTOExpired()
    64  
    65  	// Update is invoked when processing inbound acks. It's passed the
    66  	// number of packet's that were acked by the most recent cumulative
    67  	// acknowledgement.
    68  	Update(packetsAcked int)
    69  
    70  	// PostRecovery is invoked when the sender is exiting a fast retransmit/
    71  	// recovery phase. This provides congestion control algorithms a way
    72  	// to adjust their state when exiting recovery.
    73  	PostRecovery()
    74  }
    75  
    76  // lossRecovery is an interface that must be implemented by any supported
    77  // loss recovery algorithm.
    78  type lossRecovery interface {
    79  	// DoRecovery is invoked when loss is detected and segments need
    80  	// to be retransmitted. The cumulative or selective ACK is passed along
    81  	// with the flag which identifies whether the connection entered fast
    82  	// retransmit with this ACK and to retransmit the first unacknowledged
    83  	// segment.
    84  	DoRecovery(rcvdSeg *segment, fastRetransmit bool)
    85  }
    86  
    87  // sender holds the state necessary to send TCP segments.
    88  //
    89  // +stateify savable
    90  type sender struct {
    91  	stack.TCPSenderState
    92  	ep *Endpoint
    93  
    94  	// lr is the loss recovery algorithm used by the sender.
    95  	lr lossRecovery
    96  
    97  	// firstRetransmittedSegXmitTime is the original transmit time of
    98  	// the first segment that was retransmitted due to RTO expiration.
    99  	firstRetransmittedSegXmitTime tcpip.MonotonicTime
   100  
   101  	// zeroWindowProbing is set if the sender is currently probing
   102  	// for zero receive window.
   103  	zeroWindowProbing bool `state:"nosave"`
   104  
   105  	// unackZeroWindowProbes is the number of unacknowledged zero
   106  	// window probes.
   107  	unackZeroWindowProbes uint32 `state:"nosave"`
   108  
   109  	writeNext   *segment
   110  	writeList   segmentList
   111  	resendTimer timer `state:"nosave"`
   112  
   113  	// rtt.TCPRTTState.SRTT and rtt.TCPRTTState.RTTVar are the "smoothed
   114  	// round-trip time", and "round-trip time variation", as defined in
   115  	// section 2 of RFC 6298.
   116  	rtt rtt
   117  
   118  	// minRTO is the minimum permitted value for sender.rto.
   119  	minRTO time.Duration
   120  
   121  	// maxRTO is the maximum permitted value for sender.rto.
   122  	maxRTO time.Duration
   123  
   124  	// maxRetries is the maximum permitted retransmissions.
   125  	maxRetries uint32
   126  
   127  	// gso is set if generic segmentation offload is enabled.
   128  	gso bool
   129  
   130  	// state is the current state of congestion control for this endpoint.
   131  	state tcpip.CongestionControlState
   132  
   133  	// cc is the congestion control algorithm in use for this sender.
   134  	cc congestionControl
   135  
   136  	// rc has the fields needed for implementing RACK loss detection
   137  	// algorithm.
   138  	rc rackControl
   139  
   140  	// reorderTimer is the timer used to retransmit the segments after RACK
   141  	// detects them as lost.
   142  	reorderTimer timer `state:"nosave"`
   143  
   144  	// probeTimer is used to schedule PTO for RACK TLP algorithm.
   145  	probeTimer timer `state:"nosave"`
   146  
   147  	// spuriousRecovery indicates whether the sender entered recovery
   148  	// spuriously as described in RFC3522 Section 3.2.
   149  	spuriousRecovery bool
   150  
   151  	// retransmitTS is the timestamp at which the sender sends retransmitted
   152  	// segment after entering an RTO for the first time as described in
   153  	// RFC3522 Section 3.2.
   154  	retransmitTS uint32
   155  
   156  	// startCork start corking the segments.
   157  	startCork bool
   158  
   159  	// corkTimer is used to drain the segments which are held when TCP_CORK
   160  	// option is enabled.
   161  	corkTimer timer `state:"nosave"`
   162  }
   163  
   164  // rtt is a synchronization wrapper used to appease stateify. See the comment
   165  // in sender, where it is used.
   166  //
   167  // +stateify savable
   168  type rtt struct {
   169  	sync.Mutex `state:"nosave"`
   170  
   171  	stack.TCPRTTState
   172  }
   173  
   174  // +checklocks:ep.mu
   175  func newSender(ep *Endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender {
   176  	// The sender MUST reduce the TCP data length to account for any IP or
   177  	// TCP options that it is including in the packets that it sends.
   178  	// See: https://tools.ietf.org/html/rfc6691#section-2
   179  	maxPayloadSize := int(mss) - ep.maxOptionSize()
   180  
   181  	s := &sender{
   182  		ep: ep,
   183  		TCPSenderState: stack.TCPSenderState{
   184  			SndWnd:           sndWnd,
   185  			SndUna:           iss + 1,
   186  			SndNxt:           iss + 1,
   187  			RTTMeasureSeqNum: iss + 1,
   188  			LastSendTime:     ep.stack.Clock().NowMonotonic(),
   189  			MaxPayloadSize:   maxPayloadSize,
   190  			MaxSentAck:       irs + 1,
   191  			FastRecovery: stack.TCPFastRecoveryState{
   192  				// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1.
   193  				Last:      iss,
   194  				HighRxt:   iss,
   195  				RescueRxt: iss,
   196  			},
   197  			RTO: 1 * time.Second,
   198  		},
   199  		gso: ep.gso.Type != stack.GSONone,
   200  	}
   201  
   202  	if s.gso {
   203  		s.ep.gso.MSS = uint16(maxPayloadSize)
   204  	}
   205  
   206  	s.cc = s.initCongestionControl(ep.cc)
   207  	s.lr = s.initLossRecovery()
   208  	s.rc.init(s, iss)
   209  
   210  	// A negative sndWndScale means that no scaling is in use, otherwise we
   211  	// store the scaling value.
   212  	if sndWndScale > 0 {
   213  		s.SndWndScale = uint8(sndWndScale)
   214  	}
   215  
   216  	s.resendTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.retransmitTimerExpired))
   217  	s.reorderTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.rc.reorderTimerExpired))
   218  	s.probeTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.probeTimerExpired))
   219  	s.corkTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.corkTimerExpired))
   220  
   221  	s.ep.AssertLockHeld(ep)
   222  	s.updateMaxPayloadSize(int(ep.route.MTU()), 0)
   223  	// Initialize SACK Scoreboard after updating max payload size as we use
   224  	// the maxPayloadSize as the smss when determining if a segment is lost
   225  	// etc.
   226  	s.ep.scoreboard = NewSACKScoreboard(uint16(s.MaxPayloadSize), iss)
   227  
   228  	// Get Stack wide config.
   229  	var minRTO tcpip.TCPMinRTOOption
   230  	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil {
   231  		panic(fmt.Sprintf("unable to get minRTO from stack: %s", err))
   232  	}
   233  	s.minRTO = time.Duration(minRTO)
   234  
   235  	var maxRTO tcpip.TCPMaxRTOOption
   236  	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil {
   237  		panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err))
   238  	}
   239  	s.maxRTO = time.Duration(maxRTO)
   240  
   241  	var maxRetries tcpip.TCPMaxRetriesOption
   242  	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil {
   243  		panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err))
   244  	}
   245  	s.maxRetries = uint32(maxRetries)
   246  
   247  	return s
   248  }
   249  
   250  // initCongestionControl initializes the specified congestion control module and
   251  // returns a handle to it. It also initializes the sndCwnd and sndSsThresh to
   252  // their initial values.
   253  func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl {
   254  	s.SndCwnd = InitialCwnd
   255  	// Set sndSsthresh to the maximum int value, which depends on the
   256  	// platform.
   257  	s.Ssthresh = int(^uint(0) >> 1)
   258  
   259  	switch congestionControlName {
   260  	case ccCubic:
   261  		return newCubicCC(s)
   262  	case ccReno:
   263  		fallthrough
   264  	default:
   265  		return newRenoCC(s)
   266  	}
   267  }
   268  
   269  // initLossRecovery initiates the loss recovery algorithm for the sender.
   270  func (s *sender) initLossRecovery() lossRecovery {
   271  	if s.ep.SACKPermitted {
   272  		return newSACKRecovery(s)
   273  	}
   274  	return newRenoRecovery(s)
   275  }
   276  
   277  // updateMaxPayloadSize updates the maximum payload size based on the given
   278  // MTU. If this is in response to "packet too big" control packets (indicated
   279  // by the count argument), it also reduces the number of outstanding packets and
   280  // attempts to retransmit the first packet above the MTU size.
   281  // +checklocks:s.ep.mu
   282  func (s *sender) updateMaxPayloadSize(mtu, count int) {
   283  	m := mtu - header.TCPMinimumSize
   284  
   285  	m -= s.ep.maxOptionSize()
   286  
   287  	// We don't adjust up for now.
   288  	if m >= s.MaxPayloadSize {
   289  		return
   290  	}
   291  
   292  	// Make sure we can transmit at least one byte.
   293  	if m <= 0 {
   294  		m = 1
   295  	}
   296  
   297  	oldMSS := s.MaxPayloadSize
   298  	s.MaxPayloadSize = m
   299  	if s.gso {
   300  		s.ep.gso.MSS = uint16(m)
   301  	}
   302  
   303  	if count == 0 {
   304  		// updateMaxPayloadSize is also called when the sender is created.
   305  		// and there is no data to send in such cases. Return immediately.
   306  		return
   307  	}
   308  
   309  	// Update the scoreboard's smss to reflect the new lowered
   310  	// maxPayloadSize.
   311  	s.ep.scoreboard.smss = uint16(m)
   312  
   313  	s.Outstanding -= count
   314  	if s.Outstanding < 0 {
   315  		s.Outstanding = 0
   316  	}
   317  
   318  	// Rewind writeNext to the first segment exceeding the MTU. Do nothing
   319  	// if it is already before such a packet.
   320  	nextSeg := s.writeNext
   321  	for seg := s.writeList.Front(); seg != nil; seg = seg.Next() {
   322  		if seg == s.writeNext {
   323  			// We got to writeNext before we could find a segment
   324  			// exceeding the MTU.
   325  			break
   326  		}
   327  
   328  		if nextSeg == s.writeNext && seg.payloadSize() > m {
   329  			// We found a segment exceeding the MTU. Rewind
   330  			// writeNext and try to retransmit it.
   331  			nextSeg = seg
   332  		}
   333  
   334  		if s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
   335  			// Update sackedOut for new maximum payload size.
   336  			s.SackedOut -= s.pCount(seg, oldMSS)
   337  			s.SackedOut += s.pCount(seg, s.MaxPayloadSize)
   338  		}
   339  	}
   340  
   341  	// Since we likely reduced the number of outstanding packets, we may be
   342  	// ready to send some more.
   343  	s.updateWriteNext(nextSeg)
   344  	s.sendData()
   345  }
   346  
   347  // sendAck sends an ACK segment.
   348  // +checklocks:s.ep.mu
   349  func (s *sender) sendAck() {
   350  	s.sendEmptySegment(header.TCPFlagAck, s.SndNxt)
   351  }
   352  
   353  // updateRTO updates the retransmit timeout when a new roud-trip time is
   354  // available. This is done in accordance with section 2 of RFC 6298.
   355  func (s *sender) updateRTO(rtt time.Duration) {
   356  	s.rtt.Lock()
   357  	if !s.rtt.TCPRTTState.SRTTInited {
   358  		s.rtt.TCPRTTState.RTTVar = rtt / 2
   359  		s.rtt.TCPRTTState.SRTT = rtt
   360  		s.rtt.TCPRTTState.SRTTInited = true
   361  	} else {
   362  		diff := s.rtt.TCPRTTState.SRTT - rtt
   363  		if diff < 0 {
   364  			diff = -diff
   365  		}
   366  		// Use RFC6298 standard algorithm to update TCPRTTState.RTTVar and TCPRTTState.SRTT when
   367  		// no timestamps are available.
   368  		if !s.ep.SendTSOk {
   369  			s.rtt.TCPRTTState.RTTVar = (3*s.rtt.TCPRTTState.RTTVar + diff) / 4
   370  			s.rtt.TCPRTTState.SRTT = (7*s.rtt.TCPRTTState.SRTT + rtt) / 8
   371  		} else {
   372  			// When we are taking RTT measurements of every ACK then
   373  			// we need to use a modified method as specified in
   374  			// https://tools.ietf.org/html/rfc7323#appendix-G
   375  			if s.Outstanding == 0 {
   376  				s.rtt.Unlock()
   377  				return
   378  			}
   379  			// Netstack measures congestion window/inflight all in
   380  			// terms of packets and not bytes. This is similar to
   381  			// how linux also does cwnd and inflight. In practice
   382  			// this approximation works as expected.
   383  			expectedSamples := math.Ceil(float64(s.Outstanding) / 2)
   384  
   385  			// alpha & beta values are the original values as recommended in
   386  			// https://tools.ietf.org/html/rfc6298#section-2.3.
   387  			const alpha = 0.125
   388  			const beta = 0.25
   389  
   390  			alphaPrime := alpha / expectedSamples
   391  			betaPrime := beta / expectedSamples
   392  			rttVar := (1-betaPrime)*s.rtt.TCPRTTState.RTTVar.Seconds() + betaPrime*diff.Seconds()
   393  			srtt := (1-alphaPrime)*s.rtt.TCPRTTState.SRTT.Seconds() + alphaPrime*rtt.Seconds()
   394  			s.rtt.TCPRTTState.RTTVar = time.Duration(rttVar * float64(time.Second))
   395  			s.rtt.TCPRTTState.SRTT = time.Duration(srtt * float64(time.Second))
   396  		}
   397  	}
   398  
   399  	if s.rtt.TCPRTTState.SRTT < MinSRTT {
   400  		s.rtt.TCPRTTState.SRTT = MinSRTT
   401  	}
   402  
   403  	s.RTO = s.rtt.TCPRTTState.SRTT + 4*s.rtt.TCPRTTState.RTTVar
   404  	s.rtt.Unlock()
   405  	if s.RTO < s.minRTO {
   406  		s.RTO = s.minRTO
   407  	}
   408  	if s.RTO > s.maxRTO {
   409  		s.RTO = s.maxRTO
   410  	}
   411  }
   412  
   413  // resendSegment resends the first unacknowledged segment.
   414  // +checklocks:s.ep.mu
   415  func (s *sender) resendSegment() {
   416  	// Don't use any segments we already sent to measure RTT as they may
   417  	// have been affected by packets being lost.
   418  	s.RTTMeasureSeqNum = s.SndNxt
   419  
   420  	// Resend the segment.
   421  	if seg := s.writeList.Front(); seg != nil {
   422  		if seg.payloadSize() > s.MaxPayloadSize {
   423  			s.splitSeg(seg, s.MaxPayloadSize)
   424  		}
   425  
   426  		// See: RFC 6675 section 5 Step 4.3
   427  		//
   428  		// To prevent retransmission, set both the HighRXT and RescueRXT
   429  		// to the highest sequence number in the retransmitted segment.
   430  		s.FastRecovery.HighRxt = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) - 1
   431  		s.FastRecovery.RescueRxt = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) - 1
   432  		s.sendSegment(seg)
   433  		s.ep.stack.Stats().TCP.FastRetransmit.Increment()
   434  		s.ep.stats.SendErrors.FastRetransmit.Increment()
   435  
   436  		// Run SetPipe() as per RFC 6675 section 5 Step 4.4
   437  		s.SetPipe()
   438  	}
   439  }
   440  
   441  // retransmitTimerExpired is called when the retransmit timer expires, and
   442  // unacknowledged segments are assumed lost, and thus need to be resent.
   443  // Returns true if the connection is still usable, or false if the connection
   444  // is deemed lost.
   445  // +checklocks:s.ep.mu
   446  func (s *sender) retransmitTimerExpired() tcpip.Error {
   447  	// Check if the timer actually expired or if it's a spurious wake due
   448  	// to a previously orphaned runtime timer.
   449  	if s.resendTimer.isUninitialized() || !s.resendTimer.checkExpiration() {
   450  		return nil
   451  	}
   452  
   453  	// Initialize the variables used to detect spurious recovery after
   454  	// entering RTO.
   455  	//
   456  	// See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1.
   457  	s.spuriousRecovery = false
   458  	s.retransmitTS = 0
   459  
   460  	// TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases
   461  	// when writeList is empty. Remove this once we have a proper fix for this
   462  	// issue.
   463  	if s.writeList.Front() == nil {
   464  		return nil
   465  	}
   466  
   467  	s.ep.stack.Stats().TCP.Timeouts.Increment()
   468  	s.ep.stats.SendErrors.Timeouts.Increment()
   469  
   470  	// Set TLPRxtOut to false according to
   471  	// https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1.
   472  	s.rc.tlpRxtOut = false
   473  
   474  	// Give up if we've waited more than a minute since the last resend or
   475  	// if a user time out is set and we have exceeded the user specified
   476  	// timeout since the first retransmission.
   477  	uto := s.ep.userTimeout
   478  
   479  	if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) {
   480  		// We store the original xmitTime of the segment that we are
   481  		// about to retransmit as the retransmission time. This is
   482  		// required as by the time the retransmitTimer has expired the
   483  		// segment has already been sent and unacked for the RTO at the
   484  		// time the segment was sent.
   485  		s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime
   486  	}
   487  
   488  	elapsed := s.ep.stack.Clock().NowMonotonic().Sub(s.firstRetransmittedSegXmitTime)
   489  	remaining := s.maxRTO
   490  	if uto != 0 {
   491  		// Cap to the user specified timeout if one is specified.
   492  		remaining = uto - elapsed
   493  	}
   494  
   495  	// Always honor the user-timeout irrespective of whether the zero
   496  	// window probes were acknowledged.
   497  	// net/ipv4/tcp_timer.c::tcp_probe_timer()
   498  	if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries {
   499  		s.ep.stack.Stats().TCP.EstablishedTimedout.Increment()
   500  		return &tcpip.ErrTimeout{}
   501  	}
   502  
   503  	// Set new timeout. The timer will be restarted by the call to sendData
   504  	// below.
   505  	s.RTO *= 2
   506  	// Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5
   507  	if s.RTO > s.maxRTO {
   508  		s.RTO = s.maxRTO
   509  	}
   510  
   511  	// Cap RTO to remaining time.
   512  	if s.RTO > remaining {
   513  		s.RTO = remaining
   514  	}
   515  
   516  	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4.
   517  	//
   518  	// Retransmit timeouts:
   519  	//     After a retransmit timeout, record the highest sequence number
   520  	//     transmitted in the variable recover, and exit the fast recovery
   521  	//     procedure if applicable.
   522  	s.FastRecovery.Last = s.SndNxt - 1
   523  
   524  	if s.FastRecovery.Active {
   525  		// We were attempting fast recovery but were not successful.
   526  		// Leave the state. We don't need to update ssthresh because it
   527  		// has already been updated when entered fast-recovery.
   528  		s.leaveRecovery()
   529  	}
   530  
   531  	// Record retransmitTS if the sender is not in recovery as per:
   532  	// https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2
   533  	s.recordRetransmitTS()
   534  
   535  	s.state = tcpip.RTORecovery
   536  	s.cc.HandleRTOExpired()
   537  
   538  	// Mark the next segment to be sent as the first unacknowledged one and
   539  	// start sending again. Set the number of outstanding packets to 0 so
   540  	// that we'll be able to retransmit.
   541  	//
   542  	// We'll keep on transmitting (or retransmitting) as we get acks for
   543  	// the data we transmit.
   544  	s.Outstanding = 0
   545  
   546  	// Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1
   547  	//
   548  	//  In order to avoid memory deadlocks, the TCP receiver is allowed to
   549  	//  discard data that has already been selectively acknowledged. As a
   550  	//  result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK
   551  	//  information gathered from a receiver upon a retransmission timeout
   552  	//  (RTO) "since the timeout might indicate that the data receiver has
   553  	//  reneged." Additionally, a TCP sender MUST "ignore prior SACK
   554  	//  information in determining which data to retransmit."
   555  	//
   556  	// NOTE: We take the stricter interpretation and just expunge all
   557  	// information as we lack more rigorous checks to validate if the SACK
   558  	// information is usable after an RTO.
   559  	s.ep.scoreboard.Reset()
   560  	s.updateWriteNext(s.writeList.Front())
   561  
   562  	// RFC 1122 4.2.2.17: Start sending zero window probes when we still see a
   563  	// zero receive window after retransmission interval and we have data to
   564  	// send.
   565  	if s.zeroWindowProbing {
   566  		s.sendZeroWindowProbe()
   567  		// RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed
   568  		// indefinitely.  As long as the receiving TCP continues to send
   569  		// acknowledgments in response to the probe segments, the sending TCP
   570  		// MUST allow the connection to stay open.
   571  		return nil
   572  	}
   573  
   574  	seg := s.writeNext
   575  	// RFC 1122 4.2.3.5: Close the connection when the number of
   576  	// retransmissions for this segment is beyond a limit.
   577  	if seg != nil && seg.xmitCount > s.maxRetries {
   578  		s.ep.stack.Stats().TCP.EstablishedTimedout.Increment()
   579  		return &tcpip.ErrTimeout{}
   580  	}
   581  
   582  	s.sendData()
   583  
   584  	return nil
   585  }
   586  
   587  // pCount returns the number of packets in the segment. Due to GSO, a segment
   588  // can be composed of multiple packets.
   589  func (s *sender) pCount(seg *segment, maxPayloadSize int) int {
   590  	size := seg.payloadSize()
   591  	if size == 0 {
   592  		return 1
   593  	}
   594  
   595  	return (size-1)/maxPayloadSize + 1
   596  }
   597  
   598  // splitSeg splits a given segment at the size specified and inserts the
   599  // remainder as a new segment after the current one in the write list.
   600  func (s *sender) splitSeg(seg *segment, size int) {
   601  	if seg.payloadSize() <= size {
   602  		return
   603  	}
   604  	// Split this segment up.
   605  	nSeg := seg.clone()
   606  	nSeg.pkt.Data().TrimFront(size)
   607  	nSeg.sequenceNumber.UpdateForward(seqnum.Size(size))
   608  	s.writeList.InsertAfter(seg, nSeg)
   609  
   610  	// The segment being split does not carry PUSH flag because it is
   611  	// followed by the newly split segment.
   612  	// RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered
   613  	// segment (i.e., when there is no more queued data to be sent).
   614  	// Linux removes PSH flag only when the segment is being split over MSS
   615  	// and retains it when we are splitting the segment over lack of sender
   616  	// window space.
   617  	// ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point()
   618  	// ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test()
   619  	if seg.payloadSize() > s.MaxPayloadSize {
   620  		seg.flags ^= header.TCPFlagPsh
   621  	}
   622  	seg.pkt.Data().CapLength(size)
   623  }
   624  
   625  // NextSeg implements the RFC6675 NextSeg() operation.
   626  //
   627  // NextSeg starts scanning the writeList starting from nextSegHint and returns
   628  // the hint to be passed on the next call to NextSeg. This is required to avoid
   629  // iterating the write list repeatedly when NextSeg is invoked in a loop during
   630  // recovery. The returned hint will be nil if there are no more segments that
   631  // can match rules defined by NextSeg operation in RFC6675.
   632  //
   633  // rescueRtx will be true only if nextSeg is a rescue retransmission as
   634  // described by Step 4) of the NextSeg algorithm.
   635  func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) {
   636  	var s3 *segment
   637  	var s4 *segment
   638  	// Step 1.
   639  	for seg := nextSegHint; seg != nil; seg = seg.Next() {
   640  		// Stop iteration if we hit a segment that has never been
   641  		// transmitted (i.e. either it has no assigned sequence number
   642  		// or if it does have one, it's >= the next sequence number
   643  		// to be sent [i.e. >= s.sndNxt]).
   644  		if !s.isAssignedSequenceNumber(seg) || s.SndNxt.LessThanEq(seg.sequenceNumber) {
   645  			hint = nil
   646  			break
   647  		}
   648  		segSeq := seg.sequenceNumber
   649  		if smss := s.ep.scoreboard.SMSS(); seg.payloadSize() > int(smss) {
   650  			s.splitSeg(seg, int(smss))
   651  		}
   652  
   653  		// See RFC 6675 Section 4
   654  		//
   655  		//     1. If there exists a smallest unSACKED sequence number
   656  		//     'S2' that meets the following 3 criteria for determinig
   657  		//     loss, the sequence range of one segment of up to SMSS
   658  		//     octets starting with S2 MUST be returned.
   659  		if !s.ep.scoreboard.IsSACKED(header.SACKBlock{Start: segSeq, End: segSeq.Add(1)}) {
   660  			// NextSeg():
   661  			//
   662  			//    (1.a) S2 is greater than HighRxt
   663  			//    (1.b) S2 is less than highest octet covered by
   664  			//    any received SACK.
   665  			if s.FastRecovery.HighRxt.LessThan(segSeq) && segSeq.LessThan(s.ep.scoreboard.maxSACKED) {
   666  				// NextSeg():
   667  				//     (1.c) IsLost(S2) returns true.
   668  				if s.ep.scoreboard.IsLost(segSeq) {
   669  					return seg, seg.Next(), false
   670  				}
   671  
   672  				// NextSeg():
   673  				//
   674  				// (3): If the conditions for rules (1) and (2)
   675  				// fail, but there exists an unSACKed sequence
   676  				// number S3 that meets the criteria for
   677  				// detecting loss given in steps 1.a and 1.b
   678  				// above (specifically excluding (1.c)) then one
   679  				// segment of upto SMSS octets starting with S3
   680  				// SHOULD be returned.
   681  				if s3 == nil {
   682  					s3 = seg
   683  					hint = seg.Next()
   684  				}
   685  			}
   686  			// NextSeg():
   687  			//
   688  			//     (4) If the conditions for (1), (2) and (3) fail,
   689  			//     but there exists outstanding unSACKED data, we
   690  			//     provide the opportunity for a single "rescue"
   691  			//     retransmission per entry into loss recovery. If
   692  			//     HighACK is greater than RescueRxt (or RescueRxt
   693  			//     is undefined), then one segment of upto SMSS
   694  			//     octets that MUST include the highest outstanding
   695  			//     unSACKed sequence number SHOULD be returned, and
   696  			//     RescueRxt set to RecoveryPoint. HighRxt MUST NOT
   697  			//     be updated.
   698  			if s.FastRecovery.RescueRxt.LessThan(s.SndUna - 1) {
   699  				if s4 != nil {
   700  					if s4.sequenceNumber.LessThan(segSeq) {
   701  						s4 = seg
   702  					}
   703  				} else {
   704  					s4 = seg
   705  				}
   706  			}
   707  		}
   708  	}
   709  
   710  	// If we got here then no segment matched step (1).
   711  	// Step (2): "If no sequence number 'S2' per rule (1)
   712  	// exists but there exists available unsent data and the
   713  	// receiver's advertised window allows, the sequence
   714  	// range of one segment of up to SMSS octets of
   715  	// previously unsent data starting with sequence number
   716  	// HighData+1 MUST be returned."
   717  	for seg := s.writeNext; seg != nil; seg = seg.Next() {
   718  		if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.SndNxt) {
   719  			continue
   720  		}
   721  		// We do not split the segment here to <= smss as it has
   722  		// potentially not been assigned a sequence number yet.
   723  		return seg, nil, false
   724  	}
   725  
   726  	if s3 != nil {
   727  		return s3, hint, false
   728  	}
   729  
   730  	return s4, nil, true
   731  }
   732  
   733  // maybeSendSegment tries to send the specified segment and either coalesces
   734  // other segments into this one or splits the specified segment based on the
   735  // lower of the specified limit value or the receivers window size specified by
   736  // end.
   737  // +checklocks:s.ep.mu
   738  func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (sent bool) {
   739  	// We abuse the flags field to determine if we have already
   740  	// assigned a sequence number to this segment.
   741  	if !s.isAssignedSequenceNumber(seg) {
   742  		// Merge segments if allowed.
   743  		if seg.payloadSize() != 0 {
   744  			available := int(s.SndNxt.Size(end))
   745  			if available > limit {
   746  				available = limit
   747  			}
   748  
   749  			// nextTooBig indicates that the next segment was too
   750  			// large to entirely fit in the current segment. It
   751  			// would be possible to split the next segment and merge
   752  			// the portion that fits, but unexpectedly splitting
   753  			// segments can have user visible side-effects which can
   754  			// break applications. For example, RFC 7766 section 8
   755  			// says that the length and data of a DNS response
   756  			// should be sent in the same TCP segment to avoid
   757  			// triggering bugs in poorly written DNS
   758  			// implementations.
   759  			var nextTooBig bool
   760  			for nSeg := seg.Next(); nSeg != nil && nSeg.payloadSize() != 0; nSeg = seg.Next() {
   761  				if seg.payloadSize()+nSeg.payloadSize() > available {
   762  					nextTooBig = true
   763  					break
   764  				}
   765  				seg.merge(nSeg)
   766  				s.writeList.Remove(nSeg)
   767  				nSeg.DecRef()
   768  			}
   769  			if !nextTooBig && seg.payloadSize() < available {
   770  				// Segment is not full.
   771  				if s.Outstanding > 0 && s.ep.ops.GetDelayOption() {
   772  					// Nagle's algorithm. From Wikipedia:
   773  					//   Nagle's algorithm works by
   774  					//   combining a number of small
   775  					//   outgoing messages and sending them
   776  					//   all at once. Specifically, as long
   777  					//   as there is a sent packet for which
   778  					//   the sender has received no
   779  					//   acknowledgment, the sender should
   780  					//   keep buffering its output until it
   781  					//   has a full packet's worth of
   782  					//   output, thus allowing output to be
   783  					//   sent all at once.
   784  					return false
   785  				}
   786  				// With TCP_CORK, hold back until minimum of the available
   787  				// send space and MSS.
   788  				if s.ep.ops.GetCorkOption() {
   789  					if seg.payloadSize() < s.MaxPayloadSize {
   790  						if !s.startCork {
   791  							s.startCork = true
   792  							// Enable the timer for
   793  							// 200ms, after which
   794  							// the segments are drained.
   795  							s.corkTimer.enable(MinRTO)
   796  						}
   797  						return false
   798  					}
   799  					// Disable the TCP_CORK timer.
   800  					s.startCork = false
   801  					s.corkTimer.disable()
   802  				}
   803  			}
   804  		}
   805  
   806  		// Assign flags. We don't do it above so that we can merge
   807  		// additional data if Nagle holds the segment.
   808  		seg.sequenceNumber = s.SndNxt
   809  		seg.flags = header.TCPFlagAck | header.TCPFlagPsh
   810  	}
   811  
   812  	var segEnd seqnum.Value
   813  	if seg.payloadSize() == 0 {
   814  		if s.writeList.Back() != seg {
   815  			panic("FIN segments must be the final segment in the write list.")
   816  		}
   817  		seg.flags = header.TCPFlagAck | header.TCPFlagFin
   818  		segEnd = seg.sequenceNumber.Add(1)
   819  		// Update the state to reflect that we have now
   820  		// queued a FIN.
   821  		s.ep.updateConnDirectionState(connDirectionStateSndClosed)
   822  		switch s.ep.EndpointState() {
   823  		case StateCloseWait:
   824  			s.ep.setEndpointState(StateLastAck)
   825  		default:
   826  			s.ep.setEndpointState(StateFinWait1)
   827  		}
   828  	} else {
   829  		// We're sending a non-FIN segment.
   830  		if seg.flags&header.TCPFlagFin != 0 {
   831  			panic("Netstack queues FIN segments without data.")
   832  		}
   833  
   834  		if !seg.sequenceNumber.LessThan(end) {
   835  			return false
   836  		}
   837  
   838  		available := int(seg.sequenceNumber.Size(end))
   839  		if available == 0 {
   840  			return false
   841  		}
   842  
   843  		// If the whole segment or at least 1MSS sized segment cannot
   844  		// be accommodated in the receiver advertised window, skip
   845  		// splitting and sending of the segment. ref:
   846  		// net/ipv4/tcp_output.c::tcp_snd_wnd_test()
   847  		//
   848  		// Linux checks this for all segment transmits not triggered by
   849  		// a probe timer. On this condition, it defers the segment split
   850  		// and transmit to a short probe timer.
   851  		//
   852  		// ref: include/net/tcp.h::tcp_check_probe_timer()
   853  		// ref: net/ipv4/tcp_output.c::tcp_write_wakeup()
   854  		//
   855  		// Instead of defining a new transmit timer, we attempt to split
   856  		// the segment right here if there are no pending segments. If
   857  		// there are pending segments, segment transmits are deferred to
   858  		// the retransmit timer handler.
   859  		if s.SndUna != s.SndNxt {
   860  			switch {
   861  			case available >= seg.payloadSize():
   862  				// OK to send, the whole segments fits in the
   863  				// receiver's advertised window.
   864  			case available >= s.MaxPayloadSize:
   865  				// OK to send, at least 1 MSS sized segment fits
   866  				// in the receiver's advertised window.
   867  			default:
   868  				return false
   869  			}
   870  		}
   871  
   872  		// The segment size limit is computed as a function of sender
   873  		// congestion window and MSS. When sender congestion window is >
   874  		// 1, this limit can be larger than MSS. Ensure that the
   875  		// currently available send space is not greater than minimum of
   876  		// this limit and MSS.
   877  		if available > limit {
   878  			available = limit
   879  		}
   880  
   881  		// If GSO is not in use then cap available to
   882  		// maxPayloadSize. When GSO is in use the gVisor GSO logic or
   883  		// the host GSO logic will cap the segment to the correct size.
   884  		if s.ep.gso.Type == stack.GSONone && available > s.MaxPayloadSize {
   885  			available = s.MaxPayloadSize
   886  		}
   887  
   888  		if seg.payloadSize() > available {
   889  			// A negative value causes splitSeg to panic anyways, so just panic
   890  			// earlier to get more information about the cause.
   891  			s.splitSeg(seg, available)
   892  		}
   893  
   894  		segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize()))
   895  	}
   896  
   897  	s.sendSegment(seg)
   898  
   899  	// Update sndNxt if we actually sent new data (as opposed to
   900  	// retransmitting some previously sent data).
   901  	if s.SndNxt.LessThan(segEnd) {
   902  		s.SndNxt = segEnd
   903  	}
   904  
   905  	return true
   906  }
   907  
   908  // zeroProbeJunk is data sent during zero window probes. Its value is
   909  // irrelevant; since the sequence number has already been acknowledged it will
   910  // be discarded. It's only here to avoid allocating.
   911  var zeroProbeJunk = []byte{0}
   912  
   913  // +checklocks:s.ep.mu
   914  func (s *sender) sendZeroWindowProbe() {
   915  	s.unackZeroWindowProbes++
   916  
   917  	// Send a zero window probe with sequence number pointing to the last
   918  	// acknowledged byte. Note that, like Linux, this isn't quite what RFC
   919  	// 9293 3.8.6.1 describes: we don't send the next byte in the stream,
   920  	// we re-send an ACKed byte to goad the receiver into responding.
   921  	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
   922  		Payload: buffer.MakeWithData(zeroProbeJunk),
   923  	})
   924  	defer pkt.DecRef()
   925  	s.sendSegmentFromPacketBuffer(pkt, header.TCPFlagAck, s.SndUna-1)
   926  
   927  	// Rearm the timer to continue probing.
   928  	s.resendTimer.enable(s.RTO)
   929  }
   930  
   931  func (s *sender) enableZeroWindowProbing() {
   932  	s.zeroWindowProbing = true
   933  	// We piggyback the probing on the retransmit timer with the
   934  	// current retranmission interval, as we may start probing while
   935  	// segment retransmissions.
   936  	if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) {
   937  		s.firstRetransmittedSegXmitTime = s.ep.stack.Clock().NowMonotonic()
   938  	}
   939  	s.resendTimer.enable(s.RTO)
   940  }
   941  
   942  func (s *sender) disableZeroWindowProbing() {
   943  	s.zeroWindowProbing = false
   944  	s.unackZeroWindowProbes = 0
   945  	s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{}
   946  	s.resendTimer.disable()
   947  }
   948  
   949  func (s *sender) postXmit(dataSent bool, shouldScheduleProbe bool) {
   950  	if dataSent {
   951  		// We sent data, so we should stop the keepalive timer to ensure
   952  		// that no keepalives are sent while there is pending data.
   953  		s.ep.disableKeepaliveTimer()
   954  	}
   955  
   956  	// If the sender has advertised zero receive window and we have
   957  	// data to be sent out, start zero window probing to query the
   958  	// the remote for it's receive window size.
   959  	if s.writeNext != nil && s.SndWnd == 0 {
   960  		s.enableZeroWindowProbing()
   961  	}
   962  
   963  	// If we have no more pending data, start the keepalive timer.
   964  	if s.SndUna == s.SndNxt {
   965  		s.ep.resetKeepaliveTimer(false)
   966  	} else {
   967  		// Enable timers if we have pending data.
   968  		if shouldScheduleProbe && s.shouldSchedulePTO() {
   969  			// Schedule PTO after transmitting new data that wasn't itself a TLP probe.
   970  			s.schedulePTO()
   971  		} else if !s.resendTimer.enabled() {
   972  			s.probeTimer.disable()
   973  			if s.Outstanding > 0 {
   974  				// Enable the resend timer if it's not enabled yet and there is
   975  				// outstanding data.
   976  				s.resendTimer.enable(s.RTO)
   977  			}
   978  		}
   979  	}
   980  }
   981  
   982  // sendData sends new data segments. It is called when data becomes available or
   983  // when the send window opens up.
   984  // +checklocks:s.ep.mu
   985  func (s *sender) sendData() {
   986  	limit := s.MaxPayloadSize
   987  	if s.gso {
   988  		limit = int(s.ep.gso.MaxSize - header.TCPTotalHeaderMaximumSize - 1)
   989  	}
   990  	end := s.SndUna.Add(s.SndWnd)
   991  
   992  	// Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10.
   993  	// "A TCP SHOULD set cwnd to no more than RW before beginning
   994  	// transmission if the TCP has not sent data in the interval exceeding
   995  	// the retrasmission timeout."
   996  	if !s.FastRecovery.Active && s.state != tcpip.RTORecovery && s.ep.stack.Clock().NowMonotonic().Sub(s.LastSendTime) > s.RTO {
   997  		if s.SndCwnd > InitialCwnd {
   998  			s.SndCwnd = InitialCwnd
   999  		}
  1000  	}
  1001  
  1002  	var dataSent bool
  1003  	for seg := s.writeNext; seg != nil && s.Outstanding < s.SndCwnd; seg = seg.Next() {
  1004  		cwndLimit := (s.SndCwnd - s.Outstanding) * s.MaxPayloadSize
  1005  		if cwndLimit > 0 && cwndLimit < limit {
  1006  			limit = cwndLimit
  1007  		}
  1008  		if s.isAssignedSequenceNumber(seg) && s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
  1009  			// Move writeNext along so that we don't try and scan data that
  1010  			// has already been SACKED.
  1011  			s.updateWriteNext(seg.Next())
  1012  			continue
  1013  		}
  1014  		if sent := s.maybeSendSegment(seg, limit, end); !sent {
  1015  			break
  1016  		}
  1017  		dataSent = true
  1018  		s.Outstanding += s.pCount(seg, s.MaxPayloadSize)
  1019  		s.updateWriteNext(seg.Next())
  1020  	}
  1021  
  1022  	s.postXmit(dataSent, true /* shouldScheduleProbe */)
  1023  }
  1024  
  1025  func (s *sender) enterRecovery() {
  1026  	// Initialize the variables used to detect spurious recovery after
  1027  	// entering recovery.
  1028  	//
  1029  	// See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1.
  1030  	s.spuriousRecovery = false
  1031  	s.retransmitTS = 0
  1032  
  1033  	s.FastRecovery.Active = true
  1034  	// Save state to reflect we're now in fast recovery.
  1035  	//
  1036  	// See : https://tools.ietf.org/html/rfc5681#section-3.2 Step 3.
  1037  	// We inflate the cwnd by 3 to account for the 3 packets which triggered
  1038  	// the 3 duplicate ACKs and are now not in flight.
  1039  	s.SndCwnd = s.Ssthresh + 3
  1040  	s.SackedOut = 0
  1041  	s.DupAckCount = 0
  1042  	s.FastRecovery.First = s.SndUna
  1043  	s.FastRecovery.Last = s.SndNxt - 1
  1044  	s.FastRecovery.MaxCwnd = s.SndCwnd + s.Outstanding
  1045  	s.FastRecovery.HighRxt = s.SndUna
  1046  	s.FastRecovery.RescueRxt = s.SndUna
  1047  
  1048  	// Record retransmitTS if the sender is not in recovery as per:
  1049  	// https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2
  1050  	s.recordRetransmitTS()
  1051  
  1052  	if s.ep.SACKPermitted {
  1053  		s.state = tcpip.SACKRecovery
  1054  		s.ep.stack.Stats().TCP.SACKRecovery.Increment()
  1055  		// Set TLPRxtOut to false according to
  1056  		// https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1.
  1057  		if s.rc.tlpRxtOut {
  1058  			// The tail loss probe triggered recovery.
  1059  			s.ep.stack.Stats().TCP.TLPRecovery.Increment()
  1060  		}
  1061  		s.rc.tlpRxtOut = false
  1062  		return
  1063  	}
  1064  	s.state = tcpip.FastRecovery
  1065  	s.ep.stack.Stats().TCP.FastRecovery.Increment()
  1066  }
  1067  
  1068  func (s *sender) leaveRecovery() {
  1069  	s.FastRecovery.Active = false
  1070  	s.FastRecovery.MaxCwnd = 0
  1071  	s.DupAckCount = 0
  1072  
  1073  	// Deflate cwnd. It had been artificially inflated when new dups arrived.
  1074  	s.SndCwnd = s.Ssthresh
  1075  	s.cc.PostRecovery()
  1076  }
  1077  
  1078  // isAssignedSequenceNumber relies on the fact that we only set flags once a
  1079  // sequencenumber is assigned and that is only done right before we send the
  1080  // segment. As a result any segment that has a non-zero flag has a valid
  1081  // sequence number assigned to it.
  1082  func (s *sender) isAssignedSequenceNumber(seg *segment) bool {
  1083  	return seg.flags != 0
  1084  }
  1085  
  1086  // SetPipe implements the SetPipe() function described in RFC6675. Netstack
  1087  // maintains the congestion window in number of packets and not bytes, so
  1088  // SetPipe() here measures number of outstanding packets rather than actual
  1089  // outstanding bytes in the network.
  1090  func (s *sender) SetPipe() {
  1091  	// If SACK isn't permitted or it is permitted but recovery is not active
  1092  	// then ignore pipe calculations.
  1093  	if !s.ep.SACKPermitted || !s.FastRecovery.Active {
  1094  		return
  1095  	}
  1096  	pipe := 0
  1097  	smss := seqnum.Size(s.ep.scoreboard.SMSS())
  1098  	for s1 := s.writeList.Front(); s1 != nil && s1.payloadSize() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() {
  1099  		// With GSO each segment can be much larger than SMSS. So check the segment
  1100  		// in SMSS sized ranges.
  1101  		segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.payloadSize()))
  1102  		for startSeq := s1.sequenceNumber; startSeq.LessThan(segEnd); startSeq = startSeq.Add(smss) {
  1103  			endSeq := startSeq.Add(smss)
  1104  			if segEnd.LessThan(endSeq) {
  1105  				endSeq = segEnd
  1106  			}
  1107  			sb := header.SACKBlock{Start: startSeq, End: endSeq}
  1108  			// SetPipe():
  1109  			//
  1110  			// After initializing pipe to zero, the following steps are
  1111  			// taken for each octet 'S1' in the sequence space between
  1112  			// HighACK and HighData that has not been SACKed:
  1113  			if !s1.sequenceNumber.LessThan(s.SndNxt) {
  1114  				break
  1115  			}
  1116  			if s.ep.scoreboard.IsSACKED(sb) {
  1117  				continue
  1118  			}
  1119  
  1120  			// SetPipe():
  1121  			//
  1122  			//    (a) If IsLost(S1) returns false, Pipe is incremened by 1.
  1123  			//
  1124  			// NOTE: here we mark the whole segment as lost. We do not try
  1125  			// and test every byte in our write buffer as we maintain our
  1126  			// pipe in terms of outstanding packets and not bytes.
  1127  			if !s.ep.scoreboard.IsRangeLost(sb) {
  1128  				pipe++
  1129  			}
  1130  			// SetPipe():
  1131  			//    (b) If S1 <= HighRxt, Pipe is incremented by 1.
  1132  			if s1.sequenceNumber.LessThanEq(s.FastRecovery.HighRxt) {
  1133  				pipe++
  1134  			}
  1135  		}
  1136  	}
  1137  	s.Outstanding = pipe
  1138  }
  1139  
  1140  // shouldEnterRecovery returns true if the sender should enter fast recovery
  1141  // based on dupAck count and sack scoreboard.
  1142  // See RFC 6675 section 5.
  1143  func (s *sender) shouldEnterRecovery() bool {
  1144  	return s.DupAckCount >= nDupAckThreshold ||
  1145  		(s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 && s.ep.scoreboard.IsLost(s.SndUna))
  1146  }
  1147  
  1148  // detectLoss is called when an ack is received and returns whether a loss is
  1149  // detected. It manages the state related to duplicate acks and determines if
  1150  // a retransmit is needed according to the rules in RFC 6582 (NewReno).
  1151  func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) {
  1152  	// We're not in fast recovery yet.
  1153  
  1154  	// If RACK is enabled and there is no reordering we should honor the
  1155  	// three duplicate ACK rule to enter recovery.
  1156  	// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-4
  1157  	if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1158  		if s.rc.Reord {
  1159  			return false
  1160  		}
  1161  	}
  1162  
  1163  	if !s.isDupAck(seg) {
  1164  		s.DupAckCount = 0
  1165  		return false
  1166  	}
  1167  
  1168  	s.DupAckCount++
  1169  
  1170  	// Do not enter fast recovery until we reach nDupAckThreshold or the
  1171  	// first unacknowledged byte is considered lost as per SACK scoreboard.
  1172  	if !s.shouldEnterRecovery() {
  1173  		// RFC 6675 Step 3.
  1174  		s.FastRecovery.HighRxt = s.SndUna - 1
  1175  		// Do run SetPipe() to calculate the outstanding segments.
  1176  		s.SetPipe()
  1177  		s.state = tcpip.Disorder
  1178  		return false
  1179  	}
  1180  
  1181  	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2
  1182  	//
  1183  	// We only do the check here, the incrementing of last to the highest
  1184  	// sequence number transmitted till now is done when enterRecovery
  1185  	// is invoked.
  1186  	//
  1187  	// Note that we only enter recovery when at least one more byte of data
  1188  	// beyond s.fr.last (the highest byte that was outstanding when fast
  1189  	// retransmit was last entered) is acked.
  1190  	if !s.FastRecovery.Last.LessThan(seg.ackNumber - 1) {
  1191  		s.DupAckCount = 0
  1192  		return false
  1193  	}
  1194  	s.cc.HandleLossDetected()
  1195  	s.enterRecovery()
  1196  	return true
  1197  }
  1198  
  1199  // isDupAck determines if seg is a duplicate ack as defined in
  1200  // https://tools.ietf.org/html/rfc5681#section-2.
  1201  func (s *sender) isDupAck(seg *segment) bool {
  1202  	// A TCP that utilizes selective acknowledgments (SACKs) [RFC2018, RFC2883]
  1203  	// can leverage the SACK information to determine when an incoming ACK is a
  1204  	// "duplicate" (e.g., if the ACK contains previously unknown SACK
  1205  	// information).
  1206  	if s.ep.SACKPermitted && !seg.hasNewSACKInfo {
  1207  		return false
  1208  	}
  1209  
  1210  	// (a) The receiver of the ACK has outstanding data.
  1211  	return s.SndUna != s.SndNxt &&
  1212  		// (b) The incoming acknowledgment carries no data.
  1213  		seg.logicalLen() == 0 &&
  1214  		// (c) The SYN and FIN bits are both off.
  1215  		!seg.flags.Intersects(header.TCPFlagFin|header.TCPFlagSyn) &&
  1216  		// (d) the ACK number is equal to the greatest acknowledgment received on
  1217  		// the given connection (TCP.UNA from RFC793).
  1218  		seg.ackNumber == s.SndUna &&
  1219  		// (e) the advertised window in the incoming acknowledgment equals the
  1220  		// advertised window in the last incoming acknowledgment.
  1221  		s.SndWnd == seg.window
  1222  }
  1223  
  1224  // Iterate the writeList and update RACK for each segment which is newly acked
  1225  // either cumulatively or selectively. Loop through the segments which are
  1226  // sacked, and update the RACK related variables and check for reordering.
  1227  // Returns true when the DSACK block has been detected in the received ACK.
  1228  //
  1229  // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
  1230  // steps 2 and 3.
  1231  func (s *sender) walkSACK(rcvdSeg *segment) bool {
  1232  	s.rc.setDSACKSeen(false)
  1233  
  1234  	// Look for DSACK block.
  1235  	hasDSACK := false
  1236  	idx := 0
  1237  	n := len(rcvdSeg.parsedOptions.SACKBlocks)
  1238  	if checkDSACK(rcvdSeg) {
  1239  		dsackBlock := rcvdSeg.parsedOptions.SACKBlocks[0]
  1240  		numDSACK := uint64(dsackBlock.End-dsackBlock.Start) / uint64(s.MaxPayloadSize)
  1241  		// numDSACK can be zero when DSACK is sent for subsegments.
  1242  		if numDSACK < 1 {
  1243  			numDSACK = 1
  1244  		}
  1245  		s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.IncrementBy(numDSACK)
  1246  		s.rc.setDSACKSeen(true)
  1247  		idx = 1
  1248  		n--
  1249  		hasDSACK = true
  1250  	}
  1251  
  1252  	if n == 0 {
  1253  		return hasDSACK
  1254  	}
  1255  
  1256  	// Sort the SACK blocks. The first block is the most recent unacked
  1257  	// block. The following blocks can be in arbitrary order.
  1258  	sackBlocks := make([]header.SACKBlock, n)
  1259  	copy(sackBlocks, rcvdSeg.parsedOptions.SACKBlocks[idx:])
  1260  	sort.Slice(sackBlocks, func(i, j int) bool {
  1261  		return sackBlocks[j].Start.LessThan(sackBlocks[i].Start)
  1262  	})
  1263  
  1264  	seg := s.writeList.Front()
  1265  	for _, sb := range sackBlocks {
  1266  		for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 {
  1267  			if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked {
  1268  				s.rc.update(seg, rcvdSeg)
  1269  				s.rc.detectReorder(seg)
  1270  				seg.acked = true
  1271  				s.SackedOut += s.pCount(seg, s.MaxPayloadSize)
  1272  			}
  1273  			seg = seg.Next()
  1274  		}
  1275  	}
  1276  	return hasDSACK
  1277  }
  1278  
  1279  // checkDSACK checks if a DSACK is reported.
  1280  func checkDSACK(rcvdSeg *segment) bool {
  1281  	n := len(rcvdSeg.parsedOptions.SACKBlocks)
  1282  	if n == 0 {
  1283  		return false
  1284  	}
  1285  
  1286  	sb := rcvdSeg.parsedOptions.SACKBlocks[0]
  1287  	// Check if SACK block is invalid.
  1288  	if sb.End.LessThan(sb.Start) {
  1289  		return false
  1290  	}
  1291  
  1292  	// See: https://tools.ietf.org/html/rfc2883#section-5 DSACK is sent in
  1293  	// at most one SACK block. DSACK is detected in the below two cases:
  1294  	//	* If the SACK sequence space is less than this cumulative ACK, it is
  1295  	//		an indication that the segment identified by the SACK block has
  1296  	//		been received more than once by the receiver.
  1297  	//	* If the sequence space in the first SACK block is greater than the
  1298  	//		cumulative ACK, then the sender next compares the sequence space
  1299  	//		in the first SACK block with the sequence space in the second SACK
  1300  	//		block, if there is one. This comparison can determine if the first
  1301  	//		SACK block is reporting duplicate data that lies above the
  1302  	//		cumulative ACK.
  1303  	if sb.Start.LessThan(rcvdSeg.ackNumber) {
  1304  		return true
  1305  	}
  1306  
  1307  	if n > 1 {
  1308  		sb1 := rcvdSeg.parsedOptions.SACKBlocks[1]
  1309  		if sb1.End.LessThan(sb1.Start) {
  1310  			return false
  1311  		}
  1312  
  1313  		// If the first SACK block is fully covered by second SACK
  1314  		// block, then the first block is a DSACK block.
  1315  		if sb.End.LessThanEq(sb1.End) && sb1.Start.LessThanEq(sb.Start) {
  1316  			return true
  1317  		}
  1318  	}
  1319  
  1320  	return false
  1321  }
  1322  
  1323  func (s *sender) recordRetransmitTS() {
  1324  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2
  1325  	//
  1326  	// The Eifel detection algorithm is used, only upon initiation of loss
  1327  	// recovery, i.e., when either the timeout-based retransmit or the fast
  1328  	// retransmit is sent. The Eifel detection algorithm MUST NOT be
  1329  	// reinitiated after loss recovery has already started. In particular,
  1330  	// it must not be reinitiated upon subsequent timeouts for the same
  1331  	// segment, and not upon retransmitting segments other than the oldest
  1332  	// outstanding segment, e.g., during selective loss recovery.
  1333  	if s.inRecovery() {
  1334  		return
  1335  	}
  1336  
  1337  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2
  1338  	//
  1339  	// Set a "RetransmitTS" variable to the value of the Timestamp Value
  1340  	// field of the Timestamps option included in the retransmit sent when
  1341  	// loss recovery is initiated. A TCP sender must ensure that
  1342  	// RetransmitTS does not get overwritten as loss recovery progresses,
  1343  	// e.g., in case of a second timeout and subsequent second retransmit of
  1344  	// the same octet.
  1345  	s.retransmitTS = s.ep.tsValNow()
  1346  }
  1347  
  1348  func (s *sender) detectSpuriousRecovery(hasDSACK bool, tsEchoReply uint32) {
  1349  	// Return if the sender has already detected spurious recovery.
  1350  	if s.spuriousRecovery {
  1351  		return
  1352  	}
  1353  
  1354  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 4
  1355  	//
  1356  	// If the value of the Timestamp Echo Reply field of the acceptable ACK's
  1357  	// Timestamps option is smaller than the value of RetransmitTS, then
  1358  	// proceed to next step, else return.
  1359  	if tsEchoReply >= s.retransmitTS {
  1360  		return
  1361  	}
  1362  
  1363  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5
  1364  	//
  1365  	// If the acceptable ACK carries a DSACK option [RFC2883], then return.
  1366  	if hasDSACK {
  1367  		return
  1368  	}
  1369  
  1370  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5
  1371  	//
  1372  	// If during the lifetime of the TCP connection the TCP sender has
  1373  	// previously received an ACK with a DSACK option, or the acceptable ACK
  1374  	// does not acknowledge all outstanding data, then proceed to next step,
  1375  	// else return.
  1376  	numDSACK := s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.Value()
  1377  	if numDSACK == 0 && s.SndUna == s.SndNxt {
  1378  		return
  1379  	}
  1380  
  1381  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 6
  1382  	//
  1383  	// If the loss recovery has been initiated with a timeout-based
  1384  	// retransmit, then set
  1385  	//    SpuriousRecovery <- SPUR_TO (equal 1),
  1386  	// else set
  1387  	//    SpuriousRecovery <- dupacks+1
  1388  	// Set the spurious recovery variable to true as we do not differentiate
  1389  	// between fast, SACK or RTO recovery.
  1390  	s.spuriousRecovery = true
  1391  	s.ep.stack.Stats().TCP.SpuriousRecovery.Increment()
  1392  
  1393  	// RFC 3522 will detect all kinds of spurious recoveries (fast, SACK and
  1394  	// timeout). Increment the metric for RTO only as we want to track the
  1395  	// number of timeout recoveries.
  1396  	if s.state == tcpip.RTORecovery {
  1397  		s.ep.stack.Stats().TCP.SpuriousRTORecovery.Increment()
  1398  	}
  1399  }
  1400  
  1401  // Check if the sender is in RTORecovery, FastRecovery or SACKRecovery state.
  1402  func (s *sender) inRecovery() bool {
  1403  	if s.state == tcpip.RTORecovery || s.state == tcpip.FastRecovery || s.state == tcpip.SACKRecovery {
  1404  		return true
  1405  	}
  1406  	return false
  1407  }
  1408  
  1409  // handleRcvdSegment is called when a segment is received; it is responsible for
  1410  // updating the send-related state.
  1411  // +checklocks:s.ep.mu
  1412  // +checklocksalias:s.rc.snd.ep.mu=s.ep.mu
  1413  func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
  1414  	// Check if we can extract an RTT measurement from this ack.
  1415  	if !rcvdSeg.parsedOptions.TS && s.RTTMeasureSeqNum.LessThan(rcvdSeg.ackNumber) {
  1416  		s.updateRTO(s.ep.stack.Clock().NowMonotonic().Sub(s.RTTMeasureTime))
  1417  		s.RTTMeasureSeqNum = s.SndNxt
  1418  	}
  1419  
  1420  	// Update Timestamp if required. See RFC7323, section-4.3.
  1421  	if s.ep.SendTSOk && rcvdSeg.parsedOptions.TS {
  1422  		s.ep.updateRecentTimestamp(rcvdSeg.parsedOptions.TSVal, s.MaxSentAck, rcvdSeg.sequenceNumber)
  1423  	}
  1424  
  1425  	// Insert SACKBlock information into our scoreboard.
  1426  	hasDSACK := false
  1427  	if s.ep.SACKPermitted {
  1428  		for _, sb := range rcvdSeg.parsedOptions.SACKBlocks {
  1429  			// Only insert the SACK block if the following holds
  1430  			// true:
  1431  			//  * SACK block acks data after the ack number in the
  1432  			//    current segment.
  1433  			//  * SACK block represents a sequence
  1434  			//    between sndUna and sndNxt (i.e. data that is
  1435  			//    currently unacked and in-flight).
  1436  			//  * SACK block that has not been SACKed already.
  1437  			//
  1438  			// NOTE: This check specifically excludes DSACK blocks
  1439  			// which have start/end before sndUna and are used to
  1440  			// indicate spurious retransmissions.
  1441  			if rcvdSeg.ackNumber.LessThan(sb.Start) && s.SndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.SndNxt) && !s.ep.scoreboard.IsSACKED(sb) {
  1442  				s.ep.scoreboard.Insert(sb)
  1443  				rcvdSeg.hasNewSACKInfo = true
  1444  			}
  1445  		}
  1446  
  1447  		// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08
  1448  		// section-7.2
  1449  		//	* Step 2: Update RACK stats.
  1450  		//		If the ACK is not ignored as invalid, update the RACK.rtt
  1451  		//		to be the RTT sample calculated using this ACK, and
  1452  		//		continue.  If this ACK or SACK was for the most recently
  1453  		//		sent packet, then record the RACK.xmit_ts timestamp and
  1454  		//		RACK.end_seq sequence implied by this ACK.
  1455  		//	* Step 3: Detect packet reordering.
  1456  		//		If the ACK selectively or cumulatively acknowledges an
  1457  		//		unacknowledged and also never retransmitted sequence below
  1458  		//		RACK.fack, then the corresponding packet has been
  1459  		//		reordered and RACK.reord is set to TRUE.
  1460  		if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1461  			hasDSACK = s.walkSACK(rcvdSeg)
  1462  		}
  1463  		s.SetPipe()
  1464  	}
  1465  
  1466  	ack := rcvdSeg.ackNumber
  1467  	fastRetransmit := false
  1468  	// Do not leave fast recovery, if the ACK is out of range.
  1469  	if s.FastRecovery.Active {
  1470  		// Leave fast recovery if it acknowledges all the data covered by
  1471  		// this fast recovery session.
  1472  		if (ack-1).InRange(s.SndUna, s.SndNxt) && s.FastRecovery.Last.LessThan(ack) {
  1473  			s.leaveRecovery()
  1474  		}
  1475  	} else {
  1476  		// Detect loss by counting the duplicates and enter recovery.
  1477  		fastRetransmit = s.detectLoss(rcvdSeg)
  1478  	}
  1479  
  1480  	// See if TLP based recovery was successful.
  1481  	if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1482  		s.detectTLPRecovery(ack, rcvdSeg)
  1483  	}
  1484  
  1485  	// Stash away the current window size.
  1486  	s.SndWnd = rcvdSeg.window
  1487  
  1488  	// Disable zero window probing if remote advertises a non-zero receive
  1489  	// window. This can be with an ACK to the zero window probe (where the
  1490  	// acknumber refers to the already acknowledged byte) OR to any previously
  1491  	// unacknowledged segment.
  1492  	if s.zeroWindowProbing && rcvdSeg.window > 0 &&
  1493  		(ack == s.SndUna || (ack-1).InRange(s.SndUna, s.SndNxt)) {
  1494  		s.disableZeroWindowProbing()
  1495  	}
  1496  
  1497  	// On receiving the ACK for the zero window probe, account for it and
  1498  	// skip trying to send any segment as we are still probing for
  1499  	// receive window to become non-zero.
  1500  	if s.zeroWindowProbing && s.unackZeroWindowProbes > 0 && ack == s.SndUna {
  1501  		s.unackZeroWindowProbes--
  1502  		return
  1503  	}
  1504  
  1505  	// Ignore ack if it doesn't acknowledge any new data.
  1506  	if (ack - 1).InRange(s.SndUna, s.SndNxt) {
  1507  		s.DupAckCount = 0
  1508  
  1509  		// See : https://tools.ietf.org/html/rfc1323#section-3.3.
  1510  		// Specifically we should only update the RTO using TSEcr if the
  1511  		// following condition holds:
  1512  		//
  1513  		//    A TSecr value received in a segment is used to update the
  1514  		//    averaged RTT measurement only if the segment acknowledges
  1515  		//    some new data, i.e., only if it advances the left edge of
  1516  		//    the send window.
  1517  		if s.ep.SendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 {
  1518  			s.updateRTO(s.ep.elapsed(s.ep.stack.Clock().NowMonotonic(), rcvdSeg.parsedOptions.TSEcr))
  1519  		}
  1520  
  1521  		if s.shouldSchedulePTO() {
  1522  			// Schedule PTO upon receiving an ACK that cumulatively acknowledges data.
  1523  			// See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1.
  1524  			s.schedulePTO()
  1525  		} else {
  1526  			// When an ack is received we must rearm the timer.
  1527  			// RFC 6298 5.3
  1528  			s.probeTimer.disable()
  1529  			s.resendTimer.enable(s.RTO)
  1530  		}
  1531  
  1532  		// Remove all acknowledged data from the write list.
  1533  		acked := s.SndUna.Size(ack)
  1534  		s.SndUna = ack
  1535  		ackLeft := acked
  1536  		originalOutstanding := s.Outstanding
  1537  		for ackLeft > 0 {
  1538  			// We use logicalLen here because we can have FIN
  1539  			// segments (which are always at the end of list) that
  1540  			// have no data, but do consume a sequence number.
  1541  			seg := s.writeList.Front()
  1542  			datalen := seg.logicalLen()
  1543  
  1544  			if datalen > ackLeft {
  1545  				prevCount := s.pCount(seg, s.MaxPayloadSize)
  1546  				seg.TrimFront(ackLeft)
  1547  				seg.sequenceNumber.UpdateForward(ackLeft)
  1548  				s.Outstanding -= prevCount - s.pCount(seg, s.MaxPayloadSize)
  1549  				break
  1550  			}
  1551  
  1552  			if s.writeNext == seg {
  1553  				s.updateWriteNext(seg.Next())
  1554  			}
  1555  
  1556  			// Update the RACK fields if SACK is enabled.
  1557  			if s.ep.SACKPermitted && !seg.acked && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1558  				s.rc.update(seg, rcvdSeg)
  1559  				s.rc.detectReorder(seg)
  1560  			}
  1561  
  1562  			s.writeList.Remove(seg)
  1563  
  1564  			// If SACK is enabled then only reduce outstanding if
  1565  			// the segment was not previously SACKED as these have
  1566  			// already been accounted for in SetPipe().
  1567  			if !s.ep.SACKPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
  1568  				s.Outstanding -= s.pCount(seg, s.MaxPayloadSize)
  1569  			} else {
  1570  				s.SackedOut -= s.pCount(seg, s.MaxPayloadSize)
  1571  			}
  1572  			seg.DecRef()
  1573  			ackLeft -= datalen
  1574  		}
  1575  
  1576  		// Clear SACK information for all acked data.
  1577  		s.ep.scoreboard.Delete(s.SndUna)
  1578  
  1579  		// Detect if the sender entered recovery spuriously.
  1580  		if s.inRecovery() {
  1581  			s.detectSpuriousRecovery(hasDSACK, rcvdSeg.parsedOptions.TSEcr)
  1582  		}
  1583  
  1584  		// If we are not in fast recovery then update the congestion
  1585  		// window based on the number of acknowledged packets.
  1586  		if !s.FastRecovery.Active {
  1587  			s.cc.Update(originalOutstanding - s.Outstanding)
  1588  			if s.FastRecovery.Last.LessThan(s.SndUna) {
  1589  				s.state = tcpip.Open
  1590  				// Update RACK when we are exiting fast or RTO
  1591  				// recovery as described in the RFC
  1592  				// draft-ietf-tcpm-rack-08 Section-7.2 Step 4.
  1593  				if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1594  					s.rc.exitRecovery()
  1595  				}
  1596  				s.reorderTimer.disable()
  1597  			}
  1598  		}
  1599  
  1600  		// Update the send buffer usage and notify potential waiters.
  1601  		s.ep.updateSndBufferUsage(int(acked))
  1602  
  1603  		// It is possible for s.outstanding to drop below zero if we get
  1604  		// a retransmit timeout, reset outstanding to zero but later
  1605  		// get an ack that cover previously sent data.
  1606  		if s.Outstanding < 0 {
  1607  			s.Outstanding = 0
  1608  		}
  1609  
  1610  		s.SetPipe()
  1611  
  1612  		// If all outstanding data was acknowledged the disable the timer.
  1613  		// RFC 6298 Rule 5.3
  1614  		if s.SndUna == s.SndNxt {
  1615  			s.Outstanding = 0
  1616  			// Reset firstRetransmittedSegXmitTime to the zero value.
  1617  			s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{}
  1618  			s.resendTimer.disable()
  1619  			s.probeTimer.disable()
  1620  		}
  1621  	}
  1622  
  1623  	if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1624  		// Update RACK reorder window.
  1625  		// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
  1626  		//	* Upon receiving an ACK:
  1627  		//	* Step 4: Update RACK reordering window
  1628  		s.rc.updateRACKReorderWindow()
  1629  
  1630  		// After the reorder window is calculated, detect any loss by checking
  1631  		// if the time elapsed after the segments are sent is greater than the
  1632  		// reorder window.
  1633  		if numLost := s.rc.detectLoss(rcvdSeg.rcvdTime); numLost > 0 && !s.FastRecovery.Active {
  1634  			// If any segment is marked as lost by
  1635  			// RACK, enter recovery and retransmit
  1636  			// the lost segments.
  1637  			s.cc.HandleLossDetected()
  1638  			s.enterRecovery()
  1639  			fastRetransmit = true
  1640  		}
  1641  
  1642  		if s.FastRecovery.Active {
  1643  			s.rc.DoRecovery(nil, fastRetransmit)
  1644  		}
  1645  	}
  1646  
  1647  	// Now that we've popped all acknowledged data from the retransmit
  1648  	// queue, retransmit if needed.
  1649  	if s.FastRecovery.Active && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 {
  1650  		s.lr.DoRecovery(rcvdSeg, fastRetransmit)
  1651  		// When SACK is enabled data sending is governed by steps in
  1652  		// RFC 6675 Section 5 recovery steps  A-C.
  1653  		// See: https://tools.ietf.org/html/rfc6675#section-5.
  1654  		if s.ep.SACKPermitted {
  1655  			return
  1656  		}
  1657  	}
  1658  
  1659  	// Send more data now that some of the pending data has been ack'd, or
  1660  	// that the window opened up, or the congestion window was inflated due
  1661  	// to a duplicate ack during fast recovery. This will also re-enable
  1662  	// the retransmit timer if needed.
  1663  	s.sendData()
  1664  }
  1665  
  1666  // sendSegment sends the specified segment.
  1667  // +checklocks:s.ep.mu
  1668  func (s *sender) sendSegment(seg *segment) tcpip.Error {
  1669  	if seg.xmitCount > 0 {
  1670  		s.ep.stack.Stats().TCP.Retransmits.Increment()
  1671  		s.ep.stats.SendErrors.Retransmits.Increment()
  1672  		if s.SndCwnd < s.Ssthresh {
  1673  			s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment()
  1674  		}
  1675  	}
  1676  	seg.xmitTime = s.ep.stack.Clock().NowMonotonic()
  1677  	seg.xmitCount++
  1678  	seg.lost = false
  1679  
  1680  	err := s.sendSegmentFromPacketBuffer(seg.pkt, seg.flags, seg.sequenceNumber)
  1681  
  1682  	// Every time a packet containing data is sent (including a
  1683  	// retransmission), if SACK is enabled and we are retransmitting data
  1684  	// then use the conservative timer described in RFC6675 Section 6.0,
  1685  	// otherwise follow the standard time described in RFC6298 Section 5.1.
  1686  	if err != nil && seg.payloadSize() != 0 {
  1687  		if s.FastRecovery.Active && seg.xmitCount > 1 && s.ep.SACKPermitted {
  1688  			s.resendTimer.enable(s.RTO)
  1689  		} else {
  1690  			if !s.resendTimer.enabled() {
  1691  				s.resendTimer.enable(s.RTO)
  1692  			}
  1693  		}
  1694  	}
  1695  
  1696  	return err
  1697  }
  1698  
  1699  // sendSegmentFromPacketBuffer sends a new segment containing the given payload,
  1700  // flags and sequence number.
  1701  // +checklocks:s.ep.mu
  1702  // +checklocksalias:s.ep.rcv.ep.mu=s.ep.mu
  1703  func (s *sender) sendSegmentFromPacketBuffer(pkt *stack.PacketBuffer, flags header.TCPFlags, seq seqnum.Value) tcpip.Error {
  1704  	s.LastSendTime = s.ep.stack.Clock().NowMonotonic()
  1705  	if seq == s.RTTMeasureSeqNum {
  1706  		s.RTTMeasureTime = s.LastSendTime
  1707  	}
  1708  
  1709  	rcvNxt, rcvWnd := s.ep.rcv.getSendParams()
  1710  
  1711  	// Remember the max sent ack.
  1712  	s.MaxSentAck = rcvNxt
  1713  
  1714  	// We need to clone the packet because sendRaw takes ownership of pkt,
  1715  	// and pkt could be reprocessed later on (i.e retrasmission).
  1716  	pkt = pkt.Clone()
  1717  	defer pkt.DecRef()
  1718  
  1719  	return s.ep.sendRaw(pkt, flags, seq, rcvNxt, rcvWnd)
  1720  }
  1721  
  1722  // sendEmptySegment sends a new empty segment, flags and sequence number.
  1723  // +checklocks:s.ep.mu
  1724  // +checklocksalias:s.ep.rcv.ep.mu=s.ep.mu
  1725  func (s *sender) sendEmptySegment(flags header.TCPFlags, seq seqnum.Value) tcpip.Error {
  1726  	s.LastSendTime = s.ep.stack.Clock().NowMonotonic()
  1727  	if seq == s.RTTMeasureSeqNum {
  1728  		s.RTTMeasureTime = s.LastSendTime
  1729  	}
  1730  
  1731  	rcvNxt, rcvWnd := s.ep.rcv.getSendParams()
  1732  
  1733  	// Remember the max sent ack.
  1734  	s.MaxSentAck = rcvNxt
  1735  
  1736  	return s.ep.sendEmptyRaw(flags, seq, rcvNxt, rcvWnd)
  1737  }
  1738  
  1739  // maybeSendOutOfWindowAck sends an ACK if we are not being rate limited
  1740  // currently.
  1741  // +checklocks:s.ep.mu
  1742  func (s *sender) maybeSendOutOfWindowAck(seg *segment) {
  1743  	// Data packets are unlikely to be part of an ACK loop. So always send
  1744  	// an ACK for a packet w/ data.
  1745  	if seg.payloadSize() > 0 || s.ep.allowOutOfWindowAck() {
  1746  		s.sendAck()
  1747  	}
  1748  }
  1749  
  1750  func (s *sender) updateWriteNext(seg *segment) {
  1751  	if s.writeNext != nil {
  1752  		s.writeNext.DecRef()
  1753  	}
  1754  	if seg != nil {
  1755  		seg.IncRef()
  1756  	}
  1757  	s.writeNext = seg
  1758  }
  1759  
  1760  // corkTimerExpired drains all the segments when TCP_CORK is enabled.
  1761  // +checklocks:s.ep.mu
  1762  func (s *sender) corkTimerExpired() tcpip.Error {
  1763  	// Check if the timer actually expired or if it's a spurious wake due
  1764  	// to a previously orphaned runtime timer.
  1765  	if s.corkTimer.isUninitialized() || !s.corkTimer.checkExpiration() {
  1766  		return nil
  1767  	}
  1768  
  1769  	// Assign sequence number and flags to the segment.
  1770  	seg := s.writeNext
  1771  	if seg == nil {
  1772  		return nil
  1773  	}
  1774  	seg.sequenceNumber = s.SndNxt
  1775  	seg.flags = header.TCPFlagAck | header.TCPFlagPsh
  1776  	// Drain all the segments.
  1777  	s.sendData()
  1778  	return nil
  1779  }