inet.af/netstack@v0.0.0-20220214151720-7585b01ddccf/tcpip/transport/tcp/snd.go

inet.af/netstack@v0.0.0-20220214151720-7585b01ddccf/tcpip/transport/tcp/snd.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"fmt"
    19  	"math"
    20  	"sort"
    21  	"time"
    22  
    23  	"inet.af/netstack/sleep"
    24  	"inet.af/netstack/sync"
    25  	"inet.af/netstack/tcpip"
    26  	"inet.af/netstack/tcpip/buffer"
    27  	"inet.af/netstack/tcpip/header"
    28  	"inet.af/netstack/tcpip/seqnum"
    29  	"inet.af/netstack/tcpip/stack"
    30  )
    31  
    32  const (
    33  	// MinRTO is the minimum allowed value for the retransmit timeout.
    34  	MinRTO = 200 * time.Millisecond
    35  
    36  	// MaxRTO is the maximum allowed value for the retransmit timeout.
    37  	MaxRTO = 120 * time.Second
    38  
    39  	// InitialCwnd is the initial congestion window.
    40  	InitialCwnd = 10
    41  
    42  	// nDupAckThreshold is the number of duplicate ACK's required
    43  	// before fast-retransmit is entered.
    44  	nDupAckThreshold = 3
    45  
    46  	// MaxRetries is the maximum number of probe retries sender does
    47  	// before timing out the connection.
    48  	// Linux default TCP_RETR2, net.ipv4.tcp_retries2.
    49  	MaxRetries = 15
    50  )
    51  
    52  // congestionControl is an interface that must be implemented by any supported
    53  // congestion control algorithm.
    54  type congestionControl interface {
    55  	// HandleLossDetected is invoked when the loss is detected by RACK or
    56  	// sender.dupAckCount >= nDupAckThreshold just before entering fast
    57  	// retransmit.
    58  	HandleLossDetected()
    59  
    60  	// HandleRTOExpired is invoked when the retransmit timer expires.
    61  	HandleRTOExpired()
    62  
    63  	// Update is invoked when processing inbound acks. It's passed the
    64  	// number of packet's that were acked by the most recent cumulative
    65  	// acknowledgement.
    66  	Update(packetsAcked int)
    67  
    68  	// PostRecovery is invoked when the sender is exiting a fast retransmit/
    69  	// recovery phase. This provides congestion control algorithms a way
    70  	// to adjust their state when exiting recovery.
    71  	PostRecovery()
    72  }
    73  
    74  // lossRecovery is an interface that must be implemented by any supported
    75  // loss recovery algorithm.
    76  type lossRecovery interface {
    77  	// DoRecovery is invoked when loss is detected and segments need
    78  	// to be retransmitted. The cumulative or selective ACK is passed along
    79  	// with the flag which identifies whether the connection entered fast
    80  	// retransmit with this ACK and to retransmit the first unacknowledged
    81  	// segment.
    82  	DoRecovery(rcvdSeg *segment, fastRetransmit bool)
    83  }
    84  
    85  // sender holds the state necessary to send TCP segments.
    86  //
    87  // +stateify savable
    88  type sender struct {
    89  	stack.TCPSenderState
    90  	ep *endpoint
    91  
    92  	// lr is the loss recovery algorithm used by the sender.
    93  	lr lossRecovery
    94  
    95  	// firstRetransmittedSegXmitTime is the original transmit time of
    96  	// the first segment that was retransmitted due to RTO expiration.
    97  	firstRetransmittedSegXmitTime tcpip.MonotonicTime
    98  
    99  	// zeroWindowProbing is set if the sender is currently probing
   100  	// for zero receive window.
   101  	zeroWindowProbing bool `state:"nosave"`
   102  
   103  	// unackZeroWindowProbes is the number of unacknowledged zero
   104  	// window probes.
   105  	unackZeroWindowProbes uint32 `state:"nosave"`
   106  
   107  	writeNext   *segment
   108  	writeList   segmentList
   109  	resendTimer timer       `state:"nosave"`
   110  	resendWaker sleep.Waker `state:"nosave"`
   111  
   112  	// rtt.TCPRTTState.SRTT and rtt.TCPRTTState.RTTVar are the "smoothed
   113  	// round-trip time", and "round-trip time variation", as defined in
   114  	// section 2 of RFC 6298.
   115  	rtt rtt
   116  
   117  	// minRTO is the minimum permitted value for sender.rto.
   118  	minRTO time.Duration
   119  
   120  	// maxRTO is the maximum permitted value for sender.rto.
   121  	maxRTO time.Duration
   122  
   123  	// maxRetries is the maximum permitted retransmissions.
   124  	maxRetries uint32
   125  
   126  	// gso is set if generic segmentation offload is enabled.
   127  	gso bool
   128  
   129  	// state is the current state of congestion control for this endpoint.
   130  	state tcpip.CongestionControlState
   131  
   132  	// cc is the congestion control algorithm in use for this sender.
   133  	cc congestionControl
   134  
   135  	// rc has the fields needed for implementing RACK loss detection
   136  	// algorithm.
   137  	rc rackControl
   138  
   139  	// reorderTimer is the timer used to retransmit the segments after RACK
   140  	// detects them as lost.
   141  	reorderTimer timer       `state:"nosave"`
   142  	reorderWaker sleep.Waker `state:"nosave"`
   143  
   144  	// probeTimer and probeWaker are used to schedule PTO for RACK TLP algorithm.
   145  	probeTimer timer       `state:"nosave"`
   146  	probeWaker sleep.Waker `state:"nosave"`
   147  
   148  	// spuriousRecovery indicates whether the sender entered recovery
   149  	// spuriously as described in RFC3522 Section 3.2.
   150  	spuriousRecovery bool
   151  
   152  	// retransmitTS is the timestamp at which the sender sends retransmitted
   153  	// segment after entering an RTO for the first time as described in
   154  	// RFC3522 Section 3.2.
   155  	retransmitTS uint32
   156  }
   157  
   158  // rtt is a synchronization wrapper used to appease stateify. See the comment
   159  // in sender, where it is used.
   160  //
   161  // +stateify savable
   162  type rtt struct {
   163  	sync.Mutex `state:"nosave"`
   164  
   165  	stack.TCPRTTState
   166  }
   167  
   168  func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender {
   169  	// The sender MUST reduce the TCP data length to account for any IP or
   170  	// TCP options that it is including in the packets that it sends.
   171  	// See: https://tools.ietf.org/html/rfc6691#section-2
   172  	maxPayloadSize := int(mss) - ep.maxOptionSize()
   173  
   174  	s := &sender{
   175  		ep: ep,
   176  		TCPSenderState: stack.TCPSenderState{
   177  			SndWnd:           sndWnd,
   178  			SndUna:           iss + 1,
   179  			SndNxt:           iss + 1,
   180  			RTTMeasureSeqNum: iss + 1,
   181  			LastSendTime:     ep.stack.Clock().NowMonotonic(),
   182  			MaxPayloadSize:   maxPayloadSize,
   183  			MaxSentAck:       irs + 1,
   184  			FastRecovery: stack.TCPFastRecoveryState{
   185  				// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1.
   186  				Last:      iss,
   187  				HighRxt:   iss,
   188  				RescueRxt: iss,
   189  			},
   190  			RTO: 1 * time.Second,
   191  		},
   192  		gso: ep.gso.Type != stack.GSONone,
   193  	}
   194  
   195  	if s.gso {
   196  		s.ep.gso.MSS = uint16(maxPayloadSize)
   197  	}
   198  
   199  	s.cc = s.initCongestionControl(ep.cc)
   200  	s.lr = s.initLossRecovery()
   201  	s.rc.init(s, iss)
   202  
   203  	// A negative sndWndScale means that no scaling is in use, otherwise we
   204  	// store the scaling value.
   205  	if sndWndScale > 0 {
   206  		s.SndWndScale = uint8(sndWndScale)
   207  	}
   208  
   209  	s.resendTimer.init(s.ep.stack.Clock(), &s.resendWaker)
   210  	s.reorderTimer.init(s.ep.stack.Clock(), &s.reorderWaker)
   211  	s.probeTimer.init(s.ep.stack.Clock(), &s.probeWaker)
   212  
   213  	s.updateMaxPayloadSize(int(ep.route.MTU()), 0)
   214  
   215  	// Initialize SACK Scoreboard after updating max payload size as we use
   216  	// the maxPayloadSize as the smss when determining if a segment is lost
   217  	// etc.
   218  	s.ep.scoreboard = NewSACKScoreboard(uint16(s.MaxPayloadSize), iss)
   219  
   220  	// Get Stack wide config.
   221  	var minRTO tcpip.TCPMinRTOOption
   222  	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil {
   223  		panic(fmt.Sprintf("unable to get minRTO from stack: %s", err))
   224  	}
   225  	s.minRTO = time.Duration(minRTO)
   226  
   227  	var maxRTO tcpip.TCPMaxRTOOption
   228  	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil {
   229  		panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err))
   230  	}
   231  	s.maxRTO = time.Duration(maxRTO)
   232  
   233  	var maxRetries tcpip.TCPMaxRetriesOption
   234  	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil {
   235  		panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err))
   236  	}
   237  	s.maxRetries = uint32(maxRetries)
   238  
   239  	return s
   240  }
   241  
   242  // initCongestionControl initializes the specified congestion control module and
   243  // returns a handle to it. It also initializes the sndCwnd and sndSsThresh to
   244  // their initial values.
   245  func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl {
   246  	s.SndCwnd = InitialCwnd
   247  	// Set sndSsthresh to the maximum int value, which depends on the
   248  	// platform.
   249  	s.Ssthresh = int(^uint(0) >> 1)
   250  
   251  	switch congestionControlName {
   252  	case ccCubic:
   253  		return newCubicCC(s)
   254  	case ccReno:
   255  		fallthrough
   256  	default:
   257  		return newRenoCC(s)
   258  	}
   259  }
   260  
   261  // initLossRecovery initiates the loss recovery algorithm for the sender.
   262  func (s *sender) initLossRecovery() lossRecovery {
   263  	if s.ep.SACKPermitted {
   264  		return newSACKRecovery(s)
   265  	}
   266  	return newRenoRecovery(s)
   267  }
   268  
   269  // updateMaxPayloadSize updates the maximum payload size based on the given
   270  // MTU. If this is in response to "packet too big" control packets (indicated
   271  // by the count argument), it also reduces the number of outstanding packets and
   272  // attempts to retransmit the first packet above the MTU size.
   273  func (s *sender) updateMaxPayloadSize(mtu, count int) {
   274  	m := mtu - header.TCPMinimumSize
   275  
   276  	m -= s.ep.maxOptionSize()
   277  
   278  	// We don't adjust up for now.
   279  	if m >= s.MaxPayloadSize {
   280  		return
   281  	}
   282  
   283  	// Make sure we can transmit at least one byte.
   284  	if m <= 0 {
   285  		m = 1
   286  	}
   287  
   288  	oldMSS := s.MaxPayloadSize
   289  	s.MaxPayloadSize = m
   290  	if s.gso {
   291  		s.ep.gso.MSS = uint16(m)
   292  	}
   293  
   294  	if count == 0 {
   295  		// updateMaxPayloadSize is also called when the sender is created.
   296  		// and there is no data to send in such cases. Return immediately.
   297  		return
   298  	}
   299  
   300  	// Update the scoreboard's smss to reflect the new lowered
   301  	// maxPayloadSize.
   302  	s.ep.scoreboard.smss = uint16(m)
   303  
   304  	s.Outstanding -= count
   305  	if s.Outstanding < 0 {
   306  		s.Outstanding = 0
   307  	}
   308  
   309  	// Rewind writeNext to the first segment exceeding the MTU. Do nothing
   310  	// if it is already before such a packet.
   311  	nextSeg := s.writeNext
   312  	for seg := s.writeList.Front(); seg != nil; seg = seg.Next() {
   313  		if seg == s.writeNext {
   314  			// We got to writeNext before we could find a segment
   315  			// exceeding the MTU.
   316  			break
   317  		}
   318  
   319  		if nextSeg == s.writeNext && seg.data.Size() > m {
   320  			// We found a segment exceeding the MTU. Rewind
   321  			// writeNext and try to retransmit it.
   322  			nextSeg = seg
   323  		}
   324  
   325  		if s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
   326  			// Update sackedOut for new maximum payload size.
   327  			s.SackedOut -= s.pCount(seg, oldMSS)
   328  			s.SackedOut += s.pCount(seg, s.MaxPayloadSize)
   329  		}
   330  	}
   331  
   332  	// Since we likely reduced the number of outstanding packets, we may be
   333  	// ready to send some more.
   334  	s.writeNext = nextSeg
   335  	s.sendData()
   336  }
   337  
   338  // sendAck sends an ACK segment.
   339  func (s *sender) sendAck() {
   340  	s.sendSegmentFromView(buffer.VectorisedView{}, header.TCPFlagAck, s.SndNxt)
   341  }
   342  
   343  // updateRTO updates the retransmit timeout when a new roud-trip time is
   344  // available. This is done in accordance with section 2 of RFC 6298.
   345  func (s *sender) updateRTO(rtt time.Duration) {
   346  	s.rtt.Lock()
   347  	if !s.rtt.TCPRTTState.SRTTInited {
   348  		s.rtt.TCPRTTState.RTTVar = rtt / 2
   349  		s.rtt.TCPRTTState.SRTT = rtt
   350  		s.rtt.TCPRTTState.SRTTInited = true
   351  	} else {
   352  		diff := s.rtt.TCPRTTState.SRTT - rtt
   353  		if diff < 0 {
   354  			diff = -diff
   355  		}
   356  		// Use RFC6298 standard algorithm to update TCPRTTState.RTTVar and TCPRTTState.SRTT when
   357  		// no timestamps are available.
   358  		if !s.ep.SendTSOk {
   359  			s.rtt.TCPRTTState.RTTVar = (3*s.rtt.TCPRTTState.RTTVar + diff) / 4
   360  			s.rtt.TCPRTTState.SRTT = (7*s.rtt.TCPRTTState.SRTT + rtt) / 8
   361  		} else {
   362  			// When we are taking RTT measurements of every ACK then
   363  			// we need to use a modified method as specified in
   364  			// https://tools.ietf.org/html/rfc7323#appendix-G
   365  			if s.Outstanding == 0 {
   366  				s.rtt.Unlock()
   367  				return
   368  			}
   369  			// Netstack measures congestion window/inflight all in
   370  			// terms of packets and not bytes. This is similar to
   371  			// how linux also does cwnd and inflight. In practice
   372  			// this approximation works as expected.
   373  			expectedSamples := math.Ceil(float64(s.Outstanding) / 2)
   374  
   375  			// alpha & beta values are the original values as recommended in
   376  			// https://tools.ietf.org/html/rfc6298#section-2.3.
   377  			const alpha = 0.125
   378  			const beta = 0.25
   379  
   380  			alphaPrime := alpha / expectedSamples
   381  			betaPrime := beta / expectedSamples
   382  			rttVar := (1-betaPrime)*s.rtt.TCPRTTState.RTTVar.Seconds() + betaPrime*diff.Seconds()
   383  			srtt := (1-alphaPrime)*s.rtt.TCPRTTState.SRTT.Seconds() + alphaPrime*rtt.Seconds()
   384  			s.rtt.TCPRTTState.RTTVar = time.Duration(rttVar * float64(time.Second))
   385  			s.rtt.TCPRTTState.SRTT = time.Duration(srtt * float64(time.Second))
   386  		}
   387  	}
   388  
   389  	s.RTO = s.rtt.TCPRTTState.SRTT + 4*s.rtt.TCPRTTState.RTTVar
   390  	s.rtt.Unlock()
   391  	if s.RTO < s.minRTO {
   392  		s.RTO = s.minRTO
   393  	}
   394  	if s.RTO > s.maxRTO {
   395  		s.RTO = s.maxRTO
   396  	}
   397  }
   398  
   399  // resendSegment resends the first unacknowledged segment.
   400  func (s *sender) resendSegment() {
   401  	// Don't use any segments we already sent to measure RTT as they may
   402  	// have been affected by packets being lost.
   403  	s.RTTMeasureSeqNum = s.SndNxt
   404  
   405  	// Resend the segment.
   406  	if seg := s.writeList.Front(); seg != nil {
   407  		if seg.data.Size() > s.MaxPayloadSize {
   408  			s.splitSeg(seg, s.MaxPayloadSize)
   409  		}
   410  
   411  		// See: RFC 6675 section 5 Step 4.3
   412  		//
   413  		// To prevent retransmission, set both the HighRXT and RescueRXT
   414  		// to the highest sequence number in the retransmitted segment.
   415  		s.FastRecovery.HighRxt = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) - 1
   416  		s.FastRecovery.RescueRxt = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) - 1
   417  		s.sendSegment(seg)
   418  		s.ep.stack.Stats().TCP.FastRetransmit.Increment()
   419  		s.ep.stats.SendErrors.FastRetransmit.Increment()
   420  
   421  		// Run SetPipe() as per RFC 6675 section 5 Step 4.4
   422  		s.SetPipe()
   423  	}
   424  }
   425  
   426  // retransmitTimerExpired is called when the retransmit timer expires, and
   427  // unacknowledged segments are assumed lost, and thus need to be resent.
   428  // Returns true if the connection is still usable, or false if the connection
   429  // is deemed lost.
   430  func (s *sender) retransmitTimerExpired() bool {
   431  	// Check if the timer actually expired or if it's a spurious wake due
   432  	// to a previously orphaned runtime timer.
   433  	if !s.resendTimer.checkExpiration() {
   434  		return true
   435  	}
   436  
   437  	// Initialize the variables used to detect spurious recovery after
   438  	// entering RTO.
   439  	//
   440  	// See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1.
   441  	s.spuriousRecovery = false
   442  	s.retransmitTS = 0
   443  
   444  	// TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases
   445  	// when writeList is empty. Remove this once we have a proper fix for this
   446  	// issue.
   447  	if s.writeList.Front() == nil {
   448  		return true
   449  	}
   450  
   451  	s.ep.stack.Stats().TCP.Timeouts.Increment()
   452  	s.ep.stats.SendErrors.Timeouts.Increment()
   453  
   454  	// Set TLPRxtOut to false according to
   455  	// https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1.
   456  	s.rc.tlpRxtOut = false
   457  
   458  	// Give up if we've waited more than a minute since the last resend or
   459  	// if a user time out is set and we have exceeded the user specified
   460  	// timeout since the first retransmission.
   461  	uto := s.ep.userTimeout
   462  
   463  	if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) {
   464  		// We store the original xmitTime of the segment that we are
   465  		// about to retransmit as the retransmission time. This is
   466  		// required as by the time the retransmitTimer has expired the
   467  		// segment has already been sent and unacked for the RTO at the
   468  		// time the segment was sent.
   469  		s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime
   470  	}
   471  
   472  	elapsed := s.ep.stack.Clock().NowMonotonic().Sub(s.firstRetransmittedSegXmitTime)
   473  	remaining := s.maxRTO
   474  	if uto != 0 {
   475  		// Cap to the user specified timeout if one is specified.
   476  		remaining = uto - elapsed
   477  	}
   478  
   479  	// Always honor the user-timeout irrespective of whether the zero
   480  	// window probes were acknowledged.
   481  	// net/ipv4/tcp_timer.c::tcp_probe_timer()
   482  	if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries {
   483  		return false
   484  	}
   485  
   486  	// Set new timeout. The timer will be restarted by the call to sendData
   487  	// below.
   488  	s.RTO *= 2
   489  	// Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5
   490  	if s.RTO > s.maxRTO {
   491  		s.RTO = s.maxRTO
   492  	}
   493  
   494  	// Cap RTO to remaining time.
   495  	if s.RTO > remaining {
   496  		s.RTO = remaining
   497  	}
   498  
   499  	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4.
   500  	//
   501  	// Retransmit timeouts:
   502  	//     After a retransmit timeout, record the highest sequence number
   503  	//     transmitted in the variable recover, and exit the fast recovery
   504  	//     procedure if applicable.
   505  	s.FastRecovery.Last = s.SndNxt - 1
   506  
   507  	if s.FastRecovery.Active {
   508  		// We were attempting fast recovery but were not successful.
   509  		// Leave the state. We don't need to update ssthresh because it
   510  		// has already been updated when entered fast-recovery.
   511  		s.leaveRecovery()
   512  	}
   513  
   514  	// Record retransmitTS if the sender is not in recovery as per:
   515  	// https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2
   516  	s.recordRetransmitTS()
   517  
   518  	s.state = tcpip.RTORecovery
   519  	s.cc.HandleRTOExpired()
   520  
   521  	// Mark the next segment to be sent as the first unacknowledged one and
   522  	// start sending again. Set the number of outstanding packets to 0 so
   523  	// that we'll be able to retransmit.
   524  	//
   525  	// We'll keep on transmitting (or retransmitting) as we get acks for
   526  	// the data we transmit.
   527  	s.Outstanding = 0
   528  
   529  	// Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1
   530  	//
   531  	//  In order to avoid memory deadlocks, the TCP receiver is allowed to
   532  	//  discard data that has already been selectively acknowledged. As a
   533  	//  result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK
   534  	//  information gathered from a receiver upon a retransmission timeout
   535  	//  (RTO) "since the timeout might indicate that the data receiver has
   536  	//  reneged." Additionally, a TCP sender MUST "ignore prior SACK
   537  	//  information in determining which data to retransmit."
   538  	//
   539  	// NOTE: We take the stricter interpretation and just expunge all
   540  	// information as we lack more rigorous checks to validate if the SACK
   541  	// information is usable after an RTO.
   542  	s.ep.scoreboard.Reset()
   543  	s.writeNext = s.writeList.Front()
   544  
   545  	// RFC 1122 4.2.2.17: Start sending zero window probes when we still see a
   546  	// zero receive window after retransmission interval and we have data to
   547  	// send.
   548  	if s.zeroWindowProbing {
   549  		s.sendZeroWindowProbe()
   550  		// RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed
   551  		// indefinitely.  As long as the receiving TCP continues to send
   552  		// acknowledgments in response to the probe segments, the sending TCP
   553  		// MUST allow the connection to stay open.
   554  		return true
   555  	}
   556  
   557  	seg := s.writeNext
   558  	// RFC 1122 4.2.3.5: Close the connection when the number of
   559  	// retransmissions for this segment is beyond a limit.
   560  	if seg != nil && seg.xmitCount > s.maxRetries {
   561  		return false
   562  	}
   563  
   564  	s.sendData()
   565  
   566  	return true
   567  }
   568  
   569  // pCount returns the number of packets in the segment. Due to GSO, a segment
   570  // can be composed of multiple packets.
   571  func (s *sender) pCount(seg *segment, maxPayloadSize int) int {
   572  	size := seg.data.Size()
   573  	if size == 0 {
   574  		return 1
   575  	}
   576  
   577  	return (size-1)/maxPayloadSize + 1
   578  }
   579  
   580  // splitSeg splits a given segment at the size specified and inserts the
   581  // remainder as a new segment after the current one in the write list.
   582  func (s *sender) splitSeg(seg *segment, size int) {
   583  	if seg.data.Size() <= size {
   584  		return
   585  	}
   586  	// Split this segment up.
   587  	nSeg := seg.clone()
   588  	nSeg.data.TrimFront(size)
   589  	nSeg.sequenceNumber.UpdateForward(seqnum.Size(size))
   590  	s.writeList.InsertAfter(seg, nSeg)
   591  
   592  	// The segment being split does not carry PUSH flag because it is
   593  	// followed by the newly split segment.
   594  	// RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered
   595  	// segment (i.e., when there is no more queued data to be sent).
   596  	// Linux removes PSH flag only when the segment is being split over MSS
   597  	// and retains it when we are splitting the segment over lack of sender
   598  	// window space.
   599  	// ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point()
   600  	// ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test()
   601  	if seg.data.Size() > s.MaxPayloadSize {
   602  		seg.flags ^= header.TCPFlagPsh
   603  	}
   604  
   605  	seg.data.CapLength(size)
   606  }
   607  
   608  // NextSeg implements the RFC6675 NextSeg() operation.
   609  //
   610  // NextSeg starts scanning the writeList starting from nextSegHint and returns
   611  // the hint to be passed on the next call to NextSeg. This is required to avoid
   612  // iterating the write list repeatedly when NextSeg is invoked in a loop during
   613  // recovery. The returned hint will be nil if there are no more segments that
   614  // can match rules defined by NextSeg operation in RFC6675.
   615  //
   616  // rescueRtx will be true only if nextSeg is a rescue retransmission as
   617  // described by Step 4) of the NextSeg algorithm.
   618  func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) {
   619  	var s3 *segment
   620  	var s4 *segment
   621  	// Step 1.
   622  	for seg := nextSegHint; seg != nil; seg = seg.Next() {
   623  		// Stop iteration if we hit a segment that has never been
   624  		// transmitted (i.e. either it has no assigned sequence number
   625  		// or if it does have one, it's >= the next sequence number
   626  		// to be sent [i.e. >= s.sndNxt]).
   627  		if !s.isAssignedSequenceNumber(seg) || s.SndNxt.LessThanEq(seg.sequenceNumber) {
   628  			hint = nil
   629  			break
   630  		}
   631  		segSeq := seg.sequenceNumber
   632  		if smss := s.ep.scoreboard.SMSS(); seg.data.Size() > int(smss) {
   633  			s.splitSeg(seg, int(smss))
   634  		}
   635  
   636  		// See RFC 6675 Section 4
   637  		//
   638  		//     1. If there exists a smallest unSACKED sequence number
   639  		//     'S2' that meets the following 3 criteria for determinig
   640  		//     loss, the sequence range of one segment of up to SMSS
   641  		//     octects starting with S2 MUST be returned.
   642  		if !s.ep.scoreboard.IsSACKED(header.SACKBlock{Start: segSeq, End: segSeq.Add(1)}) {
   643  			// NextSeg():
   644  			//
   645  			//    (1.a) S2 is greater than HighRxt
   646  			//    (1.b) S2 is less than highest octect covered by
   647  			//    any received SACK.
   648  			if s.FastRecovery.HighRxt.LessThan(segSeq) && segSeq.LessThan(s.ep.scoreboard.maxSACKED) {
   649  				// NextSeg():
   650  				//     (1.c) IsLost(S2) returns true.
   651  				if s.ep.scoreboard.IsLost(segSeq) {
   652  					return seg, seg.Next(), false
   653  				}
   654  
   655  				// NextSeg():
   656  				//
   657  				// (3): If the conditions for rules (1) and (2)
   658  				// fail, but there exists an unSACKed sequence
   659  				// number S3 that meets the criteria for
   660  				// detecting loss given in steps 1.a and 1.b
   661  				// above (specifically excluding (1.c)) then one
   662  				// segment of upto SMSS octets starting with S3
   663  				// SHOULD be returned.
   664  				if s3 == nil {
   665  					s3 = seg
   666  					hint = seg.Next()
   667  				}
   668  			}
   669  			// NextSeg():
   670  			//
   671  			//     (4) If the conditions for (1), (2) and (3) fail,
   672  			//     but there exists outstanding unSACKED data, we
   673  			//     provide the opportunity for a single "rescue"
   674  			//     retransmission per entry into loss recovery. If
   675  			//     HighACK is greater than RescueRxt (or RescueRxt
   676  			//     is undefined), then one segment of upto SMSS
   677  			//     octects that MUST include the highest outstanding
   678  			//     unSACKed sequence number SHOULD be returned, and
   679  			//     RescueRxt set to RecoveryPoint. HighRxt MUST NOT
   680  			//     be updated.
   681  			if s.FastRecovery.RescueRxt.LessThan(s.SndUna - 1) {
   682  				if s4 != nil {
   683  					if s4.sequenceNumber.LessThan(segSeq) {
   684  						s4 = seg
   685  					}
   686  				} else {
   687  					s4 = seg
   688  				}
   689  			}
   690  		}
   691  	}
   692  
   693  	// If we got here then no segment matched step (1).
   694  	// Step (2): "If no sequence number 'S2' per rule (1)
   695  	// exists but there exists available unsent data and the
   696  	// receiver's advertised window allows, the sequence
   697  	// range of one segment of up to SMSS octets of
   698  	// previously unsent data starting with sequence number
   699  	// HighData+1 MUST be returned."
   700  	for seg := s.writeNext; seg != nil; seg = seg.Next() {
   701  		if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.SndNxt) {
   702  			continue
   703  		}
   704  		// We do not split the segment here to <= smss as it has
   705  		// potentially not been assigned a sequence number yet.
   706  		return seg, nil, false
   707  	}
   708  
   709  	if s3 != nil {
   710  		return s3, hint, false
   711  	}
   712  
   713  	return s4, nil, true
   714  }
   715  
   716  // maybeSendSegment tries to send the specified segment and either coalesces
   717  // other segments into this one or splits the specified segment based on the
   718  // lower of the specified limit value or the receivers window size specified by
   719  // end.
   720  func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (sent bool) {
   721  	// We abuse the flags field to determine if we have already
   722  	// assigned a sequence number to this segment.
   723  	if !s.isAssignedSequenceNumber(seg) {
   724  		// Merge segments if allowed.
   725  		if seg.data.Size() != 0 {
   726  			available := int(s.SndNxt.Size(end))
   727  			if available > limit {
   728  				available = limit
   729  			}
   730  
   731  			// nextTooBig indicates that the next segment was too
   732  			// large to entirely fit in the current segment. It
   733  			// would be possible to split the next segment and merge
   734  			// the portion that fits, but unexpectedly splitting
   735  			// segments can have user visible side-effects which can
   736  			// break applications. For example, RFC 7766 section 8
   737  			// says that the length and data of a DNS response
   738  			// should be sent in the same TCP segment to avoid
   739  			// triggering bugs in poorly written DNS
   740  			// implementations.
   741  			var nextTooBig bool
   742  			for nSeg := seg.Next(); nSeg != nil && nSeg.data.Size() != 0; nSeg = seg.Next() {
   743  				if seg.data.Size()+nSeg.data.Size() > available {
   744  					nextTooBig = true
   745  					break
   746  				}
   747  				seg.merge(nSeg)
   748  				s.writeList.Remove(nSeg)
   749  				nSeg.decRef()
   750  			}
   751  			if !nextTooBig && seg.data.Size() < available {
   752  				// Segment is not full.
   753  				if s.Outstanding > 0 && s.ep.ops.GetDelayOption() {
   754  					// Nagle's algorithm. From Wikipedia:
   755  					//   Nagle's algorithm works by
   756  					//   combining a number of small
   757  					//   outgoing messages and sending them
   758  					//   all at once. Specifically, as long
   759  					//   as there is a sent packet for which
   760  					//   the sender has received no
   761  					//   acknowledgment, the sender should
   762  					//   keep buffering its output until it
   763  					//   has a full packet's worth of
   764  					//   output, thus allowing output to be
   765  					//   sent all at once.
   766  					return false
   767  				}
   768  				// With TCP_CORK, hold back until minimum of the available
   769  				// send space and MSS.
   770  				// TODO(gvisor.dev/issue/2833): Drain the held segments after a
   771  				// timeout.
   772  				if seg.data.Size() < s.MaxPayloadSize && s.ep.ops.GetCorkOption() {
   773  					return false
   774  				}
   775  			}
   776  		}
   777  
   778  		// Assign flags. We don't do it above so that we can merge
   779  		// additional data if Nagle holds the segment.
   780  		seg.sequenceNumber = s.SndNxt
   781  		seg.flags = header.TCPFlagAck | header.TCPFlagPsh
   782  	}
   783  
   784  	var segEnd seqnum.Value
   785  	if seg.data.Size() == 0 {
   786  		if s.writeList.Back() != seg {
   787  			panic("FIN segments must be the final segment in the write list.")
   788  		}
   789  		seg.flags = header.TCPFlagAck | header.TCPFlagFin
   790  		segEnd = seg.sequenceNumber.Add(1)
   791  		// Update the state to reflect that we have now
   792  		// queued a FIN.
   793  		switch s.ep.EndpointState() {
   794  		case StateCloseWait:
   795  			s.ep.setEndpointState(StateLastAck)
   796  		default:
   797  			s.ep.setEndpointState(StateFinWait1)
   798  		}
   799  	} else {
   800  		// We're sending a non-FIN segment.
   801  		if seg.flags&header.TCPFlagFin != 0 {
   802  			panic("Netstack queues FIN segments without data.")
   803  		}
   804  
   805  		if !seg.sequenceNumber.LessThan(end) {
   806  			return false
   807  		}
   808  
   809  		available := int(seg.sequenceNumber.Size(end))
   810  		if available == 0 {
   811  			return false
   812  		}
   813  
   814  		// If the whole segment or at least 1MSS sized segment cannot
   815  		// be accomodated in the receiver advertized window, skip
   816  		// splitting and sending of the segment. ref:
   817  		// net/ipv4/tcp_output.c::tcp_snd_wnd_test()
   818  		//
   819  		// Linux checks this for all segment transmits not triggered by
   820  		// a probe timer. On this condition, it defers the segment split
   821  		// and transmit to a short probe timer.
   822  		//
   823  		// ref: include/net/tcp.h::tcp_check_probe_timer()
   824  		// ref: net/ipv4/tcp_output.c::tcp_write_wakeup()
   825  		//
   826  		// Instead of defining a new transmit timer, we attempt to split
   827  		// the segment right here if there are no pending segments. If
   828  		// there are pending segments, segment transmits are deferred to
   829  		// the retransmit timer handler.
   830  		if s.SndUna != s.SndNxt {
   831  			switch {
   832  			case available >= seg.data.Size():
   833  				// OK to send, the whole segments fits in the
   834  				// receiver's advertised window.
   835  			case available >= s.MaxPayloadSize:
   836  				// OK to send, at least 1 MSS sized segment fits
   837  				// in the receiver's advertised window.
   838  			default:
   839  				return false
   840  			}
   841  		}
   842  
   843  		// The segment size limit is computed as a function of sender
   844  		// congestion window and MSS. When sender congestion window is >
   845  		// 1, this limit can be larger than MSS. Ensure that the
   846  		// currently available send space is not greater than minimum of
   847  		// this limit and MSS.
   848  		if available > limit {
   849  			available = limit
   850  		}
   851  
   852  		// If GSO is not in use then cap available to
   853  		// maxPayloadSize. When GSO is in use the gVisor GSO logic or
   854  		// the host GSO logic will cap the segment to the correct size.
   855  		if s.ep.gso.Type == stack.GSONone && available > s.MaxPayloadSize {
   856  			available = s.MaxPayloadSize
   857  		}
   858  
   859  		if seg.data.Size() > available {
   860  			s.splitSeg(seg, available)
   861  		}
   862  
   863  		segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size()))
   864  	}
   865  
   866  	s.sendSegment(seg)
   867  
   868  	// Update sndNxt if we actually sent new data (as opposed to
   869  	// retransmitting some previously sent data).
   870  	if s.SndNxt.LessThan(segEnd) {
   871  		s.SndNxt = segEnd
   872  	}
   873  
   874  	return true
   875  }
   876  
   877  func (s *sender) sendZeroWindowProbe() {
   878  	ack, win := s.ep.rcv.getSendParams()
   879  	s.unackZeroWindowProbes++
   880  	// Send a zero window probe with sequence number pointing to
   881  	// the last acknowledged byte.
   882  	s.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, s.SndUna-1, ack, win)
   883  	// Rearm the timer to continue probing.
   884  	s.resendTimer.enable(s.RTO)
   885  }
   886  
   887  func (s *sender) enableZeroWindowProbing() {
   888  	s.zeroWindowProbing = true
   889  	// We piggyback the probing on the retransmit timer with the
   890  	// current retranmission interval, as we may start probing while
   891  	// segment retransmissions.
   892  	if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) {
   893  		s.firstRetransmittedSegXmitTime = s.ep.stack.Clock().NowMonotonic()
   894  	}
   895  	s.resendTimer.enable(s.RTO)
   896  }
   897  
   898  func (s *sender) disableZeroWindowProbing() {
   899  	s.zeroWindowProbing = false
   900  	s.unackZeroWindowProbes = 0
   901  	s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{}
   902  	s.resendTimer.disable()
   903  }
   904  
   905  func (s *sender) postXmit(dataSent bool, shouldScheduleProbe bool) {
   906  	if dataSent {
   907  		// We sent data, so we should stop the keepalive timer to ensure
   908  		// that no keepalives are sent while there is pending data.
   909  		s.ep.disableKeepaliveTimer()
   910  	}
   911  
   912  	// If the sender has advertized zero receive window and we have
   913  	// data to be sent out, start zero window probing to query the
   914  	// the remote for it's receive window size.
   915  	if s.writeNext != nil && s.SndWnd == 0 {
   916  		s.enableZeroWindowProbing()
   917  	}
   918  
   919  	// If we have no more pending data, start the keepalive timer.
   920  	if s.SndUna == s.SndNxt {
   921  		s.ep.resetKeepaliveTimer(false)
   922  	} else {
   923  		// Enable timers if we have pending data.
   924  		if shouldScheduleProbe && s.shouldSchedulePTO() {
   925  			// Schedule PTO after transmitting new data that wasn't itself a TLP probe.
   926  			s.schedulePTO()
   927  		} else if !s.resendTimer.enabled() {
   928  			s.probeTimer.disable()
   929  			if s.Outstanding > 0 {
   930  				// Enable the resend timer if it's not enabled yet and there is
   931  				// outstanding data.
   932  				s.resendTimer.enable(s.RTO)
   933  			}
   934  		}
   935  	}
   936  }
   937  
   938  // sendData sends new data segments. It is called when data becomes available or
   939  // when the send window opens up.
   940  func (s *sender) sendData() {
   941  	limit := s.MaxPayloadSize
   942  	if s.gso {
   943  		limit = int(s.ep.gso.MaxSize - header.TCPHeaderMaximumSize)
   944  	}
   945  	end := s.SndUna.Add(s.SndWnd)
   946  
   947  	// Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10.
   948  	// "A TCP SHOULD set cwnd to no more than RW before beginning
   949  	// transmission if the TCP has not sent data in the interval exceeding
   950  	// the retrasmission timeout."
   951  	if !s.FastRecovery.Active && s.state != tcpip.RTORecovery && s.ep.stack.Clock().NowMonotonic().Sub(s.LastSendTime) > s.RTO {
   952  		if s.SndCwnd > InitialCwnd {
   953  			s.SndCwnd = InitialCwnd
   954  		}
   955  	}
   956  
   957  	var dataSent bool
   958  	for seg := s.writeNext; seg != nil && s.Outstanding < s.SndCwnd; seg = seg.Next() {
   959  		cwndLimit := (s.SndCwnd - s.Outstanding) * s.MaxPayloadSize
   960  		if cwndLimit < limit {
   961  			limit = cwndLimit
   962  		}
   963  		if s.isAssignedSequenceNumber(seg) && s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
   964  			// Move writeNext along so that we don't try and scan data that
   965  			// has already been SACKED.
   966  			s.writeNext = seg.Next()
   967  			continue
   968  		}
   969  		if sent := s.maybeSendSegment(seg, limit, end); !sent {
   970  			break
   971  		}
   972  		dataSent = true
   973  		s.Outstanding += s.pCount(seg, s.MaxPayloadSize)
   974  		s.writeNext = seg.Next()
   975  	}
   976  
   977  	s.postXmit(dataSent, true /* shouldScheduleProbe */)
   978  }
   979  
   980  func (s *sender) enterRecovery() {
   981  	// Initialize the variables used to detect spurious recovery after
   982  	// entering recovery.
   983  	//
   984  	// See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1.
   985  	s.spuriousRecovery = false
   986  	s.retransmitTS = 0
   987  
   988  	s.FastRecovery.Active = true
   989  	// Save state to reflect we're now in fast recovery.
   990  	//
   991  	// See : https://tools.ietf.org/html/rfc5681#section-3.2 Step 3.
   992  	// We inflate the cwnd by 3 to account for the 3 packets which triggered
   993  	// the 3 duplicate ACKs and are now not in flight.
   994  	s.SndCwnd = s.Ssthresh + 3
   995  	s.SackedOut = 0
   996  	s.DupAckCount = 0
   997  	s.FastRecovery.First = s.SndUna
   998  	s.FastRecovery.Last = s.SndNxt - 1
   999  	s.FastRecovery.MaxCwnd = s.SndCwnd + s.Outstanding
  1000  	s.FastRecovery.HighRxt = s.SndUna
  1001  	s.FastRecovery.RescueRxt = s.SndUna
  1002  
  1003  	// Record retransmitTS if the sender is not in recovery as per:
  1004  	// https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2
  1005  	s.recordRetransmitTS()
  1006  
  1007  	if s.ep.SACKPermitted {
  1008  		s.state = tcpip.SACKRecovery
  1009  		s.ep.stack.Stats().TCP.SACKRecovery.Increment()
  1010  		// Set TLPRxtOut to false according to
  1011  		// https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1.
  1012  		if s.rc.tlpRxtOut {
  1013  			// The tail loss probe triggered recovery.
  1014  			s.ep.stack.Stats().TCP.TLPRecovery.Increment()
  1015  		}
  1016  		s.rc.tlpRxtOut = false
  1017  		return
  1018  	}
  1019  	s.state = tcpip.FastRecovery
  1020  	s.ep.stack.Stats().TCP.FastRecovery.Increment()
  1021  }
  1022  
  1023  func (s *sender) leaveRecovery() {
  1024  	s.FastRecovery.Active = false
  1025  	s.FastRecovery.MaxCwnd = 0
  1026  	s.DupAckCount = 0
  1027  
  1028  	// Deflate cwnd. It had been artificially inflated when new dups arrived.
  1029  	s.SndCwnd = s.Ssthresh
  1030  	s.cc.PostRecovery()
  1031  }
  1032  
  1033  // isAssignedSequenceNumber relies on the fact that we only set flags once a
  1034  // sequencenumber is assigned and that is only done right before we send the
  1035  // segment. As a result any segment that has a non-zero flag has a valid
  1036  // sequence number assigned to it.
  1037  func (s *sender) isAssignedSequenceNumber(seg *segment) bool {
  1038  	return seg.flags != 0
  1039  }
  1040  
  1041  // SetPipe implements the SetPipe() function described in RFC6675. Netstack
  1042  // maintains the congestion window in number of packets and not bytes, so
  1043  // SetPipe() here measures number of outstanding packets rather than actual
  1044  // outstanding bytes in the network.
  1045  func (s *sender) SetPipe() {
  1046  	// If SACK isn't permitted or it is permitted but recovery is not active
  1047  	// then ignore pipe calculations.
  1048  	if !s.ep.SACKPermitted || !s.FastRecovery.Active {
  1049  		return
  1050  	}
  1051  	pipe := 0
  1052  	smss := seqnum.Size(s.ep.scoreboard.SMSS())
  1053  	for s1 := s.writeList.Front(); s1 != nil && s1.data.Size() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() {
  1054  		// With GSO each segment can be much larger than SMSS. So check the segment
  1055  		// in SMSS sized ranges.
  1056  		segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.data.Size()))
  1057  		for startSeq := s1.sequenceNumber; startSeq.LessThan(segEnd); startSeq = startSeq.Add(smss) {
  1058  			endSeq := startSeq.Add(smss)
  1059  			if segEnd.LessThan(endSeq) {
  1060  				endSeq = segEnd
  1061  			}
  1062  			sb := header.SACKBlock{Start: startSeq, End: endSeq}
  1063  			// SetPipe():
  1064  			//
  1065  			// After initializing pipe to zero, the following steps are
  1066  			// taken for each octet 'S1' in the sequence space between
  1067  			// HighACK and HighData that has not been SACKed:
  1068  			if !s1.sequenceNumber.LessThan(s.SndNxt) {
  1069  				break
  1070  			}
  1071  			if s.ep.scoreboard.IsSACKED(sb) {
  1072  				continue
  1073  			}
  1074  
  1075  			// SetPipe():
  1076  			//
  1077  			//    (a) If IsLost(S1) returns false, Pipe is incremened by 1.
  1078  			//
  1079  			// NOTE: here we mark the whole segment as lost. We do not try
  1080  			// and test every byte in our write buffer as we maintain our
  1081  			// pipe in terms of oustanding packets and not bytes.
  1082  			if !s.ep.scoreboard.IsRangeLost(sb) {
  1083  				pipe++
  1084  			}
  1085  			// SetPipe():
  1086  			//    (b) If S1 <= HighRxt, Pipe is incremented by 1.
  1087  			if s1.sequenceNumber.LessThanEq(s.FastRecovery.HighRxt) {
  1088  				pipe++
  1089  			}
  1090  		}
  1091  	}
  1092  	s.Outstanding = pipe
  1093  }
  1094  
  1095  // shouldEnterRecovery returns true if the sender should enter fast recovery
  1096  // based on dupAck count and sack scoreboard.
  1097  // See RFC 6675 section 5.
  1098  func (s *sender) shouldEnterRecovery() bool {
  1099  	return s.DupAckCount >= nDupAckThreshold ||
  1100  		(s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 && s.ep.scoreboard.IsLost(s.SndUna))
  1101  }
  1102  
  1103  // detectLoss is called when an ack is received and returns whether a loss is
  1104  // detected. It manages the state related to duplicate acks and determines if
  1105  // a retransmit is needed according to the rules in RFC 6582 (NewReno).
  1106  func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) {
  1107  	// We're not in fast recovery yet.
  1108  
  1109  	// If RACK is enabled and there is no reordering we should honor the
  1110  	// three duplicate ACK rule to enter recovery.
  1111  	// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-4
  1112  	if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1113  		if s.rc.Reord {
  1114  			return false
  1115  		}
  1116  	}
  1117  
  1118  	if !s.isDupAck(seg) {
  1119  		s.DupAckCount = 0
  1120  		return false
  1121  	}
  1122  
  1123  	s.DupAckCount++
  1124  
  1125  	// Do not enter fast recovery until we reach nDupAckThreshold or the
  1126  	// first unacknowledged byte is considered lost as per SACK scoreboard.
  1127  	if !s.shouldEnterRecovery() {
  1128  		// RFC 6675 Step 3.
  1129  		s.FastRecovery.HighRxt = s.SndUna - 1
  1130  		// Do run SetPipe() to calculate the outstanding segments.
  1131  		s.SetPipe()
  1132  		s.state = tcpip.Disorder
  1133  		return false
  1134  	}
  1135  
  1136  	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2
  1137  	//
  1138  	// We only do the check here, the incrementing of last to the highest
  1139  	// sequence number transmitted till now is done when enterRecovery
  1140  	// is invoked.
  1141  	//
  1142  	// Note that we only enter recovery when at least one more byte of data
  1143  	// beyond s.fr.last (the highest byte that was outstanding when fast
  1144  	// retransmit was last entered) is acked.
  1145  	if !s.FastRecovery.Last.LessThan(seg.ackNumber - 1) {
  1146  		s.DupAckCount = 0
  1147  		return false
  1148  	}
  1149  	s.cc.HandleLossDetected()
  1150  	s.enterRecovery()
  1151  	return true
  1152  }
  1153  
  1154  // isDupAck determines if seg is a duplicate ack as defined in
  1155  // https://tools.ietf.org/html/rfc5681#section-2.
  1156  func (s *sender) isDupAck(seg *segment) bool {
  1157  	// A TCP that utilizes selective acknowledgments (SACKs) [RFC2018, RFC2883]
  1158  	// can leverage the SACK information to determine when an incoming ACK is a
  1159  	// "duplicate" (e.g., if the ACK contains previously unknown SACK
  1160  	// information).
  1161  	if s.ep.SACKPermitted && !seg.hasNewSACKInfo {
  1162  		return false
  1163  	}
  1164  
  1165  	// (a) The receiver of the ACK has outstanding data.
  1166  	return s.SndUna != s.SndNxt &&
  1167  		// (b) The incoming acknowledgment carries no data.
  1168  		seg.logicalLen() == 0 &&
  1169  		// (c) The SYN and FIN bits are both off.
  1170  		!seg.flags.Intersects(header.TCPFlagFin|header.TCPFlagSyn) &&
  1171  		// (d) the ACK number is equal to the greatest acknowledgment received on
  1172  		// the given connection (TCP.UNA from RFC793).
  1173  		seg.ackNumber == s.SndUna &&
  1174  		// (e) the advertised window in the incoming acknowledgment equals the
  1175  		// advertised window in the last incoming acknowledgment.
  1176  		s.SndWnd == seg.window
  1177  }
  1178  
  1179  // Iterate the writeList and update RACK for each segment which is newly acked
  1180  // either cumulatively or selectively. Loop through the segments which are
  1181  // sacked, and update the RACK related variables and check for reordering.
  1182  // Returns true when the DSACK block has been detected in the received ACK.
  1183  //
  1184  // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
  1185  // steps 2 and 3.
  1186  func (s *sender) walkSACK(rcvdSeg *segment) bool {
  1187  	s.rc.setDSACKSeen(false)
  1188  
  1189  	// Look for DSACK block.
  1190  	hasDSACK := false
  1191  	idx := 0
  1192  	n := len(rcvdSeg.parsedOptions.SACKBlocks)
  1193  	if checkDSACK(rcvdSeg) {
  1194  		dsackBlock := rcvdSeg.parsedOptions.SACKBlocks[0]
  1195  		numDSACK := uint64(dsackBlock.End-dsackBlock.Start) / uint64(s.MaxPayloadSize)
  1196  		// numDSACK can be zero when DSACK is sent for subsegments.
  1197  		if numDSACK < 1 {
  1198  			numDSACK = 1
  1199  		}
  1200  		s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.IncrementBy(numDSACK)
  1201  		s.rc.setDSACKSeen(true)
  1202  		idx = 1
  1203  		n--
  1204  		hasDSACK = true
  1205  	}
  1206  
  1207  	if n == 0 {
  1208  		return hasDSACK
  1209  	}
  1210  
  1211  	// Sort the SACK blocks. The first block is the most recent unacked
  1212  	// block. The following blocks can be in arbitrary order.
  1213  	sackBlocks := make([]header.SACKBlock, n)
  1214  	copy(sackBlocks, rcvdSeg.parsedOptions.SACKBlocks[idx:])
  1215  	sort.Slice(sackBlocks, func(i, j int) bool {
  1216  		return sackBlocks[j].Start.LessThan(sackBlocks[i].Start)
  1217  	})
  1218  
  1219  	seg := s.writeList.Front()
  1220  	for _, sb := range sackBlocks {
  1221  		for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 {
  1222  			if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked {
  1223  				s.rc.update(seg, rcvdSeg)
  1224  				s.rc.detectReorder(seg)
  1225  				seg.acked = true
  1226  				s.SackedOut += s.pCount(seg, s.MaxPayloadSize)
  1227  			}
  1228  			seg = seg.Next()
  1229  		}
  1230  	}
  1231  	return hasDSACK
  1232  }
  1233  
  1234  // checkDSACK checks if a DSACK is reported.
  1235  func checkDSACK(rcvdSeg *segment) bool {
  1236  	n := len(rcvdSeg.parsedOptions.SACKBlocks)
  1237  	if n == 0 {
  1238  		return false
  1239  	}
  1240  
  1241  	sb := rcvdSeg.parsedOptions.SACKBlocks[0]
  1242  	// Check if SACK block is invalid.
  1243  	if sb.End.LessThan(sb.Start) {
  1244  		return false
  1245  	}
  1246  
  1247  	// See: https://tools.ietf.org/html/rfc2883#section-5 DSACK is sent in
  1248  	// at most one SACK block. DSACK is detected in the below two cases:
  1249  	// * If the SACK sequence space is less than this cumulative ACK, it is
  1250  	//   an indication that the segment identified by the SACK block has
  1251  	//   been received more than once by the receiver.
  1252  	// * If the sequence space in the first SACK block is greater than the
  1253  	//   cumulative ACK, then the sender next compares the sequence space
  1254  	//   in the first SACK block with the sequence space in the second SACK
  1255  	//   block, if there is one. This comparison can determine if the first
  1256  	//   SACK block is reporting duplicate data that lies above the
  1257  	//   cumulative ACK.
  1258  	if sb.Start.LessThan(rcvdSeg.ackNumber) {
  1259  		return true
  1260  	}
  1261  
  1262  	if n > 1 {
  1263  		sb1 := rcvdSeg.parsedOptions.SACKBlocks[1]
  1264  		if sb1.End.LessThan(sb1.Start) {
  1265  			return false
  1266  		}
  1267  
  1268  		// If the first SACK block is fully covered by second SACK
  1269  		// block, then the first block is a DSACK block.
  1270  		if sb.End.LessThanEq(sb1.End) && sb1.Start.LessThanEq(sb.Start) {
  1271  			return true
  1272  		}
  1273  	}
  1274  
  1275  	return false
  1276  }
  1277  
  1278  func (s *sender) recordRetransmitTS() {
  1279  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2
  1280  	//
  1281  	// The Eifel detection algorithm is used, only upon initiation of loss
  1282  	// recovery, i.e., when either the timeout-based retransmit or the fast
  1283  	// retransmit is sent. The Eifel detection algorithm MUST NOT be
  1284  	// reinitiated after loss recovery has already started. In particular,
  1285  	// it must not be reinitiated upon subsequent timeouts for the same
  1286  	// segment, and not upon retransmitting segments other than the oldest
  1287  	// outstanding segment, e.g., during selective loss recovery.
  1288  	if s.inRecovery() {
  1289  		return
  1290  	}
  1291  
  1292  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2
  1293  	//
  1294  	// Set a "RetransmitTS" variable to the value of the Timestamp Value
  1295  	// field of the Timestamps option included in the retransmit sent when
  1296  	// loss recovery is initiated. A TCP sender must ensure that
  1297  	// RetransmitTS does not get overwritten as loss recovery progresses,
  1298  	// e.g., in case of a second timeout and subsequent second retransmit of
  1299  	// the same octet.
  1300  	s.retransmitTS = s.ep.tsValNow()
  1301  }
  1302  
  1303  func (s *sender) detectSpuriousRecovery(hasDSACK bool, tsEchoReply uint32) {
  1304  	// Return if the sender has already detected spurious recovery.
  1305  	if s.spuriousRecovery {
  1306  		return
  1307  	}
  1308  
  1309  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 4
  1310  	//
  1311  	// If the value of the Timestamp Echo Reply field of the acceptable ACK's
  1312  	// Timestamps option is smaller than the value of RetransmitTS, then
  1313  	// proceed to next step, else return.
  1314  	if tsEchoReply >= s.retransmitTS {
  1315  		return
  1316  	}
  1317  
  1318  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5
  1319  	//
  1320  	// If the acceptable ACK carries a DSACK option [RFC2883], then return.
  1321  	if hasDSACK {
  1322  		return
  1323  	}
  1324  
  1325  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5
  1326  	//
  1327  	// If during the lifetime of the TCP connection the TCP sender has
  1328  	// previously received an ACK with a DSACK option, or the acceptable ACK
  1329  	// does not acknowledge all outstanding data, then proceed to next step,
  1330  	// else return.
  1331  	numDSACK := s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.Value()
  1332  	if numDSACK == 0 && s.SndUna == s.SndNxt {
  1333  		return
  1334  	}
  1335  
  1336  	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 6
  1337  	//
  1338  	// If the loss recovery has been initiated with a timeout-based
  1339  	// retransmit, then set
  1340  	//    SpuriousRecovery <- SPUR_TO (equal 1),
  1341  	// else set
  1342  	//    SpuriousRecovery <- dupacks+1
  1343  	// Set the spurious recovery variable to true as we do not differentiate
  1344  	// between fast, SACK or RTO recovery.
  1345  	s.spuriousRecovery = true
  1346  	s.ep.stack.Stats().TCP.SpuriousRecovery.Increment()
  1347  }
  1348  
  1349  // Check if the sender is in RTORecovery, FastRecovery or SACKRecovery state.
  1350  func (s *sender) inRecovery() bool {
  1351  	if s.state == tcpip.RTORecovery || s.state == tcpip.FastRecovery || s.state == tcpip.SACKRecovery {
  1352  		return true
  1353  	}
  1354  	return false
  1355  }
  1356  
  1357  // handleRcvdSegment is called when a segment is received; it is responsible for
  1358  // updating the send-related state.
  1359  func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
  1360  	// Check if we can extract an RTT measurement from this ack.
  1361  	if !rcvdSeg.parsedOptions.TS && s.RTTMeasureSeqNum.LessThan(rcvdSeg.ackNumber) {
  1362  		s.updateRTO(s.ep.stack.Clock().NowMonotonic().Sub(s.RTTMeasureTime))
  1363  		s.RTTMeasureSeqNum = s.SndNxt
  1364  	}
  1365  
  1366  	// Update Timestamp if required. See RFC7323, section-4.3.
  1367  	if s.ep.SendTSOk && rcvdSeg.parsedOptions.TS {
  1368  		s.ep.updateRecentTimestamp(rcvdSeg.parsedOptions.TSVal, s.MaxSentAck, rcvdSeg.sequenceNumber)
  1369  	}
  1370  
  1371  	// Insert SACKBlock information into our scoreboard.
  1372  	hasDSACK := false
  1373  	if s.ep.SACKPermitted {
  1374  		for _, sb := range rcvdSeg.parsedOptions.SACKBlocks {
  1375  			// Only insert the SACK block if the following holds
  1376  			// true:
  1377  			//  * SACK block acks data after the ack number in the
  1378  			//    current segment.
  1379  			//  * SACK block represents a sequence
  1380  			//    between sndUna and sndNxt (i.e. data that is
  1381  			//    currently unacked and in-flight).
  1382  			//  * SACK block that has not been SACKed already.
  1383  			//
  1384  			// NOTE: This check specifically excludes DSACK blocks
  1385  			// which have start/end before sndUna and are used to
  1386  			// indicate spurious retransmissions.
  1387  			if rcvdSeg.ackNumber.LessThan(sb.Start) && s.SndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.SndNxt) && !s.ep.scoreboard.IsSACKED(sb) {
  1388  				s.ep.scoreboard.Insert(sb)
  1389  				rcvdSeg.hasNewSACKInfo = true
  1390  			}
  1391  		}
  1392  
  1393  		// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08
  1394  		// section-7.2
  1395  		// * Step 2: Update RACK stats.
  1396  		//   If the ACK is not ignored as invalid, update the RACK.rtt
  1397  		//   to be the RTT sample calculated using this ACK, and
  1398  		//   continue.  If this ACK or SACK was for the most recently
  1399  		//   sent packet, then record the RACK.xmit_ts timestamp and
  1400  		//   RACK.end_seq sequence implied by this ACK.
  1401  		// * Step 3: Detect packet reordering.
  1402  		//   If the ACK selectively or cumulatively acknowledges an
  1403  		//   unacknowledged and also never retransmitted sequence below
  1404  		//   RACK.fack, then the corresponding packet has been
  1405  		//   reordered and RACK.reord is set to TRUE.
  1406  		if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1407  			hasDSACK = s.walkSACK(rcvdSeg)
  1408  		}
  1409  		s.SetPipe()
  1410  	}
  1411  
  1412  	ack := rcvdSeg.ackNumber
  1413  	fastRetransmit := false
  1414  	// Do not leave fast recovery, if the ACK is out of range.
  1415  	if s.FastRecovery.Active {
  1416  		// Leave fast recovery if it acknowledges all the data covered by
  1417  		// this fast recovery session.
  1418  		if (ack-1).InRange(s.SndUna, s.SndNxt) && s.FastRecovery.Last.LessThan(ack) {
  1419  			s.leaveRecovery()
  1420  		}
  1421  	} else {
  1422  		// Detect loss by counting the duplicates and enter recovery.
  1423  		fastRetransmit = s.detectLoss(rcvdSeg)
  1424  	}
  1425  
  1426  	// See if TLP based recovery was successful.
  1427  	if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1428  		s.detectTLPRecovery(ack, rcvdSeg)
  1429  	}
  1430  
  1431  	// Stash away the current window size.
  1432  	s.SndWnd = rcvdSeg.window
  1433  
  1434  	// Disable zero window probing if remote advertizes a non-zero receive
  1435  	// window. This can be with an ACK to the zero window probe (where the
  1436  	// acknumber refers to the already acknowledged byte) OR to any previously
  1437  	// unacknowledged segment.
  1438  	if s.zeroWindowProbing && rcvdSeg.window > 0 &&
  1439  		(ack == s.SndUna || (ack-1).InRange(s.SndUna, s.SndNxt)) {
  1440  		s.disableZeroWindowProbing()
  1441  	}
  1442  
  1443  	// On receiving the ACK for the zero window probe, account for it and
  1444  	// skip trying to send any segment as we are still probing for
  1445  	// receive window to become non-zero.
  1446  	if s.zeroWindowProbing && s.unackZeroWindowProbes > 0 && ack == s.SndUna {
  1447  		s.unackZeroWindowProbes--
  1448  		return
  1449  	}
  1450  
  1451  	// Ignore ack if it doesn't acknowledge any new data.
  1452  	if (ack - 1).InRange(s.SndUna, s.SndNxt) {
  1453  		s.DupAckCount = 0
  1454  
  1455  		// See : https://tools.ietf.org/html/rfc1323#section-3.3.
  1456  		// Specifically we should only update the RTO using TSEcr if the
  1457  		// following condition holds:
  1458  		//
  1459  		//    A TSecr value received in a segment is used to update the
  1460  		//    averaged RTT measurement only if the segment acknowledges
  1461  		//    some new data, i.e., only if it advances the left edge of
  1462  		//    the send window.
  1463  		if s.ep.SendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 {
  1464  			s.updateRTO(s.ep.elapsed(s.ep.stack.Clock().NowMonotonic(), rcvdSeg.parsedOptions.TSEcr))
  1465  		}
  1466  
  1467  		if s.shouldSchedulePTO() {
  1468  			// Schedule PTO upon receiving an ACK that cumulatively acknowledges data.
  1469  			// See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1.
  1470  			s.schedulePTO()
  1471  		} else {
  1472  			// When an ack is received we must rearm the timer.
  1473  			// RFC 6298 5.3
  1474  			s.probeTimer.disable()
  1475  			s.resendTimer.enable(s.RTO)
  1476  		}
  1477  
  1478  		// Remove all acknowledged data from the write list.
  1479  		acked := s.SndUna.Size(ack)
  1480  		s.SndUna = ack
  1481  
  1482  		// The remote ACK-ing at least 1 byte is an indication that we have a
  1483  		// full-duplex connection to the remote as the only way we will receive an
  1484  		// ACK is if the remote received data that we previously sent.
  1485  		//
  1486  		// As of writing, linux seems to only confirm a route as reachable when
  1487  		// forward progress is made which is indicated by an ACK that removes data
  1488  		// from the retransmit queue.
  1489  		if acked > 0 {
  1490  			s.ep.route.ConfirmReachable()
  1491  		}
  1492  
  1493  		ackLeft := acked
  1494  		originalOutstanding := s.Outstanding
  1495  		for ackLeft > 0 {
  1496  			// We use logicalLen here because we can have FIN
  1497  			// segments (which are always at the end of list) that
  1498  			// have no data, but do consume a sequence number.
  1499  			seg := s.writeList.Front()
  1500  			datalen := seg.logicalLen()
  1501  
  1502  			if datalen > ackLeft {
  1503  				prevCount := s.pCount(seg, s.MaxPayloadSize)
  1504  				seg.data.TrimFront(int(ackLeft))
  1505  				seg.sequenceNumber.UpdateForward(ackLeft)
  1506  				s.Outstanding -= prevCount - s.pCount(seg, s.MaxPayloadSize)
  1507  				break
  1508  			}
  1509  
  1510  			if s.writeNext == seg {
  1511  				s.writeNext = seg.Next()
  1512  			}
  1513  
  1514  			// Update the RACK fields if SACK is enabled.
  1515  			if s.ep.SACKPermitted && !seg.acked && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1516  				s.rc.update(seg, rcvdSeg)
  1517  				s.rc.detectReorder(seg)
  1518  			}
  1519  
  1520  			s.writeList.Remove(seg)
  1521  
  1522  			// If SACK is enabled then only reduce outstanding if
  1523  			// the segment was not previously SACKED as these have
  1524  			// already been accounted for in SetPipe().
  1525  			if !s.ep.SACKPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
  1526  				s.Outstanding -= s.pCount(seg, s.MaxPayloadSize)
  1527  			} else {
  1528  				s.SackedOut -= s.pCount(seg, s.MaxPayloadSize)
  1529  			}
  1530  			seg.decRef()
  1531  			ackLeft -= datalen
  1532  		}
  1533  
  1534  		// Clear SACK information for all acked data.
  1535  		s.ep.scoreboard.Delete(s.SndUna)
  1536  
  1537  		// Detect if the sender entered recovery spuriously.
  1538  		if s.inRecovery() {
  1539  			s.detectSpuriousRecovery(hasDSACK, rcvdSeg.parsedOptions.TSEcr)
  1540  		}
  1541  
  1542  		// If we are not in fast recovery then update the congestion
  1543  		// window based on the number of acknowledged packets.
  1544  		if !s.FastRecovery.Active {
  1545  			s.cc.Update(originalOutstanding - s.Outstanding)
  1546  			if s.FastRecovery.Last.LessThan(s.SndUna) {
  1547  				s.state = tcpip.Open
  1548  				// Update RACK when we are exiting fast or RTO
  1549  				// recovery as described in the RFC
  1550  				// draft-ietf-tcpm-rack-08 Section-7.2 Step 4.
  1551  				if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1552  					s.rc.exitRecovery()
  1553  				}
  1554  				s.reorderTimer.disable()
  1555  			}
  1556  		}
  1557  
  1558  		// Update the send buffer usage and notify potential waiters.
  1559  		s.ep.updateSndBufferUsage(int(acked))
  1560  
  1561  		// It is possible for s.outstanding to drop below zero if we get
  1562  		// a retransmit timeout, reset outstanding to zero but later
  1563  		// get an ack that cover previously sent data.
  1564  		if s.Outstanding < 0 {
  1565  			s.Outstanding = 0
  1566  		}
  1567  
  1568  		s.SetPipe()
  1569  
  1570  		// If all outstanding data was acknowledged the disable the timer.
  1571  		// RFC 6298 Rule 5.3
  1572  		if s.SndUna == s.SndNxt {
  1573  			s.Outstanding = 0
  1574  			// Reset firstRetransmittedSegXmitTime to the zero value.
  1575  			s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{}
  1576  			s.resendTimer.disable()
  1577  			s.probeTimer.disable()
  1578  		}
  1579  	}
  1580  
  1581  	if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1582  		// Update RACK reorder window.
  1583  		// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
  1584  		// * Upon receiving an ACK:
  1585  		// * Step 4: Update RACK reordering window
  1586  		s.rc.updateRACKReorderWindow()
  1587  
  1588  		// After the reorder window is calculated, detect any loss by checking
  1589  		// if the time elapsed after the segments are sent is greater than the
  1590  		// reorder window.
  1591  		if numLost := s.rc.detectLoss(rcvdSeg.rcvdTime); numLost > 0 && !s.FastRecovery.Active {
  1592  			// If any segment is marked as lost by
  1593  			// RACK, enter recovery and retransmit
  1594  			// the lost segments.
  1595  			s.cc.HandleLossDetected()
  1596  			s.enterRecovery()
  1597  			fastRetransmit = true
  1598  		}
  1599  
  1600  		if s.FastRecovery.Active {
  1601  			s.rc.DoRecovery(nil, fastRetransmit)
  1602  		}
  1603  	}
  1604  
  1605  	// Now that we've popped all acknowledged data from the retransmit
  1606  	// queue, retransmit if needed.
  1607  	if s.FastRecovery.Active && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 {
  1608  		s.lr.DoRecovery(rcvdSeg, fastRetransmit)
  1609  		// When SACK is enabled data sending is governed by steps in
  1610  		// RFC 6675 Section 5 recovery steps  A-C.
  1611  		// See: https://tools.ietf.org/html/rfc6675#section-5.
  1612  		if s.ep.SACKPermitted {
  1613  			return
  1614  		}
  1615  	}
  1616  
  1617  	// Send more data now that some of the pending data has been ack'd, or
  1618  	// that the window opened up, or the congestion window was inflated due
  1619  	// to a duplicate ack during fast recovery. This will also re-enable
  1620  	// the retransmit timer if needed.
  1621  	s.sendData()
  1622  }
  1623  
  1624  // sendSegment sends the specified segment.
  1625  func (s *sender) sendSegment(seg *segment) tcpip.Error {
  1626  	if seg.xmitCount > 0 {
  1627  		s.ep.stack.Stats().TCP.Retransmits.Increment()
  1628  		s.ep.stats.SendErrors.Retransmits.Increment()
  1629  		if s.SndCwnd < s.Ssthresh {
  1630  			s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment()
  1631  		}
  1632  	}
  1633  	seg.xmitTime = s.ep.stack.Clock().NowMonotonic()
  1634  	seg.xmitCount++
  1635  	seg.lost = false
  1636  	err := s.sendSegmentFromView(seg.data, seg.flags, seg.sequenceNumber)
  1637  
  1638  	// Every time a packet containing data is sent (including a
  1639  	// retransmission), if SACK is enabled and we are retransmitting data
  1640  	// then use the conservative timer described in RFC6675 Section 6.0,
  1641  	// otherwise follow the standard time described in RFC6298 Section 5.1.
  1642  	if err != nil && seg.data.Size() != 0 {
  1643  		if s.FastRecovery.Active && seg.xmitCount > 1 && s.ep.SACKPermitted {
  1644  			s.resendTimer.enable(s.RTO)
  1645  		} else {
  1646  			if !s.resendTimer.enabled() {
  1647  				s.resendTimer.enable(s.RTO)
  1648  			}
  1649  		}
  1650  	}
  1651  
  1652  	return err
  1653  }
  1654  
  1655  // sendSegmentFromView sends a new segment containing the given payload, flags
  1656  // and sequence number.
  1657  func (s *sender) sendSegmentFromView(data buffer.VectorisedView, flags header.TCPFlags, seq seqnum.Value) tcpip.Error {
  1658  	s.LastSendTime = s.ep.stack.Clock().NowMonotonic()
  1659  	if seq == s.RTTMeasureSeqNum {
  1660  		s.RTTMeasureTime = s.LastSendTime
  1661  	}
  1662  
  1663  	rcvNxt, rcvWnd := s.ep.rcv.getSendParams()
  1664  
  1665  	// Remember the max sent ack.
  1666  	s.MaxSentAck = rcvNxt
  1667  
  1668  	return s.ep.sendRaw(data, flags, seq, rcvNxt, rcvWnd)
  1669  }
  1670  
  1671  // maybeSendOutOfWindowAck sends an ACK if we are not being rate limited
  1672  // currently.
  1673  func (s *sender) maybeSendOutOfWindowAck(seg *segment) {
  1674  	// Data packets are unlikely to be part of an ACK loop. So always send
  1675  	// an ACK for a packet w/ data.
  1676  	if seg.payloadSize() > 0 || s.ep.allowOutOfWindowAck() {
  1677  		s.sendAck()
  1678  	}
  1679  }