github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/tcpip/transport/tcp/snd.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"fmt"
    19  	"math"
    20  	"sort"
    21  	"time"
    22  
    23  	"github.com/SagerNet/gvisor/pkg/sleep"
    24  	"github.com/SagerNet/gvisor/pkg/sync"
    25  	"github.com/SagerNet/gvisor/pkg/tcpip"
    26  	"github.com/SagerNet/gvisor/pkg/tcpip/buffer"
    27  	"github.com/SagerNet/gvisor/pkg/tcpip/header"
    28  	"github.com/SagerNet/gvisor/pkg/tcpip/seqnum"
    29  	"github.com/SagerNet/gvisor/pkg/tcpip/stack"
    30  )
    31  
    32  const (
    33  	// MinRTO is the minimum allowed value for the retransmit timeout.
    34  	MinRTO = 200 * time.Millisecond
    35  
    36  	// MaxRTO is the maximum allowed value for the retransmit timeout.
    37  	MaxRTO = 120 * time.Second
    38  
    39  	// InitialCwnd is the initial congestion window.
    40  	InitialCwnd = 10
    41  
    42  	// nDupAckThreshold is the number of duplicate ACK's required
    43  	// before fast-retransmit is entered.
    44  	nDupAckThreshold = 3
    45  
    46  	// MaxRetries is the maximum number of probe retries sender does
    47  	// before timing out the connection.
    48  	// Linux default TCP_RETR2, net.ipv4.tcp_retries2.
    49  	MaxRetries = 15
    50  )
    51  
    52  // congestionControl is an interface that must be implemented by any supported
    53  // congestion control algorithm.
    54  type congestionControl interface {
    55  	// HandleLossDetected is invoked when the loss is detected by RACK or
    56  	// sender.dupAckCount >= nDupAckThreshold just before entering fast
    57  	// retransmit.
    58  	HandleLossDetected()
    59  
    60  	// HandleRTOExpired is invoked when the retransmit timer expires.
    61  	HandleRTOExpired()
    62  
    63  	// Update is invoked when processing inbound acks. It's passed the
    64  	// number of packet's that were acked by the most recent cumulative
    65  	// acknowledgement.
    66  	Update(packetsAcked int)
    67  
    68  	// PostRecovery is invoked when the sender is exiting a fast retransmit/
    69  	// recovery phase. This provides congestion control algorithms a way
    70  	// to adjust their state when exiting recovery.
    71  	PostRecovery()
    72  }
    73  
    74  // lossRecovery is an interface that must be implemented by any supported
    75  // loss recovery algorithm.
    76  type lossRecovery interface {
    77  	// DoRecovery is invoked when loss is detected and segments need
    78  	// to be retransmitted. The cumulative or selective ACK is passed along
    79  	// with the flag which identifies whether the connection entered fast
    80  	// retransmit with this ACK and to retransmit the first unacknowledged
    81  	// segment.
    82  	DoRecovery(rcvdSeg *segment, fastRetransmit bool)
    83  }
    84  
    85  // sender holds the state necessary to send TCP segments.
    86  //
    87  // +stateify savable
    88  type sender struct {
    89  	stack.TCPSenderState
    90  	ep *endpoint
    91  
    92  	// lr is the loss recovery algorithm used by the sender.
    93  	lr lossRecovery
    94  
    95  	// firstRetransmittedSegXmitTime is the original transmit time of
    96  	// the first segment that was retransmitted due to RTO expiration.
    97  	firstRetransmittedSegXmitTime tcpip.MonotonicTime
    98  
    99  	// zeroWindowProbing is set if the sender is currently probing
   100  	// for zero receive window.
   101  	zeroWindowProbing bool `state:"nosave"`
   102  
   103  	// unackZeroWindowProbes is the number of unacknowledged zero
   104  	// window probes.
   105  	unackZeroWindowProbes uint32 `state:"nosave"`
   106  
   107  	writeNext   *segment
   108  	writeList   segmentList
   109  	resendTimer timer       `state:"nosave"`
   110  	resendWaker sleep.Waker `state:"nosave"`
   111  
   112  	// rtt.TCPRTTState.SRTT and rtt.TCPRTTState.RTTVar are the "smoothed
   113  	// round-trip time", and "round-trip time variation", as defined in
   114  	// section 2 of RFC 6298.
   115  	rtt rtt
   116  
   117  	// minRTO is the minimum permitted value for sender.rto.
   118  	minRTO time.Duration
   119  
   120  	// maxRTO is the maximum permitted value for sender.rto.
   121  	maxRTO time.Duration
   122  
   123  	// maxRetries is the maximum permitted retransmissions.
   124  	maxRetries uint32
   125  
   126  	// gso is set if generic segmentation offload is enabled.
   127  	gso bool
   128  
   129  	// state is the current state of congestion control for this endpoint.
   130  	state tcpip.CongestionControlState
   131  
   132  	// cc is the congestion control algorithm in use for this sender.
   133  	cc congestionControl
   134  
   135  	// rc has the fields needed for implementing RACK loss detection
   136  	// algorithm.
   137  	rc rackControl
   138  
   139  	// reorderTimer is the timer used to retransmit the segments after RACK
   140  	// detects them as lost.
   141  	reorderTimer timer       `state:"nosave"`
   142  	reorderWaker sleep.Waker `state:"nosave"`
   143  
   144  	// probeTimer and probeWaker are used to schedule PTO for RACK TLP algorithm.
   145  	probeTimer timer       `state:"nosave"`
   146  	probeWaker sleep.Waker `state:"nosave"`
   147  }
   148  
   149  // rtt is a synchronization wrapper used to appease stateify. See the comment
   150  // in sender, where it is used.
   151  //
   152  // +stateify savable
   153  type rtt struct {
   154  	sync.Mutex `state:"nosave"`
   155  
   156  	stack.TCPRTTState
   157  }
   158  
   159  func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender {
   160  	// The sender MUST reduce the TCP data length to account for any IP or
   161  	// TCP options that it is including in the packets that it sends.
   162  	// See: https://tools.ietf.org/html/rfc6691#section-2
   163  	maxPayloadSize := int(mss) - ep.maxOptionSize()
   164  
   165  	s := &sender{
   166  		ep: ep,
   167  		TCPSenderState: stack.TCPSenderState{
   168  			SndWnd:           sndWnd,
   169  			SndUna:           iss + 1,
   170  			SndNxt:           iss + 1,
   171  			RTTMeasureSeqNum: iss + 1,
   172  			LastSendTime:     ep.stack.Clock().NowMonotonic(),
   173  			MaxPayloadSize:   maxPayloadSize,
   174  			MaxSentAck:       irs + 1,
   175  			FastRecovery: stack.TCPFastRecoveryState{
   176  				// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1.
   177  				Last:      iss,
   178  				HighRxt:   iss,
   179  				RescueRxt: iss,
   180  			},
   181  			RTO: 1 * time.Second,
   182  		},
   183  		gso: ep.gso.Type != stack.GSONone,
   184  	}
   185  
   186  	if s.gso {
   187  		s.ep.gso.MSS = uint16(maxPayloadSize)
   188  	}
   189  
   190  	s.cc = s.initCongestionControl(ep.cc)
   191  	s.lr = s.initLossRecovery()
   192  	s.rc.init(s, iss)
   193  
   194  	// A negative sndWndScale means that no scaling is in use, otherwise we
   195  	// store the scaling value.
   196  	if sndWndScale > 0 {
   197  		s.SndWndScale = uint8(sndWndScale)
   198  	}
   199  
   200  	s.resendTimer.init(s.ep.stack.Clock(), &s.resendWaker)
   201  	s.reorderTimer.init(s.ep.stack.Clock(), &s.reorderWaker)
   202  	s.probeTimer.init(s.ep.stack.Clock(), &s.probeWaker)
   203  
   204  	s.updateMaxPayloadSize(int(ep.route.MTU()), 0)
   205  
   206  	// Initialize SACK Scoreboard after updating max payload size as we use
   207  	// the maxPayloadSize as the smss when determining if a segment is lost
   208  	// etc.
   209  	s.ep.scoreboard = NewSACKScoreboard(uint16(s.MaxPayloadSize), iss)
   210  
   211  	// Get Stack wide config.
   212  	var minRTO tcpip.TCPMinRTOOption
   213  	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil {
   214  		panic(fmt.Sprintf("unable to get minRTO from stack: %s", err))
   215  	}
   216  	s.minRTO = time.Duration(minRTO)
   217  
   218  	var maxRTO tcpip.TCPMaxRTOOption
   219  	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil {
   220  		panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err))
   221  	}
   222  	s.maxRTO = time.Duration(maxRTO)
   223  
   224  	var maxRetries tcpip.TCPMaxRetriesOption
   225  	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil {
   226  		panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err))
   227  	}
   228  	s.maxRetries = uint32(maxRetries)
   229  
   230  	return s
   231  }
   232  
   233  // initCongestionControl initializes the specified congestion control module and
   234  // returns a handle to it. It also initializes the sndCwnd and sndSsThresh to
   235  // their initial values.
   236  func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl {
   237  	s.SndCwnd = InitialCwnd
   238  	// Set sndSsthresh to the maximum int value, which depends on the
   239  	// platform.
   240  	s.Ssthresh = int(^uint(0) >> 1)
   241  
   242  	switch congestionControlName {
   243  	case ccCubic:
   244  		return newCubicCC(s)
   245  	case ccReno:
   246  		fallthrough
   247  	default:
   248  		return newRenoCC(s)
   249  	}
   250  }
   251  
   252  // initLossRecovery initiates the loss recovery algorithm for the sender.
   253  func (s *sender) initLossRecovery() lossRecovery {
   254  	if s.ep.SACKPermitted {
   255  		return newSACKRecovery(s)
   256  	}
   257  	return newRenoRecovery(s)
   258  }
   259  
   260  // updateMaxPayloadSize updates the maximum payload size based on the given
   261  // MTU. If this is in response to "packet too big" control packets (indicated
   262  // by the count argument), it also reduces the number of outstanding packets and
   263  // attempts to retransmit the first packet above the MTU size.
   264  func (s *sender) updateMaxPayloadSize(mtu, count int) {
   265  	m := mtu - header.TCPMinimumSize
   266  
   267  	m -= s.ep.maxOptionSize()
   268  
   269  	// We don't adjust up for now.
   270  	if m >= s.MaxPayloadSize {
   271  		return
   272  	}
   273  
   274  	// Make sure we can transmit at least one byte.
   275  	if m <= 0 {
   276  		m = 1
   277  	}
   278  
   279  	oldMSS := s.MaxPayloadSize
   280  	s.MaxPayloadSize = m
   281  	if s.gso {
   282  		s.ep.gso.MSS = uint16(m)
   283  	}
   284  
   285  	if count == 0 {
   286  		// updateMaxPayloadSize is also called when the sender is created.
   287  		// and there is no data to send in such cases. Return immediately.
   288  		return
   289  	}
   290  
   291  	// Update the scoreboard's smss to reflect the new lowered
   292  	// maxPayloadSize.
   293  	s.ep.scoreboard.smss = uint16(m)
   294  
   295  	s.Outstanding -= count
   296  	if s.Outstanding < 0 {
   297  		s.Outstanding = 0
   298  	}
   299  
   300  	// Rewind writeNext to the first segment exceeding the MTU. Do nothing
   301  	// if it is already before such a packet.
   302  	nextSeg := s.writeNext
   303  	for seg := s.writeList.Front(); seg != nil; seg = seg.Next() {
   304  		if seg == s.writeNext {
   305  			// We got to writeNext before we could find a segment
   306  			// exceeding the MTU.
   307  			break
   308  		}
   309  
   310  		if nextSeg == s.writeNext && seg.data.Size() > m {
   311  			// We found a segment exceeding the MTU. Rewind
   312  			// writeNext and try to retransmit it.
   313  			nextSeg = seg
   314  		}
   315  
   316  		if s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
   317  			// Update sackedOut for new maximum payload size.
   318  			s.SackedOut -= s.pCount(seg, oldMSS)
   319  			s.SackedOut += s.pCount(seg, s.MaxPayloadSize)
   320  		}
   321  	}
   322  
   323  	// Since we likely reduced the number of outstanding packets, we may be
   324  	// ready to send some more.
   325  	s.writeNext = nextSeg
   326  	s.sendData()
   327  }
   328  
   329  // sendAck sends an ACK segment.
   330  func (s *sender) sendAck() {
   331  	s.sendSegmentFromView(buffer.VectorisedView{}, header.TCPFlagAck, s.SndNxt)
   332  }
   333  
   334  // updateRTO updates the retransmit timeout when a new roud-trip time is
   335  // available. This is done in accordance with section 2 of RFC 6298.
   336  func (s *sender) updateRTO(rtt time.Duration) {
   337  	s.rtt.Lock()
   338  	if !s.rtt.TCPRTTState.SRTTInited {
   339  		s.rtt.TCPRTTState.RTTVar = rtt / 2
   340  		s.rtt.TCPRTTState.SRTT = rtt
   341  		s.rtt.TCPRTTState.SRTTInited = true
   342  	} else {
   343  		diff := s.rtt.TCPRTTState.SRTT - rtt
   344  		if diff < 0 {
   345  			diff = -diff
   346  		}
   347  		// Use RFC6298 standard algorithm to update TCPRTTState.RTTVar and TCPRTTState.SRTT when
   348  		// no timestamps are available.
   349  		if !s.ep.SendTSOk {
   350  			s.rtt.TCPRTTState.RTTVar = (3*s.rtt.TCPRTTState.RTTVar + diff) / 4
   351  			s.rtt.TCPRTTState.SRTT = (7*s.rtt.TCPRTTState.SRTT + rtt) / 8
   352  		} else {
   353  			// When we are taking RTT measurements of every ACK then
   354  			// we need to use a modified method as specified in
   355  			// https://tools.ietf.org/html/rfc7323#appendix-G
   356  			if s.Outstanding == 0 {
   357  				s.rtt.Unlock()
   358  				return
   359  			}
   360  			// Netstack measures congestion window/inflight all in
   361  			// terms of packets and not bytes. This is similar to
   362  			// how linux also does cwnd and inflight. In practice
   363  			// this approximation works as expected.
   364  			expectedSamples := math.Ceil(float64(s.Outstanding) / 2)
   365  
   366  			// alpha & beta values are the original values as recommended in
   367  			// https://tools.ietf.org/html/rfc6298#section-2.3.
   368  			const alpha = 0.125
   369  			const beta = 0.25
   370  
   371  			alphaPrime := alpha / expectedSamples
   372  			betaPrime := beta / expectedSamples
   373  			rttVar := (1-betaPrime)*s.rtt.TCPRTTState.RTTVar.Seconds() + betaPrime*diff.Seconds()
   374  			srtt := (1-alphaPrime)*s.rtt.TCPRTTState.SRTT.Seconds() + alphaPrime*rtt.Seconds()
   375  			s.rtt.TCPRTTState.RTTVar = time.Duration(rttVar * float64(time.Second))
   376  			s.rtt.TCPRTTState.SRTT = time.Duration(srtt * float64(time.Second))
   377  		}
   378  	}
   379  
   380  	s.RTO = s.rtt.TCPRTTState.SRTT + 4*s.rtt.TCPRTTState.RTTVar
   381  	s.rtt.Unlock()
   382  	if s.RTO < s.minRTO {
   383  		s.RTO = s.minRTO
   384  	}
   385  }
   386  
   387  // resendSegment resends the first unacknowledged segment.
   388  func (s *sender) resendSegment() {
   389  	// Don't use any segments we already sent to measure RTT as they may
   390  	// have been affected by packets being lost.
   391  	s.RTTMeasureSeqNum = s.SndNxt
   392  
   393  	// Resend the segment.
   394  	if seg := s.writeList.Front(); seg != nil {
   395  		if seg.data.Size() > s.MaxPayloadSize {
   396  			s.splitSeg(seg, s.MaxPayloadSize)
   397  		}
   398  
   399  		// See: RFC 6675 section 5 Step 4.3
   400  		//
   401  		// To prevent retransmission, set both the HighRXT and RescueRXT
   402  		// to the highest sequence number in the retransmitted segment.
   403  		s.FastRecovery.HighRxt = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) - 1
   404  		s.FastRecovery.RescueRxt = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) - 1
   405  		s.sendSegment(seg)
   406  		s.ep.stack.Stats().TCP.FastRetransmit.Increment()
   407  		s.ep.stats.SendErrors.FastRetransmit.Increment()
   408  
   409  		// Run SetPipe() as per RFC 6675 section 5 Step 4.4
   410  		s.SetPipe()
   411  	}
   412  }
   413  
   414  // retransmitTimerExpired is called when the retransmit timer expires, and
   415  // unacknowledged segments are assumed lost, and thus need to be resent.
   416  // Returns true if the connection is still usable, or false if the connection
   417  // is deemed lost.
   418  func (s *sender) retransmitTimerExpired() bool {
   419  	// Check if the timer actually expired or if it's a spurious wake due
   420  	// to a previously orphaned runtime timer.
   421  	if !s.resendTimer.checkExpiration() {
   422  		return true
   423  	}
   424  
   425  	// TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases
   426  	// when writeList is empty. Remove this once we have a proper fix for this
   427  	// issue.
   428  	if s.writeList.Front() == nil {
   429  		return true
   430  	}
   431  
   432  	s.ep.stack.Stats().TCP.Timeouts.Increment()
   433  	s.ep.stats.SendErrors.Timeouts.Increment()
   434  
   435  	// Set TLPRxtOut to false according to
   436  	// https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1.
   437  	s.rc.tlpRxtOut = false
   438  
   439  	// Give up if we've waited more than a minute since the last resend or
   440  	// if a user time out is set and we have exceeded the user specified
   441  	// timeout since the first retransmission.
   442  	uto := s.ep.userTimeout
   443  
   444  	if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) {
   445  		// We store the original xmitTime of the segment that we are
   446  		// about to retransmit as the retransmission time. This is
   447  		// required as by the time the retransmitTimer has expired the
   448  		// segment has already been sent and unacked for the RTO at the
   449  		// time the segment was sent.
   450  		s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime
   451  	}
   452  
   453  	elapsed := s.ep.stack.Clock().NowMonotonic().Sub(s.firstRetransmittedSegXmitTime)
   454  	remaining := s.maxRTO
   455  	if uto != 0 {
   456  		// Cap to the user specified timeout if one is specified.
   457  		remaining = uto - elapsed
   458  	}
   459  
   460  	// Always honor the user-timeout irrespective of whether the zero
   461  	// window probes were acknowledged.
   462  	// net/ipv4/tcp_timer.c::tcp_probe_timer()
   463  	if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries {
   464  		return false
   465  	}
   466  
   467  	// Set new timeout. The timer will be restarted by the call to sendData
   468  	// below.
   469  	s.RTO *= 2
   470  	// Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5
   471  	if s.RTO > s.maxRTO {
   472  		s.RTO = s.maxRTO
   473  	}
   474  
   475  	// Cap RTO to remaining time.
   476  	if s.RTO > remaining {
   477  		s.RTO = remaining
   478  	}
   479  
   480  	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4.
   481  	//
   482  	// Retransmit timeouts:
   483  	//     After a retransmit timeout, record the highest sequence number
   484  	//     transmitted in the variable recover, and exit the fast recovery
   485  	//     procedure if applicable.
   486  	s.FastRecovery.Last = s.SndNxt - 1
   487  
   488  	if s.FastRecovery.Active {
   489  		// We were attempting fast recovery but were not successful.
   490  		// Leave the state. We don't need to update ssthresh because it
   491  		// has already been updated when entered fast-recovery.
   492  		s.leaveRecovery()
   493  	}
   494  
   495  	s.state = tcpip.RTORecovery
   496  	s.cc.HandleRTOExpired()
   497  
   498  	// Mark the next segment to be sent as the first unacknowledged one and
   499  	// start sending again. Set the number of outstanding packets to 0 so
   500  	// that we'll be able to retransmit.
   501  	//
   502  	// We'll keep on transmitting (or retransmitting) as we get acks for
   503  	// the data we transmit.
   504  	s.Outstanding = 0
   505  
   506  	// Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1
   507  	//
   508  	//  In order to avoid memory deadlocks, the TCP receiver is allowed to
   509  	//  discard data that has already been selectively acknowledged. As a
   510  	//  result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK
   511  	//  information gathered from a receiver upon a retransmission timeout
   512  	//  (RTO) "since the timeout might indicate that the data receiver has
   513  	//  reneged." Additionally, a TCP sender MUST "ignore prior SACK
   514  	//  information in determining which data to retransmit."
   515  	//
   516  	// NOTE: We take the stricter interpretation and just expunge all
   517  	// information as we lack more rigorous checks to validate if the SACK
   518  	// information is usable after an RTO.
   519  	s.ep.scoreboard.Reset()
   520  	s.writeNext = s.writeList.Front()
   521  
   522  	// RFC 1122 4.2.2.17: Start sending zero window probes when we still see a
   523  	// zero receive window after retransmission interval and we have data to
   524  	// send.
   525  	if s.zeroWindowProbing {
   526  		s.sendZeroWindowProbe()
   527  		// RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed
   528  		// indefinitely.  As long as the receiving TCP continues to send
   529  		// acknowledgments in response to the probe segments, the sending TCP
   530  		// MUST allow the connection to stay open.
   531  		return true
   532  	}
   533  
   534  	seg := s.writeNext
   535  	// RFC 1122 4.2.3.5: Close the connection when the number of
   536  	// retransmissions for this segment is beyond a limit.
   537  	if seg != nil && seg.xmitCount > s.maxRetries {
   538  		return false
   539  	}
   540  
   541  	s.sendData()
   542  
   543  	return true
   544  }
   545  
   546  // pCount returns the number of packets in the segment. Due to GSO, a segment
   547  // can be composed of multiple packets.
   548  func (s *sender) pCount(seg *segment, maxPayloadSize int) int {
   549  	size := seg.data.Size()
   550  	if size == 0 {
   551  		return 1
   552  	}
   553  
   554  	return (size-1)/maxPayloadSize + 1
   555  }
   556  
   557  // splitSeg splits a given segment at the size specified and inserts the
   558  // remainder as a new segment after the current one in the write list.
   559  func (s *sender) splitSeg(seg *segment, size int) {
   560  	if seg.data.Size() <= size {
   561  		return
   562  	}
   563  	// Split this segment up.
   564  	nSeg := seg.clone()
   565  	nSeg.data.TrimFront(size)
   566  	nSeg.sequenceNumber.UpdateForward(seqnum.Size(size))
   567  	s.writeList.InsertAfter(seg, nSeg)
   568  
   569  	// The segment being split does not carry PUSH flag because it is
   570  	// followed by the newly split segment.
   571  	// RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered
   572  	// segment (i.e., when there is no more queued data to be sent).
   573  	// Linux removes PSH flag only when the segment is being split over MSS
   574  	// and retains it when we are splitting the segment over lack of sender
   575  	// window space.
   576  	// ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point()
   577  	// ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test()
   578  	if seg.data.Size() > s.MaxPayloadSize {
   579  		seg.flags ^= header.TCPFlagPsh
   580  	}
   581  
   582  	seg.data.CapLength(size)
   583  }
   584  
   585  // NextSeg implements the RFC6675 NextSeg() operation.
   586  //
   587  // NextSeg starts scanning the writeList starting from nextSegHint and returns
   588  // the hint to be passed on the next call to NextSeg. This is required to avoid
   589  // iterating the write list repeatedly when NextSeg is invoked in a loop during
   590  // recovery. The returned hint will be nil if there are no more segments that
   591  // can match rules defined by NextSeg operation in RFC6675.
   592  //
   593  // rescueRtx will be true only if nextSeg is a rescue retransmission as
   594  // described by Step 4) of the NextSeg algorithm.
   595  func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) {
   596  	var s3 *segment
   597  	var s4 *segment
   598  	// Step 1.
   599  	for seg := nextSegHint; seg != nil; seg = seg.Next() {
   600  		// Stop iteration if we hit a segment that has never been
   601  		// transmitted (i.e. either it has no assigned sequence number
   602  		// or if it does have one, it's >= the next sequence number
   603  		// to be sent [i.e. >= s.sndNxt]).
   604  		if !s.isAssignedSequenceNumber(seg) || s.SndNxt.LessThanEq(seg.sequenceNumber) {
   605  			hint = nil
   606  			break
   607  		}
   608  		segSeq := seg.sequenceNumber
   609  		if smss := s.ep.scoreboard.SMSS(); seg.data.Size() > int(smss) {
   610  			s.splitSeg(seg, int(smss))
   611  		}
   612  
   613  		// See RFC 6675 Section 4
   614  		//
   615  		//     1. If there exists a smallest unSACKED sequence number
   616  		//     'S2' that meets the following 3 criteria for determinig
   617  		//     loss, the sequence range of one segment of up to SMSS
   618  		//     octects starting with S2 MUST be returned.
   619  		if !s.ep.scoreboard.IsSACKED(header.SACKBlock{Start: segSeq, End: segSeq.Add(1)}) {
   620  			// NextSeg():
   621  			//
   622  			//    (1.a) S2 is greater than HighRxt
   623  			//    (1.b) S2 is less than highest octect covered by
   624  			//    any received SACK.
   625  			if s.FastRecovery.HighRxt.LessThan(segSeq) && segSeq.LessThan(s.ep.scoreboard.maxSACKED) {
   626  				// NextSeg():
   627  				//     (1.c) IsLost(S2) returns true.
   628  				if s.ep.scoreboard.IsLost(segSeq) {
   629  					return seg, seg.Next(), false
   630  				}
   631  
   632  				// NextSeg():
   633  				//
   634  				// (3): If the conditions for rules (1) and (2)
   635  				// fail, but there exists an unSACKed sequence
   636  				// number S3 that meets the criteria for
   637  				// detecting loss given in steps 1.a and 1.b
   638  				// above (specifically excluding (1.c)) then one
   639  				// segment of upto SMSS octets starting with S3
   640  				// SHOULD be returned.
   641  				if s3 == nil {
   642  					s3 = seg
   643  					hint = seg.Next()
   644  				}
   645  			}
   646  			// NextSeg():
   647  			//
   648  			//     (4) If the conditions for (1), (2) and (3) fail,
   649  			//     but there exists outstanding unSACKED data, we
   650  			//     provide the opportunity for a single "rescue"
   651  			//     retransmission per entry into loss recovery. If
   652  			//     HighACK is greater than RescueRxt (or RescueRxt
   653  			//     is undefined), then one segment of upto SMSS
   654  			//     octects that MUST include the highest outstanding
   655  			//     unSACKed sequence number SHOULD be returned, and
   656  			//     RescueRxt set to RecoveryPoint. HighRxt MUST NOT
   657  			//     be updated.
   658  			if s.FastRecovery.RescueRxt.LessThan(s.SndUna - 1) {
   659  				if s4 != nil {
   660  					if s4.sequenceNumber.LessThan(segSeq) {
   661  						s4 = seg
   662  					}
   663  				} else {
   664  					s4 = seg
   665  				}
   666  			}
   667  		}
   668  	}
   669  
   670  	// If we got here then no segment matched step (1).
   671  	// Step (2): "If no sequence number 'S2' per rule (1)
   672  	// exists but there exists available unsent data and the
   673  	// receiver's advertised window allows, the sequence
   674  	// range of one segment of up to SMSS octets of
   675  	// previously unsent data starting with sequence number
   676  	// HighData+1 MUST be returned."
   677  	for seg := s.writeNext; seg != nil; seg = seg.Next() {
   678  		if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.SndNxt) {
   679  			continue
   680  		}
   681  		// We do not split the segment here to <= smss as it has
   682  		// potentially not been assigned a sequence number yet.
   683  		return seg, nil, false
   684  	}
   685  
   686  	if s3 != nil {
   687  		return s3, hint, false
   688  	}
   689  
   690  	return s4, nil, true
   691  }
   692  
   693  // maybeSendSegment tries to send the specified segment and either coalesces
   694  // other segments into this one or splits the specified segment based on the
   695  // lower of the specified limit value or the receivers window size specified by
   696  // end.
   697  func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (sent bool) {
   698  	// We abuse the flags field to determine if we have already
   699  	// assigned a sequence number to this segment.
   700  	if !s.isAssignedSequenceNumber(seg) {
   701  		// Merge segments if allowed.
   702  		if seg.data.Size() != 0 {
   703  			available := int(s.SndNxt.Size(end))
   704  			if available > limit {
   705  				available = limit
   706  			}
   707  
   708  			// nextTooBig indicates that the next segment was too
   709  			// large to entirely fit in the current segment. It
   710  			// would be possible to split the next segment and merge
   711  			// the portion that fits, but unexpectedly splitting
   712  			// segments can have user visible side-effects which can
   713  			// break applications. For example, RFC 7766 section 8
   714  			// says that the length and data of a DNS response
   715  			// should be sent in the same TCP segment to avoid
   716  			// triggering bugs in poorly written DNS
   717  			// implementations.
   718  			var nextTooBig bool
   719  			for nSeg := seg.Next(); nSeg != nil && nSeg.data.Size() != 0; nSeg = seg.Next() {
   720  				if seg.data.Size()+nSeg.data.Size() > available {
   721  					nextTooBig = true
   722  					break
   723  				}
   724  				seg.merge(nSeg)
   725  				s.writeList.Remove(nSeg)
   726  				nSeg.decRef()
   727  			}
   728  			if !nextTooBig && seg.data.Size() < available {
   729  				// Segment is not full.
   730  				if s.Outstanding > 0 && s.ep.ops.GetDelayOption() {
   731  					// Nagle's algorithm. From Wikipedia:
   732  					//   Nagle's algorithm works by
   733  					//   combining a number of small
   734  					//   outgoing messages and sending them
   735  					//   all at once. Specifically, as long
   736  					//   as there is a sent packet for which
   737  					//   the sender has received no
   738  					//   acknowledgment, the sender should
   739  					//   keep buffering its output until it
   740  					//   has a full packet's worth of
   741  					//   output, thus allowing output to be
   742  					//   sent all at once.
   743  					return false
   744  				}
   745  				// With TCP_CORK, hold back until minimum of the available
   746  				// send space and MSS.
   747  				// TODO(github.com/SagerNet/issue/2833): Drain the held segments after a
   748  				// timeout.
   749  				if seg.data.Size() < s.MaxPayloadSize && s.ep.ops.GetCorkOption() {
   750  					return false
   751  				}
   752  			}
   753  		}
   754  
   755  		// Assign flags. We don't do it above so that we can merge
   756  		// additional data if Nagle holds the segment.
   757  		seg.sequenceNumber = s.SndNxt
   758  		seg.flags = header.TCPFlagAck | header.TCPFlagPsh
   759  	}
   760  
   761  	var segEnd seqnum.Value
   762  	if seg.data.Size() == 0 {
   763  		if s.writeList.Back() != seg {
   764  			panic("FIN segments must be the final segment in the write list.")
   765  		}
   766  		seg.flags = header.TCPFlagAck | header.TCPFlagFin
   767  		segEnd = seg.sequenceNumber.Add(1)
   768  		// Update the state to reflect that we have now
   769  		// queued a FIN.
   770  		switch s.ep.EndpointState() {
   771  		case StateCloseWait:
   772  			s.ep.setEndpointState(StateLastAck)
   773  		default:
   774  			s.ep.setEndpointState(StateFinWait1)
   775  		}
   776  	} else {
   777  		// We're sending a non-FIN segment.
   778  		if seg.flags&header.TCPFlagFin != 0 {
   779  			panic("Netstack queues FIN segments without data.")
   780  		}
   781  
   782  		if !seg.sequenceNumber.LessThan(end) {
   783  			return false
   784  		}
   785  
   786  		available := int(seg.sequenceNumber.Size(end))
   787  		if available == 0 {
   788  			return false
   789  		}
   790  
   791  		// If the whole segment or at least 1MSS sized segment cannot
   792  		// be accomodated in the receiver advertized window, skip
   793  		// splitting and sending of the segment. ref:
   794  		// net/ipv4/tcp_output.c::tcp_snd_wnd_test()
   795  		//
   796  		// Linux checks this for all segment transmits not triggered by
   797  		// a probe timer. On this condition, it defers the segment split
   798  		// and transmit to a short probe timer.
   799  		//
   800  		// ref: include/net/tcp.h::tcp_check_probe_timer()
   801  		// ref: net/ipv4/tcp_output.c::tcp_write_wakeup()
   802  		//
   803  		// Instead of defining a new transmit timer, we attempt to split
   804  		// the segment right here if there are no pending segments. If
   805  		// there are pending segments, segment transmits are deferred to
   806  		// the retransmit timer handler.
   807  		if s.SndUna != s.SndNxt {
   808  			switch {
   809  			case available >= seg.data.Size():
   810  				// OK to send, the whole segments fits in the
   811  				// receiver's advertised window.
   812  			case available >= s.MaxPayloadSize:
   813  				// OK to send, at least 1 MSS sized segment fits
   814  				// in the receiver's advertised window.
   815  			default:
   816  				return false
   817  			}
   818  		}
   819  
   820  		// The segment size limit is computed as a function of sender
   821  		// congestion window and MSS. When sender congestion window is >
   822  		// 1, this limit can be larger than MSS. Ensure that the
   823  		// currently available send space is not greater than minimum of
   824  		// this limit and MSS.
   825  		if available > limit {
   826  			available = limit
   827  		}
   828  
   829  		// If GSO is not in use then cap available to
   830  		// maxPayloadSize. When GSO is in use the gVisor GSO logic or
   831  		// the host GSO logic will cap the segment to the correct size.
   832  		if s.ep.gso.Type == stack.GSONone && available > s.MaxPayloadSize {
   833  			available = s.MaxPayloadSize
   834  		}
   835  
   836  		if seg.data.Size() > available {
   837  			s.splitSeg(seg, available)
   838  		}
   839  
   840  		segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size()))
   841  	}
   842  
   843  	s.sendSegment(seg)
   844  
   845  	// Update sndNxt if we actually sent new data (as opposed to
   846  	// retransmitting some previously sent data).
   847  	if s.SndNxt.LessThan(segEnd) {
   848  		s.SndNxt = segEnd
   849  	}
   850  
   851  	return true
   852  }
   853  
   854  func (s *sender) sendZeroWindowProbe() {
   855  	ack, win := s.ep.rcv.getSendParams()
   856  	s.unackZeroWindowProbes++
   857  	// Send a zero window probe with sequence number pointing to
   858  	// the last acknowledged byte.
   859  	s.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, s.SndUna-1, ack, win)
   860  	// Rearm the timer to continue probing.
   861  	s.resendTimer.enable(s.RTO)
   862  }
   863  
   864  func (s *sender) enableZeroWindowProbing() {
   865  	s.zeroWindowProbing = true
   866  	// We piggyback the probing on the retransmit timer with the
   867  	// current retranmission interval, as we may start probing while
   868  	// segment retransmissions.
   869  	if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) {
   870  		s.firstRetransmittedSegXmitTime = s.ep.stack.Clock().NowMonotonic()
   871  	}
   872  	s.resendTimer.enable(s.RTO)
   873  }
   874  
   875  func (s *sender) disableZeroWindowProbing() {
   876  	s.zeroWindowProbing = false
   877  	s.unackZeroWindowProbes = 0
   878  	s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{}
   879  	s.resendTimer.disable()
   880  }
   881  
   882  func (s *sender) postXmit(dataSent bool, shouldScheduleProbe bool) {
   883  	if dataSent {
   884  		// We sent data, so we should stop the keepalive timer to ensure
   885  		// that no keepalives are sent while there is pending data.
   886  		s.ep.disableKeepaliveTimer()
   887  	}
   888  
   889  	// If the sender has advertized zero receive window and we have
   890  	// data to be sent out, start zero window probing to query the
   891  	// the remote for it's receive window size.
   892  	if s.writeNext != nil && s.SndWnd == 0 {
   893  		s.enableZeroWindowProbing()
   894  	}
   895  
   896  	// If we have no more pending data, start the keepalive timer.
   897  	if s.SndUna == s.SndNxt {
   898  		s.ep.resetKeepaliveTimer(false)
   899  	} else {
   900  		// Enable timers if we have pending data.
   901  		if shouldScheduleProbe && s.shouldSchedulePTO() {
   902  			// Schedule PTO after transmitting new data that wasn't itself a TLP probe.
   903  			s.schedulePTO()
   904  		} else if !s.resendTimer.enabled() {
   905  			s.probeTimer.disable()
   906  			if s.Outstanding > 0 {
   907  				// Enable the resend timer if it's not enabled yet and there is
   908  				// outstanding data.
   909  				s.resendTimer.enable(s.RTO)
   910  			}
   911  		}
   912  	}
   913  }
   914  
   915  // sendData sends new data segments. It is called when data becomes available or
   916  // when the send window opens up.
   917  func (s *sender) sendData() {
   918  	limit := s.MaxPayloadSize
   919  	if s.gso {
   920  		limit = int(s.ep.gso.MaxSize - header.TCPHeaderMaximumSize)
   921  	}
   922  	end := s.SndUna.Add(s.SndWnd)
   923  
   924  	// Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10.
   925  	// "A TCP SHOULD set cwnd to no more than RW before beginning
   926  	// transmission if the TCP has not sent data in the interval exceeding
   927  	// the retrasmission timeout."
   928  	if !s.FastRecovery.Active && s.state != tcpip.RTORecovery && s.ep.stack.Clock().NowMonotonic().Sub(s.LastSendTime) > s.RTO {
   929  		if s.SndCwnd > InitialCwnd {
   930  			s.SndCwnd = InitialCwnd
   931  		}
   932  	}
   933  
   934  	var dataSent bool
   935  	for seg := s.writeNext; seg != nil && s.Outstanding < s.SndCwnd; seg = seg.Next() {
   936  		cwndLimit := (s.SndCwnd - s.Outstanding) * s.MaxPayloadSize
   937  		if cwndLimit < limit {
   938  			limit = cwndLimit
   939  		}
   940  		if s.isAssignedSequenceNumber(seg) && s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
   941  			// Move writeNext along so that we don't try and scan data that
   942  			// has already been SACKED.
   943  			s.writeNext = seg.Next()
   944  			continue
   945  		}
   946  		if sent := s.maybeSendSegment(seg, limit, end); !sent {
   947  			break
   948  		}
   949  		dataSent = true
   950  		s.Outstanding += s.pCount(seg, s.MaxPayloadSize)
   951  		s.writeNext = seg.Next()
   952  	}
   953  
   954  	s.postXmit(dataSent, true /* shouldScheduleProbe */)
   955  }
   956  
   957  func (s *sender) enterRecovery() {
   958  	s.FastRecovery.Active = true
   959  	// Save state to reflect we're now in fast recovery.
   960  	//
   961  	// See : https://tools.ietf.org/html/rfc5681#section-3.2 Step 3.
   962  	// We inflate the cwnd by 3 to account for the 3 packets which triggered
   963  	// the 3 duplicate ACKs and are now not in flight.
   964  	s.SndCwnd = s.Ssthresh + 3
   965  	s.SackedOut = 0
   966  	s.DupAckCount = 0
   967  	s.FastRecovery.First = s.SndUna
   968  	s.FastRecovery.Last = s.SndNxt - 1
   969  	s.FastRecovery.MaxCwnd = s.SndCwnd + s.Outstanding
   970  	s.FastRecovery.HighRxt = s.SndUna
   971  	s.FastRecovery.RescueRxt = s.SndUna
   972  	if s.ep.SACKPermitted {
   973  		s.state = tcpip.SACKRecovery
   974  		s.ep.stack.Stats().TCP.SACKRecovery.Increment()
   975  		// Set TLPRxtOut to false according to
   976  		// https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1.
   977  		if s.rc.tlpRxtOut {
   978  			// The tail loss probe triggered recovery.
   979  			s.ep.stack.Stats().TCP.TLPRecovery.Increment()
   980  		}
   981  		s.rc.tlpRxtOut = false
   982  		return
   983  	}
   984  	s.state = tcpip.FastRecovery
   985  	s.ep.stack.Stats().TCP.FastRecovery.Increment()
   986  }
   987  
   988  func (s *sender) leaveRecovery() {
   989  	s.FastRecovery.Active = false
   990  	s.FastRecovery.MaxCwnd = 0
   991  	s.DupAckCount = 0
   992  
   993  	// Deflate cwnd. It had been artificially inflated when new dups arrived.
   994  	s.SndCwnd = s.Ssthresh
   995  	s.cc.PostRecovery()
   996  }
   997  
   998  // isAssignedSequenceNumber relies on the fact that we only set flags once a
   999  // sequencenumber is assigned and that is only done right before we send the
  1000  // segment. As a result any segment that has a non-zero flag has a valid
  1001  // sequence number assigned to it.
  1002  func (s *sender) isAssignedSequenceNumber(seg *segment) bool {
  1003  	return seg.flags != 0
  1004  }
  1005  
  1006  // SetPipe implements the SetPipe() function described in RFC6675. Netstack
  1007  // maintains the congestion window in number of packets and not bytes, so
  1008  // SetPipe() here measures number of outstanding packets rather than actual
  1009  // outstanding bytes in the network.
  1010  func (s *sender) SetPipe() {
  1011  	// If SACK isn't permitted or it is permitted but recovery is not active
  1012  	// then ignore pipe calculations.
  1013  	if !s.ep.SACKPermitted || !s.FastRecovery.Active {
  1014  		return
  1015  	}
  1016  	pipe := 0
  1017  	smss := seqnum.Size(s.ep.scoreboard.SMSS())
  1018  	for s1 := s.writeList.Front(); s1 != nil && s1.data.Size() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() {
  1019  		// With GSO each segment can be much larger than SMSS. So check the segment
  1020  		// in SMSS sized ranges.
  1021  		segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.data.Size()))
  1022  		for startSeq := s1.sequenceNumber; startSeq.LessThan(segEnd); startSeq = startSeq.Add(smss) {
  1023  			endSeq := startSeq.Add(smss)
  1024  			if segEnd.LessThan(endSeq) {
  1025  				endSeq = segEnd
  1026  			}
  1027  			sb := header.SACKBlock{Start: startSeq, End: endSeq}
  1028  			// SetPipe():
  1029  			//
  1030  			// After initializing pipe to zero, the following steps are
  1031  			// taken for each octet 'S1' in the sequence space between
  1032  			// HighACK and HighData that has not been SACKed:
  1033  			if !s1.sequenceNumber.LessThan(s.SndNxt) {
  1034  				break
  1035  			}
  1036  			if s.ep.scoreboard.IsSACKED(sb) {
  1037  				continue
  1038  			}
  1039  
  1040  			// SetPipe():
  1041  			//
  1042  			//    (a) If IsLost(S1) returns false, Pipe is incremened by 1.
  1043  			//
  1044  			// NOTE: here we mark the whole segment as lost. We do not try
  1045  			// and test every byte in our write buffer as we maintain our
  1046  			// pipe in terms of oustanding packets and not bytes.
  1047  			if !s.ep.scoreboard.IsRangeLost(sb) {
  1048  				pipe++
  1049  			}
  1050  			// SetPipe():
  1051  			//    (b) If S1 <= HighRxt, Pipe is incremented by 1.
  1052  			if s1.sequenceNumber.LessThanEq(s.FastRecovery.HighRxt) {
  1053  				pipe++
  1054  			}
  1055  		}
  1056  	}
  1057  	s.Outstanding = pipe
  1058  }
  1059  
  1060  // shouldEnterRecovery returns true if the sender should enter fast recovery
  1061  // based on dupAck count and sack scoreboard.
  1062  // See RFC 6675 section 5.
  1063  func (s *sender) shouldEnterRecovery() bool {
  1064  	return s.DupAckCount >= nDupAckThreshold ||
  1065  		(s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 && s.ep.scoreboard.IsLost(s.SndUna))
  1066  }
  1067  
  1068  // detectLoss is called when an ack is received and returns whether a loss is
  1069  // detected. It manages the state related to duplicate acks and determines if
  1070  // a retransmit is needed according to the rules in RFC 6582 (NewReno).
  1071  func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) {
  1072  	// We're not in fast recovery yet.
  1073  
  1074  	// If RACK is enabled and there is no reordering we should honor the
  1075  	// three duplicate ACK rule to enter recovery.
  1076  	// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-4
  1077  	if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1078  		if s.rc.Reord {
  1079  			return false
  1080  		}
  1081  	}
  1082  
  1083  	if !s.isDupAck(seg) {
  1084  		s.DupAckCount = 0
  1085  		return false
  1086  	}
  1087  
  1088  	s.DupAckCount++
  1089  
  1090  	// Do not enter fast recovery until we reach nDupAckThreshold or the
  1091  	// first unacknowledged byte is considered lost as per SACK scoreboard.
  1092  	if !s.shouldEnterRecovery() {
  1093  		// RFC 6675 Step 3.
  1094  		s.FastRecovery.HighRxt = s.SndUna - 1
  1095  		// Do run SetPipe() to calculate the outstanding segments.
  1096  		s.SetPipe()
  1097  		s.state = tcpip.Disorder
  1098  		return false
  1099  	}
  1100  
  1101  	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2
  1102  	//
  1103  	// We only do the check here, the incrementing of last to the highest
  1104  	// sequence number transmitted till now is done when enterRecovery
  1105  	// is invoked.
  1106  	//
  1107  	// Note that we only enter recovery when at least one more byte of data
  1108  	// beyond s.fr.last (the highest byte that was outstanding when fast
  1109  	// retransmit was last entered) is acked.
  1110  	if !s.FastRecovery.Last.LessThan(seg.ackNumber - 1) {
  1111  		s.DupAckCount = 0
  1112  		return false
  1113  	}
  1114  	s.cc.HandleLossDetected()
  1115  	s.enterRecovery()
  1116  	return true
  1117  }
  1118  
  1119  // isDupAck determines if seg is a duplicate ack as defined in
  1120  // https://tools.ietf.org/html/rfc5681#section-2.
  1121  func (s *sender) isDupAck(seg *segment) bool {
  1122  	// A TCP that utilizes selective acknowledgments (SACKs) [RFC2018, RFC2883]
  1123  	// can leverage the SACK information to determine when an incoming ACK is a
  1124  	// "duplicate" (e.g., if the ACK contains previously unknown SACK
  1125  	// information).
  1126  	if s.ep.SACKPermitted && !seg.hasNewSACKInfo {
  1127  		return false
  1128  	}
  1129  
  1130  	// (a) The receiver of the ACK has outstanding data.
  1131  	return s.SndUna != s.SndNxt &&
  1132  		// (b) The incoming acknowledgment carries no data.
  1133  		seg.logicalLen() == 0 &&
  1134  		// (c) The SYN and FIN bits are both off.
  1135  		!seg.flags.Intersects(header.TCPFlagFin|header.TCPFlagSyn) &&
  1136  		// (d) the ACK number is equal to the greatest acknowledgment received on
  1137  		// the given connection (TCP.UNA from RFC793).
  1138  		seg.ackNumber == s.SndUna &&
  1139  		// (e) the advertised window in the incoming acknowledgment equals the
  1140  		// advertised window in the last incoming acknowledgment.
  1141  		s.SndWnd == seg.window
  1142  }
  1143  
  1144  // Iterate the writeList and update RACK for each segment which is newly acked
  1145  // either cumulatively or selectively. Loop through the segments which are
  1146  // sacked, and update the RACK related variables and check for reordering.
  1147  //
  1148  // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
  1149  // steps 2 and 3.
  1150  func (s *sender) walkSACK(rcvdSeg *segment) {
  1151  	s.rc.setDSACKSeen(false)
  1152  
  1153  	// Look for DSACK block.
  1154  	idx := 0
  1155  	n := len(rcvdSeg.parsedOptions.SACKBlocks)
  1156  	if checkDSACK(rcvdSeg) {
  1157  		s.rc.setDSACKSeen(true)
  1158  		idx = 1
  1159  		n--
  1160  	}
  1161  
  1162  	if n == 0 {
  1163  		return
  1164  	}
  1165  
  1166  	// Sort the SACK blocks. The first block is the most recent unacked
  1167  	// block. The following blocks can be in arbitrary order.
  1168  	sackBlocks := make([]header.SACKBlock, n)
  1169  	copy(sackBlocks, rcvdSeg.parsedOptions.SACKBlocks[idx:])
  1170  	sort.Slice(sackBlocks, func(i, j int) bool {
  1171  		return sackBlocks[j].Start.LessThan(sackBlocks[i].Start)
  1172  	})
  1173  
  1174  	seg := s.writeList.Front()
  1175  	for _, sb := range sackBlocks {
  1176  		for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 {
  1177  			if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked {
  1178  				s.rc.update(seg, rcvdSeg)
  1179  				s.rc.detectReorder(seg)
  1180  				seg.acked = true
  1181  				s.SackedOut += s.pCount(seg, s.MaxPayloadSize)
  1182  			}
  1183  			seg = seg.Next()
  1184  		}
  1185  	}
  1186  }
  1187  
  1188  // checkDSACK checks if a DSACK is reported.
  1189  func checkDSACK(rcvdSeg *segment) bool {
  1190  	n := len(rcvdSeg.parsedOptions.SACKBlocks)
  1191  	if n == 0 {
  1192  		return false
  1193  	}
  1194  
  1195  	sb := rcvdSeg.parsedOptions.SACKBlocks[0]
  1196  	// Check if SACK block is invalid.
  1197  	if sb.End.LessThan(sb.Start) {
  1198  		return false
  1199  	}
  1200  
  1201  	// See: https://tools.ietf.org/html/rfc2883#section-5 DSACK is sent in
  1202  	// at most one SACK block. DSACK is detected in the below two cases:
  1203  	// * If the SACK sequence space is less than this cumulative ACK, it is
  1204  	//   an indication that the segment identified by the SACK block has
  1205  	//   been received more than once by the receiver.
  1206  	// * If the sequence space in the first SACK block is greater than the
  1207  	//   cumulative ACK, then the sender next compares the sequence space
  1208  	//   in the first SACK block with the sequence space in the second SACK
  1209  	//   block, if there is one. This comparison can determine if the first
  1210  	//   SACK block is reporting duplicate data that lies above the
  1211  	//   cumulative ACK.
  1212  	if sb.Start.LessThan(rcvdSeg.ackNumber) {
  1213  		return true
  1214  	}
  1215  
  1216  	if n > 1 {
  1217  		sb1 := rcvdSeg.parsedOptions.SACKBlocks[1]
  1218  		if sb1.End.LessThan(sb1.Start) {
  1219  			return false
  1220  		}
  1221  
  1222  		// If the first SACK block is fully covered by second SACK
  1223  		// block, then the first block is a DSACK block.
  1224  		if sb.End.LessThanEq(sb1.End) && sb1.Start.LessThanEq(sb.Start) {
  1225  			return true
  1226  		}
  1227  	}
  1228  
  1229  	return false
  1230  }
  1231  
  1232  // handleRcvdSegment is called when a segment is received; it is responsible for
  1233  // updating the send-related state.
  1234  func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
  1235  	// Check if we can extract an RTT measurement from this ack.
  1236  	if !rcvdSeg.parsedOptions.TS && s.RTTMeasureSeqNum.LessThan(rcvdSeg.ackNumber) {
  1237  		s.updateRTO(s.ep.stack.Clock().NowMonotonic().Sub(s.RTTMeasureTime))
  1238  		s.RTTMeasureSeqNum = s.SndNxt
  1239  	}
  1240  
  1241  	// Update Timestamp if required. See RFC7323, section-4.3.
  1242  	if s.ep.SendTSOk && rcvdSeg.parsedOptions.TS {
  1243  		s.ep.updateRecentTimestamp(rcvdSeg.parsedOptions.TSVal, s.MaxSentAck, rcvdSeg.sequenceNumber)
  1244  	}
  1245  
  1246  	// Insert SACKBlock information into our scoreboard.
  1247  	if s.ep.SACKPermitted {
  1248  		for _, sb := range rcvdSeg.parsedOptions.SACKBlocks {
  1249  			// Only insert the SACK block if the following holds
  1250  			// true:
  1251  			//  * SACK block acks data after the ack number in the
  1252  			//    current segment.
  1253  			//  * SACK block represents a sequence
  1254  			//    between sndUna and sndNxt (i.e. data that is
  1255  			//    currently unacked and in-flight).
  1256  			//  * SACK block that has not been SACKed already.
  1257  			//
  1258  			// NOTE: This check specifically excludes DSACK blocks
  1259  			// which have start/end before sndUna and are used to
  1260  			// indicate spurious retransmissions.
  1261  			if rcvdSeg.ackNumber.LessThan(sb.Start) && s.SndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.SndNxt) && !s.ep.scoreboard.IsSACKED(sb) {
  1262  				s.ep.scoreboard.Insert(sb)
  1263  				rcvdSeg.hasNewSACKInfo = true
  1264  			}
  1265  		}
  1266  
  1267  		// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08
  1268  		// section-7.2
  1269  		// * Step 2: Update RACK stats.
  1270  		//   If the ACK is not ignored as invalid, update the RACK.rtt
  1271  		//   to be the RTT sample calculated using this ACK, and
  1272  		//   continue.  If this ACK or SACK was for the most recently
  1273  		//   sent packet, then record the RACK.xmit_ts timestamp and
  1274  		//   RACK.end_seq sequence implied by this ACK.
  1275  		// * Step 3: Detect packet reordering.
  1276  		//   If the ACK selectively or cumulatively acknowledges an
  1277  		//   unacknowledged and also never retransmitted sequence below
  1278  		//   RACK.fack, then the corresponding packet has been
  1279  		//   reordered and RACK.reord is set to TRUE.
  1280  		if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1281  			s.walkSACK(rcvdSeg)
  1282  		}
  1283  		s.SetPipe()
  1284  	}
  1285  
  1286  	ack := rcvdSeg.ackNumber
  1287  	fastRetransmit := false
  1288  	// Do not leave fast recovery, if the ACK is out of range.
  1289  	if s.FastRecovery.Active {
  1290  		// Leave fast recovery if it acknowledges all the data covered by
  1291  		// this fast recovery session.
  1292  		if (ack-1).InRange(s.SndUna, s.SndNxt) && s.FastRecovery.Last.LessThan(ack) {
  1293  			s.leaveRecovery()
  1294  		}
  1295  	} else {
  1296  		// Detect loss by counting the duplicates and enter recovery.
  1297  		fastRetransmit = s.detectLoss(rcvdSeg)
  1298  	}
  1299  
  1300  	// See if TLP based recovery was successful.
  1301  	if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1302  		s.detectTLPRecovery(ack, rcvdSeg)
  1303  	}
  1304  
  1305  	// Stash away the current window size.
  1306  	s.SndWnd = rcvdSeg.window
  1307  
  1308  	// Disable zero window probing if remote advertizes a non-zero receive
  1309  	// window. This can be with an ACK to the zero window probe (where the
  1310  	// acknumber refers to the already acknowledged byte) OR to any previously
  1311  	// unacknowledged segment.
  1312  	if s.zeroWindowProbing && rcvdSeg.window > 0 &&
  1313  		(ack == s.SndUna || (ack-1).InRange(s.SndUna, s.SndNxt)) {
  1314  		s.disableZeroWindowProbing()
  1315  	}
  1316  
  1317  	// On receiving the ACK for the zero window probe, account for it and
  1318  	// skip trying to send any segment as we are still probing for
  1319  	// receive window to become non-zero.
  1320  	if s.zeroWindowProbing && s.unackZeroWindowProbes > 0 && ack == s.SndUna {
  1321  		s.unackZeroWindowProbes--
  1322  		return
  1323  	}
  1324  
  1325  	// Ignore ack if it doesn't acknowledge any new data.
  1326  	if (ack - 1).InRange(s.SndUna, s.SndNxt) {
  1327  		s.DupAckCount = 0
  1328  
  1329  		// See : https://tools.ietf.org/html/rfc1323#section-3.3.
  1330  		// Specifically we should only update the RTO using TSEcr if the
  1331  		// following condition holds:
  1332  		//
  1333  		//    A TSecr value received in a segment is used to update the
  1334  		//    averaged RTT measurement only if the segment acknowledges
  1335  		//    some new data, i.e., only if it advances the left edge of
  1336  		//    the send window.
  1337  		if s.ep.SendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 {
  1338  			// TSVal/Ecr values sent by Netstack are at a millisecond
  1339  			// granularity.
  1340  			elapsed := time.Duration(s.ep.timestamp()-rcvdSeg.parsedOptions.TSEcr) * time.Millisecond
  1341  			s.updateRTO(elapsed)
  1342  		}
  1343  
  1344  		if s.shouldSchedulePTO() {
  1345  			// Schedule PTO upon receiving an ACK that cumulatively acknowledges data.
  1346  			// See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1.
  1347  			s.schedulePTO()
  1348  		} else {
  1349  			// When an ack is received we must rearm the timer.
  1350  			// RFC 6298 5.3
  1351  			s.probeTimer.disable()
  1352  			s.resendTimer.enable(s.RTO)
  1353  		}
  1354  
  1355  		// Remove all acknowledged data from the write list.
  1356  		acked := s.SndUna.Size(ack)
  1357  		s.SndUna = ack
  1358  
  1359  		// The remote ACK-ing at least 1 byte is an indication that we have a
  1360  		// full-duplex connection to the remote as the only way we will receive an
  1361  		// ACK is if the remote received data that we previously sent.
  1362  		//
  1363  		// As of writing, linux seems to only confirm a route as reachable when
  1364  		// forward progress is made which is indicated by an ACK that removes data
  1365  		// from the retransmit queue.
  1366  		if acked > 0 {
  1367  			s.ep.route.ConfirmReachable()
  1368  		}
  1369  
  1370  		ackLeft := acked
  1371  		originalOutstanding := s.Outstanding
  1372  		for ackLeft > 0 {
  1373  			// We use logicalLen here because we can have FIN
  1374  			// segments (which are always at the end of list) that
  1375  			// have no data, but do consume a sequence number.
  1376  			seg := s.writeList.Front()
  1377  			datalen := seg.logicalLen()
  1378  
  1379  			if datalen > ackLeft {
  1380  				prevCount := s.pCount(seg, s.MaxPayloadSize)
  1381  				seg.data.TrimFront(int(ackLeft))
  1382  				seg.sequenceNumber.UpdateForward(ackLeft)
  1383  				s.Outstanding -= prevCount - s.pCount(seg, s.MaxPayloadSize)
  1384  				break
  1385  			}
  1386  
  1387  			if s.writeNext == seg {
  1388  				s.writeNext = seg.Next()
  1389  			}
  1390  
  1391  			// Update the RACK fields if SACK is enabled.
  1392  			if s.ep.SACKPermitted && !seg.acked && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1393  				s.rc.update(seg, rcvdSeg)
  1394  				s.rc.detectReorder(seg)
  1395  			}
  1396  
  1397  			s.writeList.Remove(seg)
  1398  
  1399  			// If SACK is enabled then only reduce outstanding if
  1400  			// the segment was not previously SACKED as these have
  1401  			// already been accounted for in SetPipe().
  1402  			if !s.ep.SACKPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
  1403  				s.Outstanding -= s.pCount(seg, s.MaxPayloadSize)
  1404  			} else {
  1405  				s.SackedOut -= s.pCount(seg, s.MaxPayloadSize)
  1406  			}
  1407  			seg.decRef()
  1408  			ackLeft -= datalen
  1409  		}
  1410  
  1411  		// Update the send buffer usage and notify potential waiters.
  1412  		s.ep.updateSndBufferUsage(int(acked))
  1413  
  1414  		// Clear SACK information for all acked data.
  1415  		s.ep.scoreboard.Delete(s.SndUna)
  1416  
  1417  		// If we are not in fast recovery then update the congestion
  1418  		// window based on the number of acknowledged packets.
  1419  		if !s.FastRecovery.Active {
  1420  			s.cc.Update(originalOutstanding - s.Outstanding)
  1421  			if s.FastRecovery.Last.LessThan(s.SndUna) {
  1422  				s.state = tcpip.Open
  1423  				// Update RACK when we are exiting fast or RTO
  1424  				// recovery as described in the RFC
  1425  				// draft-ietf-tcpm-rack-08 Section-7.2 Step 4.
  1426  				if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1427  					s.rc.exitRecovery()
  1428  				}
  1429  				s.reorderTimer.disable()
  1430  			}
  1431  		}
  1432  
  1433  		// It is possible for s.outstanding to drop below zero if we get
  1434  		// a retransmit timeout, reset outstanding to zero but later
  1435  		// get an ack that cover previously sent data.
  1436  		if s.Outstanding < 0 {
  1437  			s.Outstanding = 0
  1438  		}
  1439  
  1440  		s.SetPipe()
  1441  
  1442  		// If all outstanding data was acknowledged the disable the timer.
  1443  		// RFC 6298 Rule 5.3
  1444  		if s.SndUna == s.SndNxt {
  1445  			s.Outstanding = 0
  1446  			// Reset firstRetransmittedSegXmitTime to the zero value.
  1447  			s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{}
  1448  			s.resendTimer.disable()
  1449  			s.probeTimer.disable()
  1450  		}
  1451  	}
  1452  
  1453  	if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
  1454  		// Update RACK reorder window.
  1455  		// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
  1456  		// * Upon receiving an ACK:
  1457  		// * Step 4: Update RACK reordering window
  1458  		s.rc.updateRACKReorderWindow()
  1459  
  1460  		// After the reorder window is calculated, detect any loss by checking
  1461  		// if the time elapsed after the segments are sent is greater than the
  1462  		// reorder window.
  1463  		if numLost := s.rc.detectLoss(rcvdSeg.rcvdTime); numLost > 0 && !s.FastRecovery.Active {
  1464  			// If any segment is marked as lost by
  1465  			// RACK, enter recovery and retransmit
  1466  			// the lost segments.
  1467  			s.cc.HandleLossDetected()
  1468  			s.enterRecovery()
  1469  			fastRetransmit = true
  1470  		}
  1471  
  1472  		if s.FastRecovery.Active {
  1473  			s.rc.DoRecovery(nil, fastRetransmit)
  1474  		}
  1475  	}
  1476  
  1477  	// Now that we've popped all acknowledged data from the retransmit
  1478  	// queue, retransmit if needed.
  1479  	if s.FastRecovery.Active && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 {
  1480  		s.lr.DoRecovery(rcvdSeg, fastRetransmit)
  1481  		// When SACK is enabled data sending is governed by steps in
  1482  		// RFC 6675 Section 5 recovery steps  A-C.
  1483  		// See: https://tools.ietf.org/html/rfc6675#section-5.
  1484  		if s.ep.SACKPermitted {
  1485  			return
  1486  		}
  1487  	}
  1488  
  1489  	// Send more data now that some of the pending data has been ack'd, or
  1490  	// that the window opened up, or the congestion window was inflated due
  1491  	// to a duplicate ack during fast recovery. This will also re-enable
  1492  	// the retransmit timer if needed.
  1493  	s.sendData()
  1494  }
  1495  
  1496  // sendSegment sends the specified segment.
  1497  func (s *sender) sendSegment(seg *segment) tcpip.Error {
  1498  	if seg.xmitCount > 0 {
  1499  		s.ep.stack.Stats().TCP.Retransmits.Increment()
  1500  		s.ep.stats.SendErrors.Retransmits.Increment()
  1501  		if s.SndCwnd < s.Ssthresh {
  1502  			s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment()
  1503  		}
  1504  	}
  1505  	seg.xmitTime = s.ep.stack.Clock().NowMonotonic()
  1506  	seg.xmitCount++
  1507  	seg.lost = false
  1508  	err := s.sendSegmentFromView(seg.data, seg.flags, seg.sequenceNumber)
  1509  
  1510  	// Every time a packet containing data is sent (including a
  1511  	// retransmission), if SACK is enabled and we are retransmitting data
  1512  	// then use the conservative timer described in RFC6675 Section 6.0,
  1513  	// otherwise follow the standard time described in RFC6298 Section 5.1.
  1514  	if err != nil && seg.data.Size() != 0 {
  1515  		if s.FastRecovery.Active && seg.xmitCount > 1 && s.ep.SACKPermitted {
  1516  			s.resendTimer.enable(s.RTO)
  1517  		} else {
  1518  			if !s.resendTimer.enabled() {
  1519  				s.resendTimer.enable(s.RTO)
  1520  			}
  1521  		}
  1522  	}
  1523  
  1524  	return err
  1525  }
  1526  
  1527  // sendSegmentFromView sends a new segment containing the given payload, flags
  1528  // and sequence number.
  1529  func (s *sender) sendSegmentFromView(data buffer.VectorisedView, flags header.TCPFlags, seq seqnum.Value) tcpip.Error {
  1530  	s.LastSendTime = s.ep.stack.Clock().NowMonotonic()
  1531  	if seq == s.RTTMeasureSeqNum {
  1532  		s.RTTMeasureTime = s.LastSendTime
  1533  	}
  1534  
  1535  	rcvNxt, rcvWnd := s.ep.rcv.getSendParams()
  1536  
  1537  	// Remember the max sent ack.
  1538  	s.MaxSentAck = rcvNxt
  1539  
  1540  	return s.ep.sendRaw(data, flags, seq, rcvNxt, rcvWnd)
  1541  }
  1542  
  1543  // maybeSendOutOfWindowAck sends an ACK if we are not being rate limited
  1544  // currently.
  1545  func (s *sender) maybeSendOutOfWindowAck(seg *segment) {
  1546  	// Data packets are unlikely to be part of an ACK loop. So always send
  1547  	// an ACK for a packet w/ data.
  1548  	if seg.payloadSize() > 0 || s.ep.allowOutOfWindowAck() {
  1549  		s.sendAck()
  1550  	}
  1551  }