github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/tcpip/transport/tcp/snd.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "fmt" 19 "math" 20 "sort" 21 "time" 22 23 "github.com/SagerNet/gvisor/pkg/sleep" 24 "github.com/SagerNet/gvisor/pkg/sync" 25 "github.com/SagerNet/gvisor/pkg/tcpip" 26 "github.com/SagerNet/gvisor/pkg/tcpip/buffer" 27 "github.com/SagerNet/gvisor/pkg/tcpip/header" 28 "github.com/SagerNet/gvisor/pkg/tcpip/seqnum" 29 "github.com/SagerNet/gvisor/pkg/tcpip/stack" 30 ) 31 32 const ( 33 // MinRTO is the minimum allowed value for the retransmit timeout. 34 MinRTO = 200 * time.Millisecond 35 36 // MaxRTO is the maximum allowed value for the retransmit timeout. 37 MaxRTO = 120 * time.Second 38 39 // InitialCwnd is the initial congestion window. 40 InitialCwnd = 10 41 42 // nDupAckThreshold is the number of duplicate ACK's required 43 // before fast-retransmit is entered. 44 nDupAckThreshold = 3 45 46 // MaxRetries is the maximum number of probe retries sender does 47 // before timing out the connection. 48 // Linux default TCP_RETR2, net.ipv4.tcp_retries2. 49 MaxRetries = 15 50 ) 51 52 // congestionControl is an interface that must be implemented by any supported 53 // congestion control algorithm. 54 type congestionControl interface { 55 // HandleLossDetected is invoked when the loss is detected by RACK or 56 // sender.dupAckCount >= nDupAckThreshold just before entering fast 57 // retransmit. 58 HandleLossDetected() 59 60 // HandleRTOExpired is invoked when the retransmit timer expires. 61 HandleRTOExpired() 62 63 // Update is invoked when processing inbound acks. It's passed the 64 // number of packet's that were acked by the most recent cumulative 65 // acknowledgement. 66 Update(packetsAcked int) 67 68 // PostRecovery is invoked when the sender is exiting a fast retransmit/ 69 // recovery phase. This provides congestion control algorithms a way 70 // to adjust their state when exiting recovery. 71 PostRecovery() 72 } 73 74 // lossRecovery is an interface that must be implemented by any supported 75 // loss recovery algorithm. 76 type lossRecovery interface { 77 // DoRecovery is invoked when loss is detected and segments need 78 // to be retransmitted. The cumulative or selective ACK is passed along 79 // with the flag which identifies whether the connection entered fast 80 // retransmit with this ACK and to retransmit the first unacknowledged 81 // segment. 82 DoRecovery(rcvdSeg *segment, fastRetransmit bool) 83 } 84 85 // sender holds the state necessary to send TCP segments. 86 // 87 // +stateify savable 88 type sender struct { 89 stack.TCPSenderState 90 ep *endpoint 91 92 // lr is the loss recovery algorithm used by the sender. 93 lr lossRecovery 94 95 // firstRetransmittedSegXmitTime is the original transmit time of 96 // the first segment that was retransmitted due to RTO expiration. 97 firstRetransmittedSegXmitTime tcpip.MonotonicTime 98 99 // zeroWindowProbing is set if the sender is currently probing 100 // for zero receive window. 101 zeroWindowProbing bool `state:"nosave"` 102 103 // unackZeroWindowProbes is the number of unacknowledged zero 104 // window probes. 105 unackZeroWindowProbes uint32 `state:"nosave"` 106 107 writeNext *segment 108 writeList segmentList 109 resendTimer timer `state:"nosave"` 110 resendWaker sleep.Waker `state:"nosave"` 111 112 // rtt.TCPRTTState.SRTT and rtt.TCPRTTState.RTTVar are the "smoothed 113 // round-trip time", and "round-trip time variation", as defined in 114 // section 2 of RFC 6298. 115 rtt rtt 116 117 // minRTO is the minimum permitted value for sender.rto. 118 minRTO time.Duration 119 120 // maxRTO is the maximum permitted value for sender.rto. 121 maxRTO time.Duration 122 123 // maxRetries is the maximum permitted retransmissions. 124 maxRetries uint32 125 126 // gso is set if generic segmentation offload is enabled. 127 gso bool 128 129 // state is the current state of congestion control for this endpoint. 130 state tcpip.CongestionControlState 131 132 // cc is the congestion control algorithm in use for this sender. 133 cc congestionControl 134 135 // rc has the fields needed for implementing RACK loss detection 136 // algorithm. 137 rc rackControl 138 139 // reorderTimer is the timer used to retransmit the segments after RACK 140 // detects them as lost. 141 reorderTimer timer `state:"nosave"` 142 reorderWaker sleep.Waker `state:"nosave"` 143 144 // probeTimer and probeWaker are used to schedule PTO for RACK TLP algorithm. 145 probeTimer timer `state:"nosave"` 146 probeWaker sleep.Waker `state:"nosave"` 147 } 148 149 // rtt is a synchronization wrapper used to appease stateify. See the comment 150 // in sender, where it is used. 151 // 152 // +stateify savable 153 type rtt struct { 154 sync.Mutex `state:"nosave"` 155 156 stack.TCPRTTState 157 } 158 159 func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender { 160 // The sender MUST reduce the TCP data length to account for any IP or 161 // TCP options that it is including in the packets that it sends. 162 // See: https://tools.ietf.org/html/rfc6691#section-2 163 maxPayloadSize := int(mss) - ep.maxOptionSize() 164 165 s := &sender{ 166 ep: ep, 167 TCPSenderState: stack.TCPSenderState{ 168 SndWnd: sndWnd, 169 SndUna: iss + 1, 170 SndNxt: iss + 1, 171 RTTMeasureSeqNum: iss + 1, 172 LastSendTime: ep.stack.Clock().NowMonotonic(), 173 MaxPayloadSize: maxPayloadSize, 174 MaxSentAck: irs + 1, 175 FastRecovery: stack.TCPFastRecoveryState{ 176 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1. 177 Last: iss, 178 HighRxt: iss, 179 RescueRxt: iss, 180 }, 181 RTO: 1 * time.Second, 182 }, 183 gso: ep.gso.Type != stack.GSONone, 184 } 185 186 if s.gso { 187 s.ep.gso.MSS = uint16(maxPayloadSize) 188 } 189 190 s.cc = s.initCongestionControl(ep.cc) 191 s.lr = s.initLossRecovery() 192 s.rc.init(s, iss) 193 194 // A negative sndWndScale means that no scaling is in use, otherwise we 195 // store the scaling value. 196 if sndWndScale > 0 { 197 s.SndWndScale = uint8(sndWndScale) 198 } 199 200 s.resendTimer.init(s.ep.stack.Clock(), &s.resendWaker) 201 s.reorderTimer.init(s.ep.stack.Clock(), &s.reorderWaker) 202 s.probeTimer.init(s.ep.stack.Clock(), &s.probeWaker) 203 204 s.updateMaxPayloadSize(int(ep.route.MTU()), 0) 205 206 // Initialize SACK Scoreboard after updating max payload size as we use 207 // the maxPayloadSize as the smss when determining if a segment is lost 208 // etc. 209 s.ep.scoreboard = NewSACKScoreboard(uint16(s.MaxPayloadSize), iss) 210 211 // Get Stack wide config. 212 var minRTO tcpip.TCPMinRTOOption 213 if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil { 214 panic(fmt.Sprintf("unable to get minRTO from stack: %s", err)) 215 } 216 s.minRTO = time.Duration(minRTO) 217 218 var maxRTO tcpip.TCPMaxRTOOption 219 if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil { 220 panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err)) 221 } 222 s.maxRTO = time.Duration(maxRTO) 223 224 var maxRetries tcpip.TCPMaxRetriesOption 225 if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil { 226 panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err)) 227 } 228 s.maxRetries = uint32(maxRetries) 229 230 return s 231 } 232 233 // initCongestionControl initializes the specified congestion control module and 234 // returns a handle to it. It also initializes the sndCwnd and sndSsThresh to 235 // their initial values. 236 func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl { 237 s.SndCwnd = InitialCwnd 238 // Set sndSsthresh to the maximum int value, which depends on the 239 // platform. 240 s.Ssthresh = int(^uint(0) >> 1) 241 242 switch congestionControlName { 243 case ccCubic: 244 return newCubicCC(s) 245 case ccReno: 246 fallthrough 247 default: 248 return newRenoCC(s) 249 } 250 } 251 252 // initLossRecovery initiates the loss recovery algorithm for the sender. 253 func (s *sender) initLossRecovery() lossRecovery { 254 if s.ep.SACKPermitted { 255 return newSACKRecovery(s) 256 } 257 return newRenoRecovery(s) 258 } 259 260 // updateMaxPayloadSize updates the maximum payload size based on the given 261 // MTU. If this is in response to "packet too big" control packets (indicated 262 // by the count argument), it also reduces the number of outstanding packets and 263 // attempts to retransmit the first packet above the MTU size. 264 func (s *sender) updateMaxPayloadSize(mtu, count int) { 265 m := mtu - header.TCPMinimumSize 266 267 m -= s.ep.maxOptionSize() 268 269 // We don't adjust up for now. 270 if m >= s.MaxPayloadSize { 271 return 272 } 273 274 // Make sure we can transmit at least one byte. 275 if m <= 0 { 276 m = 1 277 } 278 279 oldMSS := s.MaxPayloadSize 280 s.MaxPayloadSize = m 281 if s.gso { 282 s.ep.gso.MSS = uint16(m) 283 } 284 285 if count == 0 { 286 // updateMaxPayloadSize is also called when the sender is created. 287 // and there is no data to send in such cases. Return immediately. 288 return 289 } 290 291 // Update the scoreboard's smss to reflect the new lowered 292 // maxPayloadSize. 293 s.ep.scoreboard.smss = uint16(m) 294 295 s.Outstanding -= count 296 if s.Outstanding < 0 { 297 s.Outstanding = 0 298 } 299 300 // Rewind writeNext to the first segment exceeding the MTU. Do nothing 301 // if it is already before such a packet. 302 nextSeg := s.writeNext 303 for seg := s.writeList.Front(); seg != nil; seg = seg.Next() { 304 if seg == s.writeNext { 305 // We got to writeNext before we could find a segment 306 // exceeding the MTU. 307 break 308 } 309 310 if nextSeg == s.writeNext && seg.data.Size() > m { 311 // We found a segment exceeding the MTU. Rewind 312 // writeNext and try to retransmit it. 313 nextSeg = seg 314 } 315 316 if s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 317 // Update sackedOut for new maximum payload size. 318 s.SackedOut -= s.pCount(seg, oldMSS) 319 s.SackedOut += s.pCount(seg, s.MaxPayloadSize) 320 } 321 } 322 323 // Since we likely reduced the number of outstanding packets, we may be 324 // ready to send some more. 325 s.writeNext = nextSeg 326 s.sendData() 327 } 328 329 // sendAck sends an ACK segment. 330 func (s *sender) sendAck() { 331 s.sendSegmentFromView(buffer.VectorisedView{}, header.TCPFlagAck, s.SndNxt) 332 } 333 334 // updateRTO updates the retransmit timeout when a new roud-trip time is 335 // available. This is done in accordance with section 2 of RFC 6298. 336 func (s *sender) updateRTO(rtt time.Duration) { 337 s.rtt.Lock() 338 if !s.rtt.TCPRTTState.SRTTInited { 339 s.rtt.TCPRTTState.RTTVar = rtt / 2 340 s.rtt.TCPRTTState.SRTT = rtt 341 s.rtt.TCPRTTState.SRTTInited = true 342 } else { 343 diff := s.rtt.TCPRTTState.SRTT - rtt 344 if diff < 0 { 345 diff = -diff 346 } 347 // Use RFC6298 standard algorithm to update TCPRTTState.RTTVar and TCPRTTState.SRTT when 348 // no timestamps are available. 349 if !s.ep.SendTSOk { 350 s.rtt.TCPRTTState.RTTVar = (3*s.rtt.TCPRTTState.RTTVar + diff) / 4 351 s.rtt.TCPRTTState.SRTT = (7*s.rtt.TCPRTTState.SRTT + rtt) / 8 352 } else { 353 // When we are taking RTT measurements of every ACK then 354 // we need to use a modified method as specified in 355 // https://tools.ietf.org/html/rfc7323#appendix-G 356 if s.Outstanding == 0 { 357 s.rtt.Unlock() 358 return 359 } 360 // Netstack measures congestion window/inflight all in 361 // terms of packets and not bytes. This is similar to 362 // how linux also does cwnd and inflight. In practice 363 // this approximation works as expected. 364 expectedSamples := math.Ceil(float64(s.Outstanding) / 2) 365 366 // alpha & beta values are the original values as recommended in 367 // https://tools.ietf.org/html/rfc6298#section-2.3. 368 const alpha = 0.125 369 const beta = 0.25 370 371 alphaPrime := alpha / expectedSamples 372 betaPrime := beta / expectedSamples 373 rttVar := (1-betaPrime)*s.rtt.TCPRTTState.RTTVar.Seconds() + betaPrime*diff.Seconds() 374 srtt := (1-alphaPrime)*s.rtt.TCPRTTState.SRTT.Seconds() + alphaPrime*rtt.Seconds() 375 s.rtt.TCPRTTState.RTTVar = time.Duration(rttVar * float64(time.Second)) 376 s.rtt.TCPRTTState.SRTT = time.Duration(srtt * float64(time.Second)) 377 } 378 } 379 380 s.RTO = s.rtt.TCPRTTState.SRTT + 4*s.rtt.TCPRTTState.RTTVar 381 s.rtt.Unlock() 382 if s.RTO < s.minRTO { 383 s.RTO = s.minRTO 384 } 385 } 386 387 // resendSegment resends the first unacknowledged segment. 388 func (s *sender) resendSegment() { 389 // Don't use any segments we already sent to measure RTT as they may 390 // have been affected by packets being lost. 391 s.RTTMeasureSeqNum = s.SndNxt 392 393 // Resend the segment. 394 if seg := s.writeList.Front(); seg != nil { 395 if seg.data.Size() > s.MaxPayloadSize { 396 s.splitSeg(seg, s.MaxPayloadSize) 397 } 398 399 // See: RFC 6675 section 5 Step 4.3 400 // 401 // To prevent retransmission, set both the HighRXT and RescueRXT 402 // to the highest sequence number in the retransmitted segment. 403 s.FastRecovery.HighRxt = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) - 1 404 s.FastRecovery.RescueRxt = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) - 1 405 s.sendSegment(seg) 406 s.ep.stack.Stats().TCP.FastRetransmit.Increment() 407 s.ep.stats.SendErrors.FastRetransmit.Increment() 408 409 // Run SetPipe() as per RFC 6675 section 5 Step 4.4 410 s.SetPipe() 411 } 412 } 413 414 // retransmitTimerExpired is called when the retransmit timer expires, and 415 // unacknowledged segments are assumed lost, and thus need to be resent. 416 // Returns true if the connection is still usable, or false if the connection 417 // is deemed lost. 418 func (s *sender) retransmitTimerExpired() bool { 419 // Check if the timer actually expired or if it's a spurious wake due 420 // to a previously orphaned runtime timer. 421 if !s.resendTimer.checkExpiration() { 422 return true 423 } 424 425 // TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases 426 // when writeList is empty. Remove this once we have a proper fix for this 427 // issue. 428 if s.writeList.Front() == nil { 429 return true 430 } 431 432 s.ep.stack.Stats().TCP.Timeouts.Increment() 433 s.ep.stats.SendErrors.Timeouts.Increment() 434 435 // Set TLPRxtOut to false according to 436 // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1. 437 s.rc.tlpRxtOut = false 438 439 // Give up if we've waited more than a minute since the last resend or 440 // if a user time out is set and we have exceeded the user specified 441 // timeout since the first retransmission. 442 uto := s.ep.userTimeout 443 444 if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) { 445 // We store the original xmitTime of the segment that we are 446 // about to retransmit as the retransmission time. This is 447 // required as by the time the retransmitTimer has expired the 448 // segment has already been sent and unacked for the RTO at the 449 // time the segment was sent. 450 s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime 451 } 452 453 elapsed := s.ep.stack.Clock().NowMonotonic().Sub(s.firstRetransmittedSegXmitTime) 454 remaining := s.maxRTO 455 if uto != 0 { 456 // Cap to the user specified timeout if one is specified. 457 remaining = uto - elapsed 458 } 459 460 // Always honor the user-timeout irrespective of whether the zero 461 // window probes were acknowledged. 462 // net/ipv4/tcp_timer.c::tcp_probe_timer() 463 if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries { 464 return false 465 } 466 467 // Set new timeout. The timer will be restarted by the call to sendData 468 // below. 469 s.RTO *= 2 470 // Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5 471 if s.RTO > s.maxRTO { 472 s.RTO = s.maxRTO 473 } 474 475 // Cap RTO to remaining time. 476 if s.RTO > remaining { 477 s.RTO = remaining 478 } 479 480 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4. 481 // 482 // Retransmit timeouts: 483 // After a retransmit timeout, record the highest sequence number 484 // transmitted in the variable recover, and exit the fast recovery 485 // procedure if applicable. 486 s.FastRecovery.Last = s.SndNxt - 1 487 488 if s.FastRecovery.Active { 489 // We were attempting fast recovery but were not successful. 490 // Leave the state. We don't need to update ssthresh because it 491 // has already been updated when entered fast-recovery. 492 s.leaveRecovery() 493 } 494 495 s.state = tcpip.RTORecovery 496 s.cc.HandleRTOExpired() 497 498 // Mark the next segment to be sent as the first unacknowledged one and 499 // start sending again. Set the number of outstanding packets to 0 so 500 // that we'll be able to retransmit. 501 // 502 // We'll keep on transmitting (or retransmitting) as we get acks for 503 // the data we transmit. 504 s.Outstanding = 0 505 506 // Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1 507 // 508 // In order to avoid memory deadlocks, the TCP receiver is allowed to 509 // discard data that has already been selectively acknowledged. As a 510 // result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK 511 // information gathered from a receiver upon a retransmission timeout 512 // (RTO) "since the timeout might indicate that the data receiver has 513 // reneged." Additionally, a TCP sender MUST "ignore prior SACK 514 // information in determining which data to retransmit." 515 // 516 // NOTE: We take the stricter interpretation and just expunge all 517 // information as we lack more rigorous checks to validate if the SACK 518 // information is usable after an RTO. 519 s.ep.scoreboard.Reset() 520 s.writeNext = s.writeList.Front() 521 522 // RFC 1122 4.2.2.17: Start sending zero window probes when we still see a 523 // zero receive window after retransmission interval and we have data to 524 // send. 525 if s.zeroWindowProbing { 526 s.sendZeroWindowProbe() 527 // RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed 528 // indefinitely. As long as the receiving TCP continues to send 529 // acknowledgments in response to the probe segments, the sending TCP 530 // MUST allow the connection to stay open. 531 return true 532 } 533 534 seg := s.writeNext 535 // RFC 1122 4.2.3.5: Close the connection when the number of 536 // retransmissions for this segment is beyond a limit. 537 if seg != nil && seg.xmitCount > s.maxRetries { 538 return false 539 } 540 541 s.sendData() 542 543 return true 544 } 545 546 // pCount returns the number of packets in the segment. Due to GSO, a segment 547 // can be composed of multiple packets. 548 func (s *sender) pCount(seg *segment, maxPayloadSize int) int { 549 size := seg.data.Size() 550 if size == 0 { 551 return 1 552 } 553 554 return (size-1)/maxPayloadSize + 1 555 } 556 557 // splitSeg splits a given segment at the size specified and inserts the 558 // remainder as a new segment after the current one in the write list. 559 func (s *sender) splitSeg(seg *segment, size int) { 560 if seg.data.Size() <= size { 561 return 562 } 563 // Split this segment up. 564 nSeg := seg.clone() 565 nSeg.data.TrimFront(size) 566 nSeg.sequenceNumber.UpdateForward(seqnum.Size(size)) 567 s.writeList.InsertAfter(seg, nSeg) 568 569 // The segment being split does not carry PUSH flag because it is 570 // followed by the newly split segment. 571 // RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered 572 // segment (i.e., when there is no more queued data to be sent). 573 // Linux removes PSH flag only when the segment is being split over MSS 574 // and retains it when we are splitting the segment over lack of sender 575 // window space. 576 // ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point() 577 // ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test() 578 if seg.data.Size() > s.MaxPayloadSize { 579 seg.flags ^= header.TCPFlagPsh 580 } 581 582 seg.data.CapLength(size) 583 } 584 585 // NextSeg implements the RFC6675 NextSeg() operation. 586 // 587 // NextSeg starts scanning the writeList starting from nextSegHint and returns 588 // the hint to be passed on the next call to NextSeg. This is required to avoid 589 // iterating the write list repeatedly when NextSeg is invoked in a loop during 590 // recovery. The returned hint will be nil if there are no more segments that 591 // can match rules defined by NextSeg operation in RFC6675. 592 // 593 // rescueRtx will be true only if nextSeg is a rescue retransmission as 594 // described by Step 4) of the NextSeg algorithm. 595 func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) { 596 var s3 *segment 597 var s4 *segment 598 // Step 1. 599 for seg := nextSegHint; seg != nil; seg = seg.Next() { 600 // Stop iteration if we hit a segment that has never been 601 // transmitted (i.e. either it has no assigned sequence number 602 // or if it does have one, it's >= the next sequence number 603 // to be sent [i.e. >= s.sndNxt]). 604 if !s.isAssignedSequenceNumber(seg) || s.SndNxt.LessThanEq(seg.sequenceNumber) { 605 hint = nil 606 break 607 } 608 segSeq := seg.sequenceNumber 609 if smss := s.ep.scoreboard.SMSS(); seg.data.Size() > int(smss) { 610 s.splitSeg(seg, int(smss)) 611 } 612 613 // See RFC 6675 Section 4 614 // 615 // 1. If there exists a smallest unSACKED sequence number 616 // 'S2' that meets the following 3 criteria for determinig 617 // loss, the sequence range of one segment of up to SMSS 618 // octects starting with S2 MUST be returned. 619 if !s.ep.scoreboard.IsSACKED(header.SACKBlock{Start: segSeq, End: segSeq.Add(1)}) { 620 // NextSeg(): 621 // 622 // (1.a) S2 is greater than HighRxt 623 // (1.b) S2 is less than highest octect covered by 624 // any received SACK. 625 if s.FastRecovery.HighRxt.LessThan(segSeq) && segSeq.LessThan(s.ep.scoreboard.maxSACKED) { 626 // NextSeg(): 627 // (1.c) IsLost(S2) returns true. 628 if s.ep.scoreboard.IsLost(segSeq) { 629 return seg, seg.Next(), false 630 } 631 632 // NextSeg(): 633 // 634 // (3): If the conditions for rules (1) and (2) 635 // fail, but there exists an unSACKed sequence 636 // number S3 that meets the criteria for 637 // detecting loss given in steps 1.a and 1.b 638 // above (specifically excluding (1.c)) then one 639 // segment of upto SMSS octets starting with S3 640 // SHOULD be returned. 641 if s3 == nil { 642 s3 = seg 643 hint = seg.Next() 644 } 645 } 646 // NextSeg(): 647 // 648 // (4) If the conditions for (1), (2) and (3) fail, 649 // but there exists outstanding unSACKED data, we 650 // provide the opportunity for a single "rescue" 651 // retransmission per entry into loss recovery. If 652 // HighACK is greater than RescueRxt (or RescueRxt 653 // is undefined), then one segment of upto SMSS 654 // octects that MUST include the highest outstanding 655 // unSACKed sequence number SHOULD be returned, and 656 // RescueRxt set to RecoveryPoint. HighRxt MUST NOT 657 // be updated. 658 if s.FastRecovery.RescueRxt.LessThan(s.SndUna - 1) { 659 if s4 != nil { 660 if s4.sequenceNumber.LessThan(segSeq) { 661 s4 = seg 662 } 663 } else { 664 s4 = seg 665 } 666 } 667 } 668 } 669 670 // If we got here then no segment matched step (1). 671 // Step (2): "If no sequence number 'S2' per rule (1) 672 // exists but there exists available unsent data and the 673 // receiver's advertised window allows, the sequence 674 // range of one segment of up to SMSS octets of 675 // previously unsent data starting with sequence number 676 // HighData+1 MUST be returned." 677 for seg := s.writeNext; seg != nil; seg = seg.Next() { 678 if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.SndNxt) { 679 continue 680 } 681 // We do not split the segment here to <= smss as it has 682 // potentially not been assigned a sequence number yet. 683 return seg, nil, false 684 } 685 686 if s3 != nil { 687 return s3, hint, false 688 } 689 690 return s4, nil, true 691 } 692 693 // maybeSendSegment tries to send the specified segment and either coalesces 694 // other segments into this one or splits the specified segment based on the 695 // lower of the specified limit value or the receivers window size specified by 696 // end. 697 func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (sent bool) { 698 // We abuse the flags field to determine if we have already 699 // assigned a sequence number to this segment. 700 if !s.isAssignedSequenceNumber(seg) { 701 // Merge segments if allowed. 702 if seg.data.Size() != 0 { 703 available := int(s.SndNxt.Size(end)) 704 if available > limit { 705 available = limit 706 } 707 708 // nextTooBig indicates that the next segment was too 709 // large to entirely fit in the current segment. It 710 // would be possible to split the next segment and merge 711 // the portion that fits, but unexpectedly splitting 712 // segments can have user visible side-effects which can 713 // break applications. For example, RFC 7766 section 8 714 // says that the length and data of a DNS response 715 // should be sent in the same TCP segment to avoid 716 // triggering bugs in poorly written DNS 717 // implementations. 718 var nextTooBig bool 719 for nSeg := seg.Next(); nSeg != nil && nSeg.data.Size() != 0; nSeg = seg.Next() { 720 if seg.data.Size()+nSeg.data.Size() > available { 721 nextTooBig = true 722 break 723 } 724 seg.merge(nSeg) 725 s.writeList.Remove(nSeg) 726 nSeg.decRef() 727 } 728 if !nextTooBig && seg.data.Size() < available { 729 // Segment is not full. 730 if s.Outstanding > 0 && s.ep.ops.GetDelayOption() { 731 // Nagle's algorithm. From Wikipedia: 732 // Nagle's algorithm works by 733 // combining a number of small 734 // outgoing messages and sending them 735 // all at once. Specifically, as long 736 // as there is a sent packet for which 737 // the sender has received no 738 // acknowledgment, the sender should 739 // keep buffering its output until it 740 // has a full packet's worth of 741 // output, thus allowing output to be 742 // sent all at once. 743 return false 744 } 745 // With TCP_CORK, hold back until minimum of the available 746 // send space and MSS. 747 // TODO(github.com/SagerNet/issue/2833): Drain the held segments after a 748 // timeout. 749 if seg.data.Size() < s.MaxPayloadSize && s.ep.ops.GetCorkOption() { 750 return false 751 } 752 } 753 } 754 755 // Assign flags. We don't do it above so that we can merge 756 // additional data if Nagle holds the segment. 757 seg.sequenceNumber = s.SndNxt 758 seg.flags = header.TCPFlagAck | header.TCPFlagPsh 759 } 760 761 var segEnd seqnum.Value 762 if seg.data.Size() == 0 { 763 if s.writeList.Back() != seg { 764 panic("FIN segments must be the final segment in the write list.") 765 } 766 seg.flags = header.TCPFlagAck | header.TCPFlagFin 767 segEnd = seg.sequenceNumber.Add(1) 768 // Update the state to reflect that we have now 769 // queued a FIN. 770 switch s.ep.EndpointState() { 771 case StateCloseWait: 772 s.ep.setEndpointState(StateLastAck) 773 default: 774 s.ep.setEndpointState(StateFinWait1) 775 } 776 } else { 777 // We're sending a non-FIN segment. 778 if seg.flags&header.TCPFlagFin != 0 { 779 panic("Netstack queues FIN segments without data.") 780 } 781 782 if !seg.sequenceNumber.LessThan(end) { 783 return false 784 } 785 786 available := int(seg.sequenceNumber.Size(end)) 787 if available == 0 { 788 return false 789 } 790 791 // If the whole segment or at least 1MSS sized segment cannot 792 // be accomodated in the receiver advertized window, skip 793 // splitting and sending of the segment. ref: 794 // net/ipv4/tcp_output.c::tcp_snd_wnd_test() 795 // 796 // Linux checks this for all segment transmits not triggered by 797 // a probe timer. On this condition, it defers the segment split 798 // and transmit to a short probe timer. 799 // 800 // ref: include/net/tcp.h::tcp_check_probe_timer() 801 // ref: net/ipv4/tcp_output.c::tcp_write_wakeup() 802 // 803 // Instead of defining a new transmit timer, we attempt to split 804 // the segment right here if there are no pending segments. If 805 // there are pending segments, segment transmits are deferred to 806 // the retransmit timer handler. 807 if s.SndUna != s.SndNxt { 808 switch { 809 case available >= seg.data.Size(): 810 // OK to send, the whole segments fits in the 811 // receiver's advertised window. 812 case available >= s.MaxPayloadSize: 813 // OK to send, at least 1 MSS sized segment fits 814 // in the receiver's advertised window. 815 default: 816 return false 817 } 818 } 819 820 // The segment size limit is computed as a function of sender 821 // congestion window and MSS. When sender congestion window is > 822 // 1, this limit can be larger than MSS. Ensure that the 823 // currently available send space is not greater than minimum of 824 // this limit and MSS. 825 if available > limit { 826 available = limit 827 } 828 829 // If GSO is not in use then cap available to 830 // maxPayloadSize. When GSO is in use the gVisor GSO logic or 831 // the host GSO logic will cap the segment to the correct size. 832 if s.ep.gso.Type == stack.GSONone && available > s.MaxPayloadSize { 833 available = s.MaxPayloadSize 834 } 835 836 if seg.data.Size() > available { 837 s.splitSeg(seg, available) 838 } 839 840 segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) 841 } 842 843 s.sendSegment(seg) 844 845 // Update sndNxt if we actually sent new data (as opposed to 846 // retransmitting some previously sent data). 847 if s.SndNxt.LessThan(segEnd) { 848 s.SndNxt = segEnd 849 } 850 851 return true 852 } 853 854 func (s *sender) sendZeroWindowProbe() { 855 ack, win := s.ep.rcv.getSendParams() 856 s.unackZeroWindowProbes++ 857 // Send a zero window probe with sequence number pointing to 858 // the last acknowledged byte. 859 s.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, s.SndUna-1, ack, win) 860 // Rearm the timer to continue probing. 861 s.resendTimer.enable(s.RTO) 862 } 863 864 func (s *sender) enableZeroWindowProbing() { 865 s.zeroWindowProbing = true 866 // We piggyback the probing on the retransmit timer with the 867 // current retranmission interval, as we may start probing while 868 // segment retransmissions. 869 if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) { 870 s.firstRetransmittedSegXmitTime = s.ep.stack.Clock().NowMonotonic() 871 } 872 s.resendTimer.enable(s.RTO) 873 } 874 875 func (s *sender) disableZeroWindowProbing() { 876 s.zeroWindowProbing = false 877 s.unackZeroWindowProbes = 0 878 s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{} 879 s.resendTimer.disable() 880 } 881 882 func (s *sender) postXmit(dataSent bool, shouldScheduleProbe bool) { 883 if dataSent { 884 // We sent data, so we should stop the keepalive timer to ensure 885 // that no keepalives are sent while there is pending data. 886 s.ep.disableKeepaliveTimer() 887 } 888 889 // If the sender has advertized zero receive window and we have 890 // data to be sent out, start zero window probing to query the 891 // the remote for it's receive window size. 892 if s.writeNext != nil && s.SndWnd == 0 { 893 s.enableZeroWindowProbing() 894 } 895 896 // If we have no more pending data, start the keepalive timer. 897 if s.SndUna == s.SndNxt { 898 s.ep.resetKeepaliveTimer(false) 899 } else { 900 // Enable timers if we have pending data. 901 if shouldScheduleProbe && s.shouldSchedulePTO() { 902 // Schedule PTO after transmitting new data that wasn't itself a TLP probe. 903 s.schedulePTO() 904 } else if !s.resendTimer.enabled() { 905 s.probeTimer.disable() 906 if s.Outstanding > 0 { 907 // Enable the resend timer if it's not enabled yet and there is 908 // outstanding data. 909 s.resendTimer.enable(s.RTO) 910 } 911 } 912 } 913 } 914 915 // sendData sends new data segments. It is called when data becomes available or 916 // when the send window opens up. 917 func (s *sender) sendData() { 918 limit := s.MaxPayloadSize 919 if s.gso { 920 limit = int(s.ep.gso.MaxSize - header.TCPHeaderMaximumSize) 921 } 922 end := s.SndUna.Add(s.SndWnd) 923 924 // Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10. 925 // "A TCP SHOULD set cwnd to no more than RW before beginning 926 // transmission if the TCP has not sent data in the interval exceeding 927 // the retrasmission timeout." 928 if !s.FastRecovery.Active && s.state != tcpip.RTORecovery && s.ep.stack.Clock().NowMonotonic().Sub(s.LastSendTime) > s.RTO { 929 if s.SndCwnd > InitialCwnd { 930 s.SndCwnd = InitialCwnd 931 } 932 } 933 934 var dataSent bool 935 for seg := s.writeNext; seg != nil && s.Outstanding < s.SndCwnd; seg = seg.Next() { 936 cwndLimit := (s.SndCwnd - s.Outstanding) * s.MaxPayloadSize 937 if cwndLimit < limit { 938 limit = cwndLimit 939 } 940 if s.isAssignedSequenceNumber(seg) && s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 941 // Move writeNext along so that we don't try and scan data that 942 // has already been SACKED. 943 s.writeNext = seg.Next() 944 continue 945 } 946 if sent := s.maybeSendSegment(seg, limit, end); !sent { 947 break 948 } 949 dataSent = true 950 s.Outstanding += s.pCount(seg, s.MaxPayloadSize) 951 s.writeNext = seg.Next() 952 } 953 954 s.postXmit(dataSent, true /* shouldScheduleProbe */) 955 } 956 957 func (s *sender) enterRecovery() { 958 s.FastRecovery.Active = true 959 // Save state to reflect we're now in fast recovery. 960 // 961 // See : https://tools.ietf.org/html/rfc5681#section-3.2 Step 3. 962 // We inflate the cwnd by 3 to account for the 3 packets which triggered 963 // the 3 duplicate ACKs and are now not in flight. 964 s.SndCwnd = s.Ssthresh + 3 965 s.SackedOut = 0 966 s.DupAckCount = 0 967 s.FastRecovery.First = s.SndUna 968 s.FastRecovery.Last = s.SndNxt - 1 969 s.FastRecovery.MaxCwnd = s.SndCwnd + s.Outstanding 970 s.FastRecovery.HighRxt = s.SndUna 971 s.FastRecovery.RescueRxt = s.SndUna 972 if s.ep.SACKPermitted { 973 s.state = tcpip.SACKRecovery 974 s.ep.stack.Stats().TCP.SACKRecovery.Increment() 975 // Set TLPRxtOut to false according to 976 // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1. 977 if s.rc.tlpRxtOut { 978 // The tail loss probe triggered recovery. 979 s.ep.stack.Stats().TCP.TLPRecovery.Increment() 980 } 981 s.rc.tlpRxtOut = false 982 return 983 } 984 s.state = tcpip.FastRecovery 985 s.ep.stack.Stats().TCP.FastRecovery.Increment() 986 } 987 988 func (s *sender) leaveRecovery() { 989 s.FastRecovery.Active = false 990 s.FastRecovery.MaxCwnd = 0 991 s.DupAckCount = 0 992 993 // Deflate cwnd. It had been artificially inflated when new dups arrived. 994 s.SndCwnd = s.Ssthresh 995 s.cc.PostRecovery() 996 } 997 998 // isAssignedSequenceNumber relies on the fact that we only set flags once a 999 // sequencenumber is assigned and that is only done right before we send the 1000 // segment. As a result any segment that has a non-zero flag has a valid 1001 // sequence number assigned to it. 1002 func (s *sender) isAssignedSequenceNumber(seg *segment) bool { 1003 return seg.flags != 0 1004 } 1005 1006 // SetPipe implements the SetPipe() function described in RFC6675. Netstack 1007 // maintains the congestion window in number of packets and not bytes, so 1008 // SetPipe() here measures number of outstanding packets rather than actual 1009 // outstanding bytes in the network. 1010 func (s *sender) SetPipe() { 1011 // If SACK isn't permitted or it is permitted but recovery is not active 1012 // then ignore pipe calculations. 1013 if !s.ep.SACKPermitted || !s.FastRecovery.Active { 1014 return 1015 } 1016 pipe := 0 1017 smss := seqnum.Size(s.ep.scoreboard.SMSS()) 1018 for s1 := s.writeList.Front(); s1 != nil && s1.data.Size() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() { 1019 // With GSO each segment can be much larger than SMSS. So check the segment 1020 // in SMSS sized ranges. 1021 segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.data.Size())) 1022 for startSeq := s1.sequenceNumber; startSeq.LessThan(segEnd); startSeq = startSeq.Add(smss) { 1023 endSeq := startSeq.Add(smss) 1024 if segEnd.LessThan(endSeq) { 1025 endSeq = segEnd 1026 } 1027 sb := header.SACKBlock{Start: startSeq, End: endSeq} 1028 // SetPipe(): 1029 // 1030 // After initializing pipe to zero, the following steps are 1031 // taken for each octet 'S1' in the sequence space between 1032 // HighACK and HighData that has not been SACKed: 1033 if !s1.sequenceNumber.LessThan(s.SndNxt) { 1034 break 1035 } 1036 if s.ep.scoreboard.IsSACKED(sb) { 1037 continue 1038 } 1039 1040 // SetPipe(): 1041 // 1042 // (a) If IsLost(S1) returns false, Pipe is incremened by 1. 1043 // 1044 // NOTE: here we mark the whole segment as lost. We do not try 1045 // and test every byte in our write buffer as we maintain our 1046 // pipe in terms of oustanding packets and not bytes. 1047 if !s.ep.scoreboard.IsRangeLost(sb) { 1048 pipe++ 1049 } 1050 // SetPipe(): 1051 // (b) If S1 <= HighRxt, Pipe is incremented by 1. 1052 if s1.sequenceNumber.LessThanEq(s.FastRecovery.HighRxt) { 1053 pipe++ 1054 } 1055 } 1056 } 1057 s.Outstanding = pipe 1058 } 1059 1060 // shouldEnterRecovery returns true if the sender should enter fast recovery 1061 // based on dupAck count and sack scoreboard. 1062 // See RFC 6675 section 5. 1063 func (s *sender) shouldEnterRecovery() bool { 1064 return s.DupAckCount >= nDupAckThreshold || 1065 (s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 && s.ep.scoreboard.IsLost(s.SndUna)) 1066 } 1067 1068 // detectLoss is called when an ack is received and returns whether a loss is 1069 // detected. It manages the state related to duplicate acks and determines if 1070 // a retransmit is needed according to the rules in RFC 6582 (NewReno). 1071 func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) { 1072 // We're not in fast recovery yet. 1073 1074 // If RACK is enabled and there is no reordering we should honor the 1075 // three duplicate ACK rule to enter recovery. 1076 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-4 1077 if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1078 if s.rc.Reord { 1079 return false 1080 } 1081 } 1082 1083 if !s.isDupAck(seg) { 1084 s.DupAckCount = 0 1085 return false 1086 } 1087 1088 s.DupAckCount++ 1089 1090 // Do not enter fast recovery until we reach nDupAckThreshold or the 1091 // first unacknowledged byte is considered lost as per SACK scoreboard. 1092 if !s.shouldEnterRecovery() { 1093 // RFC 6675 Step 3. 1094 s.FastRecovery.HighRxt = s.SndUna - 1 1095 // Do run SetPipe() to calculate the outstanding segments. 1096 s.SetPipe() 1097 s.state = tcpip.Disorder 1098 return false 1099 } 1100 1101 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2 1102 // 1103 // We only do the check here, the incrementing of last to the highest 1104 // sequence number transmitted till now is done when enterRecovery 1105 // is invoked. 1106 // 1107 // Note that we only enter recovery when at least one more byte of data 1108 // beyond s.fr.last (the highest byte that was outstanding when fast 1109 // retransmit was last entered) is acked. 1110 if !s.FastRecovery.Last.LessThan(seg.ackNumber - 1) { 1111 s.DupAckCount = 0 1112 return false 1113 } 1114 s.cc.HandleLossDetected() 1115 s.enterRecovery() 1116 return true 1117 } 1118 1119 // isDupAck determines if seg is a duplicate ack as defined in 1120 // https://tools.ietf.org/html/rfc5681#section-2. 1121 func (s *sender) isDupAck(seg *segment) bool { 1122 // A TCP that utilizes selective acknowledgments (SACKs) [RFC2018, RFC2883] 1123 // can leverage the SACK information to determine when an incoming ACK is a 1124 // "duplicate" (e.g., if the ACK contains previously unknown SACK 1125 // information). 1126 if s.ep.SACKPermitted && !seg.hasNewSACKInfo { 1127 return false 1128 } 1129 1130 // (a) The receiver of the ACK has outstanding data. 1131 return s.SndUna != s.SndNxt && 1132 // (b) The incoming acknowledgment carries no data. 1133 seg.logicalLen() == 0 && 1134 // (c) The SYN and FIN bits are both off. 1135 !seg.flags.Intersects(header.TCPFlagFin|header.TCPFlagSyn) && 1136 // (d) the ACK number is equal to the greatest acknowledgment received on 1137 // the given connection (TCP.UNA from RFC793). 1138 seg.ackNumber == s.SndUna && 1139 // (e) the advertised window in the incoming acknowledgment equals the 1140 // advertised window in the last incoming acknowledgment. 1141 s.SndWnd == seg.window 1142 } 1143 1144 // Iterate the writeList and update RACK for each segment which is newly acked 1145 // either cumulatively or selectively. Loop through the segments which are 1146 // sacked, and update the RACK related variables and check for reordering. 1147 // 1148 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 1149 // steps 2 and 3. 1150 func (s *sender) walkSACK(rcvdSeg *segment) { 1151 s.rc.setDSACKSeen(false) 1152 1153 // Look for DSACK block. 1154 idx := 0 1155 n := len(rcvdSeg.parsedOptions.SACKBlocks) 1156 if checkDSACK(rcvdSeg) { 1157 s.rc.setDSACKSeen(true) 1158 idx = 1 1159 n-- 1160 } 1161 1162 if n == 0 { 1163 return 1164 } 1165 1166 // Sort the SACK blocks. The first block is the most recent unacked 1167 // block. The following blocks can be in arbitrary order. 1168 sackBlocks := make([]header.SACKBlock, n) 1169 copy(sackBlocks, rcvdSeg.parsedOptions.SACKBlocks[idx:]) 1170 sort.Slice(sackBlocks, func(i, j int) bool { 1171 return sackBlocks[j].Start.LessThan(sackBlocks[i].Start) 1172 }) 1173 1174 seg := s.writeList.Front() 1175 for _, sb := range sackBlocks { 1176 for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 { 1177 if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked { 1178 s.rc.update(seg, rcvdSeg) 1179 s.rc.detectReorder(seg) 1180 seg.acked = true 1181 s.SackedOut += s.pCount(seg, s.MaxPayloadSize) 1182 } 1183 seg = seg.Next() 1184 } 1185 } 1186 } 1187 1188 // checkDSACK checks if a DSACK is reported. 1189 func checkDSACK(rcvdSeg *segment) bool { 1190 n := len(rcvdSeg.parsedOptions.SACKBlocks) 1191 if n == 0 { 1192 return false 1193 } 1194 1195 sb := rcvdSeg.parsedOptions.SACKBlocks[0] 1196 // Check if SACK block is invalid. 1197 if sb.End.LessThan(sb.Start) { 1198 return false 1199 } 1200 1201 // See: https://tools.ietf.org/html/rfc2883#section-5 DSACK is sent in 1202 // at most one SACK block. DSACK is detected in the below two cases: 1203 // * If the SACK sequence space is less than this cumulative ACK, it is 1204 // an indication that the segment identified by the SACK block has 1205 // been received more than once by the receiver. 1206 // * If the sequence space in the first SACK block is greater than the 1207 // cumulative ACK, then the sender next compares the sequence space 1208 // in the first SACK block with the sequence space in the second SACK 1209 // block, if there is one. This comparison can determine if the first 1210 // SACK block is reporting duplicate data that lies above the 1211 // cumulative ACK. 1212 if sb.Start.LessThan(rcvdSeg.ackNumber) { 1213 return true 1214 } 1215 1216 if n > 1 { 1217 sb1 := rcvdSeg.parsedOptions.SACKBlocks[1] 1218 if sb1.End.LessThan(sb1.Start) { 1219 return false 1220 } 1221 1222 // If the first SACK block is fully covered by second SACK 1223 // block, then the first block is a DSACK block. 1224 if sb.End.LessThanEq(sb1.End) && sb1.Start.LessThanEq(sb.Start) { 1225 return true 1226 } 1227 } 1228 1229 return false 1230 } 1231 1232 // handleRcvdSegment is called when a segment is received; it is responsible for 1233 // updating the send-related state. 1234 func (s *sender) handleRcvdSegment(rcvdSeg *segment) { 1235 // Check if we can extract an RTT measurement from this ack. 1236 if !rcvdSeg.parsedOptions.TS && s.RTTMeasureSeqNum.LessThan(rcvdSeg.ackNumber) { 1237 s.updateRTO(s.ep.stack.Clock().NowMonotonic().Sub(s.RTTMeasureTime)) 1238 s.RTTMeasureSeqNum = s.SndNxt 1239 } 1240 1241 // Update Timestamp if required. See RFC7323, section-4.3. 1242 if s.ep.SendTSOk && rcvdSeg.parsedOptions.TS { 1243 s.ep.updateRecentTimestamp(rcvdSeg.parsedOptions.TSVal, s.MaxSentAck, rcvdSeg.sequenceNumber) 1244 } 1245 1246 // Insert SACKBlock information into our scoreboard. 1247 if s.ep.SACKPermitted { 1248 for _, sb := range rcvdSeg.parsedOptions.SACKBlocks { 1249 // Only insert the SACK block if the following holds 1250 // true: 1251 // * SACK block acks data after the ack number in the 1252 // current segment. 1253 // * SACK block represents a sequence 1254 // between sndUna and sndNxt (i.e. data that is 1255 // currently unacked and in-flight). 1256 // * SACK block that has not been SACKed already. 1257 // 1258 // NOTE: This check specifically excludes DSACK blocks 1259 // which have start/end before sndUna and are used to 1260 // indicate spurious retransmissions. 1261 if rcvdSeg.ackNumber.LessThan(sb.Start) && s.SndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.SndNxt) && !s.ep.scoreboard.IsSACKED(sb) { 1262 s.ep.scoreboard.Insert(sb) 1263 rcvdSeg.hasNewSACKInfo = true 1264 } 1265 } 1266 1267 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08 1268 // section-7.2 1269 // * Step 2: Update RACK stats. 1270 // If the ACK is not ignored as invalid, update the RACK.rtt 1271 // to be the RTT sample calculated using this ACK, and 1272 // continue. If this ACK or SACK was for the most recently 1273 // sent packet, then record the RACK.xmit_ts timestamp and 1274 // RACK.end_seq sequence implied by this ACK. 1275 // * Step 3: Detect packet reordering. 1276 // If the ACK selectively or cumulatively acknowledges an 1277 // unacknowledged and also never retransmitted sequence below 1278 // RACK.fack, then the corresponding packet has been 1279 // reordered and RACK.reord is set to TRUE. 1280 if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1281 s.walkSACK(rcvdSeg) 1282 } 1283 s.SetPipe() 1284 } 1285 1286 ack := rcvdSeg.ackNumber 1287 fastRetransmit := false 1288 // Do not leave fast recovery, if the ACK is out of range. 1289 if s.FastRecovery.Active { 1290 // Leave fast recovery if it acknowledges all the data covered by 1291 // this fast recovery session. 1292 if (ack-1).InRange(s.SndUna, s.SndNxt) && s.FastRecovery.Last.LessThan(ack) { 1293 s.leaveRecovery() 1294 } 1295 } else { 1296 // Detect loss by counting the duplicates and enter recovery. 1297 fastRetransmit = s.detectLoss(rcvdSeg) 1298 } 1299 1300 // See if TLP based recovery was successful. 1301 if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1302 s.detectTLPRecovery(ack, rcvdSeg) 1303 } 1304 1305 // Stash away the current window size. 1306 s.SndWnd = rcvdSeg.window 1307 1308 // Disable zero window probing if remote advertizes a non-zero receive 1309 // window. This can be with an ACK to the zero window probe (where the 1310 // acknumber refers to the already acknowledged byte) OR to any previously 1311 // unacknowledged segment. 1312 if s.zeroWindowProbing && rcvdSeg.window > 0 && 1313 (ack == s.SndUna || (ack-1).InRange(s.SndUna, s.SndNxt)) { 1314 s.disableZeroWindowProbing() 1315 } 1316 1317 // On receiving the ACK for the zero window probe, account for it and 1318 // skip trying to send any segment as we are still probing for 1319 // receive window to become non-zero. 1320 if s.zeroWindowProbing && s.unackZeroWindowProbes > 0 && ack == s.SndUna { 1321 s.unackZeroWindowProbes-- 1322 return 1323 } 1324 1325 // Ignore ack if it doesn't acknowledge any new data. 1326 if (ack - 1).InRange(s.SndUna, s.SndNxt) { 1327 s.DupAckCount = 0 1328 1329 // See : https://tools.ietf.org/html/rfc1323#section-3.3. 1330 // Specifically we should only update the RTO using TSEcr if the 1331 // following condition holds: 1332 // 1333 // A TSecr value received in a segment is used to update the 1334 // averaged RTT measurement only if the segment acknowledges 1335 // some new data, i.e., only if it advances the left edge of 1336 // the send window. 1337 if s.ep.SendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 { 1338 // TSVal/Ecr values sent by Netstack are at a millisecond 1339 // granularity. 1340 elapsed := time.Duration(s.ep.timestamp()-rcvdSeg.parsedOptions.TSEcr) * time.Millisecond 1341 s.updateRTO(elapsed) 1342 } 1343 1344 if s.shouldSchedulePTO() { 1345 // Schedule PTO upon receiving an ACK that cumulatively acknowledges data. 1346 // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1. 1347 s.schedulePTO() 1348 } else { 1349 // When an ack is received we must rearm the timer. 1350 // RFC 6298 5.3 1351 s.probeTimer.disable() 1352 s.resendTimer.enable(s.RTO) 1353 } 1354 1355 // Remove all acknowledged data from the write list. 1356 acked := s.SndUna.Size(ack) 1357 s.SndUna = ack 1358 1359 // The remote ACK-ing at least 1 byte is an indication that we have a 1360 // full-duplex connection to the remote as the only way we will receive an 1361 // ACK is if the remote received data that we previously sent. 1362 // 1363 // As of writing, linux seems to only confirm a route as reachable when 1364 // forward progress is made which is indicated by an ACK that removes data 1365 // from the retransmit queue. 1366 if acked > 0 { 1367 s.ep.route.ConfirmReachable() 1368 } 1369 1370 ackLeft := acked 1371 originalOutstanding := s.Outstanding 1372 for ackLeft > 0 { 1373 // We use logicalLen here because we can have FIN 1374 // segments (which are always at the end of list) that 1375 // have no data, but do consume a sequence number. 1376 seg := s.writeList.Front() 1377 datalen := seg.logicalLen() 1378 1379 if datalen > ackLeft { 1380 prevCount := s.pCount(seg, s.MaxPayloadSize) 1381 seg.data.TrimFront(int(ackLeft)) 1382 seg.sequenceNumber.UpdateForward(ackLeft) 1383 s.Outstanding -= prevCount - s.pCount(seg, s.MaxPayloadSize) 1384 break 1385 } 1386 1387 if s.writeNext == seg { 1388 s.writeNext = seg.Next() 1389 } 1390 1391 // Update the RACK fields if SACK is enabled. 1392 if s.ep.SACKPermitted && !seg.acked && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1393 s.rc.update(seg, rcvdSeg) 1394 s.rc.detectReorder(seg) 1395 } 1396 1397 s.writeList.Remove(seg) 1398 1399 // If SACK is enabled then only reduce outstanding if 1400 // the segment was not previously SACKED as these have 1401 // already been accounted for in SetPipe(). 1402 if !s.ep.SACKPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 1403 s.Outstanding -= s.pCount(seg, s.MaxPayloadSize) 1404 } else { 1405 s.SackedOut -= s.pCount(seg, s.MaxPayloadSize) 1406 } 1407 seg.decRef() 1408 ackLeft -= datalen 1409 } 1410 1411 // Update the send buffer usage and notify potential waiters. 1412 s.ep.updateSndBufferUsage(int(acked)) 1413 1414 // Clear SACK information for all acked data. 1415 s.ep.scoreboard.Delete(s.SndUna) 1416 1417 // If we are not in fast recovery then update the congestion 1418 // window based on the number of acknowledged packets. 1419 if !s.FastRecovery.Active { 1420 s.cc.Update(originalOutstanding - s.Outstanding) 1421 if s.FastRecovery.Last.LessThan(s.SndUna) { 1422 s.state = tcpip.Open 1423 // Update RACK when we are exiting fast or RTO 1424 // recovery as described in the RFC 1425 // draft-ietf-tcpm-rack-08 Section-7.2 Step 4. 1426 if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1427 s.rc.exitRecovery() 1428 } 1429 s.reorderTimer.disable() 1430 } 1431 } 1432 1433 // It is possible for s.outstanding to drop below zero if we get 1434 // a retransmit timeout, reset outstanding to zero but later 1435 // get an ack that cover previously sent data. 1436 if s.Outstanding < 0 { 1437 s.Outstanding = 0 1438 } 1439 1440 s.SetPipe() 1441 1442 // If all outstanding data was acknowledged the disable the timer. 1443 // RFC 6298 Rule 5.3 1444 if s.SndUna == s.SndNxt { 1445 s.Outstanding = 0 1446 // Reset firstRetransmittedSegXmitTime to the zero value. 1447 s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{} 1448 s.resendTimer.disable() 1449 s.probeTimer.disable() 1450 } 1451 } 1452 1453 if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1454 // Update RACK reorder window. 1455 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 1456 // * Upon receiving an ACK: 1457 // * Step 4: Update RACK reordering window 1458 s.rc.updateRACKReorderWindow() 1459 1460 // After the reorder window is calculated, detect any loss by checking 1461 // if the time elapsed after the segments are sent is greater than the 1462 // reorder window. 1463 if numLost := s.rc.detectLoss(rcvdSeg.rcvdTime); numLost > 0 && !s.FastRecovery.Active { 1464 // If any segment is marked as lost by 1465 // RACK, enter recovery and retransmit 1466 // the lost segments. 1467 s.cc.HandleLossDetected() 1468 s.enterRecovery() 1469 fastRetransmit = true 1470 } 1471 1472 if s.FastRecovery.Active { 1473 s.rc.DoRecovery(nil, fastRetransmit) 1474 } 1475 } 1476 1477 // Now that we've popped all acknowledged data from the retransmit 1478 // queue, retransmit if needed. 1479 if s.FastRecovery.Active && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 { 1480 s.lr.DoRecovery(rcvdSeg, fastRetransmit) 1481 // When SACK is enabled data sending is governed by steps in 1482 // RFC 6675 Section 5 recovery steps A-C. 1483 // See: https://tools.ietf.org/html/rfc6675#section-5. 1484 if s.ep.SACKPermitted { 1485 return 1486 } 1487 } 1488 1489 // Send more data now that some of the pending data has been ack'd, or 1490 // that the window opened up, or the congestion window was inflated due 1491 // to a duplicate ack during fast recovery. This will also re-enable 1492 // the retransmit timer if needed. 1493 s.sendData() 1494 } 1495 1496 // sendSegment sends the specified segment. 1497 func (s *sender) sendSegment(seg *segment) tcpip.Error { 1498 if seg.xmitCount > 0 { 1499 s.ep.stack.Stats().TCP.Retransmits.Increment() 1500 s.ep.stats.SendErrors.Retransmits.Increment() 1501 if s.SndCwnd < s.Ssthresh { 1502 s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment() 1503 } 1504 } 1505 seg.xmitTime = s.ep.stack.Clock().NowMonotonic() 1506 seg.xmitCount++ 1507 seg.lost = false 1508 err := s.sendSegmentFromView(seg.data, seg.flags, seg.sequenceNumber) 1509 1510 // Every time a packet containing data is sent (including a 1511 // retransmission), if SACK is enabled and we are retransmitting data 1512 // then use the conservative timer described in RFC6675 Section 6.0, 1513 // otherwise follow the standard time described in RFC6298 Section 5.1. 1514 if err != nil && seg.data.Size() != 0 { 1515 if s.FastRecovery.Active && seg.xmitCount > 1 && s.ep.SACKPermitted { 1516 s.resendTimer.enable(s.RTO) 1517 } else { 1518 if !s.resendTimer.enabled() { 1519 s.resendTimer.enable(s.RTO) 1520 } 1521 } 1522 } 1523 1524 return err 1525 } 1526 1527 // sendSegmentFromView sends a new segment containing the given payload, flags 1528 // and sequence number. 1529 func (s *sender) sendSegmentFromView(data buffer.VectorisedView, flags header.TCPFlags, seq seqnum.Value) tcpip.Error { 1530 s.LastSendTime = s.ep.stack.Clock().NowMonotonic() 1531 if seq == s.RTTMeasureSeqNum { 1532 s.RTTMeasureTime = s.LastSendTime 1533 } 1534 1535 rcvNxt, rcvWnd := s.ep.rcv.getSendParams() 1536 1537 // Remember the max sent ack. 1538 s.MaxSentAck = rcvNxt 1539 1540 return s.ep.sendRaw(data, flags, seq, rcvNxt, rcvWnd) 1541 } 1542 1543 // maybeSendOutOfWindowAck sends an ACK if we are not being rate limited 1544 // currently. 1545 func (s *sender) maybeSendOutOfWindowAck(seg *segment) { 1546 // Data packets are unlikely to be part of an ACK loop. So always send 1547 // an ACK for a packet w/ data. 1548 if seg.payloadSize() > 0 || s.ep.allowOutOfWindowAck() { 1549 s.sendAck() 1550 } 1551 }