inet.af/netstack@v0.0.0-20220214151720-7585b01ddccf/tcpip/transport/tcp/snd.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "fmt" 19 "math" 20 "sort" 21 "time" 22 23 "inet.af/netstack/sleep" 24 "inet.af/netstack/sync" 25 "inet.af/netstack/tcpip" 26 "inet.af/netstack/tcpip/buffer" 27 "inet.af/netstack/tcpip/header" 28 "inet.af/netstack/tcpip/seqnum" 29 "inet.af/netstack/tcpip/stack" 30 ) 31 32 const ( 33 // MinRTO is the minimum allowed value for the retransmit timeout. 34 MinRTO = 200 * time.Millisecond 35 36 // MaxRTO is the maximum allowed value for the retransmit timeout. 37 MaxRTO = 120 * time.Second 38 39 // InitialCwnd is the initial congestion window. 40 InitialCwnd = 10 41 42 // nDupAckThreshold is the number of duplicate ACK's required 43 // before fast-retransmit is entered. 44 nDupAckThreshold = 3 45 46 // MaxRetries is the maximum number of probe retries sender does 47 // before timing out the connection. 48 // Linux default TCP_RETR2, net.ipv4.tcp_retries2. 49 MaxRetries = 15 50 ) 51 52 // congestionControl is an interface that must be implemented by any supported 53 // congestion control algorithm. 54 type congestionControl interface { 55 // HandleLossDetected is invoked when the loss is detected by RACK or 56 // sender.dupAckCount >= nDupAckThreshold just before entering fast 57 // retransmit. 58 HandleLossDetected() 59 60 // HandleRTOExpired is invoked when the retransmit timer expires. 61 HandleRTOExpired() 62 63 // Update is invoked when processing inbound acks. It's passed the 64 // number of packet's that were acked by the most recent cumulative 65 // acknowledgement. 66 Update(packetsAcked int) 67 68 // PostRecovery is invoked when the sender is exiting a fast retransmit/ 69 // recovery phase. This provides congestion control algorithms a way 70 // to adjust their state when exiting recovery. 71 PostRecovery() 72 } 73 74 // lossRecovery is an interface that must be implemented by any supported 75 // loss recovery algorithm. 76 type lossRecovery interface { 77 // DoRecovery is invoked when loss is detected and segments need 78 // to be retransmitted. The cumulative or selective ACK is passed along 79 // with the flag which identifies whether the connection entered fast 80 // retransmit with this ACK and to retransmit the first unacknowledged 81 // segment. 82 DoRecovery(rcvdSeg *segment, fastRetransmit bool) 83 } 84 85 // sender holds the state necessary to send TCP segments. 86 // 87 // +stateify savable 88 type sender struct { 89 stack.TCPSenderState 90 ep *endpoint 91 92 // lr is the loss recovery algorithm used by the sender. 93 lr lossRecovery 94 95 // firstRetransmittedSegXmitTime is the original transmit time of 96 // the first segment that was retransmitted due to RTO expiration. 97 firstRetransmittedSegXmitTime tcpip.MonotonicTime 98 99 // zeroWindowProbing is set if the sender is currently probing 100 // for zero receive window. 101 zeroWindowProbing bool `state:"nosave"` 102 103 // unackZeroWindowProbes is the number of unacknowledged zero 104 // window probes. 105 unackZeroWindowProbes uint32 `state:"nosave"` 106 107 writeNext *segment 108 writeList segmentList 109 resendTimer timer `state:"nosave"` 110 resendWaker sleep.Waker `state:"nosave"` 111 112 // rtt.TCPRTTState.SRTT and rtt.TCPRTTState.RTTVar are the "smoothed 113 // round-trip time", and "round-trip time variation", as defined in 114 // section 2 of RFC 6298. 115 rtt rtt 116 117 // minRTO is the minimum permitted value for sender.rto. 118 minRTO time.Duration 119 120 // maxRTO is the maximum permitted value for sender.rto. 121 maxRTO time.Duration 122 123 // maxRetries is the maximum permitted retransmissions. 124 maxRetries uint32 125 126 // gso is set if generic segmentation offload is enabled. 127 gso bool 128 129 // state is the current state of congestion control for this endpoint. 130 state tcpip.CongestionControlState 131 132 // cc is the congestion control algorithm in use for this sender. 133 cc congestionControl 134 135 // rc has the fields needed for implementing RACK loss detection 136 // algorithm. 137 rc rackControl 138 139 // reorderTimer is the timer used to retransmit the segments after RACK 140 // detects them as lost. 141 reorderTimer timer `state:"nosave"` 142 reorderWaker sleep.Waker `state:"nosave"` 143 144 // probeTimer and probeWaker are used to schedule PTO for RACK TLP algorithm. 145 probeTimer timer `state:"nosave"` 146 probeWaker sleep.Waker `state:"nosave"` 147 148 // spuriousRecovery indicates whether the sender entered recovery 149 // spuriously as described in RFC3522 Section 3.2. 150 spuriousRecovery bool 151 152 // retransmitTS is the timestamp at which the sender sends retransmitted 153 // segment after entering an RTO for the first time as described in 154 // RFC3522 Section 3.2. 155 retransmitTS uint32 156 } 157 158 // rtt is a synchronization wrapper used to appease stateify. See the comment 159 // in sender, where it is used. 160 // 161 // +stateify savable 162 type rtt struct { 163 sync.Mutex `state:"nosave"` 164 165 stack.TCPRTTState 166 } 167 168 func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender { 169 // The sender MUST reduce the TCP data length to account for any IP or 170 // TCP options that it is including in the packets that it sends. 171 // See: https://tools.ietf.org/html/rfc6691#section-2 172 maxPayloadSize := int(mss) - ep.maxOptionSize() 173 174 s := &sender{ 175 ep: ep, 176 TCPSenderState: stack.TCPSenderState{ 177 SndWnd: sndWnd, 178 SndUna: iss + 1, 179 SndNxt: iss + 1, 180 RTTMeasureSeqNum: iss + 1, 181 LastSendTime: ep.stack.Clock().NowMonotonic(), 182 MaxPayloadSize: maxPayloadSize, 183 MaxSentAck: irs + 1, 184 FastRecovery: stack.TCPFastRecoveryState{ 185 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1. 186 Last: iss, 187 HighRxt: iss, 188 RescueRxt: iss, 189 }, 190 RTO: 1 * time.Second, 191 }, 192 gso: ep.gso.Type != stack.GSONone, 193 } 194 195 if s.gso { 196 s.ep.gso.MSS = uint16(maxPayloadSize) 197 } 198 199 s.cc = s.initCongestionControl(ep.cc) 200 s.lr = s.initLossRecovery() 201 s.rc.init(s, iss) 202 203 // A negative sndWndScale means that no scaling is in use, otherwise we 204 // store the scaling value. 205 if sndWndScale > 0 { 206 s.SndWndScale = uint8(sndWndScale) 207 } 208 209 s.resendTimer.init(s.ep.stack.Clock(), &s.resendWaker) 210 s.reorderTimer.init(s.ep.stack.Clock(), &s.reorderWaker) 211 s.probeTimer.init(s.ep.stack.Clock(), &s.probeWaker) 212 213 s.updateMaxPayloadSize(int(ep.route.MTU()), 0) 214 215 // Initialize SACK Scoreboard after updating max payload size as we use 216 // the maxPayloadSize as the smss when determining if a segment is lost 217 // etc. 218 s.ep.scoreboard = NewSACKScoreboard(uint16(s.MaxPayloadSize), iss) 219 220 // Get Stack wide config. 221 var minRTO tcpip.TCPMinRTOOption 222 if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil { 223 panic(fmt.Sprintf("unable to get minRTO from stack: %s", err)) 224 } 225 s.minRTO = time.Duration(minRTO) 226 227 var maxRTO tcpip.TCPMaxRTOOption 228 if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil { 229 panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err)) 230 } 231 s.maxRTO = time.Duration(maxRTO) 232 233 var maxRetries tcpip.TCPMaxRetriesOption 234 if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil { 235 panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err)) 236 } 237 s.maxRetries = uint32(maxRetries) 238 239 return s 240 } 241 242 // initCongestionControl initializes the specified congestion control module and 243 // returns a handle to it. It also initializes the sndCwnd and sndSsThresh to 244 // their initial values. 245 func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl { 246 s.SndCwnd = InitialCwnd 247 // Set sndSsthresh to the maximum int value, which depends on the 248 // platform. 249 s.Ssthresh = int(^uint(0) >> 1) 250 251 switch congestionControlName { 252 case ccCubic: 253 return newCubicCC(s) 254 case ccReno: 255 fallthrough 256 default: 257 return newRenoCC(s) 258 } 259 } 260 261 // initLossRecovery initiates the loss recovery algorithm for the sender. 262 func (s *sender) initLossRecovery() lossRecovery { 263 if s.ep.SACKPermitted { 264 return newSACKRecovery(s) 265 } 266 return newRenoRecovery(s) 267 } 268 269 // updateMaxPayloadSize updates the maximum payload size based on the given 270 // MTU. If this is in response to "packet too big" control packets (indicated 271 // by the count argument), it also reduces the number of outstanding packets and 272 // attempts to retransmit the first packet above the MTU size. 273 func (s *sender) updateMaxPayloadSize(mtu, count int) { 274 m := mtu - header.TCPMinimumSize 275 276 m -= s.ep.maxOptionSize() 277 278 // We don't adjust up for now. 279 if m >= s.MaxPayloadSize { 280 return 281 } 282 283 // Make sure we can transmit at least one byte. 284 if m <= 0 { 285 m = 1 286 } 287 288 oldMSS := s.MaxPayloadSize 289 s.MaxPayloadSize = m 290 if s.gso { 291 s.ep.gso.MSS = uint16(m) 292 } 293 294 if count == 0 { 295 // updateMaxPayloadSize is also called when the sender is created. 296 // and there is no data to send in such cases. Return immediately. 297 return 298 } 299 300 // Update the scoreboard's smss to reflect the new lowered 301 // maxPayloadSize. 302 s.ep.scoreboard.smss = uint16(m) 303 304 s.Outstanding -= count 305 if s.Outstanding < 0 { 306 s.Outstanding = 0 307 } 308 309 // Rewind writeNext to the first segment exceeding the MTU. Do nothing 310 // if it is already before such a packet. 311 nextSeg := s.writeNext 312 for seg := s.writeList.Front(); seg != nil; seg = seg.Next() { 313 if seg == s.writeNext { 314 // We got to writeNext before we could find a segment 315 // exceeding the MTU. 316 break 317 } 318 319 if nextSeg == s.writeNext && seg.data.Size() > m { 320 // We found a segment exceeding the MTU. Rewind 321 // writeNext and try to retransmit it. 322 nextSeg = seg 323 } 324 325 if s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 326 // Update sackedOut for new maximum payload size. 327 s.SackedOut -= s.pCount(seg, oldMSS) 328 s.SackedOut += s.pCount(seg, s.MaxPayloadSize) 329 } 330 } 331 332 // Since we likely reduced the number of outstanding packets, we may be 333 // ready to send some more. 334 s.writeNext = nextSeg 335 s.sendData() 336 } 337 338 // sendAck sends an ACK segment. 339 func (s *sender) sendAck() { 340 s.sendSegmentFromView(buffer.VectorisedView{}, header.TCPFlagAck, s.SndNxt) 341 } 342 343 // updateRTO updates the retransmit timeout when a new roud-trip time is 344 // available. This is done in accordance with section 2 of RFC 6298. 345 func (s *sender) updateRTO(rtt time.Duration) { 346 s.rtt.Lock() 347 if !s.rtt.TCPRTTState.SRTTInited { 348 s.rtt.TCPRTTState.RTTVar = rtt / 2 349 s.rtt.TCPRTTState.SRTT = rtt 350 s.rtt.TCPRTTState.SRTTInited = true 351 } else { 352 diff := s.rtt.TCPRTTState.SRTT - rtt 353 if diff < 0 { 354 diff = -diff 355 } 356 // Use RFC6298 standard algorithm to update TCPRTTState.RTTVar and TCPRTTState.SRTT when 357 // no timestamps are available. 358 if !s.ep.SendTSOk { 359 s.rtt.TCPRTTState.RTTVar = (3*s.rtt.TCPRTTState.RTTVar + diff) / 4 360 s.rtt.TCPRTTState.SRTT = (7*s.rtt.TCPRTTState.SRTT + rtt) / 8 361 } else { 362 // When we are taking RTT measurements of every ACK then 363 // we need to use a modified method as specified in 364 // https://tools.ietf.org/html/rfc7323#appendix-G 365 if s.Outstanding == 0 { 366 s.rtt.Unlock() 367 return 368 } 369 // Netstack measures congestion window/inflight all in 370 // terms of packets and not bytes. This is similar to 371 // how linux also does cwnd and inflight. In practice 372 // this approximation works as expected. 373 expectedSamples := math.Ceil(float64(s.Outstanding) / 2) 374 375 // alpha & beta values are the original values as recommended in 376 // https://tools.ietf.org/html/rfc6298#section-2.3. 377 const alpha = 0.125 378 const beta = 0.25 379 380 alphaPrime := alpha / expectedSamples 381 betaPrime := beta / expectedSamples 382 rttVar := (1-betaPrime)*s.rtt.TCPRTTState.RTTVar.Seconds() + betaPrime*diff.Seconds() 383 srtt := (1-alphaPrime)*s.rtt.TCPRTTState.SRTT.Seconds() + alphaPrime*rtt.Seconds() 384 s.rtt.TCPRTTState.RTTVar = time.Duration(rttVar * float64(time.Second)) 385 s.rtt.TCPRTTState.SRTT = time.Duration(srtt * float64(time.Second)) 386 } 387 } 388 389 s.RTO = s.rtt.TCPRTTState.SRTT + 4*s.rtt.TCPRTTState.RTTVar 390 s.rtt.Unlock() 391 if s.RTO < s.minRTO { 392 s.RTO = s.minRTO 393 } 394 if s.RTO > s.maxRTO { 395 s.RTO = s.maxRTO 396 } 397 } 398 399 // resendSegment resends the first unacknowledged segment. 400 func (s *sender) resendSegment() { 401 // Don't use any segments we already sent to measure RTT as they may 402 // have been affected by packets being lost. 403 s.RTTMeasureSeqNum = s.SndNxt 404 405 // Resend the segment. 406 if seg := s.writeList.Front(); seg != nil { 407 if seg.data.Size() > s.MaxPayloadSize { 408 s.splitSeg(seg, s.MaxPayloadSize) 409 } 410 411 // See: RFC 6675 section 5 Step 4.3 412 // 413 // To prevent retransmission, set both the HighRXT and RescueRXT 414 // to the highest sequence number in the retransmitted segment. 415 s.FastRecovery.HighRxt = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) - 1 416 s.FastRecovery.RescueRxt = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) - 1 417 s.sendSegment(seg) 418 s.ep.stack.Stats().TCP.FastRetransmit.Increment() 419 s.ep.stats.SendErrors.FastRetransmit.Increment() 420 421 // Run SetPipe() as per RFC 6675 section 5 Step 4.4 422 s.SetPipe() 423 } 424 } 425 426 // retransmitTimerExpired is called when the retransmit timer expires, and 427 // unacknowledged segments are assumed lost, and thus need to be resent. 428 // Returns true if the connection is still usable, or false if the connection 429 // is deemed lost. 430 func (s *sender) retransmitTimerExpired() bool { 431 // Check if the timer actually expired or if it's a spurious wake due 432 // to a previously orphaned runtime timer. 433 if !s.resendTimer.checkExpiration() { 434 return true 435 } 436 437 // Initialize the variables used to detect spurious recovery after 438 // entering RTO. 439 // 440 // See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1. 441 s.spuriousRecovery = false 442 s.retransmitTS = 0 443 444 // TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases 445 // when writeList is empty. Remove this once we have a proper fix for this 446 // issue. 447 if s.writeList.Front() == nil { 448 return true 449 } 450 451 s.ep.stack.Stats().TCP.Timeouts.Increment() 452 s.ep.stats.SendErrors.Timeouts.Increment() 453 454 // Set TLPRxtOut to false according to 455 // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1. 456 s.rc.tlpRxtOut = false 457 458 // Give up if we've waited more than a minute since the last resend or 459 // if a user time out is set and we have exceeded the user specified 460 // timeout since the first retransmission. 461 uto := s.ep.userTimeout 462 463 if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) { 464 // We store the original xmitTime of the segment that we are 465 // about to retransmit as the retransmission time. This is 466 // required as by the time the retransmitTimer has expired the 467 // segment has already been sent and unacked for the RTO at the 468 // time the segment was sent. 469 s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime 470 } 471 472 elapsed := s.ep.stack.Clock().NowMonotonic().Sub(s.firstRetransmittedSegXmitTime) 473 remaining := s.maxRTO 474 if uto != 0 { 475 // Cap to the user specified timeout if one is specified. 476 remaining = uto - elapsed 477 } 478 479 // Always honor the user-timeout irrespective of whether the zero 480 // window probes were acknowledged. 481 // net/ipv4/tcp_timer.c::tcp_probe_timer() 482 if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries { 483 return false 484 } 485 486 // Set new timeout. The timer will be restarted by the call to sendData 487 // below. 488 s.RTO *= 2 489 // Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5 490 if s.RTO > s.maxRTO { 491 s.RTO = s.maxRTO 492 } 493 494 // Cap RTO to remaining time. 495 if s.RTO > remaining { 496 s.RTO = remaining 497 } 498 499 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4. 500 // 501 // Retransmit timeouts: 502 // After a retransmit timeout, record the highest sequence number 503 // transmitted in the variable recover, and exit the fast recovery 504 // procedure if applicable. 505 s.FastRecovery.Last = s.SndNxt - 1 506 507 if s.FastRecovery.Active { 508 // We were attempting fast recovery but were not successful. 509 // Leave the state. We don't need to update ssthresh because it 510 // has already been updated when entered fast-recovery. 511 s.leaveRecovery() 512 } 513 514 // Record retransmitTS if the sender is not in recovery as per: 515 // https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 516 s.recordRetransmitTS() 517 518 s.state = tcpip.RTORecovery 519 s.cc.HandleRTOExpired() 520 521 // Mark the next segment to be sent as the first unacknowledged one and 522 // start sending again. Set the number of outstanding packets to 0 so 523 // that we'll be able to retransmit. 524 // 525 // We'll keep on transmitting (or retransmitting) as we get acks for 526 // the data we transmit. 527 s.Outstanding = 0 528 529 // Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1 530 // 531 // In order to avoid memory deadlocks, the TCP receiver is allowed to 532 // discard data that has already been selectively acknowledged. As a 533 // result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK 534 // information gathered from a receiver upon a retransmission timeout 535 // (RTO) "since the timeout might indicate that the data receiver has 536 // reneged." Additionally, a TCP sender MUST "ignore prior SACK 537 // information in determining which data to retransmit." 538 // 539 // NOTE: We take the stricter interpretation and just expunge all 540 // information as we lack more rigorous checks to validate if the SACK 541 // information is usable after an RTO. 542 s.ep.scoreboard.Reset() 543 s.writeNext = s.writeList.Front() 544 545 // RFC 1122 4.2.2.17: Start sending zero window probes when we still see a 546 // zero receive window after retransmission interval and we have data to 547 // send. 548 if s.zeroWindowProbing { 549 s.sendZeroWindowProbe() 550 // RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed 551 // indefinitely. As long as the receiving TCP continues to send 552 // acknowledgments in response to the probe segments, the sending TCP 553 // MUST allow the connection to stay open. 554 return true 555 } 556 557 seg := s.writeNext 558 // RFC 1122 4.2.3.5: Close the connection when the number of 559 // retransmissions for this segment is beyond a limit. 560 if seg != nil && seg.xmitCount > s.maxRetries { 561 return false 562 } 563 564 s.sendData() 565 566 return true 567 } 568 569 // pCount returns the number of packets in the segment. Due to GSO, a segment 570 // can be composed of multiple packets. 571 func (s *sender) pCount(seg *segment, maxPayloadSize int) int { 572 size := seg.data.Size() 573 if size == 0 { 574 return 1 575 } 576 577 return (size-1)/maxPayloadSize + 1 578 } 579 580 // splitSeg splits a given segment at the size specified and inserts the 581 // remainder as a new segment after the current one in the write list. 582 func (s *sender) splitSeg(seg *segment, size int) { 583 if seg.data.Size() <= size { 584 return 585 } 586 // Split this segment up. 587 nSeg := seg.clone() 588 nSeg.data.TrimFront(size) 589 nSeg.sequenceNumber.UpdateForward(seqnum.Size(size)) 590 s.writeList.InsertAfter(seg, nSeg) 591 592 // The segment being split does not carry PUSH flag because it is 593 // followed by the newly split segment. 594 // RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered 595 // segment (i.e., when there is no more queued data to be sent). 596 // Linux removes PSH flag only when the segment is being split over MSS 597 // and retains it when we are splitting the segment over lack of sender 598 // window space. 599 // ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point() 600 // ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test() 601 if seg.data.Size() > s.MaxPayloadSize { 602 seg.flags ^= header.TCPFlagPsh 603 } 604 605 seg.data.CapLength(size) 606 } 607 608 // NextSeg implements the RFC6675 NextSeg() operation. 609 // 610 // NextSeg starts scanning the writeList starting from nextSegHint and returns 611 // the hint to be passed on the next call to NextSeg. This is required to avoid 612 // iterating the write list repeatedly when NextSeg is invoked in a loop during 613 // recovery. The returned hint will be nil if there are no more segments that 614 // can match rules defined by NextSeg operation in RFC6675. 615 // 616 // rescueRtx will be true only if nextSeg is a rescue retransmission as 617 // described by Step 4) of the NextSeg algorithm. 618 func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) { 619 var s3 *segment 620 var s4 *segment 621 // Step 1. 622 for seg := nextSegHint; seg != nil; seg = seg.Next() { 623 // Stop iteration if we hit a segment that has never been 624 // transmitted (i.e. either it has no assigned sequence number 625 // or if it does have one, it's >= the next sequence number 626 // to be sent [i.e. >= s.sndNxt]). 627 if !s.isAssignedSequenceNumber(seg) || s.SndNxt.LessThanEq(seg.sequenceNumber) { 628 hint = nil 629 break 630 } 631 segSeq := seg.sequenceNumber 632 if smss := s.ep.scoreboard.SMSS(); seg.data.Size() > int(smss) { 633 s.splitSeg(seg, int(smss)) 634 } 635 636 // See RFC 6675 Section 4 637 // 638 // 1. If there exists a smallest unSACKED sequence number 639 // 'S2' that meets the following 3 criteria for determinig 640 // loss, the sequence range of one segment of up to SMSS 641 // octects starting with S2 MUST be returned. 642 if !s.ep.scoreboard.IsSACKED(header.SACKBlock{Start: segSeq, End: segSeq.Add(1)}) { 643 // NextSeg(): 644 // 645 // (1.a) S2 is greater than HighRxt 646 // (1.b) S2 is less than highest octect covered by 647 // any received SACK. 648 if s.FastRecovery.HighRxt.LessThan(segSeq) && segSeq.LessThan(s.ep.scoreboard.maxSACKED) { 649 // NextSeg(): 650 // (1.c) IsLost(S2) returns true. 651 if s.ep.scoreboard.IsLost(segSeq) { 652 return seg, seg.Next(), false 653 } 654 655 // NextSeg(): 656 // 657 // (3): If the conditions for rules (1) and (2) 658 // fail, but there exists an unSACKed sequence 659 // number S3 that meets the criteria for 660 // detecting loss given in steps 1.a and 1.b 661 // above (specifically excluding (1.c)) then one 662 // segment of upto SMSS octets starting with S3 663 // SHOULD be returned. 664 if s3 == nil { 665 s3 = seg 666 hint = seg.Next() 667 } 668 } 669 // NextSeg(): 670 // 671 // (4) If the conditions for (1), (2) and (3) fail, 672 // but there exists outstanding unSACKED data, we 673 // provide the opportunity for a single "rescue" 674 // retransmission per entry into loss recovery. If 675 // HighACK is greater than RescueRxt (or RescueRxt 676 // is undefined), then one segment of upto SMSS 677 // octects that MUST include the highest outstanding 678 // unSACKed sequence number SHOULD be returned, and 679 // RescueRxt set to RecoveryPoint. HighRxt MUST NOT 680 // be updated. 681 if s.FastRecovery.RescueRxt.LessThan(s.SndUna - 1) { 682 if s4 != nil { 683 if s4.sequenceNumber.LessThan(segSeq) { 684 s4 = seg 685 } 686 } else { 687 s4 = seg 688 } 689 } 690 } 691 } 692 693 // If we got here then no segment matched step (1). 694 // Step (2): "If no sequence number 'S2' per rule (1) 695 // exists but there exists available unsent data and the 696 // receiver's advertised window allows, the sequence 697 // range of one segment of up to SMSS octets of 698 // previously unsent data starting with sequence number 699 // HighData+1 MUST be returned." 700 for seg := s.writeNext; seg != nil; seg = seg.Next() { 701 if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.SndNxt) { 702 continue 703 } 704 // We do not split the segment here to <= smss as it has 705 // potentially not been assigned a sequence number yet. 706 return seg, nil, false 707 } 708 709 if s3 != nil { 710 return s3, hint, false 711 } 712 713 return s4, nil, true 714 } 715 716 // maybeSendSegment tries to send the specified segment and either coalesces 717 // other segments into this one or splits the specified segment based on the 718 // lower of the specified limit value or the receivers window size specified by 719 // end. 720 func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (sent bool) { 721 // We abuse the flags field to determine if we have already 722 // assigned a sequence number to this segment. 723 if !s.isAssignedSequenceNumber(seg) { 724 // Merge segments if allowed. 725 if seg.data.Size() != 0 { 726 available := int(s.SndNxt.Size(end)) 727 if available > limit { 728 available = limit 729 } 730 731 // nextTooBig indicates that the next segment was too 732 // large to entirely fit in the current segment. It 733 // would be possible to split the next segment and merge 734 // the portion that fits, but unexpectedly splitting 735 // segments can have user visible side-effects which can 736 // break applications. For example, RFC 7766 section 8 737 // says that the length and data of a DNS response 738 // should be sent in the same TCP segment to avoid 739 // triggering bugs in poorly written DNS 740 // implementations. 741 var nextTooBig bool 742 for nSeg := seg.Next(); nSeg != nil && nSeg.data.Size() != 0; nSeg = seg.Next() { 743 if seg.data.Size()+nSeg.data.Size() > available { 744 nextTooBig = true 745 break 746 } 747 seg.merge(nSeg) 748 s.writeList.Remove(nSeg) 749 nSeg.decRef() 750 } 751 if !nextTooBig && seg.data.Size() < available { 752 // Segment is not full. 753 if s.Outstanding > 0 && s.ep.ops.GetDelayOption() { 754 // Nagle's algorithm. From Wikipedia: 755 // Nagle's algorithm works by 756 // combining a number of small 757 // outgoing messages and sending them 758 // all at once. Specifically, as long 759 // as there is a sent packet for which 760 // the sender has received no 761 // acknowledgment, the sender should 762 // keep buffering its output until it 763 // has a full packet's worth of 764 // output, thus allowing output to be 765 // sent all at once. 766 return false 767 } 768 // With TCP_CORK, hold back until minimum of the available 769 // send space and MSS. 770 // TODO(gvisor.dev/issue/2833): Drain the held segments after a 771 // timeout. 772 if seg.data.Size() < s.MaxPayloadSize && s.ep.ops.GetCorkOption() { 773 return false 774 } 775 } 776 } 777 778 // Assign flags. We don't do it above so that we can merge 779 // additional data if Nagle holds the segment. 780 seg.sequenceNumber = s.SndNxt 781 seg.flags = header.TCPFlagAck | header.TCPFlagPsh 782 } 783 784 var segEnd seqnum.Value 785 if seg.data.Size() == 0 { 786 if s.writeList.Back() != seg { 787 panic("FIN segments must be the final segment in the write list.") 788 } 789 seg.flags = header.TCPFlagAck | header.TCPFlagFin 790 segEnd = seg.sequenceNumber.Add(1) 791 // Update the state to reflect that we have now 792 // queued a FIN. 793 switch s.ep.EndpointState() { 794 case StateCloseWait: 795 s.ep.setEndpointState(StateLastAck) 796 default: 797 s.ep.setEndpointState(StateFinWait1) 798 } 799 } else { 800 // We're sending a non-FIN segment. 801 if seg.flags&header.TCPFlagFin != 0 { 802 panic("Netstack queues FIN segments without data.") 803 } 804 805 if !seg.sequenceNumber.LessThan(end) { 806 return false 807 } 808 809 available := int(seg.sequenceNumber.Size(end)) 810 if available == 0 { 811 return false 812 } 813 814 // If the whole segment or at least 1MSS sized segment cannot 815 // be accomodated in the receiver advertized window, skip 816 // splitting and sending of the segment. ref: 817 // net/ipv4/tcp_output.c::tcp_snd_wnd_test() 818 // 819 // Linux checks this for all segment transmits not triggered by 820 // a probe timer. On this condition, it defers the segment split 821 // and transmit to a short probe timer. 822 // 823 // ref: include/net/tcp.h::tcp_check_probe_timer() 824 // ref: net/ipv4/tcp_output.c::tcp_write_wakeup() 825 // 826 // Instead of defining a new transmit timer, we attempt to split 827 // the segment right here if there are no pending segments. If 828 // there are pending segments, segment transmits are deferred to 829 // the retransmit timer handler. 830 if s.SndUna != s.SndNxt { 831 switch { 832 case available >= seg.data.Size(): 833 // OK to send, the whole segments fits in the 834 // receiver's advertised window. 835 case available >= s.MaxPayloadSize: 836 // OK to send, at least 1 MSS sized segment fits 837 // in the receiver's advertised window. 838 default: 839 return false 840 } 841 } 842 843 // The segment size limit is computed as a function of sender 844 // congestion window and MSS. When sender congestion window is > 845 // 1, this limit can be larger than MSS. Ensure that the 846 // currently available send space is not greater than minimum of 847 // this limit and MSS. 848 if available > limit { 849 available = limit 850 } 851 852 // If GSO is not in use then cap available to 853 // maxPayloadSize. When GSO is in use the gVisor GSO logic or 854 // the host GSO logic will cap the segment to the correct size. 855 if s.ep.gso.Type == stack.GSONone && available > s.MaxPayloadSize { 856 available = s.MaxPayloadSize 857 } 858 859 if seg.data.Size() > available { 860 s.splitSeg(seg, available) 861 } 862 863 segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) 864 } 865 866 s.sendSegment(seg) 867 868 // Update sndNxt if we actually sent new data (as opposed to 869 // retransmitting some previously sent data). 870 if s.SndNxt.LessThan(segEnd) { 871 s.SndNxt = segEnd 872 } 873 874 return true 875 } 876 877 func (s *sender) sendZeroWindowProbe() { 878 ack, win := s.ep.rcv.getSendParams() 879 s.unackZeroWindowProbes++ 880 // Send a zero window probe with sequence number pointing to 881 // the last acknowledged byte. 882 s.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, s.SndUna-1, ack, win) 883 // Rearm the timer to continue probing. 884 s.resendTimer.enable(s.RTO) 885 } 886 887 func (s *sender) enableZeroWindowProbing() { 888 s.zeroWindowProbing = true 889 // We piggyback the probing on the retransmit timer with the 890 // current retranmission interval, as we may start probing while 891 // segment retransmissions. 892 if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) { 893 s.firstRetransmittedSegXmitTime = s.ep.stack.Clock().NowMonotonic() 894 } 895 s.resendTimer.enable(s.RTO) 896 } 897 898 func (s *sender) disableZeroWindowProbing() { 899 s.zeroWindowProbing = false 900 s.unackZeroWindowProbes = 0 901 s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{} 902 s.resendTimer.disable() 903 } 904 905 func (s *sender) postXmit(dataSent bool, shouldScheduleProbe bool) { 906 if dataSent { 907 // We sent data, so we should stop the keepalive timer to ensure 908 // that no keepalives are sent while there is pending data. 909 s.ep.disableKeepaliveTimer() 910 } 911 912 // If the sender has advertized zero receive window and we have 913 // data to be sent out, start zero window probing to query the 914 // the remote for it's receive window size. 915 if s.writeNext != nil && s.SndWnd == 0 { 916 s.enableZeroWindowProbing() 917 } 918 919 // If we have no more pending data, start the keepalive timer. 920 if s.SndUna == s.SndNxt { 921 s.ep.resetKeepaliveTimer(false) 922 } else { 923 // Enable timers if we have pending data. 924 if shouldScheduleProbe && s.shouldSchedulePTO() { 925 // Schedule PTO after transmitting new data that wasn't itself a TLP probe. 926 s.schedulePTO() 927 } else if !s.resendTimer.enabled() { 928 s.probeTimer.disable() 929 if s.Outstanding > 0 { 930 // Enable the resend timer if it's not enabled yet and there is 931 // outstanding data. 932 s.resendTimer.enable(s.RTO) 933 } 934 } 935 } 936 } 937 938 // sendData sends new data segments. It is called when data becomes available or 939 // when the send window opens up. 940 func (s *sender) sendData() { 941 limit := s.MaxPayloadSize 942 if s.gso { 943 limit = int(s.ep.gso.MaxSize - header.TCPHeaderMaximumSize) 944 } 945 end := s.SndUna.Add(s.SndWnd) 946 947 // Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10. 948 // "A TCP SHOULD set cwnd to no more than RW before beginning 949 // transmission if the TCP has not sent data in the interval exceeding 950 // the retrasmission timeout." 951 if !s.FastRecovery.Active && s.state != tcpip.RTORecovery && s.ep.stack.Clock().NowMonotonic().Sub(s.LastSendTime) > s.RTO { 952 if s.SndCwnd > InitialCwnd { 953 s.SndCwnd = InitialCwnd 954 } 955 } 956 957 var dataSent bool 958 for seg := s.writeNext; seg != nil && s.Outstanding < s.SndCwnd; seg = seg.Next() { 959 cwndLimit := (s.SndCwnd - s.Outstanding) * s.MaxPayloadSize 960 if cwndLimit < limit { 961 limit = cwndLimit 962 } 963 if s.isAssignedSequenceNumber(seg) && s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 964 // Move writeNext along so that we don't try and scan data that 965 // has already been SACKED. 966 s.writeNext = seg.Next() 967 continue 968 } 969 if sent := s.maybeSendSegment(seg, limit, end); !sent { 970 break 971 } 972 dataSent = true 973 s.Outstanding += s.pCount(seg, s.MaxPayloadSize) 974 s.writeNext = seg.Next() 975 } 976 977 s.postXmit(dataSent, true /* shouldScheduleProbe */) 978 } 979 980 func (s *sender) enterRecovery() { 981 // Initialize the variables used to detect spurious recovery after 982 // entering recovery. 983 // 984 // See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1. 985 s.spuriousRecovery = false 986 s.retransmitTS = 0 987 988 s.FastRecovery.Active = true 989 // Save state to reflect we're now in fast recovery. 990 // 991 // See : https://tools.ietf.org/html/rfc5681#section-3.2 Step 3. 992 // We inflate the cwnd by 3 to account for the 3 packets which triggered 993 // the 3 duplicate ACKs and are now not in flight. 994 s.SndCwnd = s.Ssthresh + 3 995 s.SackedOut = 0 996 s.DupAckCount = 0 997 s.FastRecovery.First = s.SndUna 998 s.FastRecovery.Last = s.SndNxt - 1 999 s.FastRecovery.MaxCwnd = s.SndCwnd + s.Outstanding 1000 s.FastRecovery.HighRxt = s.SndUna 1001 s.FastRecovery.RescueRxt = s.SndUna 1002 1003 // Record retransmitTS if the sender is not in recovery as per: 1004 // https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 1005 s.recordRetransmitTS() 1006 1007 if s.ep.SACKPermitted { 1008 s.state = tcpip.SACKRecovery 1009 s.ep.stack.Stats().TCP.SACKRecovery.Increment() 1010 // Set TLPRxtOut to false according to 1011 // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1. 1012 if s.rc.tlpRxtOut { 1013 // The tail loss probe triggered recovery. 1014 s.ep.stack.Stats().TCP.TLPRecovery.Increment() 1015 } 1016 s.rc.tlpRxtOut = false 1017 return 1018 } 1019 s.state = tcpip.FastRecovery 1020 s.ep.stack.Stats().TCP.FastRecovery.Increment() 1021 } 1022 1023 func (s *sender) leaveRecovery() { 1024 s.FastRecovery.Active = false 1025 s.FastRecovery.MaxCwnd = 0 1026 s.DupAckCount = 0 1027 1028 // Deflate cwnd. It had been artificially inflated when new dups arrived. 1029 s.SndCwnd = s.Ssthresh 1030 s.cc.PostRecovery() 1031 } 1032 1033 // isAssignedSequenceNumber relies on the fact that we only set flags once a 1034 // sequencenumber is assigned and that is only done right before we send the 1035 // segment. As a result any segment that has a non-zero flag has a valid 1036 // sequence number assigned to it. 1037 func (s *sender) isAssignedSequenceNumber(seg *segment) bool { 1038 return seg.flags != 0 1039 } 1040 1041 // SetPipe implements the SetPipe() function described in RFC6675. Netstack 1042 // maintains the congestion window in number of packets and not bytes, so 1043 // SetPipe() here measures number of outstanding packets rather than actual 1044 // outstanding bytes in the network. 1045 func (s *sender) SetPipe() { 1046 // If SACK isn't permitted or it is permitted but recovery is not active 1047 // then ignore pipe calculations. 1048 if !s.ep.SACKPermitted || !s.FastRecovery.Active { 1049 return 1050 } 1051 pipe := 0 1052 smss := seqnum.Size(s.ep.scoreboard.SMSS()) 1053 for s1 := s.writeList.Front(); s1 != nil && s1.data.Size() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() { 1054 // With GSO each segment can be much larger than SMSS. So check the segment 1055 // in SMSS sized ranges. 1056 segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.data.Size())) 1057 for startSeq := s1.sequenceNumber; startSeq.LessThan(segEnd); startSeq = startSeq.Add(smss) { 1058 endSeq := startSeq.Add(smss) 1059 if segEnd.LessThan(endSeq) { 1060 endSeq = segEnd 1061 } 1062 sb := header.SACKBlock{Start: startSeq, End: endSeq} 1063 // SetPipe(): 1064 // 1065 // After initializing pipe to zero, the following steps are 1066 // taken for each octet 'S1' in the sequence space between 1067 // HighACK and HighData that has not been SACKed: 1068 if !s1.sequenceNumber.LessThan(s.SndNxt) { 1069 break 1070 } 1071 if s.ep.scoreboard.IsSACKED(sb) { 1072 continue 1073 } 1074 1075 // SetPipe(): 1076 // 1077 // (a) If IsLost(S1) returns false, Pipe is incremened by 1. 1078 // 1079 // NOTE: here we mark the whole segment as lost. We do not try 1080 // and test every byte in our write buffer as we maintain our 1081 // pipe in terms of oustanding packets and not bytes. 1082 if !s.ep.scoreboard.IsRangeLost(sb) { 1083 pipe++ 1084 } 1085 // SetPipe(): 1086 // (b) If S1 <= HighRxt, Pipe is incremented by 1. 1087 if s1.sequenceNumber.LessThanEq(s.FastRecovery.HighRxt) { 1088 pipe++ 1089 } 1090 } 1091 } 1092 s.Outstanding = pipe 1093 } 1094 1095 // shouldEnterRecovery returns true if the sender should enter fast recovery 1096 // based on dupAck count and sack scoreboard. 1097 // See RFC 6675 section 5. 1098 func (s *sender) shouldEnterRecovery() bool { 1099 return s.DupAckCount >= nDupAckThreshold || 1100 (s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 && s.ep.scoreboard.IsLost(s.SndUna)) 1101 } 1102 1103 // detectLoss is called when an ack is received and returns whether a loss is 1104 // detected. It manages the state related to duplicate acks and determines if 1105 // a retransmit is needed according to the rules in RFC 6582 (NewReno). 1106 func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) { 1107 // We're not in fast recovery yet. 1108 1109 // If RACK is enabled and there is no reordering we should honor the 1110 // three duplicate ACK rule to enter recovery. 1111 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-4 1112 if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1113 if s.rc.Reord { 1114 return false 1115 } 1116 } 1117 1118 if !s.isDupAck(seg) { 1119 s.DupAckCount = 0 1120 return false 1121 } 1122 1123 s.DupAckCount++ 1124 1125 // Do not enter fast recovery until we reach nDupAckThreshold or the 1126 // first unacknowledged byte is considered lost as per SACK scoreboard. 1127 if !s.shouldEnterRecovery() { 1128 // RFC 6675 Step 3. 1129 s.FastRecovery.HighRxt = s.SndUna - 1 1130 // Do run SetPipe() to calculate the outstanding segments. 1131 s.SetPipe() 1132 s.state = tcpip.Disorder 1133 return false 1134 } 1135 1136 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2 1137 // 1138 // We only do the check here, the incrementing of last to the highest 1139 // sequence number transmitted till now is done when enterRecovery 1140 // is invoked. 1141 // 1142 // Note that we only enter recovery when at least one more byte of data 1143 // beyond s.fr.last (the highest byte that was outstanding when fast 1144 // retransmit was last entered) is acked. 1145 if !s.FastRecovery.Last.LessThan(seg.ackNumber - 1) { 1146 s.DupAckCount = 0 1147 return false 1148 } 1149 s.cc.HandleLossDetected() 1150 s.enterRecovery() 1151 return true 1152 } 1153 1154 // isDupAck determines if seg is a duplicate ack as defined in 1155 // https://tools.ietf.org/html/rfc5681#section-2. 1156 func (s *sender) isDupAck(seg *segment) bool { 1157 // A TCP that utilizes selective acknowledgments (SACKs) [RFC2018, RFC2883] 1158 // can leverage the SACK information to determine when an incoming ACK is a 1159 // "duplicate" (e.g., if the ACK contains previously unknown SACK 1160 // information). 1161 if s.ep.SACKPermitted && !seg.hasNewSACKInfo { 1162 return false 1163 } 1164 1165 // (a) The receiver of the ACK has outstanding data. 1166 return s.SndUna != s.SndNxt && 1167 // (b) The incoming acknowledgment carries no data. 1168 seg.logicalLen() == 0 && 1169 // (c) The SYN and FIN bits are both off. 1170 !seg.flags.Intersects(header.TCPFlagFin|header.TCPFlagSyn) && 1171 // (d) the ACK number is equal to the greatest acknowledgment received on 1172 // the given connection (TCP.UNA from RFC793). 1173 seg.ackNumber == s.SndUna && 1174 // (e) the advertised window in the incoming acknowledgment equals the 1175 // advertised window in the last incoming acknowledgment. 1176 s.SndWnd == seg.window 1177 } 1178 1179 // Iterate the writeList and update RACK for each segment which is newly acked 1180 // either cumulatively or selectively. Loop through the segments which are 1181 // sacked, and update the RACK related variables and check for reordering. 1182 // Returns true when the DSACK block has been detected in the received ACK. 1183 // 1184 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 1185 // steps 2 and 3. 1186 func (s *sender) walkSACK(rcvdSeg *segment) bool { 1187 s.rc.setDSACKSeen(false) 1188 1189 // Look for DSACK block. 1190 hasDSACK := false 1191 idx := 0 1192 n := len(rcvdSeg.parsedOptions.SACKBlocks) 1193 if checkDSACK(rcvdSeg) { 1194 dsackBlock := rcvdSeg.parsedOptions.SACKBlocks[0] 1195 numDSACK := uint64(dsackBlock.End-dsackBlock.Start) / uint64(s.MaxPayloadSize) 1196 // numDSACK can be zero when DSACK is sent for subsegments. 1197 if numDSACK < 1 { 1198 numDSACK = 1 1199 } 1200 s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.IncrementBy(numDSACK) 1201 s.rc.setDSACKSeen(true) 1202 idx = 1 1203 n-- 1204 hasDSACK = true 1205 } 1206 1207 if n == 0 { 1208 return hasDSACK 1209 } 1210 1211 // Sort the SACK blocks. The first block is the most recent unacked 1212 // block. The following blocks can be in arbitrary order. 1213 sackBlocks := make([]header.SACKBlock, n) 1214 copy(sackBlocks, rcvdSeg.parsedOptions.SACKBlocks[idx:]) 1215 sort.Slice(sackBlocks, func(i, j int) bool { 1216 return sackBlocks[j].Start.LessThan(sackBlocks[i].Start) 1217 }) 1218 1219 seg := s.writeList.Front() 1220 for _, sb := range sackBlocks { 1221 for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 { 1222 if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked { 1223 s.rc.update(seg, rcvdSeg) 1224 s.rc.detectReorder(seg) 1225 seg.acked = true 1226 s.SackedOut += s.pCount(seg, s.MaxPayloadSize) 1227 } 1228 seg = seg.Next() 1229 } 1230 } 1231 return hasDSACK 1232 } 1233 1234 // checkDSACK checks if a DSACK is reported. 1235 func checkDSACK(rcvdSeg *segment) bool { 1236 n := len(rcvdSeg.parsedOptions.SACKBlocks) 1237 if n == 0 { 1238 return false 1239 } 1240 1241 sb := rcvdSeg.parsedOptions.SACKBlocks[0] 1242 // Check if SACK block is invalid. 1243 if sb.End.LessThan(sb.Start) { 1244 return false 1245 } 1246 1247 // See: https://tools.ietf.org/html/rfc2883#section-5 DSACK is sent in 1248 // at most one SACK block. DSACK is detected in the below two cases: 1249 // * If the SACK sequence space is less than this cumulative ACK, it is 1250 // an indication that the segment identified by the SACK block has 1251 // been received more than once by the receiver. 1252 // * If the sequence space in the first SACK block is greater than the 1253 // cumulative ACK, then the sender next compares the sequence space 1254 // in the first SACK block with the sequence space in the second SACK 1255 // block, if there is one. This comparison can determine if the first 1256 // SACK block is reporting duplicate data that lies above the 1257 // cumulative ACK. 1258 if sb.Start.LessThan(rcvdSeg.ackNumber) { 1259 return true 1260 } 1261 1262 if n > 1 { 1263 sb1 := rcvdSeg.parsedOptions.SACKBlocks[1] 1264 if sb1.End.LessThan(sb1.Start) { 1265 return false 1266 } 1267 1268 // If the first SACK block is fully covered by second SACK 1269 // block, then the first block is a DSACK block. 1270 if sb.End.LessThanEq(sb1.End) && sb1.Start.LessThanEq(sb.Start) { 1271 return true 1272 } 1273 } 1274 1275 return false 1276 } 1277 1278 func (s *sender) recordRetransmitTS() { 1279 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 1280 // 1281 // The Eifel detection algorithm is used, only upon initiation of loss 1282 // recovery, i.e., when either the timeout-based retransmit or the fast 1283 // retransmit is sent. The Eifel detection algorithm MUST NOT be 1284 // reinitiated after loss recovery has already started. In particular, 1285 // it must not be reinitiated upon subsequent timeouts for the same 1286 // segment, and not upon retransmitting segments other than the oldest 1287 // outstanding segment, e.g., during selective loss recovery. 1288 if s.inRecovery() { 1289 return 1290 } 1291 1292 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 1293 // 1294 // Set a "RetransmitTS" variable to the value of the Timestamp Value 1295 // field of the Timestamps option included in the retransmit sent when 1296 // loss recovery is initiated. A TCP sender must ensure that 1297 // RetransmitTS does not get overwritten as loss recovery progresses, 1298 // e.g., in case of a second timeout and subsequent second retransmit of 1299 // the same octet. 1300 s.retransmitTS = s.ep.tsValNow() 1301 } 1302 1303 func (s *sender) detectSpuriousRecovery(hasDSACK bool, tsEchoReply uint32) { 1304 // Return if the sender has already detected spurious recovery. 1305 if s.spuriousRecovery { 1306 return 1307 } 1308 1309 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 4 1310 // 1311 // If the value of the Timestamp Echo Reply field of the acceptable ACK's 1312 // Timestamps option is smaller than the value of RetransmitTS, then 1313 // proceed to next step, else return. 1314 if tsEchoReply >= s.retransmitTS { 1315 return 1316 } 1317 1318 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5 1319 // 1320 // If the acceptable ACK carries a DSACK option [RFC2883], then return. 1321 if hasDSACK { 1322 return 1323 } 1324 1325 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5 1326 // 1327 // If during the lifetime of the TCP connection the TCP sender has 1328 // previously received an ACK with a DSACK option, or the acceptable ACK 1329 // does not acknowledge all outstanding data, then proceed to next step, 1330 // else return. 1331 numDSACK := s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.Value() 1332 if numDSACK == 0 && s.SndUna == s.SndNxt { 1333 return 1334 } 1335 1336 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 6 1337 // 1338 // If the loss recovery has been initiated with a timeout-based 1339 // retransmit, then set 1340 // SpuriousRecovery <- SPUR_TO (equal 1), 1341 // else set 1342 // SpuriousRecovery <- dupacks+1 1343 // Set the spurious recovery variable to true as we do not differentiate 1344 // between fast, SACK or RTO recovery. 1345 s.spuriousRecovery = true 1346 s.ep.stack.Stats().TCP.SpuriousRecovery.Increment() 1347 } 1348 1349 // Check if the sender is in RTORecovery, FastRecovery or SACKRecovery state. 1350 func (s *sender) inRecovery() bool { 1351 if s.state == tcpip.RTORecovery || s.state == tcpip.FastRecovery || s.state == tcpip.SACKRecovery { 1352 return true 1353 } 1354 return false 1355 } 1356 1357 // handleRcvdSegment is called when a segment is received; it is responsible for 1358 // updating the send-related state. 1359 func (s *sender) handleRcvdSegment(rcvdSeg *segment) { 1360 // Check if we can extract an RTT measurement from this ack. 1361 if !rcvdSeg.parsedOptions.TS && s.RTTMeasureSeqNum.LessThan(rcvdSeg.ackNumber) { 1362 s.updateRTO(s.ep.stack.Clock().NowMonotonic().Sub(s.RTTMeasureTime)) 1363 s.RTTMeasureSeqNum = s.SndNxt 1364 } 1365 1366 // Update Timestamp if required. See RFC7323, section-4.3. 1367 if s.ep.SendTSOk && rcvdSeg.parsedOptions.TS { 1368 s.ep.updateRecentTimestamp(rcvdSeg.parsedOptions.TSVal, s.MaxSentAck, rcvdSeg.sequenceNumber) 1369 } 1370 1371 // Insert SACKBlock information into our scoreboard. 1372 hasDSACK := false 1373 if s.ep.SACKPermitted { 1374 for _, sb := range rcvdSeg.parsedOptions.SACKBlocks { 1375 // Only insert the SACK block if the following holds 1376 // true: 1377 // * SACK block acks data after the ack number in the 1378 // current segment. 1379 // * SACK block represents a sequence 1380 // between sndUna and sndNxt (i.e. data that is 1381 // currently unacked and in-flight). 1382 // * SACK block that has not been SACKed already. 1383 // 1384 // NOTE: This check specifically excludes DSACK blocks 1385 // which have start/end before sndUna and are used to 1386 // indicate spurious retransmissions. 1387 if rcvdSeg.ackNumber.LessThan(sb.Start) && s.SndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.SndNxt) && !s.ep.scoreboard.IsSACKED(sb) { 1388 s.ep.scoreboard.Insert(sb) 1389 rcvdSeg.hasNewSACKInfo = true 1390 } 1391 } 1392 1393 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08 1394 // section-7.2 1395 // * Step 2: Update RACK stats. 1396 // If the ACK is not ignored as invalid, update the RACK.rtt 1397 // to be the RTT sample calculated using this ACK, and 1398 // continue. If this ACK or SACK was for the most recently 1399 // sent packet, then record the RACK.xmit_ts timestamp and 1400 // RACK.end_seq sequence implied by this ACK. 1401 // * Step 3: Detect packet reordering. 1402 // If the ACK selectively or cumulatively acknowledges an 1403 // unacknowledged and also never retransmitted sequence below 1404 // RACK.fack, then the corresponding packet has been 1405 // reordered and RACK.reord is set to TRUE. 1406 if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1407 hasDSACK = s.walkSACK(rcvdSeg) 1408 } 1409 s.SetPipe() 1410 } 1411 1412 ack := rcvdSeg.ackNumber 1413 fastRetransmit := false 1414 // Do not leave fast recovery, if the ACK is out of range. 1415 if s.FastRecovery.Active { 1416 // Leave fast recovery if it acknowledges all the data covered by 1417 // this fast recovery session. 1418 if (ack-1).InRange(s.SndUna, s.SndNxt) && s.FastRecovery.Last.LessThan(ack) { 1419 s.leaveRecovery() 1420 } 1421 } else { 1422 // Detect loss by counting the duplicates and enter recovery. 1423 fastRetransmit = s.detectLoss(rcvdSeg) 1424 } 1425 1426 // See if TLP based recovery was successful. 1427 if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1428 s.detectTLPRecovery(ack, rcvdSeg) 1429 } 1430 1431 // Stash away the current window size. 1432 s.SndWnd = rcvdSeg.window 1433 1434 // Disable zero window probing if remote advertizes a non-zero receive 1435 // window. This can be with an ACK to the zero window probe (where the 1436 // acknumber refers to the already acknowledged byte) OR to any previously 1437 // unacknowledged segment. 1438 if s.zeroWindowProbing && rcvdSeg.window > 0 && 1439 (ack == s.SndUna || (ack-1).InRange(s.SndUna, s.SndNxt)) { 1440 s.disableZeroWindowProbing() 1441 } 1442 1443 // On receiving the ACK for the zero window probe, account for it and 1444 // skip trying to send any segment as we are still probing for 1445 // receive window to become non-zero. 1446 if s.zeroWindowProbing && s.unackZeroWindowProbes > 0 && ack == s.SndUna { 1447 s.unackZeroWindowProbes-- 1448 return 1449 } 1450 1451 // Ignore ack if it doesn't acknowledge any new data. 1452 if (ack - 1).InRange(s.SndUna, s.SndNxt) { 1453 s.DupAckCount = 0 1454 1455 // See : https://tools.ietf.org/html/rfc1323#section-3.3. 1456 // Specifically we should only update the RTO using TSEcr if the 1457 // following condition holds: 1458 // 1459 // A TSecr value received in a segment is used to update the 1460 // averaged RTT measurement only if the segment acknowledges 1461 // some new data, i.e., only if it advances the left edge of 1462 // the send window. 1463 if s.ep.SendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 { 1464 s.updateRTO(s.ep.elapsed(s.ep.stack.Clock().NowMonotonic(), rcvdSeg.parsedOptions.TSEcr)) 1465 } 1466 1467 if s.shouldSchedulePTO() { 1468 // Schedule PTO upon receiving an ACK that cumulatively acknowledges data. 1469 // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1. 1470 s.schedulePTO() 1471 } else { 1472 // When an ack is received we must rearm the timer. 1473 // RFC 6298 5.3 1474 s.probeTimer.disable() 1475 s.resendTimer.enable(s.RTO) 1476 } 1477 1478 // Remove all acknowledged data from the write list. 1479 acked := s.SndUna.Size(ack) 1480 s.SndUna = ack 1481 1482 // The remote ACK-ing at least 1 byte is an indication that we have a 1483 // full-duplex connection to the remote as the only way we will receive an 1484 // ACK is if the remote received data that we previously sent. 1485 // 1486 // As of writing, linux seems to only confirm a route as reachable when 1487 // forward progress is made which is indicated by an ACK that removes data 1488 // from the retransmit queue. 1489 if acked > 0 { 1490 s.ep.route.ConfirmReachable() 1491 } 1492 1493 ackLeft := acked 1494 originalOutstanding := s.Outstanding 1495 for ackLeft > 0 { 1496 // We use logicalLen here because we can have FIN 1497 // segments (which are always at the end of list) that 1498 // have no data, but do consume a sequence number. 1499 seg := s.writeList.Front() 1500 datalen := seg.logicalLen() 1501 1502 if datalen > ackLeft { 1503 prevCount := s.pCount(seg, s.MaxPayloadSize) 1504 seg.data.TrimFront(int(ackLeft)) 1505 seg.sequenceNumber.UpdateForward(ackLeft) 1506 s.Outstanding -= prevCount - s.pCount(seg, s.MaxPayloadSize) 1507 break 1508 } 1509 1510 if s.writeNext == seg { 1511 s.writeNext = seg.Next() 1512 } 1513 1514 // Update the RACK fields if SACK is enabled. 1515 if s.ep.SACKPermitted && !seg.acked && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1516 s.rc.update(seg, rcvdSeg) 1517 s.rc.detectReorder(seg) 1518 } 1519 1520 s.writeList.Remove(seg) 1521 1522 // If SACK is enabled then only reduce outstanding if 1523 // the segment was not previously SACKED as these have 1524 // already been accounted for in SetPipe(). 1525 if !s.ep.SACKPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 1526 s.Outstanding -= s.pCount(seg, s.MaxPayloadSize) 1527 } else { 1528 s.SackedOut -= s.pCount(seg, s.MaxPayloadSize) 1529 } 1530 seg.decRef() 1531 ackLeft -= datalen 1532 } 1533 1534 // Clear SACK information for all acked data. 1535 s.ep.scoreboard.Delete(s.SndUna) 1536 1537 // Detect if the sender entered recovery spuriously. 1538 if s.inRecovery() { 1539 s.detectSpuriousRecovery(hasDSACK, rcvdSeg.parsedOptions.TSEcr) 1540 } 1541 1542 // If we are not in fast recovery then update the congestion 1543 // window based on the number of acknowledged packets. 1544 if !s.FastRecovery.Active { 1545 s.cc.Update(originalOutstanding - s.Outstanding) 1546 if s.FastRecovery.Last.LessThan(s.SndUna) { 1547 s.state = tcpip.Open 1548 // Update RACK when we are exiting fast or RTO 1549 // recovery as described in the RFC 1550 // draft-ietf-tcpm-rack-08 Section-7.2 Step 4. 1551 if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1552 s.rc.exitRecovery() 1553 } 1554 s.reorderTimer.disable() 1555 } 1556 } 1557 1558 // Update the send buffer usage and notify potential waiters. 1559 s.ep.updateSndBufferUsage(int(acked)) 1560 1561 // It is possible for s.outstanding to drop below zero if we get 1562 // a retransmit timeout, reset outstanding to zero but later 1563 // get an ack that cover previously sent data. 1564 if s.Outstanding < 0 { 1565 s.Outstanding = 0 1566 } 1567 1568 s.SetPipe() 1569 1570 // If all outstanding data was acknowledged the disable the timer. 1571 // RFC 6298 Rule 5.3 1572 if s.SndUna == s.SndNxt { 1573 s.Outstanding = 0 1574 // Reset firstRetransmittedSegXmitTime to the zero value. 1575 s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{} 1576 s.resendTimer.disable() 1577 s.probeTimer.disable() 1578 } 1579 } 1580 1581 if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1582 // Update RACK reorder window. 1583 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 1584 // * Upon receiving an ACK: 1585 // * Step 4: Update RACK reordering window 1586 s.rc.updateRACKReorderWindow() 1587 1588 // After the reorder window is calculated, detect any loss by checking 1589 // if the time elapsed after the segments are sent is greater than the 1590 // reorder window. 1591 if numLost := s.rc.detectLoss(rcvdSeg.rcvdTime); numLost > 0 && !s.FastRecovery.Active { 1592 // If any segment is marked as lost by 1593 // RACK, enter recovery and retransmit 1594 // the lost segments. 1595 s.cc.HandleLossDetected() 1596 s.enterRecovery() 1597 fastRetransmit = true 1598 } 1599 1600 if s.FastRecovery.Active { 1601 s.rc.DoRecovery(nil, fastRetransmit) 1602 } 1603 } 1604 1605 // Now that we've popped all acknowledged data from the retransmit 1606 // queue, retransmit if needed. 1607 if s.FastRecovery.Active && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 { 1608 s.lr.DoRecovery(rcvdSeg, fastRetransmit) 1609 // When SACK is enabled data sending is governed by steps in 1610 // RFC 6675 Section 5 recovery steps A-C. 1611 // See: https://tools.ietf.org/html/rfc6675#section-5. 1612 if s.ep.SACKPermitted { 1613 return 1614 } 1615 } 1616 1617 // Send more data now that some of the pending data has been ack'd, or 1618 // that the window opened up, or the congestion window was inflated due 1619 // to a duplicate ack during fast recovery. This will also re-enable 1620 // the retransmit timer if needed. 1621 s.sendData() 1622 } 1623 1624 // sendSegment sends the specified segment. 1625 func (s *sender) sendSegment(seg *segment) tcpip.Error { 1626 if seg.xmitCount > 0 { 1627 s.ep.stack.Stats().TCP.Retransmits.Increment() 1628 s.ep.stats.SendErrors.Retransmits.Increment() 1629 if s.SndCwnd < s.Ssthresh { 1630 s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment() 1631 } 1632 } 1633 seg.xmitTime = s.ep.stack.Clock().NowMonotonic() 1634 seg.xmitCount++ 1635 seg.lost = false 1636 err := s.sendSegmentFromView(seg.data, seg.flags, seg.sequenceNumber) 1637 1638 // Every time a packet containing data is sent (including a 1639 // retransmission), if SACK is enabled and we are retransmitting data 1640 // then use the conservative timer described in RFC6675 Section 6.0, 1641 // otherwise follow the standard time described in RFC6298 Section 5.1. 1642 if err != nil && seg.data.Size() != 0 { 1643 if s.FastRecovery.Active && seg.xmitCount > 1 && s.ep.SACKPermitted { 1644 s.resendTimer.enable(s.RTO) 1645 } else { 1646 if !s.resendTimer.enabled() { 1647 s.resendTimer.enable(s.RTO) 1648 } 1649 } 1650 } 1651 1652 return err 1653 } 1654 1655 // sendSegmentFromView sends a new segment containing the given payload, flags 1656 // and sequence number. 1657 func (s *sender) sendSegmentFromView(data buffer.VectorisedView, flags header.TCPFlags, seq seqnum.Value) tcpip.Error { 1658 s.LastSendTime = s.ep.stack.Clock().NowMonotonic() 1659 if seq == s.RTTMeasureSeqNum { 1660 s.RTTMeasureTime = s.LastSendTime 1661 } 1662 1663 rcvNxt, rcvWnd := s.ep.rcv.getSendParams() 1664 1665 // Remember the max sent ack. 1666 s.MaxSentAck = rcvNxt 1667 1668 return s.ep.sendRaw(data, flags, seq, rcvNxt, rcvWnd) 1669 } 1670 1671 // maybeSendOutOfWindowAck sends an ACK if we are not being rate limited 1672 // currently. 1673 func (s *sender) maybeSendOutOfWindowAck(seg *segment) { 1674 // Data packets are unlikely to be part of an ACK loop. So always send 1675 // an ACK for a packet w/ data. 1676 if seg.payloadSize() > 0 || s.ep.allowOutOfWindowAck() { 1677 s.sendAck() 1678 } 1679 }