github.com/sagernet/gvisor@v0.0.0-20240428053021-e691de28565f/pkg/tcpip/transport/tcp/snd.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "fmt" 19 "math" 20 "sort" 21 "time" 22 23 "github.com/sagernet/gvisor/pkg/buffer" 24 "github.com/sagernet/gvisor/pkg/sync" 25 "github.com/sagernet/gvisor/pkg/tcpip" 26 "github.com/sagernet/gvisor/pkg/tcpip/header" 27 "github.com/sagernet/gvisor/pkg/tcpip/seqnum" 28 "github.com/sagernet/gvisor/pkg/tcpip/stack" 29 ) 30 31 const ( 32 // MinRTO is the minimum allowed value for the retransmit timeout. 33 MinRTO = 200 * time.Millisecond 34 35 // MaxRTO is the maximum allowed value for the retransmit timeout. 36 MaxRTO = 120 * time.Second 37 38 // MinSRTT is the minimum allowed value for smoothed RTT. 39 MinSRTT = 1 * time.Millisecond 40 41 // InitialCwnd is the initial congestion window. 42 InitialCwnd = 10 43 44 // nDupAckThreshold is the number of duplicate ACK's required 45 // before fast-retransmit is entered. 46 nDupAckThreshold = 3 47 48 // MaxRetries is the maximum number of probe retries sender does 49 // before timing out the connection. 50 // Linux default TCP_RETR2, net.ipv4.tcp_retries2. 51 MaxRetries = 15 52 ) 53 54 // congestionControl is an interface that must be implemented by any supported 55 // congestion control algorithm. 56 type congestionControl interface { 57 // HandleLossDetected is invoked when the loss is detected by RACK or 58 // sender.dupAckCount >= nDupAckThreshold just before entering fast 59 // retransmit. 60 HandleLossDetected() 61 62 // HandleRTOExpired is invoked when the retransmit timer expires. 63 HandleRTOExpired() 64 65 // Update is invoked when processing inbound acks. It's passed the 66 // number of packet's that were acked by the most recent cumulative 67 // acknowledgement. 68 Update(packetsAcked int) 69 70 // PostRecovery is invoked when the sender is exiting a fast retransmit/ 71 // recovery phase. This provides congestion control algorithms a way 72 // to adjust their state when exiting recovery. 73 PostRecovery() 74 } 75 76 // lossRecovery is an interface that must be implemented by any supported 77 // loss recovery algorithm. 78 type lossRecovery interface { 79 // DoRecovery is invoked when loss is detected and segments need 80 // to be retransmitted. The cumulative or selective ACK is passed along 81 // with the flag which identifies whether the connection entered fast 82 // retransmit with this ACK and to retransmit the first unacknowledged 83 // segment. 84 DoRecovery(rcvdSeg *segment, fastRetransmit bool) 85 } 86 87 // sender holds the state necessary to send TCP segments. 88 // 89 // +stateify savable 90 type sender struct { 91 stack.TCPSenderState 92 ep *Endpoint 93 94 // lr is the loss recovery algorithm used by the sender. 95 lr lossRecovery 96 97 // firstRetransmittedSegXmitTime is the original transmit time of 98 // the first segment that was retransmitted due to RTO expiration. 99 firstRetransmittedSegXmitTime tcpip.MonotonicTime 100 101 // zeroWindowProbing is set if the sender is currently probing 102 // for zero receive window. 103 zeroWindowProbing bool `state:"nosave"` 104 105 // unackZeroWindowProbes is the number of unacknowledged zero 106 // window probes. 107 unackZeroWindowProbes uint32 `state:"nosave"` 108 109 writeNext *segment 110 writeList segmentList 111 resendTimer timer `state:"nosave"` 112 113 // rtt.TCPRTTState.SRTT and rtt.TCPRTTState.RTTVar are the "smoothed 114 // round-trip time", and "round-trip time variation", as defined in 115 // section 2 of RFC 6298. 116 rtt rtt 117 118 // minRTO is the minimum permitted value for sender.rto. 119 minRTO time.Duration 120 121 // maxRTO is the maximum permitted value for sender.rto. 122 maxRTO time.Duration 123 124 // maxRetries is the maximum permitted retransmissions. 125 maxRetries uint32 126 127 // gso is set if generic segmentation offload is enabled. 128 gso bool 129 130 // state is the current state of congestion control for this endpoint. 131 state tcpip.CongestionControlState 132 133 // cc is the congestion control algorithm in use for this sender. 134 cc congestionControl 135 136 // rc has the fields needed for implementing RACK loss detection 137 // algorithm. 138 rc rackControl 139 140 // reorderTimer is the timer used to retransmit the segments after RACK 141 // detects them as lost. 142 reorderTimer timer `state:"nosave"` 143 144 // probeTimer is used to schedule PTO for RACK TLP algorithm. 145 probeTimer timer `state:"nosave"` 146 147 // spuriousRecovery indicates whether the sender entered recovery 148 // spuriously as described in RFC3522 Section 3.2. 149 spuriousRecovery bool 150 151 // retransmitTS is the timestamp at which the sender sends retransmitted 152 // segment after entering an RTO for the first time as described in 153 // RFC3522 Section 3.2. 154 retransmitTS uint32 155 156 // startCork start corking the segments. 157 startCork bool 158 159 // corkTimer is used to drain the segments which are held when TCP_CORK 160 // option is enabled. 161 corkTimer timer `state:"nosave"` 162 } 163 164 // rtt is a synchronization wrapper used to appease stateify. See the comment 165 // in sender, where it is used. 166 // 167 // +stateify savable 168 type rtt struct { 169 sync.Mutex `state:"nosave"` 170 171 stack.TCPRTTState 172 } 173 174 // +checklocks:ep.mu 175 func newSender(ep *Endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender { 176 // The sender MUST reduce the TCP data length to account for any IP or 177 // TCP options that it is including in the packets that it sends. 178 // See: https://tools.ietf.org/html/rfc6691#section-2 179 maxPayloadSize := int(mss) - ep.maxOptionSize() 180 181 s := &sender{ 182 ep: ep, 183 TCPSenderState: stack.TCPSenderState{ 184 SndWnd: sndWnd, 185 SndUna: iss + 1, 186 SndNxt: iss + 1, 187 RTTMeasureSeqNum: iss + 1, 188 LastSendTime: ep.stack.Clock().NowMonotonic(), 189 MaxPayloadSize: maxPayloadSize, 190 MaxSentAck: irs + 1, 191 FastRecovery: stack.TCPFastRecoveryState{ 192 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1. 193 Last: iss, 194 HighRxt: iss, 195 RescueRxt: iss, 196 }, 197 RTO: 1 * time.Second, 198 }, 199 gso: ep.gso.Type != stack.GSONone, 200 } 201 202 if s.gso { 203 s.ep.gso.MSS = uint16(maxPayloadSize) 204 } 205 206 s.cc = s.initCongestionControl(ep.cc) 207 s.lr = s.initLossRecovery() 208 s.rc.init(s, iss) 209 210 // A negative sndWndScale means that no scaling is in use, otherwise we 211 // store the scaling value. 212 if sndWndScale > 0 { 213 s.SndWndScale = uint8(sndWndScale) 214 } 215 216 s.resendTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.retransmitTimerExpired)) 217 s.reorderTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.rc.reorderTimerExpired)) 218 s.probeTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.probeTimerExpired)) 219 s.corkTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.corkTimerExpired)) 220 221 s.ep.AssertLockHeld(ep) 222 s.updateMaxPayloadSize(int(ep.route.MTU()), 0) 223 // Initialize SACK Scoreboard after updating max payload size as we use 224 // the maxPayloadSize as the smss when determining if a segment is lost 225 // etc. 226 s.ep.scoreboard = NewSACKScoreboard(uint16(s.MaxPayloadSize), iss) 227 228 // Get Stack wide config. 229 var minRTO tcpip.TCPMinRTOOption 230 if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil { 231 panic(fmt.Sprintf("unable to get minRTO from stack: %s", err)) 232 } 233 s.minRTO = time.Duration(minRTO) 234 235 var maxRTO tcpip.TCPMaxRTOOption 236 if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil { 237 panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err)) 238 } 239 s.maxRTO = time.Duration(maxRTO) 240 241 var maxRetries tcpip.TCPMaxRetriesOption 242 if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil { 243 panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err)) 244 } 245 s.maxRetries = uint32(maxRetries) 246 247 return s 248 } 249 250 // initCongestionControl initializes the specified congestion control module and 251 // returns a handle to it. It also initializes the sndCwnd and sndSsThresh to 252 // their initial values. 253 func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl { 254 s.SndCwnd = InitialCwnd 255 // Set sndSsthresh to the maximum int value, which depends on the 256 // platform. 257 s.Ssthresh = int(^uint(0) >> 1) 258 259 switch congestionControlName { 260 case ccCubic: 261 return newCubicCC(s) 262 case ccReno: 263 fallthrough 264 default: 265 return newRenoCC(s) 266 } 267 } 268 269 // initLossRecovery initiates the loss recovery algorithm for the sender. 270 func (s *sender) initLossRecovery() lossRecovery { 271 if s.ep.SACKPermitted { 272 return newSACKRecovery(s) 273 } 274 return newRenoRecovery(s) 275 } 276 277 // updateMaxPayloadSize updates the maximum payload size based on the given 278 // MTU. If this is in response to "packet too big" control packets (indicated 279 // by the count argument), it also reduces the number of outstanding packets and 280 // attempts to retransmit the first packet above the MTU size. 281 // +checklocks:s.ep.mu 282 func (s *sender) updateMaxPayloadSize(mtu, count int) { 283 m := mtu - header.TCPMinimumSize 284 285 m -= s.ep.maxOptionSize() 286 287 // We don't adjust up for now. 288 if m >= s.MaxPayloadSize { 289 return 290 } 291 292 // Make sure we can transmit at least one byte. 293 if m <= 0 { 294 m = 1 295 } 296 297 oldMSS := s.MaxPayloadSize 298 s.MaxPayloadSize = m 299 if s.gso { 300 s.ep.gso.MSS = uint16(m) 301 } 302 303 if count == 0 { 304 // updateMaxPayloadSize is also called when the sender is created. 305 // and there is no data to send in such cases. Return immediately. 306 return 307 } 308 309 // Update the scoreboard's smss to reflect the new lowered 310 // maxPayloadSize. 311 s.ep.scoreboard.smss = uint16(m) 312 313 s.Outstanding -= count 314 if s.Outstanding < 0 { 315 s.Outstanding = 0 316 } 317 318 // Rewind writeNext to the first segment exceeding the MTU. Do nothing 319 // if it is already before such a packet. 320 nextSeg := s.writeNext 321 for seg := s.writeList.Front(); seg != nil; seg = seg.Next() { 322 if seg == s.writeNext { 323 // We got to writeNext before we could find a segment 324 // exceeding the MTU. 325 break 326 } 327 328 if nextSeg == s.writeNext && seg.payloadSize() > m { 329 // We found a segment exceeding the MTU. Rewind 330 // writeNext and try to retransmit it. 331 nextSeg = seg 332 } 333 334 if s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 335 // Update sackedOut for new maximum payload size. 336 s.SackedOut -= s.pCount(seg, oldMSS) 337 s.SackedOut += s.pCount(seg, s.MaxPayloadSize) 338 } 339 } 340 341 // Since we likely reduced the number of outstanding packets, we may be 342 // ready to send some more. 343 s.updateWriteNext(nextSeg) 344 s.sendData() 345 } 346 347 // sendAck sends an ACK segment. 348 // +checklocks:s.ep.mu 349 func (s *sender) sendAck() { 350 s.sendEmptySegment(header.TCPFlagAck, s.SndNxt) 351 } 352 353 // updateRTO updates the retransmit timeout when a new roud-trip time is 354 // available. This is done in accordance with section 2 of RFC 6298. 355 func (s *sender) updateRTO(rtt time.Duration) { 356 s.rtt.Lock() 357 if !s.rtt.TCPRTTState.SRTTInited { 358 s.rtt.TCPRTTState.RTTVar = rtt / 2 359 s.rtt.TCPRTTState.SRTT = rtt 360 s.rtt.TCPRTTState.SRTTInited = true 361 } else { 362 diff := s.rtt.TCPRTTState.SRTT - rtt 363 if diff < 0 { 364 diff = -diff 365 } 366 // Use RFC6298 standard algorithm to update TCPRTTState.RTTVar and TCPRTTState.SRTT when 367 // no timestamps are available. 368 if !s.ep.SendTSOk { 369 s.rtt.TCPRTTState.RTTVar = (3*s.rtt.TCPRTTState.RTTVar + diff) / 4 370 s.rtt.TCPRTTState.SRTT = (7*s.rtt.TCPRTTState.SRTT + rtt) / 8 371 } else { 372 // When we are taking RTT measurements of every ACK then 373 // we need to use a modified method as specified in 374 // https://tools.ietf.org/html/rfc7323#appendix-G 375 if s.Outstanding == 0 { 376 s.rtt.Unlock() 377 return 378 } 379 // Netstack measures congestion window/inflight all in 380 // terms of packets and not bytes. This is similar to 381 // how linux also does cwnd and inflight. In practice 382 // this approximation works as expected. 383 expectedSamples := math.Ceil(float64(s.Outstanding) / 2) 384 385 // alpha & beta values are the original values as recommended in 386 // https://tools.ietf.org/html/rfc6298#section-2.3. 387 const alpha = 0.125 388 const beta = 0.25 389 390 alphaPrime := alpha / expectedSamples 391 betaPrime := beta / expectedSamples 392 rttVar := (1-betaPrime)*s.rtt.TCPRTTState.RTTVar.Seconds() + betaPrime*diff.Seconds() 393 srtt := (1-alphaPrime)*s.rtt.TCPRTTState.SRTT.Seconds() + alphaPrime*rtt.Seconds() 394 s.rtt.TCPRTTState.RTTVar = time.Duration(rttVar * float64(time.Second)) 395 s.rtt.TCPRTTState.SRTT = time.Duration(srtt * float64(time.Second)) 396 } 397 } 398 399 if s.rtt.TCPRTTState.SRTT < MinSRTT { 400 s.rtt.TCPRTTState.SRTT = MinSRTT 401 } 402 403 s.RTO = s.rtt.TCPRTTState.SRTT + 4*s.rtt.TCPRTTState.RTTVar 404 s.rtt.Unlock() 405 if s.RTO < s.minRTO { 406 s.RTO = s.minRTO 407 } 408 if s.RTO > s.maxRTO { 409 s.RTO = s.maxRTO 410 } 411 } 412 413 // resendSegment resends the first unacknowledged segment. 414 // +checklocks:s.ep.mu 415 func (s *sender) resendSegment() { 416 // Don't use any segments we already sent to measure RTT as they may 417 // have been affected by packets being lost. 418 s.RTTMeasureSeqNum = s.SndNxt 419 420 // Resend the segment. 421 if seg := s.writeList.Front(); seg != nil { 422 if seg.payloadSize() > s.MaxPayloadSize { 423 s.splitSeg(seg, s.MaxPayloadSize) 424 } 425 426 // See: RFC 6675 section 5 Step 4.3 427 // 428 // To prevent retransmission, set both the HighRXT and RescueRXT 429 // to the highest sequence number in the retransmitted segment. 430 s.FastRecovery.HighRxt = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) - 1 431 s.FastRecovery.RescueRxt = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) - 1 432 s.sendSegment(seg) 433 s.ep.stack.Stats().TCP.FastRetransmit.Increment() 434 s.ep.stats.SendErrors.FastRetransmit.Increment() 435 436 // Run SetPipe() as per RFC 6675 section 5 Step 4.4 437 s.SetPipe() 438 } 439 } 440 441 // retransmitTimerExpired is called when the retransmit timer expires, and 442 // unacknowledged segments are assumed lost, and thus need to be resent. 443 // Returns true if the connection is still usable, or false if the connection 444 // is deemed lost. 445 // +checklocks:s.ep.mu 446 func (s *sender) retransmitTimerExpired() tcpip.Error { 447 // Check if the timer actually expired or if it's a spurious wake due 448 // to a previously orphaned runtime timer. 449 if s.resendTimer.isUninitialized() || !s.resendTimer.checkExpiration() { 450 return nil 451 } 452 453 // Initialize the variables used to detect spurious recovery after 454 // entering RTO. 455 // 456 // See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1. 457 s.spuriousRecovery = false 458 s.retransmitTS = 0 459 460 // TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases 461 // when writeList is empty. Remove this once we have a proper fix for this 462 // issue. 463 if s.writeList.Front() == nil { 464 return nil 465 } 466 467 s.ep.stack.Stats().TCP.Timeouts.Increment() 468 s.ep.stats.SendErrors.Timeouts.Increment() 469 470 // Set TLPRxtOut to false according to 471 // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1. 472 s.rc.tlpRxtOut = false 473 474 // Give up if we've waited more than a minute since the last resend or 475 // if a user time out is set and we have exceeded the user specified 476 // timeout since the first retransmission. 477 uto := s.ep.userTimeout 478 479 if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) { 480 // We store the original xmitTime of the segment that we are 481 // about to retransmit as the retransmission time. This is 482 // required as by the time the retransmitTimer has expired the 483 // segment has already been sent and unacked for the RTO at the 484 // time the segment was sent. 485 s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime 486 } 487 488 elapsed := s.ep.stack.Clock().NowMonotonic().Sub(s.firstRetransmittedSegXmitTime) 489 remaining := s.maxRTO 490 if uto != 0 { 491 // Cap to the user specified timeout if one is specified. 492 remaining = uto - elapsed 493 } 494 495 // Always honor the user-timeout irrespective of whether the zero 496 // window probes were acknowledged. 497 // net/ipv4/tcp_timer.c::tcp_probe_timer() 498 if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries { 499 s.ep.stack.Stats().TCP.EstablishedTimedout.Increment() 500 return &tcpip.ErrTimeout{} 501 } 502 503 // Set new timeout. The timer will be restarted by the call to sendData 504 // below. 505 s.RTO *= 2 506 // Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5 507 if s.RTO > s.maxRTO { 508 s.RTO = s.maxRTO 509 } 510 511 // Cap RTO to remaining time. 512 if s.RTO > remaining { 513 s.RTO = remaining 514 } 515 516 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4. 517 // 518 // Retransmit timeouts: 519 // After a retransmit timeout, record the highest sequence number 520 // transmitted in the variable recover, and exit the fast recovery 521 // procedure if applicable. 522 s.FastRecovery.Last = s.SndNxt - 1 523 524 if s.FastRecovery.Active { 525 // We were attempting fast recovery but were not successful. 526 // Leave the state. We don't need to update ssthresh because it 527 // has already been updated when entered fast-recovery. 528 s.leaveRecovery() 529 } 530 531 // Record retransmitTS if the sender is not in recovery as per: 532 // https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 533 s.recordRetransmitTS() 534 535 s.state = tcpip.RTORecovery 536 s.cc.HandleRTOExpired() 537 538 // Mark the next segment to be sent as the first unacknowledged one and 539 // start sending again. Set the number of outstanding packets to 0 so 540 // that we'll be able to retransmit. 541 // 542 // We'll keep on transmitting (or retransmitting) as we get acks for 543 // the data we transmit. 544 s.Outstanding = 0 545 546 // Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1 547 // 548 // In order to avoid memory deadlocks, the TCP receiver is allowed to 549 // discard data that has already been selectively acknowledged. As a 550 // result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK 551 // information gathered from a receiver upon a retransmission timeout 552 // (RTO) "since the timeout might indicate that the data receiver has 553 // reneged." Additionally, a TCP sender MUST "ignore prior SACK 554 // information in determining which data to retransmit." 555 // 556 // NOTE: We take the stricter interpretation and just expunge all 557 // information as we lack more rigorous checks to validate if the SACK 558 // information is usable after an RTO. 559 s.ep.scoreboard.Reset() 560 s.updateWriteNext(s.writeList.Front()) 561 562 // RFC 1122 4.2.2.17: Start sending zero window probes when we still see a 563 // zero receive window after retransmission interval and we have data to 564 // send. 565 if s.zeroWindowProbing { 566 s.sendZeroWindowProbe() 567 // RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed 568 // indefinitely. As long as the receiving TCP continues to send 569 // acknowledgments in response to the probe segments, the sending TCP 570 // MUST allow the connection to stay open. 571 return nil 572 } 573 574 seg := s.writeNext 575 // RFC 1122 4.2.3.5: Close the connection when the number of 576 // retransmissions for this segment is beyond a limit. 577 if seg != nil && seg.xmitCount > s.maxRetries { 578 s.ep.stack.Stats().TCP.EstablishedTimedout.Increment() 579 return &tcpip.ErrTimeout{} 580 } 581 582 s.sendData() 583 584 return nil 585 } 586 587 // pCount returns the number of packets in the segment. Due to GSO, a segment 588 // can be composed of multiple packets. 589 func (s *sender) pCount(seg *segment, maxPayloadSize int) int { 590 size := seg.payloadSize() 591 if size == 0 { 592 return 1 593 } 594 595 return (size-1)/maxPayloadSize + 1 596 } 597 598 // splitSeg splits a given segment at the size specified and inserts the 599 // remainder as a new segment after the current one in the write list. 600 func (s *sender) splitSeg(seg *segment, size int) { 601 if seg.payloadSize() <= size { 602 return 603 } 604 // Split this segment up. 605 nSeg := seg.clone() 606 nSeg.pkt.Data().TrimFront(size) 607 nSeg.sequenceNumber.UpdateForward(seqnum.Size(size)) 608 s.writeList.InsertAfter(seg, nSeg) 609 610 // The segment being split does not carry PUSH flag because it is 611 // followed by the newly split segment. 612 // RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered 613 // segment (i.e., when there is no more queued data to be sent). 614 // Linux removes PSH flag only when the segment is being split over MSS 615 // and retains it when we are splitting the segment over lack of sender 616 // window space. 617 // ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point() 618 // ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test() 619 if seg.payloadSize() > s.MaxPayloadSize { 620 seg.flags ^= header.TCPFlagPsh 621 } 622 seg.pkt.Data().CapLength(size) 623 } 624 625 // NextSeg implements the RFC6675 NextSeg() operation. 626 // 627 // NextSeg starts scanning the writeList starting from nextSegHint and returns 628 // the hint to be passed on the next call to NextSeg. This is required to avoid 629 // iterating the write list repeatedly when NextSeg is invoked in a loop during 630 // recovery. The returned hint will be nil if there are no more segments that 631 // can match rules defined by NextSeg operation in RFC6675. 632 // 633 // rescueRtx will be true only if nextSeg is a rescue retransmission as 634 // described by Step 4) of the NextSeg algorithm. 635 func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) { 636 var s3 *segment 637 var s4 *segment 638 // Step 1. 639 for seg := nextSegHint; seg != nil; seg = seg.Next() { 640 // Stop iteration if we hit a segment that has never been 641 // transmitted (i.e. either it has no assigned sequence number 642 // or if it does have one, it's >= the next sequence number 643 // to be sent [i.e. >= s.sndNxt]). 644 if !s.isAssignedSequenceNumber(seg) || s.SndNxt.LessThanEq(seg.sequenceNumber) { 645 hint = nil 646 break 647 } 648 segSeq := seg.sequenceNumber 649 if smss := s.ep.scoreboard.SMSS(); seg.payloadSize() > int(smss) { 650 s.splitSeg(seg, int(smss)) 651 } 652 653 // See RFC 6675 Section 4 654 // 655 // 1. If there exists a smallest unSACKED sequence number 656 // 'S2' that meets the following 3 criteria for determinig 657 // loss, the sequence range of one segment of up to SMSS 658 // octets starting with S2 MUST be returned. 659 if !s.ep.scoreboard.IsSACKED(header.SACKBlock{Start: segSeq, End: segSeq.Add(1)}) { 660 // NextSeg(): 661 // 662 // (1.a) S2 is greater than HighRxt 663 // (1.b) S2 is less than highest octet covered by 664 // any received SACK. 665 if s.FastRecovery.HighRxt.LessThan(segSeq) && segSeq.LessThan(s.ep.scoreboard.maxSACKED) { 666 // NextSeg(): 667 // (1.c) IsLost(S2) returns true. 668 if s.ep.scoreboard.IsLost(segSeq) { 669 return seg, seg.Next(), false 670 } 671 672 // NextSeg(): 673 // 674 // (3): If the conditions for rules (1) and (2) 675 // fail, but there exists an unSACKed sequence 676 // number S3 that meets the criteria for 677 // detecting loss given in steps 1.a and 1.b 678 // above (specifically excluding (1.c)) then one 679 // segment of upto SMSS octets starting with S3 680 // SHOULD be returned. 681 if s3 == nil { 682 s3 = seg 683 hint = seg.Next() 684 } 685 } 686 // NextSeg(): 687 // 688 // (4) If the conditions for (1), (2) and (3) fail, 689 // but there exists outstanding unSACKED data, we 690 // provide the opportunity for a single "rescue" 691 // retransmission per entry into loss recovery. If 692 // HighACK is greater than RescueRxt (or RescueRxt 693 // is undefined), then one segment of upto SMSS 694 // octets that MUST include the highest outstanding 695 // unSACKed sequence number SHOULD be returned, and 696 // RescueRxt set to RecoveryPoint. HighRxt MUST NOT 697 // be updated. 698 if s.FastRecovery.RescueRxt.LessThan(s.SndUna - 1) { 699 if s4 != nil { 700 if s4.sequenceNumber.LessThan(segSeq) { 701 s4 = seg 702 } 703 } else { 704 s4 = seg 705 } 706 } 707 } 708 } 709 710 // If we got here then no segment matched step (1). 711 // Step (2): "If no sequence number 'S2' per rule (1) 712 // exists but there exists available unsent data and the 713 // receiver's advertised window allows, the sequence 714 // range of one segment of up to SMSS octets of 715 // previously unsent data starting with sequence number 716 // HighData+1 MUST be returned." 717 for seg := s.writeNext; seg != nil; seg = seg.Next() { 718 if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.SndNxt) { 719 continue 720 } 721 // We do not split the segment here to <= smss as it has 722 // potentially not been assigned a sequence number yet. 723 return seg, nil, false 724 } 725 726 if s3 != nil { 727 return s3, hint, false 728 } 729 730 return s4, nil, true 731 } 732 733 // maybeSendSegment tries to send the specified segment and either coalesces 734 // other segments into this one or splits the specified segment based on the 735 // lower of the specified limit value or the receivers window size specified by 736 // end. 737 // +checklocks:s.ep.mu 738 func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (sent bool) { 739 // We abuse the flags field to determine if we have already 740 // assigned a sequence number to this segment. 741 if !s.isAssignedSequenceNumber(seg) { 742 // Merge segments if allowed. 743 if seg.payloadSize() != 0 { 744 available := int(s.SndNxt.Size(end)) 745 if available > limit { 746 available = limit 747 } 748 749 // nextTooBig indicates that the next segment was too 750 // large to entirely fit in the current segment. It 751 // would be possible to split the next segment and merge 752 // the portion that fits, but unexpectedly splitting 753 // segments can have user visible side-effects which can 754 // break applications. For example, RFC 7766 section 8 755 // says that the length and data of a DNS response 756 // should be sent in the same TCP segment to avoid 757 // triggering bugs in poorly written DNS 758 // implementations. 759 var nextTooBig bool 760 for nSeg := seg.Next(); nSeg != nil && nSeg.payloadSize() != 0; nSeg = seg.Next() { 761 if seg.payloadSize()+nSeg.payloadSize() > available { 762 nextTooBig = true 763 break 764 } 765 seg.merge(nSeg) 766 s.writeList.Remove(nSeg) 767 nSeg.DecRef() 768 } 769 if !nextTooBig && seg.payloadSize() < available { 770 // Segment is not full. 771 if s.Outstanding > 0 && s.ep.ops.GetDelayOption() { 772 // Nagle's algorithm. From Wikipedia: 773 // Nagle's algorithm works by 774 // combining a number of small 775 // outgoing messages and sending them 776 // all at once. Specifically, as long 777 // as there is a sent packet for which 778 // the sender has received no 779 // acknowledgment, the sender should 780 // keep buffering its output until it 781 // has a full packet's worth of 782 // output, thus allowing output to be 783 // sent all at once. 784 return false 785 } 786 // With TCP_CORK, hold back until minimum of the available 787 // send space and MSS. 788 if s.ep.ops.GetCorkOption() { 789 if seg.payloadSize() < s.MaxPayloadSize { 790 if !s.startCork { 791 s.startCork = true 792 // Enable the timer for 793 // 200ms, after which 794 // the segments are drained. 795 s.corkTimer.enable(MinRTO) 796 } 797 return false 798 } 799 // Disable the TCP_CORK timer. 800 s.startCork = false 801 s.corkTimer.disable() 802 } 803 } 804 } 805 806 // Assign flags. We don't do it above so that we can merge 807 // additional data if Nagle holds the segment. 808 seg.sequenceNumber = s.SndNxt 809 seg.flags = header.TCPFlagAck | header.TCPFlagPsh 810 } 811 812 var segEnd seqnum.Value 813 if seg.payloadSize() == 0 { 814 if s.writeList.Back() != seg { 815 panic("FIN segments must be the final segment in the write list.") 816 } 817 seg.flags = header.TCPFlagAck | header.TCPFlagFin 818 segEnd = seg.sequenceNumber.Add(1) 819 // Update the state to reflect that we have now 820 // queued a FIN. 821 s.ep.updateConnDirectionState(connDirectionStateSndClosed) 822 switch s.ep.EndpointState() { 823 case StateCloseWait: 824 s.ep.setEndpointState(StateLastAck) 825 default: 826 s.ep.setEndpointState(StateFinWait1) 827 } 828 } else { 829 // We're sending a non-FIN segment. 830 if seg.flags&header.TCPFlagFin != 0 { 831 panic("Netstack queues FIN segments without data.") 832 } 833 834 if !seg.sequenceNumber.LessThan(end) { 835 return false 836 } 837 838 available := int(seg.sequenceNumber.Size(end)) 839 if available == 0 { 840 return false 841 } 842 843 // If the whole segment or at least 1MSS sized segment cannot 844 // be accommodated in the receiver advertised window, skip 845 // splitting and sending of the segment. ref: 846 // net/ipv4/tcp_output.c::tcp_snd_wnd_test() 847 // 848 // Linux checks this for all segment transmits not triggered by 849 // a probe timer. On this condition, it defers the segment split 850 // and transmit to a short probe timer. 851 // 852 // ref: include/net/tcp.h::tcp_check_probe_timer() 853 // ref: net/ipv4/tcp_output.c::tcp_write_wakeup() 854 // 855 // Instead of defining a new transmit timer, we attempt to split 856 // the segment right here if there are no pending segments. If 857 // there are pending segments, segment transmits are deferred to 858 // the retransmit timer handler. 859 if s.SndUna != s.SndNxt { 860 switch { 861 case available >= seg.payloadSize(): 862 // OK to send, the whole segments fits in the 863 // receiver's advertised window. 864 case available >= s.MaxPayloadSize: 865 // OK to send, at least 1 MSS sized segment fits 866 // in the receiver's advertised window. 867 default: 868 return false 869 } 870 } 871 872 // The segment size limit is computed as a function of sender 873 // congestion window and MSS. When sender congestion window is > 874 // 1, this limit can be larger than MSS. Ensure that the 875 // currently available send space is not greater than minimum of 876 // this limit and MSS. 877 if available > limit { 878 available = limit 879 } 880 881 // If GSO is not in use then cap available to 882 // maxPayloadSize. When GSO is in use the gVisor GSO logic or 883 // the host GSO logic will cap the segment to the correct size. 884 if s.ep.gso.Type == stack.GSONone && available > s.MaxPayloadSize { 885 available = s.MaxPayloadSize 886 } 887 888 if seg.payloadSize() > available { 889 // A negative value causes splitSeg to panic anyways, so just panic 890 // earlier to get more information about the cause. 891 s.splitSeg(seg, available) 892 } 893 894 segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) 895 } 896 897 s.sendSegment(seg) 898 899 // Update sndNxt if we actually sent new data (as opposed to 900 // retransmitting some previously sent data). 901 if s.SndNxt.LessThan(segEnd) { 902 s.SndNxt = segEnd 903 } 904 905 return true 906 } 907 908 // zeroProbeJunk is data sent during zero window probes. Its value is 909 // irrelevant; since the sequence number has already been acknowledged it will 910 // be discarded. It's only here to avoid allocating. 911 var zeroProbeJunk = []byte{0} 912 913 // +checklocks:s.ep.mu 914 func (s *sender) sendZeroWindowProbe() { 915 s.unackZeroWindowProbes++ 916 917 // Send a zero window probe with sequence number pointing to the last 918 // acknowledged byte. Note that, like Linux, this isn't quite what RFC 919 // 9293 3.8.6.1 describes: we don't send the next byte in the stream, 920 // we re-send an ACKed byte to goad the receiver into responding. 921 pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ 922 Payload: buffer.MakeWithData(zeroProbeJunk), 923 }) 924 defer pkt.DecRef() 925 s.sendSegmentFromPacketBuffer(pkt, header.TCPFlagAck, s.SndUna-1) 926 927 // Rearm the timer to continue probing. 928 s.resendTimer.enable(s.RTO) 929 } 930 931 func (s *sender) enableZeroWindowProbing() { 932 s.zeroWindowProbing = true 933 // We piggyback the probing on the retransmit timer with the 934 // current retranmission interval, as we may start probing while 935 // segment retransmissions. 936 if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) { 937 s.firstRetransmittedSegXmitTime = s.ep.stack.Clock().NowMonotonic() 938 } 939 s.resendTimer.enable(s.RTO) 940 } 941 942 func (s *sender) disableZeroWindowProbing() { 943 s.zeroWindowProbing = false 944 s.unackZeroWindowProbes = 0 945 s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{} 946 s.resendTimer.disable() 947 } 948 949 func (s *sender) postXmit(dataSent bool, shouldScheduleProbe bool) { 950 if dataSent { 951 // We sent data, so we should stop the keepalive timer to ensure 952 // that no keepalives are sent while there is pending data. 953 s.ep.disableKeepaliveTimer() 954 } 955 956 // If the sender has advertised zero receive window and we have 957 // data to be sent out, start zero window probing to query the 958 // the remote for it's receive window size. 959 if s.writeNext != nil && s.SndWnd == 0 { 960 s.enableZeroWindowProbing() 961 } 962 963 // If we have no more pending data, start the keepalive timer. 964 if s.SndUna == s.SndNxt { 965 s.ep.resetKeepaliveTimer(false) 966 } else { 967 // Enable timers if we have pending data. 968 if shouldScheduleProbe && s.shouldSchedulePTO() { 969 // Schedule PTO after transmitting new data that wasn't itself a TLP probe. 970 s.schedulePTO() 971 } else if !s.resendTimer.enabled() { 972 s.probeTimer.disable() 973 if s.Outstanding > 0 { 974 // Enable the resend timer if it's not enabled yet and there is 975 // outstanding data. 976 s.resendTimer.enable(s.RTO) 977 } 978 } 979 } 980 } 981 982 // sendData sends new data segments. It is called when data becomes available or 983 // when the send window opens up. 984 // +checklocks:s.ep.mu 985 func (s *sender) sendData() { 986 limit := s.MaxPayloadSize 987 if s.gso { 988 limit = int(s.ep.gso.MaxSize - header.TCPTotalHeaderMaximumSize - 1) 989 } 990 end := s.SndUna.Add(s.SndWnd) 991 992 // Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10. 993 // "A TCP SHOULD set cwnd to no more than RW before beginning 994 // transmission if the TCP has not sent data in the interval exceeding 995 // the retrasmission timeout." 996 if !s.FastRecovery.Active && s.state != tcpip.RTORecovery && s.ep.stack.Clock().NowMonotonic().Sub(s.LastSendTime) > s.RTO { 997 if s.SndCwnd > InitialCwnd { 998 s.SndCwnd = InitialCwnd 999 } 1000 } 1001 1002 var dataSent bool 1003 for seg := s.writeNext; seg != nil && s.Outstanding < s.SndCwnd; seg = seg.Next() { 1004 cwndLimit := (s.SndCwnd - s.Outstanding) * s.MaxPayloadSize 1005 if cwndLimit > 0 && cwndLimit < limit { 1006 limit = cwndLimit 1007 } 1008 if s.isAssignedSequenceNumber(seg) && s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 1009 // Move writeNext along so that we don't try and scan data that 1010 // has already been SACKED. 1011 s.updateWriteNext(seg.Next()) 1012 continue 1013 } 1014 if sent := s.maybeSendSegment(seg, limit, end); !sent { 1015 break 1016 } 1017 dataSent = true 1018 s.Outstanding += s.pCount(seg, s.MaxPayloadSize) 1019 s.updateWriteNext(seg.Next()) 1020 } 1021 1022 s.postXmit(dataSent, true /* shouldScheduleProbe */) 1023 } 1024 1025 func (s *sender) enterRecovery() { 1026 // Initialize the variables used to detect spurious recovery after 1027 // entering recovery. 1028 // 1029 // See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1. 1030 s.spuriousRecovery = false 1031 s.retransmitTS = 0 1032 1033 s.FastRecovery.Active = true 1034 // Save state to reflect we're now in fast recovery. 1035 // 1036 // See : https://tools.ietf.org/html/rfc5681#section-3.2 Step 3. 1037 // We inflate the cwnd by 3 to account for the 3 packets which triggered 1038 // the 3 duplicate ACKs and are now not in flight. 1039 s.SndCwnd = s.Ssthresh + 3 1040 s.SackedOut = 0 1041 s.DupAckCount = 0 1042 s.FastRecovery.First = s.SndUna 1043 s.FastRecovery.Last = s.SndNxt - 1 1044 s.FastRecovery.MaxCwnd = s.SndCwnd + s.Outstanding 1045 s.FastRecovery.HighRxt = s.SndUna 1046 s.FastRecovery.RescueRxt = s.SndUna 1047 1048 // Record retransmitTS if the sender is not in recovery as per: 1049 // https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 1050 s.recordRetransmitTS() 1051 1052 if s.ep.SACKPermitted { 1053 s.state = tcpip.SACKRecovery 1054 s.ep.stack.Stats().TCP.SACKRecovery.Increment() 1055 // Set TLPRxtOut to false according to 1056 // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1. 1057 if s.rc.tlpRxtOut { 1058 // The tail loss probe triggered recovery. 1059 s.ep.stack.Stats().TCP.TLPRecovery.Increment() 1060 } 1061 s.rc.tlpRxtOut = false 1062 return 1063 } 1064 s.state = tcpip.FastRecovery 1065 s.ep.stack.Stats().TCP.FastRecovery.Increment() 1066 } 1067 1068 func (s *sender) leaveRecovery() { 1069 s.FastRecovery.Active = false 1070 s.FastRecovery.MaxCwnd = 0 1071 s.DupAckCount = 0 1072 1073 // Deflate cwnd. It had been artificially inflated when new dups arrived. 1074 s.SndCwnd = s.Ssthresh 1075 s.cc.PostRecovery() 1076 } 1077 1078 // isAssignedSequenceNumber relies on the fact that we only set flags once a 1079 // sequencenumber is assigned and that is only done right before we send the 1080 // segment. As a result any segment that has a non-zero flag has a valid 1081 // sequence number assigned to it. 1082 func (s *sender) isAssignedSequenceNumber(seg *segment) bool { 1083 return seg.flags != 0 1084 } 1085 1086 // SetPipe implements the SetPipe() function described in RFC6675. Netstack 1087 // maintains the congestion window in number of packets and not bytes, so 1088 // SetPipe() here measures number of outstanding packets rather than actual 1089 // outstanding bytes in the network. 1090 func (s *sender) SetPipe() { 1091 // If SACK isn't permitted or it is permitted but recovery is not active 1092 // then ignore pipe calculations. 1093 if !s.ep.SACKPermitted || !s.FastRecovery.Active { 1094 return 1095 } 1096 pipe := 0 1097 smss := seqnum.Size(s.ep.scoreboard.SMSS()) 1098 for s1 := s.writeList.Front(); s1 != nil && s1.payloadSize() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() { 1099 // With GSO each segment can be much larger than SMSS. So check the segment 1100 // in SMSS sized ranges. 1101 segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.payloadSize())) 1102 for startSeq := s1.sequenceNumber; startSeq.LessThan(segEnd); startSeq = startSeq.Add(smss) { 1103 endSeq := startSeq.Add(smss) 1104 if segEnd.LessThan(endSeq) { 1105 endSeq = segEnd 1106 } 1107 sb := header.SACKBlock{Start: startSeq, End: endSeq} 1108 // SetPipe(): 1109 // 1110 // After initializing pipe to zero, the following steps are 1111 // taken for each octet 'S1' in the sequence space between 1112 // HighACK and HighData that has not been SACKed: 1113 if !s1.sequenceNumber.LessThan(s.SndNxt) { 1114 break 1115 } 1116 if s.ep.scoreboard.IsSACKED(sb) { 1117 continue 1118 } 1119 1120 // SetPipe(): 1121 // 1122 // (a) If IsLost(S1) returns false, Pipe is incremened by 1. 1123 // 1124 // NOTE: here we mark the whole segment as lost. We do not try 1125 // and test every byte in our write buffer as we maintain our 1126 // pipe in terms of outstanding packets and not bytes. 1127 if !s.ep.scoreboard.IsRangeLost(sb) { 1128 pipe++ 1129 } 1130 // SetPipe(): 1131 // (b) If S1 <= HighRxt, Pipe is incremented by 1. 1132 if s1.sequenceNumber.LessThanEq(s.FastRecovery.HighRxt) { 1133 pipe++ 1134 } 1135 } 1136 } 1137 s.Outstanding = pipe 1138 } 1139 1140 // shouldEnterRecovery returns true if the sender should enter fast recovery 1141 // based on dupAck count and sack scoreboard. 1142 // See RFC 6675 section 5. 1143 func (s *sender) shouldEnterRecovery() bool { 1144 return s.DupAckCount >= nDupAckThreshold || 1145 (s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 && s.ep.scoreboard.IsLost(s.SndUna)) 1146 } 1147 1148 // detectLoss is called when an ack is received and returns whether a loss is 1149 // detected. It manages the state related to duplicate acks and determines if 1150 // a retransmit is needed according to the rules in RFC 6582 (NewReno). 1151 func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) { 1152 // We're not in fast recovery yet. 1153 1154 // If RACK is enabled and there is no reordering we should honor the 1155 // three duplicate ACK rule to enter recovery. 1156 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-4 1157 if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1158 if s.rc.Reord { 1159 return false 1160 } 1161 } 1162 1163 if !s.isDupAck(seg) { 1164 s.DupAckCount = 0 1165 return false 1166 } 1167 1168 s.DupAckCount++ 1169 1170 // Do not enter fast recovery until we reach nDupAckThreshold or the 1171 // first unacknowledged byte is considered lost as per SACK scoreboard. 1172 if !s.shouldEnterRecovery() { 1173 // RFC 6675 Step 3. 1174 s.FastRecovery.HighRxt = s.SndUna - 1 1175 // Do run SetPipe() to calculate the outstanding segments. 1176 s.SetPipe() 1177 s.state = tcpip.Disorder 1178 return false 1179 } 1180 1181 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2 1182 // 1183 // We only do the check here, the incrementing of last to the highest 1184 // sequence number transmitted till now is done when enterRecovery 1185 // is invoked. 1186 // 1187 // Note that we only enter recovery when at least one more byte of data 1188 // beyond s.fr.last (the highest byte that was outstanding when fast 1189 // retransmit was last entered) is acked. 1190 if !s.FastRecovery.Last.LessThan(seg.ackNumber - 1) { 1191 s.DupAckCount = 0 1192 return false 1193 } 1194 s.cc.HandleLossDetected() 1195 s.enterRecovery() 1196 return true 1197 } 1198 1199 // isDupAck determines if seg is a duplicate ack as defined in 1200 // https://tools.ietf.org/html/rfc5681#section-2. 1201 func (s *sender) isDupAck(seg *segment) bool { 1202 // A TCP that utilizes selective acknowledgments (SACKs) [RFC2018, RFC2883] 1203 // can leverage the SACK information to determine when an incoming ACK is a 1204 // "duplicate" (e.g., if the ACK contains previously unknown SACK 1205 // information). 1206 if s.ep.SACKPermitted && !seg.hasNewSACKInfo { 1207 return false 1208 } 1209 1210 // (a) The receiver of the ACK has outstanding data. 1211 return s.SndUna != s.SndNxt && 1212 // (b) The incoming acknowledgment carries no data. 1213 seg.logicalLen() == 0 && 1214 // (c) The SYN and FIN bits are both off. 1215 !seg.flags.Intersects(header.TCPFlagFin|header.TCPFlagSyn) && 1216 // (d) the ACK number is equal to the greatest acknowledgment received on 1217 // the given connection (TCP.UNA from RFC793). 1218 seg.ackNumber == s.SndUna && 1219 // (e) the advertised window in the incoming acknowledgment equals the 1220 // advertised window in the last incoming acknowledgment. 1221 s.SndWnd == seg.window 1222 } 1223 1224 // Iterate the writeList and update RACK for each segment which is newly acked 1225 // either cumulatively or selectively. Loop through the segments which are 1226 // sacked, and update the RACK related variables and check for reordering. 1227 // Returns true when the DSACK block has been detected in the received ACK. 1228 // 1229 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 1230 // steps 2 and 3. 1231 func (s *sender) walkSACK(rcvdSeg *segment) bool { 1232 s.rc.setDSACKSeen(false) 1233 1234 // Look for DSACK block. 1235 hasDSACK := false 1236 idx := 0 1237 n := len(rcvdSeg.parsedOptions.SACKBlocks) 1238 if checkDSACK(rcvdSeg) { 1239 dsackBlock := rcvdSeg.parsedOptions.SACKBlocks[0] 1240 numDSACK := uint64(dsackBlock.End-dsackBlock.Start) / uint64(s.MaxPayloadSize) 1241 // numDSACK can be zero when DSACK is sent for subsegments. 1242 if numDSACK < 1 { 1243 numDSACK = 1 1244 } 1245 s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.IncrementBy(numDSACK) 1246 s.rc.setDSACKSeen(true) 1247 idx = 1 1248 n-- 1249 hasDSACK = true 1250 } 1251 1252 if n == 0 { 1253 return hasDSACK 1254 } 1255 1256 // Sort the SACK blocks. The first block is the most recent unacked 1257 // block. The following blocks can be in arbitrary order. 1258 sackBlocks := make([]header.SACKBlock, n) 1259 copy(sackBlocks, rcvdSeg.parsedOptions.SACKBlocks[idx:]) 1260 sort.Slice(sackBlocks, func(i, j int) bool { 1261 return sackBlocks[j].Start.LessThan(sackBlocks[i].Start) 1262 }) 1263 1264 seg := s.writeList.Front() 1265 for _, sb := range sackBlocks { 1266 for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 { 1267 if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked { 1268 s.rc.update(seg, rcvdSeg) 1269 s.rc.detectReorder(seg) 1270 seg.acked = true 1271 s.SackedOut += s.pCount(seg, s.MaxPayloadSize) 1272 } 1273 seg = seg.Next() 1274 } 1275 } 1276 return hasDSACK 1277 } 1278 1279 // checkDSACK checks if a DSACK is reported. 1280 func checkDSACK(rcvdSeg *segment) bool { 1281 n := len(rcvdSeg.parsedOptions.SACKBlocks) 1282 if n == 0 { 1283 return false 1284 } 1285 1286 sb := rcvdSeg.parsedOptions.SACKBlocks[0] 1287 // Check if SACK block is invalid. 1288 if sb.End.LessThan(sb.Start) { 1289 return false 1290 } 1291 1292 // See: https://tools.ietf.org/html/rfc2883#section-5 DSACK is sent in 1293 // at most one SACK block. DSACK is detected in the below two cases: 1294 // * If the SACK sequence space is less than this cumulative ACK, it is 1295 // an indication that the segment identified by the SACK block has 1296 // been received more than once by the receiver. 1297 // * If the sequence space in the first SACK block is greater than the 1298 // cumulative ACK, then the sender next compares the sequence space 1299 // in the first SACK block with the sequence space in the second SACK 1300 // block, if there is one. This comparison can determine if the first 1301 // SACK block is reporting duplicate data that lies above the 1302 // cumulative ACK. 1303 if sb.Start.LessThan(rcvdSeg.ackNumber) { 1304 return true 1305 } 1306 1307 if n > 1 { 1308 sb1 := rcvdSeg.parsedOptions.SACKBlocks[1] 1309 if sb1.End.LessThan(sb1.Start) { 1310 return false 1311 } 1312 1313 // If the first SACK block is fully covered by second SACK 1314 // block, then the first block is a DSACK block. 1315 if sb.End.LessThanEq(sb1.End) && sb1.Start.LessThanEq(sb.Start) { 1316 return true 1317 } 1318 } 1319 1320 return false 1321 } 1322 1323 func (s *sender) recordRetransmitTS() { 1324 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 1325 // 1326 // The Eifel detection algorithm is used, only upon initiation of loss 1327 // recovery, i.e., when either the timeout-based retransmit or the fast 1328 // retransmit is sent. The Eifel detection algorithm MUST NOT be 1329 // reinitiated after loss recovery has already started. In particular, 1330 // it must not be reinitiated upon subsequent timeouts for the same 1331 // segment, and not upon retransmitting segments other than the oldest 1332 // outstanding segment, e.g., during selective loss recovery. 1333 if s.inRecovery() { 1334 return 1335 } 1336 1337 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 1338 // 1339 // Set a "RetransmitTS" variable to the value of the Timestamp Value 1340 // field of the Timestamps option included in the retransmit sent when 1341 // loss recovery is initiated. A TCP sender must ensure that 1342 // RetransmitTS does not get overwritten as loss recovery progresses, 1343 // e.g., in case of a second timeout and subsequent second retransmit of 1344 // the same octet. 1345 s.retransmitTS = s.ep.tsValNow() 1346 } 1347 1348 func (s *sender) detectSpuriousRecovery(hasDSACK bool, tsEchoReply uint32) { 1349 // Return if the sender has already detected spurious recovery. 1350 if s.spuriousRecovery { 1351 return 1352 } 1353 1354 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 4 1355 // 1356 // If the value of the Timestamp Echo Reply field of the acceptable ACK's 1357 // Timestamps option is smaller than the value of RetransmitTS, then 1358 // proceed to next step, else return. 1359 if tsEchoReply >= s.retransmitTS { 1360 return 1361 } 1362 1363 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5 1364 // 1365 // If the acceptable ACK carries a DSACK option [RFC2883], then return. 1366 if hasDSACK { 1367 return 1368 } 1369 1370 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5 1371 // 1372 // If during the lifetime of the TCP connection the TCP sender has 1373 // previously received an ACK with a DSACK option, or the acceptable ACK 1374 // does not acknowledge all outstanding data, then proceed to next step, 1375 // else return. 1376 numDSACK := s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.Value() 1377 if numDSACK == 0 && s.SndUna == s.SndNxt { 1378 return 1379 } 1380 1381 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 6 1382 // 1383 // If the loss recovery has been initiated with a timeout-based 1384 // retransmit, then set 1385 // SpuriousRecovery <- SPUR_TO (equal 1), 1386 // else set 1387 // SpuriousRecovery <- dupacks+1 1388 // Set the spurious recovery variable to true as we do not differentiate 1389 // between fast, SACK or RTO recovery. 1390 s.spuriousRecovery = true 1391 s.ep.stack.Stats().TCP.SpuriousRecovery.Increment() 1392 1393 // RFC 3522 will detect all kinds of spurious recoveries (fast, SACK and 1394 // timeout). Increment the metric for RTO only as we want to track the 1395 // number of timeout recoveries. 1396 if s.state == tcpip.RTORecovery { 1397 s.ep.stack.Stats().TCP.SpuriousRTORecovery.Increment() 1398 } 1399 } 1400 1401 // Check if the sender is in RTORecovery, FastRecovery or SACKRecovery state. 1402 func (s *sender) inRecovery() bool { 1403 if s.state == tcpip.RTORecovery || s.state == tcpip.FastRecovery || s.state == tcpip.SACKRecovery { 1404 return true 1405 } 1406 return false 1407 } 1408 1409 // handleRcvdSegment is called when a segment is received; it is responsible for 1410 // updating the send-related state. 1411 // +checklocks:s.ep.mu 1412 // +checklocksalias:s.rc.snd.ep.mu=s.ep.mu 1413 func (s *sender) handleRcvdSegment(rcvdSeg *segment) { 1414 // Check if we can extract an RTT measurement from this ack. 1415 if !rcvdSeg.parsedOptions.TS && s.RTTMeasureSeqNum.LessThan(rcvdSeg.ackNumber) { 1416 s.updateRTO(s.ep.stack.Clock().NowMonotonic().Sub(s.RTTMeasureTime)) 1417 s.RTTMeasureSeqNum = s.SndNxt 1418 } 1419 1420 // Update Timestamp if required. See RFC7323, section-4.3. 1421 if s.ep.SendTSOk && rcvdSeg.parsedOptions.TS { 1422 s.ep.updateRecentTimestamp(rcvdSeg.parsedOptions.TSVal, s.MaxSentAck, rcvdSeg.sequenceNumber) 1423 } 1424 1425 // Insert SACKBlock information into our scoreboard. 1426 hasDSACK := false 1427 if s.ep.SACKPermitted { 1428 for _, sb := range rcvdSeg.parsedOptions.SACKBlocks { 1429 // Only insert the SACK block if the following holds 1430 // true: 1431 // * SACK block acks data after the ack number in the 1432 // current segment. 1433 // * SACK block represents a sequence 1434 // between sndUna and sndNxt (i.e. data that is 1435 // currently unacked and in-flight). 1436 // * SACK block that has not been SACKed already. 1437 // 1438 // NOTE: This check specifically excludes DSACK blocks 1439 // which have start/end before sndUna and are used to 1440 // indicate spurious retransmissions. 1441 if rcvdSeg.ackNumber.LessThan(sb.Start) && s.SndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.SndNxt) && !s.ep.scoreboard.IsSACKED(sb) { 1442 s.ep.scoreboard.Insert(sb) 1443 rcvdSeg.hasNewSACKInfo = true 1444 } 1445 } 1446 1447 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08 1448 // section-7.2 1449 // * Step 2: Update RACK stats. 1450 // If the ACK is not ignored as invalid, update the RACK.rtt 1451 // to be the RTT sample calculated using this ACK, and 1452 // continue. If this ACK or SACK was for the most recently 1453 // sent packet, then record the RACK.xmit_ts timestamp and 1454 // RACK.end_seq sequence implied by this ACK. 1455 // * Step 3: Detect packet reordering. 1456 // If the ACK selectively or cumulatively acknowledges an 1457 // unacknowledged and also never retransmitted sequence below 1458 // RACK.fack, then the corresponding packet has been 1459 // reordered and RACK.reord is set to TRUE. 1460 if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1461 hasDSACK = s.walkSACK(rcvdSeg) 1462 } 1463 s.SetPipe() 1464 } 1465 1466 ack := rcvdSeg.ackNumber 1467 fastRetransmit := false 1468 // Do not leave fast recovery, if the ACK is out of range. 1469 if s.FastRecovery.Active { 1470 // Leave fast recovery if it acknowledges all the data covered by 1471 // this fast recovery session. 1472 if (ack-1).InRange(s.SndUna, s.SndNxt) && s.FastRecovery.Last.LessThan(ack) { 1473 s.leaveRecovery() 1474 } 1475 } else { 1476 // Detect loss by counting the duplicates and enter recovery. 1477 fastRetransmit = s.detectLoss(rcvdSeg) 1478 } 1479 1480 // See if TLP based recovery was successful. 1481 if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1482 s.detectTLPRecovery(ack, rcvdSeg) 1483 } 1484 1485 // Stash away the current window size. 1486 s.SndWnd = rcvdSeg.window 1487 1488 // Disable zero window probing if remote advertises a non-zero receive 1489 // window. This can be with an ACK to the zero window probe (where the 1490 // acknumber refers to the already acknowledged byte) OR to any previously 1491 // unacknowledged segment. 1492 if s.zeroWindowProbing && rcvdSeg.window > 0 && 1493 (ack == s.SndUna || (ack-1).InRange(s.SndUna, s.SndNxt)) { 1494 s.disableZeroWindowProbing() 1495 } 1496 1497 // On receiving the ACK for the zero window probe, account for it and 1498 // skip trying to send any segment as we are still probing for 1499 // receive window to become non-zero. 1500 if s.zeroWindowProbing && s.unackZeroWindowProbes > 0 && ack == s.SndUna { 1501 s.unackZeroWindowProbes-- 1502 return 1503 } 1504 1505 // Ignore ack if it doesn't acknowledge any new data. 1506 if (ack - 1).InRange(s.SndUna, s.SndNxt) { 1507 s.DupAckCount = 0 1508 1509 // See : https://tools.ietf.org/html/rfc1323#section-3.3. 1510 // Specifically we should only update the RTO using TSEcr if the 1511 // following condition holds: 1512 // 1513 // A TSecr value received in a segment is used to update the 1514 // averaged RTT measurement only if the segment acknowledges 1515 // some new data, i.e., only if it advances the left edge of 1516 // the send window. 1517 if s.ep.SendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 { 1518 s.updateRTO(s.ep.elapsed(s.ep.stack.Clock().NowMonotonic(), rcvdSeg.parsedOptions.TSEcr)) 1519 } 1520 1521 if s.shouldSchedulePTO() { 1522 // Schedule PTO upon receiving an ACK that cumulatively acknowledges data. 1523 // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1. 1524 s.schedulePTO() 1525 } else { 1526 // When an ack is received we must rearm the timer. 1527 // RFC 6298 5.3 1528 s.probeTimer.disable() 1529 s.resendTimer.enable(s.RTO) 1530 } 1531 1532 // Remove all acknowledged data from the write list. 1533 acked := s.SndUna.Size(ack) 1534 s.SndUna = ack 1535 ackLeft := acked 1536 originalOutstanding := s.Outstanding 1537 for ackLeft > 0 { 1538 // We use logicalLen here because we can have FIN 1539 // segments (which are always at the end of list) that 1540 // have no data, but do consume a sequence number. 1541 seg := s.writeList.Front() 1542 datalen := seg.logicalLen() 1543 1544 if datalen > ackLeft { 1545 prevCount := s.pCount(seg, s.MaxPayloadSize) 1546 seg.TrimFront(ackLeft) 1547 seg.sequenceNumber.UpdateForward(ackLeft) 1548 s.Outstanding -= prevCount - s.pCount(seg, s.MaxPayloadSize) 1549 break 1550 } 1551 1552 if s.writeNext == seg { 1553 s.updateWriteNext(seg.Next()) 1554 } 1555 1556 // Update the RACK fields if SACK is enabled. 1557 if s.ep.SACKPermitted && !seg.acked && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1558 s.rc.update(seg, rcvdSeg) 1559 s.rc.detectReorder(seg) 1560 } 1561 1562 s.writeList.Remove(seg) 1563 1564 // If SACK is enabled then only reduce outstanding if 1565 // the segment was not previously SACKED as these have 1566 // already been accounted for in SetPipe(). 1567 if !s.ep.SACKPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 1568 s.Outstanding -= s.pCount(seg, s.MaxPayloadSize) 1569 } else { 1570 s.SackedOut -= s.pCount(seg, s.MaxPayloadSize) 1571 } 1572 seg.DecRef() 1573 ackLeft -= datalen 1574 } 1575 1576 // Clear SACK information for all acked data. 1577 s.ep.scoreboard.Delete(s.SndUna) 1578 1579 // Detect if the sender entered recovery spuriously. 1580 if s.inRecovery() { 1581 s.detectSpuriousRecovery(hasDSACK, rcvdSeg.parsedOptions.TSEcr) 1582 } 1583 1584 // If we are not in fast recovery then update the congestion 1585 // window based on the number of acknowledged packets. 1586 if !s.FastRecovery.Active { 1587 s.cc.Update(originalOutstanding - s.Outstanding) 1588 if s.FastRecovery.Last.LessThan(s.SndUna) { 1589 s.state = tcpip.Open 1590 // Update RACK when we are exiting fast or RTO 1591 // recovery as described in the RFC 1592 // draft-ietf-tcpm-rack-08 Section-7.2 Step 4. 1593 if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1594 s.rc.exitRecovery() 1595 } 1596 s.reorderTimer.disable() 1597 } 1598 } 1599 1600 // Update the send buffer usage and notify potential waiters. 1601 s.ep.updateSndBufferUsage(int(acked)) 1602 1603 // It is possible for s.outstanding to drop below zero if we get 1604 // a retransmit timeout, reset outstanding to zero but later 1605 // get an ack that cover previously sent data. 1606 if s.Outstanding < 0 { 1607 s.Outstanding = 0 1608 } 1609 1610 s.SetPipe() 1611 1612 // If all outstanding data was acknowledged the disable the timer. 1613 // RFC 6298 Rule 5.3 1614 if s.SndUna == s.SndNxt { 1615 s.Outstanding = 0 1616 // Reset firstRetransmittedSegXmitTime to the zero value. 1617 s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{} 1618 s.resendTimer.disable() 1619 s.probeTimer.disable() 1620 } 1621 } 1622 1623 if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1624 // Update RACK reorder window. 1625 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 1626 // * Upon receiving an ACK: 1627 // * Step 4: Update RACK reordering window 1628 s.rc.updateRACKReorderWindow() 1629 1630 // After the reorder window is calculated, detect any loss by checking 1631 // if the time elapsed after the segments are sent is greater than the 1632 // reorder window. 1633 if numLost := s.rc.detectLoss(rcvdSeg.rcvdTime); numLost > 0 && !s.FastRecovery.Active { 1634 // If any segment is marked as lost by 1635 // RACK, enter recovery and retransmit 1636 // the lost segments. 1637 s.cc.HandleLossDetected() 1638 s.enterRecovery() 1639 fastRetransmit = true 1640 } 1641 1642 if s.FastRecovery.Active { 1643 s.rc.DoRecovery(nil, fastRetransmit) 1644 } 1645 } 1646 1647 // Now that we've popped all acknowledged data from the retransmit 1648 // queue, retransmit if needed. 1649 if s.FastRecovery.Active && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 { 1650 s.lr.DoRecovery(rcvdSeg, fastRetransmit) 1651 // When SACK is enabled data sending is governed by steps in 1652 // RFC 6675 Section 5 recovery steps A-C. 1653 // See: https://tools.ietf.org/html/rfc6675#section-5. 1654 if s.ep.SACKPermitted { 1655 return 1656 } 1657 } 1658 1659 // Send more data now that some of the pending data has been ack'd, or 1660 // that the window opened up, or the congestion window was inflated due 1661 // to a duplicate ack during fast recovery. This will also re-enable 1662 // the retransmit timer if needed. 1663 s.sendData() 1664 } 1665 1666 // sendSegment sends the specified segment. 1667 // +checklocks:s.ep.mu 1668 func (s *sender) sendSegment(seg *segment) tcpip.Error { 1669 if seg.xmitCount > 0 { 1670 s.ep.stack.Stats().TCP.Retransmits.Increment() 1671 s.ep.stats.SendErrors.Retransmits.Increment() 1672 if s.SndCwnd < s.Ssthresh { 1673 s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment() 1674 } 1675 } 1676 seg.xmitTime = s.ep.stack.Clock().NowMonotonic() 1677 seg.xmitCount++ 1678 seg.lost = false 1679 1680 err := s.sendSegmentFromPacketBuffer(seg.pkt, seg.flags, seg.sequenceNumber) 1681 1682 // Every time a packet containing data is sent (including a 1683 // retransmission), if SACK is enabled and we are retransmitting data 1684 // then use the conservative timer described in RFC6675 Section 6.0, 1685 // otherwise follow the standard time described in RFC6298 Section 5.1. 1686 if err != nil && seg.payloadSize() != 0 { 1687 if s.FastRecovery.Active && seg.xmitCount > 1 && s.ep.SACKPermitted { 1688 s.resendTimer.enable(s.RTO) 1689 } else { 1690 if !s.resendTimer.enabled() { 1691 s.resendTimer.enable(s.RTO) 1692 } 1693 } 1694 } 1695 1696 return err 1697 } 1698 1699 // sendSegmentFromPacketBuffer sends a new segment containing the given payload, 1700 // flags and sequence number. 1701 // +checklocks:s.ep.mu 1702 // +checklocksalias:s.ep.rcv.ep.mu=s.ep.mu 1703 func (s *sender) sendSegmentFromPacketBuffer(pkt *stack.PacketBuffer, flags header.TCPFlags, seq seqnum.Value) tcpip.Error { 1704 s.LastSendTime = s.ep.stack.Clock().NowMonotonic() 1705 if seq == s.RTTMeasureSeqNum { 1706 s.RTTMeasureTime = s.LastSendTime 1707 } 1708 1709 rcvNxt, rcvWnd := s.ep.rcv.getSendParams() 1710 1711 // Remember the max sent ack. 1712 s.MaxSentAck = rcvNxt 1713 1714 // We need to clone the packet because sendRaw takes ownership of pkt, 1715 // and pkt could be reprocessed later on (i.e retrasmission). 1716 pkt = pkt.Clone() 1717 defer pkt.DecRef() 1718 1719 return s.ep.sendRaw(pkt, flags, seq, rcvNxt, rcvWnd) 1720 } 1721 1722 // sendEmptySegment sends a new empty segment, flags and sequence number. 1723 // +checklocks:s.ep.mu 1724 // +checklocksalias:s.ep.rcv.ep.mu=s.ep.mu 1725 func (s *sender) sendEmptySegment(flags header.TCPFlags, seq seqnum.Value) tcpip.Error { 1726 s.LastSendTime = s.ep.stack.Clock().NowMonotonic() 1727 if seq == s.RTTMeasureSeqNum { 1728 s.RTTMeasureTime = s.LastSendTime 1729 } 1730 1731 rcvNxt, rcvWnd := s.ep.rcv.getSendParams() 1732 1733 // Remember the max sent ack. 1734 s.MaxSentAck = rcvNxt 1735 1736 return s.ep.sendEmptyRaw(flags, seq, rcvNxt, rcvWnd) 1737 } 1738 1739 // maybeSendOutOfWindowAck sends an ACK if we are not being rate limited 1740 // currently. 1741 // +checklocks:s.ep.mu 1742 func (s *sender) maybeSendOutOfWindowAck(seg *segment) { 1743 // Data packets are unlikely to be part of an ACK loop. So always send 1744 // an ACK for a packet w/ data. 1745 if seg.payloadSize() > 0 || s.ep.allowOutOfWindowAck() { 1746 s.sendAck() 1747 } 1748 } 1749 1750 func (s *sender) updateWriteNext(seg *segment) { 1751 if s.writeNext != nil { 1752 s.writeNext.DecRef() 1753 } 1754 if seg != nil { 1755 seg.IncRef() 1756 } 1757 s.writeNext = seg 1758 } 1759 1760 // corkTimerExpired drains all the segments when TCP_CORK is enabled. 1761 // +checklocks:s.ep.mu 1762 func (s *sender) corkTimerExpired() tcpip.Error { 1763 // Check if the timer actually expired or if it's a spurious wake due 1764 // to a previously orphaned runtime timer. 1765 if s.corkTimer.isUninitialized() || !s.corkTimer.checkExpiration() { 1766 return nil 1767 } 1768 1769 // Assign sequence number and flags to the segment. 1770 seg := s.writeNext 1771 if seg == nil { 1772 return nil 1773 } 1774 seg.sequenceNumber = s.SndNxt 1775 seg.flags = header.TCPFlagAck | header.TCPFlagPsh 1776 // Drain all the segments. 1777 s.sendData() 1778 return nil 1779 }