github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/tcpip/transport/tcp/snd.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "fmt" 19 "math" 20 "sort" 21 "time" 22 23 "github.com/metacubex/gvisor/pkg/sync" 24 "github.com/metacubex/gvisor/pkg/tcpip" 25 "github.com/metacubex/gvisor/pkg/tcpip/header" 26 "github.com/metacubex/gvisor/pkg/tcpip/seqnum" 27 "github.com/metacubex/gvisor/pkg/tcpip/stack" 28 ) 29 30 const ( 31 // MinRTO is the minimum allowed value for the retransmit timeout. 32 MinRTO = 200 * time.Millisecond 33 34 // MaxRTO is the maximum allowed value for the retransmit timeout. 35 MaxRTO = 120 * time.Second 36 37 // MinSRTT is the minimum allowed value for smoothed RTT. 38 MinSRTT = 1 * time.Millisecond 39 40 // InitialCwnd is the initial congestion window. 41 InitialCwnd = 10 42 43 // nDupAckThreshold is the number of duplicate ACK's required 44 // before fast-retransmit is entered. 45 nDupAckThreshold = 3 46 47 // MaxRetries is the maximum number of probe retries sender does 48 // before timing out the connection. 49 // Linux default TCP_RETR2, net.ipv4.tcp_retries2. 50 MaxRetries = 15 51 ) 52 53 // congestionControl is an interface that must be implemented by any supported 54 // congestion control algorithm. 55 type congestionControl interface { 56 // HandleLossDetected is invoked when the loss is detected by RACK or 57 // sender.dupAckCount >= nDupAckThreshold just before entering fast 58 // retransmit. 59 HandleLossDetected() 60 61 // HandleRTOExpired is invoked when the retransmit timer expires. 62 HandleRTOExpired() 63 64 // Update is invoked when processing inbound acks. It's passed the 65 // number of packet's that were acked by the most recent cumulative 66 // acknowledgement. 67 Update(packetsAcked int) 68 69 // PostRecovery is invoked when the sender is exiting a fast retransmit/ 70 // recovery phase. This provides congestion control algorithms a way 71 // to adjust their state when exiting recovery. 72 PostRecovery() 73 } 74 75 // lossRecovery is an interface that must be implemented by any supported 76 // loss recovery algorithm. 77 type lossRecovery interface { 78 // DoRecovery is invoked when loss is detected and segments need 79 // to be retransmitted. The cumulative or selective ACK is passed along 80 // with the flag which identifies whether the connection entered fast 81 // retransmit with this ACK and to retransmit the first unacknowledged 82 // segment. 83 DoRecovery(rcvdSeg *segment, fastRetransmit bool) 84 } 85 86 // sender holds the state necessary to send TCP segments. 87 // 88 // +stateify savable 89 type sender struct { 90 stack.TCPSenderState 91 ep *Endpoint 92 93 // lr is the loss recovery algorithm used by the sender. 94 lr lossRecovery 95 96 // firstRetransmittedSegXmitTime is the original transmit time of 97 // the first segment that was retransmitted due to RTO expiration. 98 firstRetransmittedSegXmitTime tcpip.MonotonicTime 99 100 // zeroWindowProbing is set if the sender is currently probing 101 // for zero receive window. 102 zeroWindowProbing bool `state:"nosave"` 103 104 // unackZeroWindowProbes is the number of unacknowledged zero 105 // window probes. 106 unackZeroWindowProbes uint32 `state:"nosave"` 107 108 writeNext *segment 109 writeList segmentList 110 resendTimer timer `state:"nosave"` 111 112 // rtt.TCPRTTState.SRTT and rtt.TCPRTTState.RTTVar are the "smoothed 113 // round-trip time", and "round-trip time variation", as defined in 114 // section 2 of RFC 6298. 115 rtt rtt 116 117 // minRTO is the minimum permitted value for sender.rto. 118 minRTO time.Duration 119 120 // maxRTO is the maximum permitted value for sender.rto. 121 maxRTO time.Duration 122 123 // maxRetries is the maximum permitted retransmissions. 124 maxRetries uint32 125 126 // gso is set if generic segmentation offload is enabled. 127 gso bool 128 129 // state is the current state of congestion control for this endpoint. 130 state tcpip.CongestionControlState 131 132 // cc is the congestion control algorithm in use for this sender. 133 cc congestionControl 134 135 // rc has the fields needed for implementing RACK loss detection 136 // algorithm. 137 rc rackControl 138 139 // reorderTimer is the timer used to retransmit the segments after RACK 140 // detects them as lost. 141 reorderTimer timer `state:"nosave"` 142 143 // probeTimer is used to schedule PTO for RACK TLP algorithm. 144 probeTimer timer `state:"nosave"` 145 146 // spuriousRecovery indicates whether the sender entered recovery 147 // spuriously as described in RFC3522 Section 3.2. 148 spuriousRecovery bool 149 150 // retransmitTS is the timestamp at which the sender sends retransmitted 151 // segment after entering an RTO for the first time as described in 152 // RFC3522 Section 3.2. 153 retransmitTS uint32 154 155 // startCork start corking the segments. 156 startCork bool 157 158 // corkTimer is used to drain the segments which are held when TCP_CORK 159 // option is enabled. 160 corkTimer timer `state:"nosave"` 161 } 162 163 // rtt is a synchronization wrapper used to appease stateify. See the comment 164 // in sender, where it is used. 165 // 166 // +stateify savable 167 type rtt struct { 168 sync.Mutex `state:"nosave"` 169 170 stack.TCPRTTState 171 } 172 173 // +checklocks:ep.mu 174 func newSender(ep *Endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender { 175 // The sender MUST reduce the TCP data length to account for any IP or 176 // TCP options that it is including in the packets that it sends. 177 // See: https://tools.ietf.org/html/rfc6691#section-2 178 maxPayloadSize := int(mss) - ep.maxOptionSize() 179 180 s := &sender{ 181 ep: ep, 182 TCPSenderState: stack.TCPSenderState{ 183 SndWnd: sndWnd, 184 SndUna: iss + 1, 185 SndNxt: iss + 1, 186 RTTMeasureSeqNum: iss + 1, 187 LastSendTime: ep.stack.Clock().NowMonotonic(), 188 MaxPayloadSize: maxPayloadSize, 189 MaxSentAck: irs + 1, 190 FastRecovery: stack.TCPFastRecoveryState{ 191 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1. 192 Last: iss, 193 HighRxt: iss, 194 RescueRxt: iss, 195 }, 196 RTO: 1 * time.Second, 197 }, 198 gso: ep.gso.Type != stack.GSONone, 199 } 200 201 if s.gso { 202 s.ep.gso.MSS = uint16(maxPayloadSize) 203 } 204 205 s.cc = s.initCongestionControl(ep.cc) 206 s.lr = s.initLossRecovery() 207 s.rc.init(s, iss) 208 209 // A negative sndWndScale means that no scaling is in use, otherwise we 210 // store the scaling value. 211 if sndWndScale > 0 { 212 s.SndWndScale = uint8(sndWndScale) 213 } 214 215 s.resendTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.retransmitTimerExpired)) 216 s.reorderTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.rc.reorderTimerExpired)) 217 s.probeTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.probeTimerExpired)) 218 s.corkTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.corkTimerExpired)) 219 220 s.ep.AssertLockHeld(ep) 221 s.updateMaxPayloadSize(int(ep.route.MTU()), 0) 222 // Initialize SACK Scoreboard after updating max payload size as we use 223 // the maxPayloadSize as the smss when determining if a segment is lost 224 // etc. 225 s.ep.scoreboard = NewSACKScoreboard(uint16(s.MaxPayloadSize), iss) 226 227 // Get Stack wide config. 228 var minRTO tcpip.TCPMinRTOOption 229 if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil { 230 panic(fmt.Sprintf("unable to get minRTO from stack: %s", err)) 231 } 232 s.minRTO = time.Duration(minRTO) 233 234 var maxRTO tcpip.TCPMaxRTOOption 235 if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil { 236 panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err)) 237 } 238 s.maxRTO = time.Duration(maxRTO) 239 240 var maxRetries tcpip.TCPMaxRetriesOption 241 if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil { 242 panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err)) 243 } 244 s.maxRetries = uint32(maxRetries) 245 246 return s 247 } 248 249 // initCongestionControl initializes the specified congestion control module and 250 // returns a handle to it. It also initializes the sndCwnd and sndSsThresh to 251 // their initial values. 252 func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl { 253 s.SndCwnd = InitialCwnd 254 // Set sndSsthresh to the maximum int value, which depends on the 255 // platform. 256 s.Ssthresh = int(^uint(0) >> 1) 257 258 switch congestionControlName { 259 case ccCubic: 260 return newCubicCC(s) 261 case ccReno: 262 fallthrough 263 default: 264 return newRenoCC(s) 265 } 266 } 267 268 // initLossRecovery initiates the loss recovery algorithm for the sender. 269 func (s *sender) initLossRecovery() lossRecovery { 270 if s.ep.SACKPermitted { 271 return newSACKRecovery(s) 272 } 273 return newRenoRecovery(s) 274 } 275 276 // updateMaxPayloadSize updates the maximum payload size based on the given 277 // MTU. If this is in response to "packet too big" control packets (indicated 278 // by the count argument), it also reduces the number of outstanding packets and 279 // attempts to retransmit the first packet above the MTU size. 280 // +checklocks:s.ep.mu 281 func (s *sender) updateMaxPayloadSize(mtu, count int) { 282 m := mtu - header.TCPMinimumSize 283 284 m -= s.ep.maxOptionSize() 285 286 // We don't adjust up for now. 287 if m >= s.MaxPayloadSize { 288 return 289 } 290 291 // Make sure we can transmit at least one byte. 292 if m <= 0 { 293 m = 1 294 } 295 296 oldMSS := s.MaxPayloadSize 297 s.MaxPayloadSize = m 298 if s.gso { 299 s.ep.gso.MSS = uint16(m) 300 } 301 302 if count == 0 { 303 // updateMaxPayloadSize is also called when the sender is created. 304 // and there is no data to send in such cases. Return immediately. 305 return 306 } 307 308 // Update the scoreboard's smss to reflect the new lowered 309 // maxPayloadSize. 310 s.ep.scoreboard.smss = uint16(m) 311 312 s.Outstanding -= count 313 if s.Outstanding < 0 { 314 s.Outstanding = 0 315 } 316 317 // Rewind writeNext to the first segment exceeding the MTU. Do nothing 318 // if it is already before such a packet. 319 nextSeg := s.writeNext 320 for seg := s.writeList.Front(); seg != nil; seg = seg.Next() { 321 if seg == s.writeNext { 322 // We got to writeNext before we could find a segment 323 // exceeding the MTU. 324 break 325 } 326 327 if nextSeg == s.writeNext && seg.payloadSize() > m { 328 // We found a segment exceeding the MTU. Rewind 329 // writeNext and try to retransmit it. 330 nextSeg = seg 331 } 332 333 if s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 334 // Update sackedOut for new maximum payload size. 335 s.SackedOut -= s.pCount(seg, oldMSS) 336 s.SackedOut += s.pCount(seg, s.MaxPayloadSize) 337 } 338 } 339 340 // Since we likely reduced the number of outstanding packets, we may be 341 // ready to send some more. 342 s.updateWriteNext(nextSeg) 343 s.sendData() 344 } 345 346 // sendAck sends an ACK segment. 347 // +checklocks:s.ep.mu 348 func (s *sender) sendAck() { 349 s.sendEmptySegment(header.TCPFlagAck, s.SndNxt) 350 } 351 352 // updateRTO updates the retransmit timeout when a new roud-trip time is 353 // available. This is done in accordance with section 2 of RFC 6298. 354 func (s *sender) updateRTO(rtt time.Duration) { 355 s.rtt.Lock() 356 if !s.rtt.TCPRTTState.SRTTInited { 357 s.rtt.TCPRTTState.RTTVar = rtt / 2 358 s.rtt.TCPRTTState.SRTT = rtt 359 s.rtt.TCPRTTState.SRTTInited = true 360 } else { 361 diff := s.rtt.TCPRTTState.SRTT - rtt 362 if diff < 0 { 363 diff = -diff 364 } 365 // Use RFC6298 standard algorithm to update TCPRTTState.RTTVar and TCPRTTState.SRTT when 366 // no timestamps are available. 367 if !s.ep.SendTSOk { 368 s.rtt.TCPRTTState.RTTVar = (3*s.rtt.TCPRTTState.RTTVar + diff) / 4 369 s.rtt.TCPRTTState.SRTT = (7*s.rtt.TCPRTTState.SRTT + rtt) / 8 370 } else { 371 // When we are taking RTT measurements of every ACK then 372 // we need to use a modified method as specified in 373 // https://tools.ietf.org/html/rfc7323#appendix-G 374 if s.Outstanding == 0 { 375 s.rtt.Unlock() 376 return 377 } 378 // Netstack measures congestion window/inflight all in 379 // terms of packets and not bytes. This is similar to 380 // how linux also does cwnd and inflight. In practice 381 // this approximation works as expected. 382 expectedSamples := math.Ceil(float64(s.Outstanding) / 2) 383 384 // alpha & beta values are the original values as recommended in 385 // https://tools.ietf.org/html/rfc6298#section-2.3. 386 const alpha = 0.125 387 const beta = 0.25 388 389 alphaPrime := alpha / expectedSamples 390 betaPrime := beta / expectedSamples 391 rttVar := (1-betaPrime)*s.rtt.TCPRTTState.RTTVar.Seconds() + betaPrime*diff.Seconds() 392 srtt := (1-alphaPrime)*s.rtt.TCPRTTState.SRTT.Seconds() + alphaPrime*rtt.Seconds() 393 s.rtt.TCPRTTState.RTTVar = time.Duration(rttVar * float64(time.Second)) 394 s.rtt.TCPRTTState.SRTT = time.Duration(srtt * float64(time.Second)) 395 } 396 } 397 398 if s.rtt.TCPRTTState.SRTT < MinSRTT { 399 s.rtt.TCPRTTState.SRTT = MinSRTT 400 } 401 402 s.RTO = s.rtt.TCPRTTState.SRTT + 4*s.rtt.TCPRTTState.RTTVar 403 s.rtt.Unlock() 404 if s.RTO < s.minRTO { 405 s.RTO = s.minRTO 406 } 407 if s.RTO > s.maxRTO { 408 s.RTO = s.maxRTO 409 } 410 } 411 412 // resendSegment resends the first unacknowledged segment. 413 // +checklocks:s.ep.mu 414 func (s *sender) resendSegment() { 415 // Don't use any segments we already sent to measure RTT as they may 416 // have been affected by packets being lost. 417 s.RTTMeasureSeqNum = s.SndNxt 418 419 // Resend the segment. 420 if seg := s.writeList.Front(); seg != nil { 421 if seg.payloadSize() > s.MaxPayloadSize { 422 s.splitSeg(seg, s.MaxPayloadSize) 423 } 424 425 // See: RFC 6675 section 5 Step 4.3 426 // 427 // To prevent retransmission, set both the HighRXT and RescueRXT 428 // to the highest sequence number in the retransmitted segment. 429 s.FastRecovery.HighRxt = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) - 1 430 s.FastRecovery.RescueRxt = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) - 1 431 s.sendSegment(seg) 432 s.ep.stack.Stats().TCP.FastRetransmit.Increment() 433 s.ep.stats.SendErrors.FastRetransmit.Increment() 434 435 // Run SetPipe() as per RFC 6675 section 5 Step 4.4 436 s.SetPipe() 437 } 438 } 439 440 // retransmitTimerExpired is called when the retransmit timer expires, and 441 // unacknowledged segments are assumed lost, and thus need to be resent. 442 // Returns true if the connection is still usable, or false if the connection 443 // is deemed lost. 444 // +checklocks:s.ep.mu 445 func (s *sender) retransmitTimerExpired() tcpip.Error { 446 // Check if the timer actually expired or if it's a spurious wake due 447 // to a previously orphaned runtime timer. 448 if s.resendTimer.isUninitialized() || !s.resendTimer.checkExpiration() { 449 return nil 450 } 451 452 // Initialize the variables used to detect spurious recovery after 453 // entering RTO. 454 // 455 // See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1. 456 s.spuriousRecovery = false 457 s.retransmitTS = 0 458 459 // TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases 460 // when writeList is empty. Remove this once we have a proper fix for this 461 // issue. 462 if s.writeList.Front() == nil { 463 return nil 464 } 465 466 s.ep.stack.Stats().TCP.Timeouts.Increment() 467 s.ep.stats.SendErrors.Timeouts.Increment() 468 469 // Set TLPRxtOut to false according to 470 // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1. 471 s.rc.tlpRxtOut = false 472 473 // Give up if we've waited more than a minute since the last resend or 474 // if a user time out is set and we have exceeded the user specified 475 // timeout since the first retransmission. 476 uto := s.ep.userTimeout 477 478 if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) { 479 // We store the original xmitTime of the segment that we are 480 // about to retransmit as the retransmission time. This is 481 // required as by the time the retransmitTimer has expired the 482 // segment has already been sent and unacked for the RTO at the 483 // time the segment was sent. 484 s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime 485 } 486 487 elapsed := s.ep.stack.Clock().NowMonotonic().Sub(s.firstRetransmittedSegXmitTime) 488 remaining := s.maxRTO 489 if uto != 0 { 490 // Cap to the user specified timeout if one is specified. 491 remaining = uto - elapsed 492 } 493 494 // Always honor the user-timeout irrespective of whether the zero 495 // window probes were acknowledged. 496 // net/ipv4/tcp_timer.c::tcp_probe_timer() 497 if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries { 498 s.ep.stack.Stats().TCP.EstablishedTimedout.Increment() 499 return &tcpip.ErrTimeout{} 500 } 501 502 // Set new timeout. The timer will be restarted by the call to sendData 503 // below. 504 s.RTO *= 2 505 // Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5 506 if s.RTO > s.maxRTO { 507 s.RTO = s.maxRTO 508 } 509 510 // Cap RTO to remaining time. 511 if s.RTO > remaining { 512 s.RTO = remaining 513 } 514 515 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4. 516 // 517 // Retransmit timeouts: 518 // After a retransmit timeout, record the highest sequence number 519 // transmitted in the variable recover, and exit the fast recovery 520 // procedure if applicable. 521 s.FastRecovery.Last = s.SndNxt - 1 522 523 if s.FastRecovery.Active { 524 // We were attempting fast recovery but were not successful. 525 // Leave the state. We don't need to update ssthresh because it 526 // has already been updated when entered fast-recovery. 527 s.leaveRecovery() 528 } 529 530 // Record retransmitTS if the sender is not in recovery as per: 531 // https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 532 s.recordRetransmitTS() 533 534 s.state = tcpip.RTORecovery 535 s.cc.HandleRTOExpired() 536 537 // Mark the next segment to be sent as the first unacknowledged one and 538 // start sending again. Set the number of outstanding packets to 0 so 539 // that we'll be able to retransmit. 540 // 541 // We'll keep on transmitting (or retransmitting) as we get acks for 542 // the data we transmit. 543 s.Outstanding = 0 544 545 // Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1 546 // 547 // In order to avoid memory deadlocks, the TCP receiver is allowed to 548 // discard data that has already been selectively acknowledged. As a 549 // result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK 550 // information gathered from a receiver upon a retransmission timeout 551 // (RTO) "since the timeout might indicate that the data receiver has 552 // reneged." Additionally, a TCP sender MUST "ignore prior SACK 553 // information in determining which data to retransmit." 554 // 555 // NOTE: We take the stricter interpretation and just expunge all 556 // information as we lack more rigorous checks to validate if the SACK 557 // information is usable after an RTO. 558 s.ep.scoreboard.Reset() 559 s.updateWriteNext(s.writeList.Front()) 560 561 // RFC 1122 4.2.2.17: Start sending zero window probes when we still see a 562 // zero receive window after retransmission interval and we have data to 563 // send. 564 if s.zeroWindowProbing { 565 s.sendZeroWindowProbe() 566 // RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed 567 // indefinitely. As long as the receiving TCP continues to send 568 // acknowledgments in response to the probe segments, the sending TCP 569 // MUST allow the connection to stay open. 570 return nil 571 } 572 573 seg := s.writeNext 574 // RFC 1122 4.2.3.5: Close the connection when the number of 575 // retransmissions for this segment is beyond a limit. 576 if seg != nil && seg.xmitCount > s.maxRetries { 577 s.ep.stack.Stats().TCP.EstablishedTimedout.Increment() 578 return &tcpip.ErrTimeout{} 579 } 580 581 s.sendData() 582 583 return nil 584 } 585 586 // pCount returns the number of packets in the segment. Due to GSO, a segment 587 // can be composed of multiple packets. 588 func (s *sender) pCount(seg *segment, maxPayloadSize int) int { 589 size := seg.payloadSize() 590 if size == 0 { 591 return 1 592 } 593 594 return (size-1)/maxPayloadSize + 1 595 } 596 597 // splitSeg splits a given segment at the size specified and inserts the 598 // remainder as a new segment after the current one in the write list. 599 func (s *sender) splitSeg(seg *segment, size int) { 600 if seg.payloadSize() <= size { 601 return 602 } 603 // Split this segment up. 604 nSeg := seg.clone() 605 nSeg.pkt.Data().TrimFront(size) 606 nSeg.sequenceNumber.UpdateForward(seqnum.Size(size)) 607 s.writeList.InsertAfter(seg, nSeg) 608 609 // The segment being split does not carry PUSH flag because it is 610 // followed by the newly split segment. 611 // RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered 612 // segment (i.e., when there is no more queued data to be sent). 613 // Linux removes PSH flag only when the segment is being split over MSS 614 // and retains it when we are splitting the segment over lack of sender 615 // window space. 616 // ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point() 617 // ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test() 618 if seg.payloadSize() > s.MaxPayloadSize { 619 seg.flags ^= header.TCPFlagPsh 620 } 621 seg.pkt.Data().CapLength(size) 622 } 623 624 // NextSeg implements the RFC6675 NextSeg() operation. 625 // 626 // NextSeg starts scanning the writeList starting from nextSegHint and returns 627 // the hint to be passed on the next call to NextSeg. This is required to avoid 628 // iterating the write list repeatedly when NextSeg is invoked in a loop during 629 // recovery. The returned hint will be nil if there are no more segments that 630 // can match rules defined by NextSeg operation in RFC6675. 631 // 632 // rescueRtx will be true only if nextSeg is a rescue retransmission as 633 // described by Step 4) of the NextSeg algorithm. 634 func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) { 635 var s3 *segment 636 var s4 *segment 637 // Step 1. 638 for seg := nextSegHint; seg != nil; seg = seg.Next() { 639 // Stop iteration if we hit a segment that has never been 640 // transmitted (i.e. either it has no assigned sequence number 641 // or if it does have one, it's >= the next sequence number 642 // to be sent [i.e. >= s.sndNxt]). 643 if !s.isAssignedSequenceNumber(seg) || s.SndNxt.LessThanEq(seg.sequenceNumber) { 644 hint = nil 645 break 646 } 647 segSeq := seg.sequenceNumber 648 if smss := s.ep.scoreboard.SMSS(); seg.payloadSize() > int(smss) { 649 s.splitSeg(seg, int(smss)) 650 } 651 652 // See RFC 6675 Section 4 653 // 654 // 1. If there exists a smallest unSACKED sequence number 655 // 'S2' that meets the following 3 criteria for determinig 656 // loss, the sequence range of one segment of up to SMSS 657 // octets starting with S2 MUST be returned. 658 if !s.ep.scoreboard.IsSACKED(header.SACKBlock{Start: segSeq, End: segSeq.Add(1)}) { 659 // NextSeg(): 660 // 661 // (1.a) S2 is greater than HighRxt 662 // (1.b) S2 is less than highest octet covered by 663 // any received SACK. 664 if s.FastRecovery.HighRxt.LessThan(segSeq) && segSeq.LessThan(s.ep.scoreboard.maxSACKED) { 665 // NextSeg(): 666 // (1.c) IsLost(S2) returns true. 667 if s.ep.scoreboard.IsLost(segSeq) { 668 return seg, seg.Next(), false 669 } 670 671 // NextSeg(): 672 // 673 // (3): If the conditions for rules (1) and (2) 674 // fail, but there exists an unSACKed sequence 675 // number S3 that meets the criteria for 676 // detecting loss given in steps 1.a and 1.b 677 // above (specifically excluding (1.c)) then one 678 // segment of upto SMSS octets starting with S3 679 // SHOULD be returned. 680 if s3 == nil { 681 s3 = seg 682 hint = seg.Next() 683 } 684 } 685 // NextSeg(): 686 // 687 // (4) If the conditions for (1), (2) and (3) fail, 688 // but there exists outstanding unSACKED data, we 689 // provide the opportunity for a single "rescue" 690 // retransmission per entry into loss recovery. If 691 // HighACK is greater than RescueRxt (or RescueRxt 692 // is undefined), then one segment of upto SMSS 693 // octets that MUST include the highest outstanding 694 // unSACKed sequence number SHOULD be returned, and 695 // RescueRxt set to RecoveryPoint. HighRxt MUST NOT 696 // be updated. 697 if s.FastRecovery.RescueRxt.LessThan(s.SndUna - 1) { 698 if s4 != nil { 699 if s4.sequenceNumber.LessThan(segSeq) { 700 s4 = seg 701 } 702 } else { 703 s4 = seg 704 } 705 } 706 } 707 } 708 709 // If we got here then no segment matched step (1). 710 // Step (2): "If no sequence number 'S2' per rule (1) 711 // exists but there exists available unsent data and the 712 // receiver's advertised window allows, the sequence 713 // range of one segment of up to SMSS octets of 714 // previously unsent data starting with sequence number 715 // HighData+1 MUST be returned." 716 for seg := s.writeNext; seg != nil; seg = seg.Next() { 717 if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.SndNxt) { 718 continue 719 } 720 // We do not split the segment here to <= smss as it has 721 // potentially not been assigned a sequence number yet. 722 return seg, nil, false 723 } 724 725 if s3 != nil { 726 return s3, hint, false 727 } 728 729 return s4, nil, true 730 } 731 732 // maybeSendSegment tries to send the specified segment and either coalesces 733 // other segments into this one or splits the specified segment based on the 734 // lower of the specified limit value or the receivers window size specified by 735 // end. 736 // +checklocks:s.ep.mu 737 func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (sent bool) { 738 // We abuse the flags field to determine if we have already 739 // assigned a sequence number to this segment. 740 if !s.isAssignedSequenceNumber(seg) { 741 // Merge segments if allowed. 742 if seg.payloadSize() != 0 { 743 available := int(s.SndNxt.Size(end)) 744 if available > limit { 745 available = limit 746 } 747 748 // nextTooBig indicates that the next segment was too 749 // large to entirely fit in the current segment. It 750 // would be possible to split the next segment and merge 751 // the portion that fits, but unexpectedly splitting 752 // segments can have user visible side-effects which can 753 // break applications. For example, RFC 7766 section 8 754 // says that the length and data of a DNS response 755 // should be sent in the same TCP segment to avoid 756 // triggering bugs in poorly written DNS 757 // implementations. 758 var nextTooBig bool 759 for nSeg := seg.Next(); nSeg != nil && nSeg.payloadSize() != 0; nSeg = seg.Next() { 760 if seg.payloadSize()+nSeg.payloadSize() > available { 761 nextTooBig = true 762 break 763 } 764 seg.merge(nSeg) 765 s.writeList.Remove(nSeg) 766 nSeg.DecRef() 767 } 768 if !nextTooBig && seg.payloadSize() < available { 769 // Segment is not full. 770 if s.Outstanding > 0 && s.ep.ops.GetDelayOption() { 771 // Nagle's algorithm. From Wikipedia: 772 // Nagle's algorithm works by 773 // combining a number of small 774 // outgoing messages and sending them 775 // all at once. Specifically, as long 776 // as there is a sent packet for which 777 // the sender has received no 778 // acknowledgment, the sender should 779 // keep buffering its output until it 780 // has a full packet's worth of 781 // output, thus allowing output to be 782 // sent all at once. 783 return false 784 } 785 // With TCP_CORK, hold back until minimum of the available 786 // send space and MSS. 787 if s.ep.ops.GetCorkOption() { 788 if seg.payloadSize() < s.MaxPayloadSize { 789 if !s.startCork { 790 s.startCork = true 791 // Enable the timer for 792 // 200ms, after which 793 // the segments are drained. 794 s.corkTimer.enable(MinRTO) 795 } 796 return false 797 } 798 // Disable the TCP_CORK timer. 799 s.startCork = false 800 s.corkTimer.disable() 801 } 802 } 803 } 804 805 // Assign flags. We don't do it above so that we can merge 806 // additional data if Nagle holds the segment. 807 seg.sequenceNumber = s.SndNxt 808 seg.flags = header.TCPFlagAck | header.TCPFlagPsh 809 } 810 811 var segEnd seqnum.Value 812 if seg.payloadSize() == 0 { 813 if s.writeList.Back() != seg { 814 panic("FIN segments must be the final segment in the write list.") 815 } 816 seg.flags = header.TCPFlagAck | header.TCPFlagFin 817 segEnd = seg.sequenceNumber.Add(1) 818 // Update the state to reflect that we have now 819 // queued a FIN. 820 s.ep.updateConnDirectionState(connDirectionStateSndClosed) 821 switch s.ep.EndpointState() { 822 case StateCloseWait: 823 s.ep.setEndpointState(StateLastAck) 824 default: 825 s.ep.setEndpointState(StateFinWait1) 826 } 827 } else { 828 // We're sending a non-FIN segment. 829 if seg.flags&header.TCPFlagFin != 0 { 830 panic("Netstack queues FIN segments without data.") 831 } 832 833 if !seg.sequenceNumber.LessThan(end) { 834 return false 835 } 836 837 available := int(seg.sequenceNumber.Size(end)) 838 if available == 0 { 839 return false 840 } 841 842 // If the whole segment or at least 1MSS sized segment cannot 843 // be accommodated in the receiver advertised window, skip 844 // splitting and sending of the segment. ref: 845 // net/ipv4/tcp_output.c::tcp_snd_wnd_test() 846 // 847 // Linux checks this for all segment transmits not triggered by 848 // a probe timer. On this condition, it defers the segment split 849 // and transmit to a short probe timer. 850 // 851 // ref: include/net/tcp.h::tcp_check_probe_timer() 852 // ref: net/ipv4/tcp_output.c::tcp_write_wakeup() 853 // 854 // Instead of defining a new transmit timer, we attempt to split 855 // the segment right here if there are no pending segments. If 856 // there are pending segments, segment transmits are deferred to 857 // the retransmit timer handler. 858 if s.SndUna != s.SndNxt { 859 switch { 860 case available >= seg.payloadSize(): 861 // OK to send, the whole segments fits in the 862 // receiver's advertised window. 863 case available >= s.MaxPayloadSize: 864 // OK to send, at least 1 MSS sized segment fits 865 // in the receiver's advertised window. 866 default: 867 return false 868 } 869 } 870 871 // The segment size limit is computed as a function of sender 872 // congestion window and MSS. When sender congestion window is > 873 // 1, this limit can be larger than MSS. Ensure that the 874 // currently available send space is not greater than minimum of 875 // this limit and MSS. 876 if available > limit { 877 available = limit 878 } 879 880 // If GSO is not in use then cap available to 881 // maxPayloadSize. When GSO is in use the gVisor GSO logic or 882 // the host GSO logic will cap the segment to the correct size. 883 if s.ep.gso.Type == stack.GSONone && available > s.MaxPayloadSize { 884 available = s.MaxPayloadSize 885 } 886 887 if seg.payloadSize() > available { 888 // A negative value causes splitSeg to panic anyways, so just panic 889 // earlier to get more information about the cause. 890 s.splitSeg(seg, available) 891 } 892 893 segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) 894 } 895 896 s.sendSegment(seg) 897 898 // Update sndNxt if we actually sent new data (as opposed to 899 // retransmitting some previously sent data). 900 if s.SndNxt.LessThan(segEnd) { 901 s.SndNxt = segEnd 902 } 903 904 return true 905 } 906 907 // +checklocks:s.ep.mu 908 func (s *sender) sendZeroWindowProbe() { 909 s.unackZeroWindowProbes++ 910 // Send a zero window probe with sequence number pointing to 911 // the last acknowledged byte. 912 s.sendEmptySegment(header.TCPFlagAck, s.SndUna-1) 913 // Rearm the timer to continue probing. 914 s.resendTimer.enable(s.RTO) 915 } 916 917 func (s *sender) enableZeroWindowProbing() { 918 s.zeroWindowProbing = true 919 // We piggyback the probing on the retransmit timer with the 920 // current retranmission interval, as we may start probing while 921 // segment retransmissions. 922 if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) { 923 s.firstRetransmittedSegXmitTime = s.ep.stack.Clock().NowMonotonic() 924 } 925 s.resendTimer.enable(s.RTO) 926 } 927 928 func (s *sender) disableZeroWindowProbing() { 929 s.zeroWindowProbing = false 930 s.unackZeroWindowProbes = 0 931 s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{} 932 s.resendTimer.disable() 933 } 934 935 func (s *sender) postXmit(dataSent bool, shouldScheduleProbe bool) { 936 if dataSent { 937 // We sent data, so we should stop the keepalive timer to ensure 938 // that no keepalives are sent while there is pending data. 939 s.ep.disableKeepaliveTimer() 940 } 941 942 // If the sender has advertised zero receive window and we have 943 // data to be sent out, start zero window probing to query the 944 // the remote for it's receive window size. 945 if s.writeNext != nil && s.SndWnd == 0 { 946 s.enableZeroWindowProbing() 947 } 948 949 // If we have no more pending data, start the keepalive timer. 950 if s.SndUna == s.SndNxt { 951 s.ep.resetKeepaliveTimer(false) 952 } else { 953 // Enable timers if we have pending data. 954 if shouldScheduleProbe && s.shouldSchedulePTO() { 955 // Schedule PTO after transmitting new data that wasn't itself a TLP probe. 956 s.schedulePTO() 957 } else if !s.resendTimer.enabled() { 958 s.probeTimer.disable() 959 if s.Outstanding > 0 { 960 // Enable the resend timer if it's not enabled yet and there is 961 // outstanding data. 962 s.resendTimer.enable(s.RTO) 963 } 964 } 965 } 966 } 967 968 // sendData sends new data segments. It is called when data becomes available or 969 // when the send window opens up. 970 // +checklocks:s.ep.mu 971 func (s *sender) sendData() { 972 limit := s.MaxPayloadSize 973 if s.gso { 974 limit = int(s.ep.gso.MaxSize - header.TCPTotalHeaderMaximumSize - 1) 975 } 976 end := s.SndUna.Add(s.SndWnd) 977 978 // Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10. 979 // "A TCP SHOULD set cwnd to no more than RW before beginning 980 // transmission if the TCP has not sent data in the interval exceeding 981 // the retrasmission timeout." 982 if !s.FastRecovery.Active && s.state != tcpip.RTORecovery && s.ep.stack.Clock().NowMonotonic().Sub(s.LastSendTime) > s.RTO { 983 if s.SndCwnd > InitialCwnd { 984 s.SndCwnd = InitialCwnd 985 } 986 } 987 988 var dataSent bool 989 for seg := s.writeNext; seg != nil && s.Outstanding < s.SndCwnd; seg = seg.Next() { 990 cwndLimit := (s.SndCwnd - s.Outstanding) * s.MaxPayloadSize 991 if cwndLimit > 0 && cwndLimit < limit { 992 limit = cwndLimit 993 } 994 if s.isAssignedSequenceNumber(seg) && s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 995 // Move writeNext along so that we don't try and scan data that 996 // has already been SACKED. 997 s.updateWriteNext(seg.Next()) 998 continue 999 } 1000 if sent := s.maybeSendSegment(seg, limit, end); !sent { 1001 break 1002 } 1003 dataSent = true 1004 s.Outstanding += s.pCount(seg, s.MaxPayloadSize) 1005 s.updateWriteNext(seg.Next()) 1006 } 1007 1008 s.postXmit(dataSent, true /* shouldScheduleProbe */) 1009 } 1010 1011 func (s *sender) enterRecovery() { 1012 // Initialize the variables used to detect spurious recovery after 1013 // entering recovery. 1014 // 1015 // See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1. 1016 s.spuriousRecovery = false 1017 s.retransmitTS = 0 1018 1019 s.FastRecovery.Active = true 1020 // Save state to reflect we're now in fast recovery. 1021 // 1022 // See : https://tools.ietf.org/html/rfc5681#section-3.2 Step 3. 1023 // We inflate the cwnd by 3 to account for the 3 packets which triggered 1024 // the 3 duplicate ACKs and are now not in flight. 1025 s.SndCwnd = s.Ssthresh + 3 1026 s.SackedOut = 0 1027 s.DupAckCount = 0 1028 s.FastRecovery.First = s.SndUna 1029 s.FastRecovery.Last = s.SndNxt - 1 1030 s.FastRecovery.MaxCwnd = s.SndCwnd + s.Outstanding 1031 s.FastRecovery.HighRxt = s.SndUna 1032 s.FastRecovery.RescueRxt = s.SndUna 1033 1034 // Record retransmitTS if the sender is not in recovery as per: 1035 // https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 1036 s.recordRetransmitTS() 1037 1038 if s.ep.SACKPermitted { 1039 s.state = tcpip.SACKRecovery 1040 s.ep.stack.Stats().TCP.SACKRecovery.Increment() 1041 // Set TLPRxtOut to false according to 1042 // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1. 1043 if s.rc.tlpRxtOut { 1044 // The tail loss probe triggered recovery. 1045 s.ep.stack.Stats().TCP.TLPRecovery.Increment() 1046 } 1047 s.rc.tlpRxtOut = false 1048 return 1049 } 1050 s.state = tcpip.FastRecovery 1051 s.ep.stack.Stats().TCP.FastRecovery.Increment() 1052 } 1053 1054 func (s *sender) leaveRecovery() { 1055 s.FastRecovery.Active = false 1056 s.FastRecovery.MaxCwnd = 0 1057 s.DupAckCount = 0 1058 1059 // Deflate cwnd. It had been artificially inflated when new dups arrived. 1060 s.SndCwnd = s.Ssthresh 1061 s.cc.PostRecovery() 1062 } 1063 1064 // isAssignedSequenceNumber relies on the fact that we only set flags once a 1065 // sequencenumber is assigned and that is only done right before we send the 1066 // segment. As a result any segment that has a non-zero flag has a valid 1067 // sequence number assigned to it. 1068 func (s *sender) isAssignedSequenceNumber(seg *segment) bool { 1069 return seg.flags != 0 1070 } 1071 1072 // SetPipe implements the SetPipe() function described in RFC6675. Netstack 1073 // maintains the congestion window in number of packets and not bytes, so 1074 // SetPipe() here measures number of outstanding packets rather than actual 1075 // outstanding bytes in the network. 1076 func (s *sender) SetPipe() { 1077 // If SACK isn't permitted or it is permitted but recovery is not active 1078 // then ignore pipe calculations. 1079 if !s.ep.SACKPermitted || !s.FastRecovery.Active { 1080 return 1081 } 1082 pipe := 0 1083 smss := seqnum.Size(s.ep.scoreboard.SMSS()) 1084 for s1 := s.writeList.Front(); s1 != nil && s1.payloadSize() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() { 1085 // With GSO each segment can be much larger than SMSS. So check the segment 1086 // in SMSS sized ranges. 1087 segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.payloadSize())) 1088 for startSeq := s1.sequenceNumber; startSeq.LessThan(segEnd); startSeq = startSeq.Add(smss) { 1089 endSeq := startSeq.Add(smss) 1090 if segEnd.LessThan(endSeq) { 1091 endSeq = segEnd 1092 } 1093 sb := header.SACKBlock{Start: startSeq, End: endSeq} 1094 // SetPipe(): 1095 // 1096 // After initializing pipe to zero, the following steps are 1097 // taken for each octet 'S1' in the sequence space between 1098 // HighACK and HighData that has not been SACKed: 1099 if !s1.sequenceNumber.LessThan(s.SndNxt) { 1100 break 1101 } 1102 if s.ep.scoreboard.IsSACKED(sb) { 1103 continue 1104 } 1105 1106 // SetPipe(): 1107 // 1108 // (a) If IsLost(S1) returns false, Pipe is incremened by 1. 1109 // 1110 // NOTE: here we mark the whole segment as lost. We do not try 1111 // and test every byte in our write buffer as we maintain our 1112 // pipe in terms of outstanding packets and not bytes. 1113 if !s.ep.scoreboard.IsRangeLost(sb) { 1114 pipe++ 1115 } 1116 // SetPipe(): 1117 // (b) If S1 <= HighRxt, Pipe is incremented by 1. 1118 if s1.sequenceNumber.LessThanEq(s.FastRecovery.HighRxt) { 1119 pipe++ 1120 } 1121 } 1122 } 1123 s.Outstanding = pipe 1124 } 1125 1126 // shouldEnterRecovery returns true if the sender should enter fast recovery 1127 // based on dupAck count and sack scoreboard. 1128 // See RFC 6675 section 5. 1129 func (s *sender) shouldEnterRecovery() bool { 1130 return s.DupAckCount >= nDupAckThreshold || 1131 (s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 && s.ep.scoreboard.IsLost(s.SndUna)) 1132 } 1133 1134 // detectLoss is called when an ack is received and returns whether a loss is 1135 // detected. It manages the state related to duplicate acks and determines if 1136 // a retransmit is needed according to the rules in RFC 6582 (NewReno). 1137 func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) { 1138 // We're not in fast recovery yet. 1139 1140 // If RACK is enabled and there is no reordering we should honor the 1141 // three duplicate ACK rule to enter recovery. 1142 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-4 1143 if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1144 if s.rc.Reord { 1145 return false 1146 } 1147 } 1148 1149 if !s.isDupAck(seg) { 1150 s.DupAckCount = 0 1151 return false 1152 } 1153 1154 s.DupAckCount++ 1155 1156 // Do not enter fast recovery until we reach nDupAckThreshold or the 1157 // first unacknowledged byte is considered lost as per SACK scoreboard. 1158 if !s.shouldEnterRecovery() { 1159 // RFC 6675 Step 3. 1160 s.FastRecovery.HighRxt = s.SndUna - 1 1161 // Do run SetPipe() to calculate the outstanding segments. 1162 s.SetPipe() 1163 s.state = tcpip.Disorder 1164 return false 1165 } 1166 1167 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2 1168 // 1169 // We only do the check here, the incrementing of last to the highest 1170 // sequence number transmitted till now is done when enterRecovery 1171 // is invoked. 1172 // 1173 // Note that we only enter recovery when at least one more byte of data 1174 // beyond s.fr.last (the highest byte that was outstanding when fast 1175 // retransmit was last entered) is acked. 1176 if !s.FastRecovery.Last.LessThan(seg.ackNumber - 1) { 1177 s.DupAckCount = 0 1178 return false 1179 } 1180 s.cc.HandleLossDetected() 1181 s.enterRecovery() 1182 return true 1183 } 1184 1185 // isDupAck determines if seg is a duplicate ack as defined in 1186 // https://tools.ietf.org/html/rfc5681#section-2. 1187 func (s *sender) isDupAck(seg *segment) bool { 1188 // A TCP that utilizes selective acknowledgments (SACKs) [RFC2018, RFC2883] 1189 // can leverage the SACK information to determine when an incoming ACK is a 1190 // "duplicate" (e.g., if the ACK contains previously unknown SACK 1191 // information). 1192 if s.ep.SACKPermitted && !seg.hasNewSACKInfo { 1193 return false 1194 } 1195 1196 // (a) The receiver of the ACK has outstanding data. 1197 return s.SndUna != s.SndNxt && 1198 // (b) The incoming acknowledgment carries no data. 1199 seg.logicalLen() == 0 && 1200 // (c) The SYN and FIN bits are both off. 1201 !seg.flags.Intersects(header.TCPFlagFin|header.TCPFlagSyn) && 1202 // (d) the ACK number is equal to the greatest acknowledgment received on 1203 // the given connection (TCP.UNA from RFC793). 1204 seg.ackNumber == s.SndUna && 1205 // (e) the advertised window in the incoming acknowledgment equals the 1206 // advertised window in the last incoming acknowledgment. 1207 s.SndWnd == seg.window 1208 } 1209 1210 // Iterate the writeList and update RACK for each segment which is newly acked 1211 // either cumulatively or selectively. Loop through the segments which are 1212 // sacked, and update the RACK related variables and check for reordering. 1213 // Returns true when the DSACK block has been detected in the received ACK. 1214 // 1215 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 1216 // steps 2 and 3. 1217 func (s *sender) walkSACK(rcvdSeg *segment) bool { 1218 s.rc.setDSACKSeen(false) 1219 1220 // Look for DSACK block. 1221 hasDSACK := false 1222 idx := 0 1223 n := len(rcvdSeg.parsedOptions.SACKBlocks) 1224 if checkDSACK(rcvdSeg) { 1225 dsackBlock := rcvdSeg.parsedOptions.SACKBlocks[0] 1226 numDSACK := uint64(dsackBlock.End-dsackBlock.Start) / uint64(s.MaxPayloadSize) 1227 // numDSACK can be zero when DSACK is sent for subsegments. 1228 if numDSACK < 1 { 1229 numDSACK = 1 1230 } 1231 s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.IncrementBy(numDSACK) 1232 s.rc.setDSACKSeen(true) 1233 idx = 1 1234 n-- 1235 hasDSACK = true 1236 } 1237 1238 if n == 0 { 1239 return hasDSACK 1240 } 1241 1242 // Sort the SACK blocks. The first block is the most recent unacked 1243 // block. The following blocks can be in arbitrary order. 1244 sackBlocks := make([]header.SACKBlock, n) 1245 copy(sackBlocks, rcvdSeg.parsedOptions.SACKBlocks[idx:]) 1246 sort.Slice(sackBlocks, func(i, j int) bool { 1247 return sackBlocks[j].Start.LessThan(sackBlocks[i].Start) 1248 }) 1249 1250 seg := s.writeList.Front() 1251 for _, sb := range sackBlocks { 1252 for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 { 1253 if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked { 1254 s.rc.update(seg, rcvdSeg) 1255 s.rc.detectReorder(seg) 1256 seg.acked = true 1257 s.SackedOut += s.pCount(seg, s.MaxPayloadSize) 1258 } 1259 seg = seg.Next() 1260 } 1261 } 1262 return hasDSACK 1263 } 1264 1265 // checkDSACK checks if a DSACK is reported. 1266 func checkDSACK(rcvdSeg *segment) bool { 1267 n := len(rcvdSeg.parsedOptions.SACKBlocks) 1268 if n == 0 { 1269 return false 1270 } 1271 1272 sb := rcvdSeg.parsedOptions.SACKBlocks[0] 1273 // Check if SACK block is invalid. 1274 if sb.End.LessThan(sb.Start) { 1275 return false 1276 } 1277 1278 // See: https://tools.ietf.org/html/rfc2883#section-5 DSACK is sent in 1279 // at most one SACK block. DSACK is detected in the below two cases: 1280 // * If the SACK sequence space is less than this cumulative ACK, it is 1281 // an indication that the segment identified by the SACK block has 1282 // been received more than once by the receiver. 1283 // * If the sequence space in the first SACK block is greater than the 1284 // cumulative ACK, then the sender next compares the sequence space 1285 // in the first SACK block with the sequence space in the second SACK 1286 // block, if there is one. This comparison can determine if the first 1287 // SACK block is reporting duplicate data that lies above the 1288 // cumulative ACK. 1289 if sb.Start.LessThan(rcvdSeg.ackNumber) { 1290 return true 1291 } 1292 1293 if n > 1 { 1294 sb1 := rcvdSeg.parsedOptions.SACKBlocks[1] 1295 if sb1.End.LessThan(sb1.Start) { 1296 return false 1297 } 1298 1299 // If the first SACK block is fully covered by second SACK 1300 // block, then the first block is a DSACK block. 1301 if sb.End.LessThanEq(sb1.End) && sb1.Start.LessThanEq(sb.Start) { 1302 return true 1303 } 1304 } 1305 1306 return false 1307 } 1308 1309 func (s *sender) recordRetransmitTS() { 1310 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 1311 // 1312 // The Eifel detection algorithm is used, only upon initiation of loss 1313 // recovery, i.e., when either the timeout-based retransmit or the fast 1314 // retransmit is sent. The Eifel detection algorithm MUST NOT be 1315 // reinitiated after loss recovery has already started. In particular, 1316 // it must not be reinitiated upon subsequent timeouts for the same 1317 // segment, and not upon retransmitting segments other than the oldest 1318 // outstanding segment, e.g., during selective loss recovery. 1319 if s.inRecovery() { 1320 return 1321 } 1322 1323 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 1324 // 1325 // Set a "RetransmitTS" variable to the value of the Timestamp Value 1326 // field of the Timestamps option included in the retransmit sent when 1327 // loss recovery is initiated. A TCP sender must ensure that 1328 // RetransmitTS does not get overwritten as loss recovery progresses, 1329 // e.g., in case of a second timeout and subsequent second retransmit of 1330 // the same octet. 1331 s.retransmitTS = s.ep.tsValNow() 1332 } 1333 1334 func (s *sender) detectSpuriousRecovery(hasDSACK bool, tsEchoReply uint32) { 1335 // Return if the sender has already detected spurious recovery. 1336 if s.spuriousRecovery { 1337 return 1338 } 1339 1340 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 4 1341 // 1342 // If the value of the Timestamp Echo Reply field of the acceptable ACK's 1343 // Timestamps option is smaller than the value of RetransmitTS, then 1344 // proceed to next step, else return. 1345 if tsEchoReply >= s.retransmitTS { 1346 return 1347 } 1348 1349 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5 1350 // 1351 // If the acceptable ACK carries a DSACK option [RFC2883], then return. 1352 if hasDSACK { 1353 return 1354 } 1355 1356 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5 1357 // 1358 // If during the lifetime of the TCP connection the TCP sender has 1359 // previously received an ACK with a DSACK option, or the acceptable ACK 1360 // does not acknowledge all outstanding data, then proceed to next step, 1361 // else return. 1362 numDSACK := s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.Value() 1363 if numDSACK == 0 && s.SndUna == s.SndNxt { 1364 return 1365 } 1366 1367 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 6 1368 // 1369 // If the loss recovery has been initiated with a timeout-based 1370 // retransmit, then set 1371 // SpuriousRecovery <- SPUR_TO (equal 1), 1372 // else set 1373 // SpuriousRecovery <- dupacks+1 1374 // Set the spurious recovery variable to true as we do not differentiate 1375 // between fast, SACK or RTO recovery. 1376 s.spuriousRecovery = true 1377 s.ep.stack.Stats().TCP.SpuriousRecovery.Increment() 1378 1379 // RFC 3522 will detect all kinds of spurious recoveries (fast, SACK and 1380 // timeout). Increment the metric for RTO only as we want to track the 1381 // number of timeout recoveries. 1382 if s.state == tcpip.RTORecovery { 1383 s.ep.stack.Stats().TCP.SpuriousRTORecovery.Increment() 1384 } 1385 } 1386 1387 // Check if the sender is in RTORecovery, FastRecovery or SACKRecovery state. 1388 func (s *sender) inRecovery() bool { 1389 if s.state == tcpip.RTORecovery || s.state == tcpip.FastRecovery || s.state == tcpip.SACKRecovery { 1390 return true 1391 } 1392 return false 1393 } 1394 1395 // handleRcvdSegment is called when a segment is received; it is responsible for 1396 // updating the send-related state. 1397 // +checklocks:s.ep.mu 1398 // +checklocksalias:s.rc.snd.ep.mu=s.ep.mu 1399 func (s *sender) handleRcvdSegment(rcvdSeg *segment) { 1400 // Check if we can extract an RTT measurement from this ack. 1401 if !rcvdSeg.parsedOptions.TS && s.RTTMeasureSeqNum.LessThan(rcvdSeg.ackNumber) { 1402 s.updateRTO(s.ep.stack.Clock().NowMonotonic().Sub(s.RTTMeasureTime)) 1403 s.RTTMeasureSeqNum = s.SndNxt 1404 } 1405 1406 // Update Timestamp if required. See RFC7323, section-4.3. 1407 if s.ep.SendTSOk && rcvdSeg.parsedOptions.TS { 1408 s.ep.updateRecentTimestamp(rcvdSeg.parsedOptions.TSVal, s.MaxSentAck, rcvdSeg.sequenceNumber) 1409 } 1410 1411 // Insert SACKBlock information into our scoreboard. 1412 hasDSACK := false 1413 if s.ep.SACKPermitted { 1414 for _, sb := range rcvdSeg.parsedOptions.SACKBlocks { 1415 // Only insert the SACK block if the following holds 1416 // true: 1417 // * SACK block acks data after the ack number in the 1418 // current segment. 1419 // * SACK block represents a sequence 1420 // between sndUna and sndNxt (i.e. data that is 1421 // currently unacked and in-flight). 1422 // * SACK block that has not been SACKed already. 1423 // 1424 // NOTE: This check specifically excludes DSACK blocks 1425 // which have start/end before sndUna and are used to 1426 // indicate spurious retransmissions. 1427 if rcvdSeg.ackNumber.LessThan(sb.Start) && s.SndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.SndNxt) && !s.ep.scoreboard.IsSACKED(sb) { 1428 s.ep.scoreboard.Insert(sb) 1429 rcvdSeg.hasNewSACKInfo = true 1430 } 1431 } 1432 1433 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08 1434 // section-7.2 1435 // * Step 2: Update RACK stats. 1436 // If the ACK is not ignored as invalid, update the RACK.rtt 1437 // to be the RTT sample calculated using this ACK, and 1438 // continue. If this ACK or SACK was for the most recently 1439 // sent packet, then record the RACK.xmit_ts timestamp and 1440 // RACK.end_seq sequence implied by this ACK. 1441 // * Step 3: Detect packet reordering. 1442 // If the ACK selectively or cumulatively acknowledges an 1443 // unacknowledged and also never retransmitted sequence below 1444 // RACK.fack, then the corresponding packet has been 1445 // reordered and RACK.reord is set to TRUE. 1446 if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1447 hasDSACK = s.walkSACK(rcvdSeg) 1448 } 1449 s.SetPipe() 1450 } 1451 1452 ack := rcvdSeg.ackNumber 1453 fastRetransmit := false 1454 // Do not leave fast recovery, if the ACK is out of range. 1455 if s.FastRecovery.Active { 1456 // Leave fast recovery if it acknowledges all the data covered by 1457 // this fast recovery session. 1458 if (ack-1).InRange(s.SndUna, s.SndNxt) && s.FastRecovery.Last.LessThan(ack) { 1459 s.leaveRecovery() 1460 } 1461 } else { 1462 // Detect loss by counting the duplicates and enter recovery. 1463 fastRetransmit = s.detectLoss(rcvdSeg) 1464 } 1465 1466 // See if TLP based recovery was successful. 1467 if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1468 s.detectTLPRecovery(ack, rcvdSeg) 1469 } 1470 1471 // Stash away the current window size. 1472 s.SndWnd = rcvdSeg.window 1473 1474 // Disable zero window probing if remote advertises a non-zero receive 1475 // window. This can be with an ACK to the zero window probe (where the 1476 // acknumber refers to the already acknowledged byte) OR to any previously 1477 // unacknowledged segment. 1478 if s.zeroWindowProbing && rcvdSeg.window > 0 && 1479 (ack == s.SndUna || (ack-1).InRange(s.SndUna, s.SndNxt)) { 1480 s.disableZeroWindowProbing() 1481 } 1482 1483 // On receiving the ACK for the zero window probe, account for it and 1484 // skip trying to send any segment as we are still probing for 1485 // receive window to become non-zero. 1486 if s.zeroWindowProbing && s.unackZeroWindowProbes > 0 && ack == s.SndUna { 1487 s.unackZeroWindowProbes-- 1488 return 1489 } 1490 1491 // Ignore ack if it doesn't acknowledge any new data. 1492 if (ack - 1).InRange(s.SndUna, s.SndNxt) { 1493 s.DupAckCount = 0 1494 1495 // See : https://tools.ietf.org/html/rfc1323#section-3.3. 1496 // Specifically we should only update the RTO using TSEcr if the 1497 // following condition holds: 1498 // 1499 // A TSecr value received in a segment is used to update the 1500 // averaged RTT measurement only if the segment acknowledges 1501 // some new data, i.e., only if it advances the left edge of 1502 // the send window. 1503 if s.ep.SendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 { 1504 s.updateRTO(s.ep.elapsed(s.ep.stack.Clock().NowMonotonic(), rcvdSeg.parsedOptions.TSEcr)) 1505 } 1506 1507 if s.shouldSchedulePTO() { 1508 // Schedule PTO upon receiving an ACK that cumulatively acknowledges data. 1509 // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1. 1510 s.schedulePTO() 1511 } else { 1512 // When an ack is received we must rearm the timer. 1513 // RFC 6298 5.3 1514 s.probeTimer.disable() 1515 s.resendTimer.enable(s.RTO) 1516 } 1517 1518 // Remove all acknowledged data from the write list. 1519 acked := s.SndUna.Size(ack) 1520 s.SndUna = ack 1521 ackLeft := acked 1522 originalOutstanding := s.Outstanding 1523 for ackLeft > 0 { 1524 // We use logicalLen here because we can have FIN 1525 // segments (which are always at the end of list) that 1526 // have no data, but do consume a sequence number. 1527 seg := s.writeList.Front() 1528 datalen := seg.logicalLen() 1529 1530 if datalen > ackLeft { 1531 prevCount := s.pCount(seg, s.MaxPayloadSize) 1532 seg.TrimFront(ackLeft) 1533 seg.sequenceNumber.UpdateForward(ackLeft) 1534 s.Outstanding -= prevCount - s.pCount(seg, s.MaxPayloadSize) 1535 break 1536 } 1537 1538 if s.writeNext == seg { 1539 s.updateWriteNext(seg.Next()) 1540 } 1541 1542 // Update the RACK fields if SACK is enabled. 1543 if s.ep.SACKPermitted && !seg.acked && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1544 s.rc.update(seg, rcvdSeg) 1545 s.rc.detectReorder(seg) 1546 } 1547 1548 s.writeList.Remove(seg) 1549 1550 // If SACK is enabled then only reduce outstanding if 1551 // the segment was not previously SACKED as these have 1552 // already been accounted for in SetPipe(). 1553 if !s.ep.SACKPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 1554 s.Outstanding -= s.pCount(seg, s.MaxPayloadSize) 1555 } else { 1556 s.SackedOut -= s.pCount(seg, s.MaxPayloadSize) 1557 } 1558 seg.DecRef() 1559 ackLeft -= datalen 1560 } 1561 1562 // Clear SACK information for all acked data. 1563 s.ep.scoreboard.Delete(s.SndUna) 1564 1565 // Detect if the sender entered recovery spuriously. 1566 if s.inRecovery() { 1567 s.detectSpuriousRecovery(hasDSACK, rcvdSeg.parsedOptions.TSEcr) 1568 } 1569 1570 // If we are not in fast recovery then update the congestion 1571 // window based on the number of acknowledged packets. 1572 if !s.FastRecovery.Active { 1573 s.cc.Update(originalOutstanding - s.Outstanding) 1574 if s.FastRecovery.Last.LessThan(s.SndUna) { 1575 s.state = tcpip.Open 1576 // Update RACK when we are exiting fast or RTO 1577 // recovery as described in the RFC 1578 // draft-ietf-tcpm-rack-08 Section-7.2 Step 4. 1579 if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1580 s.rc.exitRecovery() 1581 } 1582 s.reorderTimer.disable() 1583 } 1584 } 1585 1586 // Update the send buffer usage and notify potential waiters. 1587 s.ep.updateSndBufferUsage(int(acked)) 1588 1589 // It is possible for s.outstanding to drop below zero if we get 1590 // a retransmit timeout, reset outstanding to zero but later 1591 // get an ack that cover previously sent data. 1592 if s.Outstanding < 0 { 1593 s.Outstanding = 0 1594 } 1595 1596 s.SetPipe() 1597 1598 // If all outstanding data was acknowledged the disable the timer. 1599 // RFC 6298 Rule 5.3 1600 if s.SndUna == s.SndNxt { 1601 s.Outstanding = 0 1602 // Reset firstRetransmittedSegXmitTime to the zero value. 1603 s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{} 1604 s.resendTimer.disable() 1605 s.probeTimer.disable() 1606 } 1607 } 1608 1609 if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1610 // Update RACK reorder window. 1611 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 1612 // * Upon receiving an ACK: 1613 // * Step 4: Update RACK reordering window 1614 s.rc.updateRACKReorderWindow() 1615 1616 // After the reorder window is calculated, detect any loss by checking 1617 // if the time elapsed after the segments are sent is greater than the 1618 // reorder window. 1619 if numLost := s.rc.detectLoss(rcvdSeg.rcvdTime); numLost > 0 && !s.FastRecovery.Active { 1620 // If any segment is marked as lost by 1621 // RACK, enter recovery and retransmit 1622 // the lost segments. 1623 s.cc.HandleLossDetected() 1624 s.enterRecovery() 1625 fastRetransmit = true 1626 } 1627 1628 if s.FastRecovery.Active { 1629 s.rc.DoRecovery(nil, fastRetransmit) 1630 } 1631 } 1632 1633 // Now that we've popped all acknowledged data from the retransmit 1634 // queue, retransmit if needed. 1635 if s.FastRecovery.Active && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 { 1636 s.lr.DoRecovery(rcvdSeg, fastRetransmit) 1637 // When SACK is enabled data sending is governed by steps in 1638 // RFC 6675 Section 5 recovery steps A-C. 1639 // See: https://tools.ietf.org/html/rfc6675#section-5. 1640 if s.ep.SACKPermitted { 1641 return 1642 } 1643 } 1644 1645 // Send more data now that some of the pending data has been ack'd, or 1646 // that the window opened up, or the congestion window was inflated due 1647 // to a duplicate ack during fast recovery. This will also re-enable 1648 // the retransmit timer if needed. 1649 s.sendData() 1650 } 1651 1652 // sendSegment sends the specified segment. 1653 // +checklocks:s.ep.mu 1654 func (s *sender) sendSegment(seg *segment) tcpip.Error { 1655 if seg.xmitCount > 0 { 1656 s.ep.stack.Stats().TCP.Retransmits.Increment() 1657 s.ep.stats.SendErrors.Retransmits.Increment() 1658 if s.SndCwnd < s.Ssthresh { 1659 s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment() 1660 } 1661 } 1662 seg.xmitTime = s.ep.stack.Clock().NowMonotonic() 1663 seg.xmitCount++ 1664 seg.lost = false 1665 1666 err := s.sendSegmentFromPacketBuffer(seg.pkt, seg.flags, seg.sequenceNumber) 1667 1668 // Every time a packet containing data is sent (including a 1669 // retransmission), if SACK is enabled and we are retransmitting data 1670 // then use the conservative timer described in RFC6675 Section 6.0, 1671 // otherwise follow the standard time described in RFC6298 Section 5.1. 1672 if err != nil && seg.payloadSize() != 0 { 1673 if s.FastRecovery.Active && seg.xmitCount > 1 && s.ep.SACKPermitted { 1674 s.resendTimer.enable(s.RTO) 1675 } else { 1676 if !s.resendTimer.enabled() { 1677 s.resendTimer.enable(s.RTO) 1678 } 1679 } 1680 } 1681 1682 return err 1683 } 1684 1685 // sendSegmentFromPacketBuffer sends a new segment containing the given payload, 1686 // flags and sequence number. 1687 // +checklocks:s.ep.mu 1688 // +checklocksalias:s.ep.rcv.ep.mu=s.ep.mu 1689 func (s *sender) sendSegmentFromPacketBuffer(pkt *stack.PacketBuffer, flags header.TCPFlags, seq seqnum.Value) tcpip.Error { 1690 s.LastSendTime = s.ep.stack.Clock().NowMonotonic() 1691 if seq == s.RTTMeasureSeqNum { 1692 s.RTTMeasureTime = s.LastSendTime 1693 } 1694 1695 rcvNxt, rcvWnd := s.ep.rcv.getSendParams() 1696 1697 // Remember the max sent ack. 1698 s.MaxSentAck = rcvNxt 1699 1700 // We need to clone the packet because sendRaw takes ownership of pkt, 1701 // and pkt could be reprocessed later on (i.e retrasmission). 1702 pkt = pkt.Clone() 1703 defer pkt.DecRef() 1704 1705 return s.ep.sendRaw(pkt, flags, seq, rcvNxt, rcvWnd) 1706 } 1707 1708 // sendEmptySegment sends a new empty segment, flags and sequence number. 1709 // +checklocks:s.ep.mu 1710 // +checklocksalias:s.ep.rcv.ep.mu=s.ep.mu 1711 func (s *sender) sendEmptySegment(flags header.TCPFlags, seq seqnum.Value) tcpip.Error { 1712 s.LastSendTime = s.ep.stack.Clock().NowMonotonic() 1713 if seq == s.RTTMeasureSeqNum { 1714 s.RTTMeasureTime = s.LastSendTime 1715 } 1716 1717 rcvNxt, rcvWnd := s.ep.rcv.getSendParams() 1718 1719 // Remember the max sent ack. 1720 s.MaxSentAck = rcvNxt 1721 1722 return s.ep.sendEmptyRaw(flags, seq, rcvNxt, rcvWnd) 1723 } 1724 1725 // maybeSendOutOfWindowAck sends an ACK if we are not being rate limited 1726 // currently. 1727 // +checklocks:s.ep.mu 1728 func (s *sender) maybeSendOutOfWindowAck(seg *segment) { 1729 // Data packets are unlikely to be part of an ACK loop. So always send 1730 // an ACK for a packet w/ data. 1731 if seg.payloadSize() > 0 || s.ep.allowOutOfWindowAck() { 1732 s.sendAck() 1733 } 1734 } 1735 1736 func (s *sender) updateWriteNext(seg *segment) { 1737 if s.writeNext != nil { 1738 s.writeNext.DecRef() 1739 } 1740 if seg != nil { 1741 seg.IncRef() 1742 } 1743 s.writeNext = seg 1744 } 1745 1746 // corkTimerExpired drains all the segments when TCP_CORK is enabled. 1747 // +checklocks:s.ep.mu 1748 func (s *sender) corkTimerExpired() tcpip.Error { 1749 // Check if the timer actually expired or if it's a spurious wake due 1750 // to a previously orphaned runtime timer. 1751 if s.corkTimer.isUninitialized() || !s.corkTimer.checkExpiration() { 1752 return nil 1753 } 1754 1755 // Assign sequence number and flags to the segment. 1756 seg := s.writeNext 1757 if seg == nil { 1758 return nil 1759 } 1760 seg.sequenceNumber = s.SndNxt 1761 seg.flags = header.TCPFlagAck | header.TCPFlagPsh 1762 // Drain all the segments. 1763 s.sendData() 1764 return nil 1765 }