github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/tcpip/transport/tcp/snd.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "fmt" 19 "math" 20 "sort" 21 "time" 22 23 "github.com/nicocha30/gvisor-ligolo/pkg/sync" 24 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip" 25 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/header" 26 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/seqnum" 27 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/stack" 28 ) 29 30 const ( 31 // MinRTO is the minimum allowed value for the retransmit timeout. 32 MinRTO = 200 * time.Millisecond 33 34 // MaxRTO is the maximum allowed value for the retransmit timeout. 35 MaxRTO = 120 * time.Second 36 37 // MinSRTT is the minimum allowed value for smoothed RTT. 38 MinSRTT = 1 * time.Millisecond 39 40 // InitialCwnd is the initial congestion window. 41 InitialCwnd = 10 42 43 // nDupAckThreshold is the number of duplicate ACK's required 44 // before fast-retransmit is entered. 45 nDupAckThreshold = 3 46 47 // MaxRetries is the maximum number of probe retries sender does 48 // before timing out the connection. 49 // Linux default TCP_RETR2, net.ipv4.tcp_retries2. 50 MaxRetries = 15 51 ) 52 53 // congestionControl is an interface that must be implemented by any supported 54 // congestion control algorithm. 55 type congestionControl interface { 56 // HandleLossDetected is invoked when the loss is detected by RACK or 57 // sender.dupAckCount >= nDupAckThreshold just before entering fast 58 // retransmit. 59 HandleLossDetected() 60 61 // HandleRTOExpired is invoked when the retransmit timer expires. 62 HandleRTOExpired() 63 64 // Update is invoked when processing inbound acks. It's passed the 65 // number of packet's that were acked by the most recent cumulative 66 // acknowledgement. 67 Update(packetsAcked int) 68 69 // PostRecovery is invoked when the sender is exiting a fast retransmit/ 70 // recovery phase. This provides congestion control algorithms a way 71 // to adjust their state when exiting recovery. 72 PostRecovery() 73 } 74 75 // lossRecovery is an interface that must be implemented by any supported 76 // loss recovery algorithm. 77 type lossRecovery interface { 78 // DoRecovery is invoked when loss is detected and segments need 79 // to be retransmitted. The cumulative or selective ACK is passed along 80 // with the flag which identifies whether the connection entered fast 81 // retransmit with this ACK and to retransmit the first unacknowledged 82 // segment. 83 DoRecovery(rcvdSeg *segment, fastRetransmit bool) 84 } 85 86 // sender holds the state necessary to send TCP segments. 87 // 88 // +stateify savable 89 type sender struct { 90 stack.TCPSenderState 91 ep *endpoint 92 93 // lr is the loss recovery algorithm used by the sender. 94 lr lossRecovery 95 96 // firstRetransmittedSegXmitTime is the original transmit time of 97 // the first segment that was retransmitted due to RTO expiration. 98 firstRetransmittedSegXmitTime tcpip.MonotonicTime 99 100 // zeroWindowProbing is set if the sender is currently probing 101 // for zero receive window. 102 zeroWindowProbing bool `state:"nosave"` 103 104 // unackZeroWindowProbes is the number of unacknowledged zero 105 // window probes. 106 unackZeroWindowProbes uint32 `state:"nosave"` 107 108 writeNext *segment 109 writeList segmentList 110 resendTimer timer `state:"nosave"` 111 112 // rtt.TCPRTTState.SRTT and rtt.TCPRTTState.RTTVar are the "smoothed 113 // round-trip time", and "round-trip time variation", as defined in 114 // section 2 of RFC 6298. 115 rtt rtt 116 117 // minRTO is the minimum permitted value for sender.rto. 118 minRTO time.Duration 119 120 // maxRTO is the maximum permitted value for sender.rto. 121 maxRTO time.Duration 122 123 // maxRetries is the maximum permitted retransmissions. 124 maxRetries uint32 125 126 // gso is set if generic segmentation offload is enabled. 127 gso bool 128 129 // state is the current state of congestion control for this endpoint. 130 state tcpip.CongestionControlState 131 132 // cc is the congestion control algorithm in use for this sender. 133 cc congestionControl 134 135 // rc has the fields needed for implementing RACK loss detection 136 // algorithm. 137 rc rackControl 138 139 // reorderTimer is the timer used to retransmit the segments after RACK 140 // detects them as lost. 141 reorderTimer timer `state:"nosave"` 142 143 // probeTimer is used to schedule PTO for RACK TLP algorithm. 144 probeTimer timer `state:"nosave"` 145 146 // spuriousRecovery indicates whether the sender entered recovery 147 // spuriously as described in RFC3522 Section 3.2. 148 spuriousRecovery bool 149 150 // retransmitTS is the timestamp at which the sender sends retransmitted 151 // segment after entering an RTO for the first time as described in 152 // RFC3522 Section 3.2. 153 retransmitTS uint32 154 } 155 156 // rtt is a synchronization wrapper used to appease stateify. See the comment 157 // in sender, where it is used. 158 // 159 // +stateify savable 160 type rtt struct { 161 sync.Mutex `state:"nosave"` 162 163 stack.TCPRTTState 164 } 165 166 // +checklocks:ep.mu 167 func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender { 168 // The sender MUST reduce the TCP data length to account for any IP or 169 // TCP options that it is including in the packets that it sends. 170 // See: https://tools.ietf.org/html/rfc6691#section-2 171 maxPayloadSize := int(mss) - ep.maxOptionSize() 172 173 s := &sender{ 174 ep: ep, 175 TCPSenderState: stack.TCPSenderState{ 176 SndWnd: sndWnd, 177 SndUna: iss + 1, 178 SndNxt: iss + 1, 179 RTTMeasureSeqNum: iss + 1, 180 LastSendTime: ep.stack.Clock().NowMonotonic(), 181 MaxPayloadSize: maxPayloadSize, 182 MaxSentAck: irs + 1, 183 FastRecovery: stack.TCPFastRecoveryState{ 184 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1. 185 Last: iss, 186 HighRxt: iss, 187 RescueRxt: iss, 188 }, 189 RTO: 1 * time.Second, 190 }, 191 gso: ep.gso.Type != stack.GSONone, 192 } 193 194 if s.gso { 195 s.ep.gso.MSS = uint16(maxPayloadSize) 196 } 197 198 s.cc = s.initCongestionControl(ep.cc) 199 s.lr = s.initLossRecovery() 200 s.rc.init(s, iss) 201 202 // A negative sndWndScale means that no scaling is in use, otherwise we 203 // store the scaling value. 204 if sndWndScale > 0 { 205 s.SndWndScale = uint8(sndWndScale) 206 } 207 208 s.resendTimer.init(s.ep.stack.Clock(), maybeFailTimerHandler(s.ep, s.retransmitTimerExpired)) 209 s.reorderTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.rc.reorderTimerExpired)) 210 s.probeTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.probeTimerExpired)) 211 212 s.ep.AssertLockHeld(ep) 213 s.updateMaxPayloadSize(int(ep.route.MTU()), 0) 214 // Initialize SACK Scoreboard after updating max payload size as we use 215 // the maxPayloadSize as the smss when determining if a segment is lost 216 // etc. 217 s.ep.scoreboard = NewSACKScoreboard(uint16(s.MaxPayloadSize), iss) 218 219 // Get Stack wide config. 220 var minRTO tcpip.TCPMinRTOOption 221 if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil { 222 panic(fmt.Sprintf("unable to get minRTO from stack: %s", err)) 223 } 224 s.minRTO = time.Duration(minRTO) 225 226 var maxRTO tcpip.TCPMaxRTOOption 227 if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil { 228 panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err)) 229 } 230 s.maxRTO = time.Duration(maxRTO) 231 232 var maxRetries tcpip.TCPMaxRetriesOption 233 if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil { 234 panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err)) 235 } 236 s.maxRetries = uint32(maxRetries) 237 238 return s 239 } 240 241 // initCongestionControl initializes the specified congestion control module and 242 // returns a handle to it. It also initializes the sndCwnd and sndSsThresh to 243 // their initial values. 244 func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl { 245 s.SndCwnd = InitialCwnd 246 // Set sndSsthresh to the maximum int value, which depends on the 247 // platform. 248 s.Ssthresh = int(^uint(0) >> 1) 249 250 switch congestionControlName { 251 case ccCubic: 252 return newCubicCC(s) 253 case ccReno: 254 fallthrough 255 default: 256 return newRenoCC(s) 257 } 258 } 259 260 // initLossRecovery initiates the loss recovery algorithm for the sender. 261 func (s *sender) initLossRecovery() lossRecovery { 262 if s.ep.SACKPermitted { 263 return newSACKRecovery(s) 264 } 265 return newRenoRecovery(s) 266 } 267 268 // updateMaxPayloadSize updates the maximum payload size based on the given 269 // MTU. If this is in response to "packet too big" control packets (indicated 270 // by the count argument), it also reduces the number of outstanding packets and 271 // attempts to retransmit the first packet above the MTU size. 272 // +checklocks:s.ep.mu 273 func (s *sender) updateMaxPayloadSize(mtu, count int) { 274 m := mtu - header.TCPMinimumSize 275 276 m -= s.ep.maxOptionSize() 277 278 // We don't adjust up for now. 279 if m >= s.MaxPayloadSize { 280 return 281 } 282 283 // Make sure we can transmit at least one byte. 284 if m <= 0 { 285 m = 1 286 } 287 288 oldMSS := s.MaxPayloadSize 289 s.MaxPayloadSize = m 290 if s.gso { 291 s.ep.gso.MSS = uint16(m) 292 } 293 294 if count == 0 { 295 // updateMaxPayloadSize is also called when the sender is created. 296 // and there is no data to send in such cases. Return immediately. 297 return 298 } 299 300 // Update the scoreboard's smss to reflect the new lowered 301 // maxPayloadSize. 302 s.ep.scoreboard.smss = uint16(m) 303 304 s.Outstanding -= count 305 if s.Outstanding < 0 { 306 s.Outstanding = 0 307 } 308 309 // Rewind writeNext to the first segment exceeding the MTU. Do nothing 310 // if it is already before such a packet. 311 nextSeg := s.writeNext 312 for seg := s.writeList.Front(); seg != nil; seg = seg.Next() { 313 if seg == s.writeNext { 314 // We got to writeNext before we could find a segment 315 // exceeding the MTU. 316 break 317 } 318 319 if nextSeg == s.writeNext && seg.payloadSize() > m { 320 // We found a segment exceeding the MTU. Rewind 321 // writeNext and try to retransmit it. 322 nextSeg = seg 323 } 324 325 if s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 326 // Update sackedOut for new maximum payload size. 327 s.SackedOut -= s.pCount(seg, oldMSS) 328 s.SackedOut += s.pCount(seg, s.MaxPayloadSize) 329 } 330 } 331 332 // Since we likely reduced the number of outstanding packets, we may be 333 // ready to send some more. 334 s.updateWriteNext(nextSeg) 335 s.sendData() 336 } 337 338 // sendAck sends an ACK segment. 339 // +checklocks:s.ep.mu 340 func (s *sender) sendAck() { 341 s.sendEmptySegment(header.TCPFlagAck, s.SndNxt) 342 } 343 344 // updateRTO updates the retransmit timeout when a new roud-trip time is 345 // available. This is done in accordance with section 2 of RFC 6298. 346 func (s *sender) updateRTO(rtt time.Duration) { 347 s.rtt.Lock() 348 if !s.rtt.TCPRTTState.SRTTInited { 349 s.rtt.TCPRTTState.RTTVar = rtt / 2 350 s.rtt.TCPRTTState.SRTT = rtt 351 s.rtt.TCPRTTState.SRTTInited = true 352 } else { 353 diff := s.rtt.TCPRTTState.SRTT - rtt 354 if diff < 0 { 355 diff = -diff 356 } 357 // Use RFC6298 standard algorithm to update TCPRTTState.RTTVar and TCPRTTState.SRTT when 358 // no timestamps are available. 359 if !s.ep.SendTSOk { 360 s.rtt.TCPRTTState.RTTVar = (3*s.rtt.TCPRTTState.RTTVar + diff) / 4 361 s.rtt.TCPRTTState.SRTT = (7*s.rtt.TCPRTTState.SRTT + rtt) / 8 362 } else { 363 // When we are taking RTT measurements of every ACK then 364 // we need to use a modified method as specified in 365 // https://tools.ietf.org/html/rfc7323#appendix-G 366 if s.Outstanding == 0 { 367 s.rtt.Unlock() 368 return 369 } 370 // Netstack measures congestion window/inflight all in 371 // terms of packets and not bytes. This is similar to 372 // how linux also does cwnd and inflight. In practice 373 // this approximation works as expected. 374 expectedSamples := math.Ceil(float64(s.Outstanding) / 2) 375 376 // alpha & beta values are the original values as recommended in 377 // https://tools.ietf.org/html/rfc6298#section-2.3. 378 const alpha = 0.125 379 const beta = 0.25 380 381 alphaPrime := alpha / expectedSamples 382 betaPrime := beta / expectedSamples 383 rttVar := (1-betaPrime)*s.rtt.TCPRTTState.RTTVar.Seconds() + betaPrime*diff.Seconds() 384 srtt := (1-alphaPrime)*s.rtt.TCPRTTState.SRTT.Seconds() + alphaPrime*rtt.Seconds() 385 s.rtt.TCPRTTState.RTTVar = time.Duration(rttVar * float64(time.Second)) 386 s.rtt.TCPRTTState.SRTT = time.Duration(srtt * float64(time.Second)) 387 } 388 } 389 390 if s.rtt.TCPRTTState.SRTT < MinSRTT { 391 s.rtt.TCPRTTState.SRTT = MinSRTT 392 } 393 394 s.RTO = s.rtt.TCPRTTState.SRTT + 4*s.rtt.TCPRTTState.RTTVar 395 s.rtt.Unlock() 396 if s.RTO < s.minRTO { 397 s.RTO = s.minRTO 398 } 399 if s.RTO > s.maxRTO { 400 s.RTO = s.maxRTO 401 } 402 } 403 404 // resendSegment resends the first unacknowledged segment. 405 // +checklocks:s.ep.mu 406 func (s *sender) resendSegment() { 407 // Don't use any segments we already sent to measure RTT as they may 408 // have been affected by packets being lost. 409 s.RTTMeasureSeqNum = s.SndNxt 410 411 // Resend the segment. 412 if seg := s.writeList.Front(); seg != nil { 413 if seg.payloadSize() > s.MaxPayloadSize { 414 s.splitSeg(seg, s.MaxPayloadSize) 415 } 416 417 // See: RFC 6675 section 5 Step 4.3 418 // 419 // To prevent retransmission, set both the HighRXT and RescueRXT 420 // to the highest sequence number in the retransmitted segment. 421 s.FastRecovery.HighRxt = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) - 1 422 s.FastRecovery.RescueRxt = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) - 1 423 s.sendSegment(seg) 424 s.ep.stack.Stats().TCP.FastRetransmit.Increment() 425 s.ep.stats.SendErrors.FastRetransmit.Increment() 426 427 // Run SetPipe() as per RFC 6675 section 5 Step 4.4 428 s.SetPipe() 429 } 430 } 431 432 // retransmitTimerExpired is called when the retransmit timer expires, and 433 // unacknowledged segments are assumed lost, and thus need to be resent. 434 // Returns true if the connection is still usable, or false if the connection 435 // is deemed lost. 436 // +checklocks:s.ep.mu 437 func (s *sender) retransmitTimerExpired() tcpip.Error { 438 // Check if the timer actually expired or if it's a spurious wake due 439 // to a previously orphaned runtime timer. 440 if s.resendTimer.isZero() || !s.resendTimer.checkExpiration() { 441 return nil 442 } 443 444 // Initialize the variables used to detect spurious recovery after 445 // entering RTO. 446 // 447 // See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1. 448 s.spuriousRecovery = false 449 s.retransmitTS = 0 450 451 // TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases 452 // when writeList is empty. Remove this once we have a proper fix for this 453 // issue. 454 if s.writeList.Front() == nil { 455 return nil 456 } 457 458 s.ep.stack.Stats().TCP.Timeouts.Increment() 459 s.ep.stats.SendErrors.Timeouts.Increment() 460 461 // Set TLPRxtOut to false according to 462 // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1. 463 s.rc.tlpRxtOut = false 464 465 // Give up if we've waited more than a minute since the last resend or 466 // if a user time out is set and we have exceeded the user specified 467 // timeout since the first retransmission. 468 uto := s.ep.userTimeout 469 470 if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) { 471 // We store the original xmitTime of the segment that we are 472 // about to retransmit as the retransmission time. This is 473 // required as by the time the retransmitTimer has expired the 474 // segment has already been sent and unacked for the RTO at the 475 // time the segment was sent. 476 s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime 477 } 478 479 elapsed := s.ep.stack.Clock().NowMonotonic().Sub(s.firstRetransmittedSegXmitTime) 480 remaining := s.maxRTO 481 if uto != 0 { 482 // Cap to the user specified timeout if one is specified. 483 remaining = uto - elapsed 484 } 485 486 // Always honor the user-timeout irrespective of whether the zero 487 // window probes were acknowledged. 488 // net/ipv4/tcp_timer.c::tcp_probe_timer() 489 if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries { 490 s.ep.stack.Stats().TCP.EstablishedTimedout.Increment() 491 return &tcpip.ErrTimeout{} 492 } 493 494 // Set new timeout. The timer will be restarted by the call to sendData 495 // below. 496 s.RTO *= 2 497 // Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5 498 if s.RTO > s.maxRTO { 499 s.RTO = s.maxRTO 500 } 501 502 // Cap RTO to remaining time. 503 if s.RTO > remaining { 504 s.RTO = remaining 505 } 506 507 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4. 508 // 509 // Retransmit timeouts: 510 // After a retransmit timeout, record the highest sequence number 511 // transmitted in the variable recover, and exit the fast recovery 512 // procedure if applicable. 513 s.FastRecovery.Last = s.SndNxt - 1 514 515 if s.FastRecovery.Active { 516 // We were attempting fast recovery but were not successful. 517 // Leave the state. We don't need to update ssthresh because it 518 // has already been updated when entered fast-recovery. 519 s.leaveRecovery() 520 } 521 522 // Record retransmitTS if the sender is not in recovery as per: 523 // https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 524 s.recordRetransmitTS() 525 526 s.state = tcpip.RTORecovery 527 s.cc.HandleRTOExpired() 528 529 // Mark the next segment to be sent as the first unacknowledged one and 530 // start sending again. Set the number of outstanding packets to 0 so 531 // that we'll be able to retransmit. 532 // 533 // We'll keep on transmitting (or retransmitting) as we get acks for 534 // the data we transmit. 535 s.Outstanding = 0 536 537 // Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1 538 // 539 // In order to avoid memory deadlocks, the TCP receiver is allowed to 540 // discard data that has already been selectively acknowledged. As a 541 // result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK 542 // information gathered from a receiver upon a retransmission timeout 543 // (RTO) "since the timeout might indicate that the data receiver has 544 // reneged." Additionally, a TCP sender MUST "ignore prior SACK 545 // information in determining which data to retransmit." 546 // 547 // NOTE: We take the stricter interpretation and just expunge all 548 // information as we lack more rigorous checks to validate if the SACK 549 // information is usable after an RTO. 550 s.ep.scoreboard.Reset() 551 s.updateWriteNext(s.writeList.Front()) 552 553 // RFC 1122 4.2.2.17: Start sending zero window probes when we still see a 554 // zero receive window after retransmission interval and we have data to 555 // send. 556 if s.zeroWindowProbing { 557 s.sendZeroWindowProbe() 558 // RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed 559 // indefinitely. As long as the receiving TCP continues to send 560 // acknowledgments in response to the probe segments, the sending TCP 561 // MUST allow the connection to stay open. 562 return nil 563 } 564 565 seg := s.writeNext 566 // RFC 1122 4.2.3.5: Close the connection when the number of 567 // retransmissions for this segment is beyond a limit. 568 if seg != nil && seg.xmitCount > s.maxRetries { 569 s.ep.stack.Stats().TCP.EstablishedTimedout.Increment() 570 return &tcpip.ErrTimeout{} 571 } 572 573 s.sendData() 574 575 return nil 576 } 577 578 // pCount returns the number of packets in the segment. Due to GSO, a segment 579 // can be composed of multiple packets. 580 func (s *sender) pCount(seg *segment, maxPayloadSize int) int { 581 size := seg.payloadSize() 582 if size == 0 { 583 return 1 584 } 585 586 return (size-1)/maxPayloadSize + 1 587 } 588 589 // splitSeg splits a given segment at the size specified and inserts the 590 // remainder as a new segment after the current one in the write list. 591 func (s *sender) splitSeg(seg *segment, size int) { 592 if seg.payloadSize() <= size { 593 return 594 } 595 // Split this segment up. 596 nSeg := seg.clone() 597 nSeg.pkt.Data().TrimFront(size) 598 nSeg.sequenceNumber.UpdateForward(seqnum.Size(size)) 599 s.writeList.InsertAfter(seg, nSeg) 600 601 // The segment being split does not carry PUSH flag because it is 602 // followed by the newly split segment. 603 // RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered 604 // segment (i.e., when there is no more queued data to be sent). 605 // Linux removes PSH flag only when the segment is being split over MSS 606 // and retains it when we are splitting the segment over lack of sender 607 // window space. 608 // ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point() 609 // ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test() 610 if seg.payloadSize() > s.MaxPayloadSize { 611 seg.flags ^= header.TCPFlagPsh 612 } 613 seg.pkt.Data().CapLength(size) 614 } 615 616 // NextSeg implements the RFC6675 NextSeg() operation. 617 // 618 // NextSeg starts scanning the writeList starting from nextSegHint and returns 619 // the hint to be passed on the next call to NextSeg. This is required to avoid 620 // iterating the write list repeatedly when NextSeg is invoked in a loop during 621 // recovery. The returned hint will be nil if there are no more segments that 622 // can match rules defined by NextSeg operation in RFC6675. 623 // 624 // rescueRtx will be true only if nextSeg is a rescue retransmission as 625 // described by Step 4) of the NextSeg algorithm. 626 func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) { 627 var s3 *segment 628 var s4 *segment 629 // Step 1. 630 for seg := nextSegHint; seg != nil; seg = seg.Next() { 631 // Stop iteration if we hit a segment that has never been 632 // transmitted (i.e. either it has no assigned sequence number 633 // or if it does have one, it's >= the next sequence number 634 // to be sent [i.e. >= s.sndNxt]). 635 if !s.isAssignedSequenceNumber(seg) || s.SndNxt.LessThanEq(seg.sequenceNumber) { 636 hint = nil 637 break 638 } 639 segSeq := seg.sequenceNumber 640 if smss := s.ep.scoreboard.SMSS(); seg.payloadSize() > int(smss) { 641 s.splitSeg(seg, int(smss)) 642 } 643 644 // See RFC 6675 Section 4 645 // 646 // 1. If there exists a smallest unSACKED sequence number 647 // 'S2' that meets the following 3 criteria for determinig 648 // loss, the sequence range of one segment of up to SMSS 649 // octects starting with S2 MUST be returned. 650 if !s.ep.scoreboard.IsSACKED(header.SACKBlock{Start: segSeq, End: segSeq.Add(1)}) { 651 // NextSeg(): 652 // 653 // (1.a) S2 is greater than HighRxt 654 // (1.b) S2 is less than highest octect covered by 655 // any received SACK. 656 if s.FastRecovery.HighRxt.LessThan(segSeq) && segSeq.LessThan(s.ep.scoreboard.maxSACKED) { 657 // NextSeg(): 658 // (1.c) IsLost(S2) returns true. 659 if s.ep.scoreboard.IsLost(segSeq) { 660 return seg, seg.Next(), false 661 } 662 663 // NextSeg(): 664 // 665 // (3): If the conditions for rules (1) and (2) 666 // fail, but there exists an unSACKed sequence 667 // number S3 that meets the criteria for 668 // detecting loss given in steps 1.a and 1.b 669 // above (specifically excluding (1.c)) then one 670 // segment of upto SMSS octets starting with S3 671 // SHOULD be returned. 672 if s3 == nil { 673 s3 = seg 674 hint = seg.Next() 675 } 676 } 677 // NextSeg(): 678 // 679 // (4) If the conditions for (1), (2) and (3) fail, 680 // but there exists outstanding unSACKED data, we 681 // provide the opportunity for a single "rescue" 682 // retransmission per entry into loss recovery. If 683 // HighACK is greater than RescueRxt (or RescueRxt 684 // is undefined), then one segment of upto SMSS 685 // octects that MUST include the highest outstanding 686 // unSACKed sequence number SHOULD be returned, and 687 // RescueRxt set to RecoveryPoint. HighRxt MUST NOT 688 // be updated. 689 if s.FastRecovery.RescueRxt.LessThan(s.SndUna - 1) { 690 if s4 != nil { 691 if s4.sequenceNumber.LessThan(segSeq) { 692 s4 = seg 693 } 694 } else { 695 s4 = seg 696 } 697 } 698 } 699 } 700 701 // If we got here then no segment matched step (1). 702 // Step (2): "If no sequence number 'S2' per rule (1) 703 // exists but there exists available unsent data and the 704 // receiver's advertised window allows, the sequence 705 // range of one segment of up to SMSS octets of 706 // previously unsent data starting with sequence number 707 // HighData+1 MUST be returned." 708 for seg := s.writeNext; seg != nil; seg = seg.Next() { 709 if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.SndNxt) { 710 continue 711 } 712 // We do not split the segment here to <= smss as it has 713 // potentially not been assigned a sequence number yet. 714 return seg, nil, false 715 } 716 717 if s3 != nil { 718 return s3, hint, false 719 } 720 721 return s4, nil, true 722 } 723 724 // maybeSendSegment tries to send the specified segment and either coalesces 725 // other segments into this one or splits the specified segment based on the 726 // lower of the specified limit value or the receivers window size specified by 727 // end. 728 // +checklocks:s.ep.mu 729 func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (sent bool) { 730 // We abuse the flags field to determine if we have already 731 // assigned a sequence number to this segment. 732 if !s.isAssignedSequenceNumber(seg) { 733 // Merge segments if allowed. 734 if seg.payloadSize() != 0 { 735 available := int(s.SndNxt.Size(end)) 736 if available > limit { 737 available = limit 738 } 739 740 // nextTooBig indicates that the next segment was too 741 // large to entirely fit in the current segment. It 742 // would be possible to split the next segment and merge 743 // the portion that fits, but unexpectedly splitting 744 // segments can have user visible side-effects which can 745 // break applications. For example, RFC 7766 section 8 746 // says that the length and data of a DNS response 747 // should be sent in the same TCP segment to avoid 748 // triggering bugs in poorly written DNS 749 // implementations. 750 var nextTooBig bool 751 for nSeg := seg.Next(); nSeg != nil && nSeg.payloadSize() != 0; nSeg = seg.Next() { 752 if seg.payloadSize()+nSeg.payloadSize() > available { 753 nextTooBig = true 754 break 755 } 756 seg.merge(nSeg) 757 s.writeList.Remove(nSeg) 758 nSeg.DecRef() 759 } 760 if !nextTooBig && seg.payloadSize() < available { 761 // Segment is not full. 762 if s.Outstanding > 0 && s.ep.ops.GetDelayOption() { 763 // Nagle's algorithm. From Wikipedia: 764 // Nagle's algorithm works by 765 // combining a number of small 766 // outgoing messages and sending them 767 // all at once. Specifically, as long 768 // as there is a sent packet for which 769 // the sender has received no 770 // acknowledgment, the sender should 771 // keep buffering its output until it 772 // has a full packet's worth of 773 // output, thus allowing output to be 774 // sent all at once. 775 return false 776 } 777 // With TCP_CORK, hold back until minimum of the available 778 // send space and MSS. 779 // TODO(gvisor.dev/issue/2833): Drain the held segments after a 780 // timeout. 781 if seg.payloadSize() < s.MaxPayloadSize && s.ep.ops.GetCorkOption() { 782 return false 783 } 784 } 785 } 786 787 // Assign flags. We don't do it above so that we can merge 788 // additional data if Nagle holds the segment. 789 seg.sequenceNumber = s.SndNxt 790 seg.flags = header.TCPFlagAck | header.TCPFlagPsh 791 } 792 793 var segEnd seqnum.Value 794 if seg.payloadSize() == 0 { 795 if s.writeList.Back() != seg { 796 panic("FIN segments must be the final segment in the write list.") 797 } 798 seg.flags = header.TCPFlagAck | header.TCPFlagFin 799 segEnd = seg.sequenceNumber.Add(1) 800 // Update the state to reflect that we have now 801 // queued a FIN. 802 s.ep.updateConnDirectionState(connDirectionStateSndClosed) 803 switch s.ep.EndpointState() { 804 case StateCloseWait: 805 s.ep.setEndpointState(StateLastAck) 806 default: 807 s.ep.setEndpointState(StateFinWait1) 808 } 809 } else { 810 // We're sending a non-FIN segment. 811 if seg.flags&header.TCPFlagFin != 0 { 812 panic("Netstack queues FIN segments without data.") 813 } 814 815 if !seg.sequenceNumber.LessThan(end) { 816 return false 817 } 818 819 available := int(seg.sequenceNumber.Size(end)) 820 if available == 0 { 821 return false 822 } 823 824 // If the whole segment or at least 1MSS sized segment cannot 825 // be accomodated in the receiver advertized window, skip 826 // splitting and sending of the segment. ref: 827 // net/ipv4/tcp_output.c::tcp_snd_wnd_test() 828 // 829 // Linux checks this for all segment transmits not triggered by 830 // a probe timer. On this condition, it defers the segment split 831 // and transmit to a short probe timer. 832 // 833 // ref: include/net/tcp.h::tcp_check_probe_timer() 834 // ref: net/ipv4/tcp_output.c::tcp_write_wakeup() 835 // 836 // Instead of defining a new transmit timer, we attempt to split 837 // the segment right here if there are no pending segments. If 838 // there are pending segments, segment transmits are deferred to 839 // the retransmit timer handler. 840 if s.SndUna != s.SndNxt { 841 switch { 842 case available >= seg.payloadSize(): 843 // OK to send, the whole segments fits in the 844 // receiver's advertised window. 845 case available >= s.MaxPayloadSize: 846 // OK to send, at least 1 MSS sized segment fits 847 // in the receiver's advertised window. 848 default: 849 return false 850 } 851 } 852 853 // The segment size limit is computed as a function of sender 854 // congestion window and MSS. When sender congestion window is > 855 // 1, this limit can be larger than MSS. Ensure that the 856 // currently available send space is not greater than minimum of 857 // this limit and MSS. 858 if available > limit { 859 available = limit 860 } 861 862 // If GSO is not in use then cap available to 863 // maxPayloadSize. When GSO is in use the gVisor GSO logic or 864 // the host GSO logic will cap the segment to the correct size. 865 if s.ep.gso.Type == stack.GSONone && available > s.MaxPayloadSize { 866 available = s.MaxPayloadSize 867 } 868 869 if seg.payloadSize() > available { 870 // A negative value causes splitSeg to panic anyways, so just panic 871 // earlier to get more information about the cause. 872 s.splitSeg(seg, available) 873 } 874 875 segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) 876 } 877 878 s.sendSegment(seg) 879 880 // Update sndNxt if we actually sent new data (as opposed to 881 // retransmitting some previously sent data). 882 if s.SndNxt.LessThan(segEnd) { 883 s.SndNxt = segEnd 884 } 885 886 return true 887 } 888 889 // +checklocks:s.ep.mu 890 func (s *sender) sendZeroWindowProbe() { 891 s.unackZeroWindowProbes++ 892 // Send a zero window probe with sequence number pointing to 893 // the last acknowledged byte. 894 s.sendEmptySegment(header.TCPFlagAck, s.SndUna-1) 895 // Rearm the timer to continue probing. 896 s.resendTimer.enable(s.RTO) 897 } 898 899 func (s *sender) enableZeroWindowProbing() { 900 s.zeroWindowProbing = true 901 // We piggyback the probing on the retransmit timer with the 902 // current retranmission interval, as we may start probing while 903 // segment retransmissions. 904 if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) { 905 s.firstRetransmittedSegXmitTime = s.ep.stack.Clock().NowMonotonic() 906 } 907 s.resendTimer.enable(s.RTO) 908 } 909 910 func (s *sender) disableZeroWindowProbing() { 911 s.zeroWindowProbing = false 912 s.unackZeroWindowProbes = 0 913 s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{} 914 s.resendTimer.disable() 915 } 916 917 func (s *sender) postXmit(dataSent bool, shouldScheduleProbe bool) { 918 if dataSent { 919 // We sent data, so we should stop the keepalive timer to ensure 920 // that no keepalives are sent while there is pending data. 921 s.ep.disableKeepaliveTimer() 922 } 923 924 // If the sender has advertized zero receive window and we have 925 // data to be sent out, start zero window probing to query the 926 // the remote for it's receive window size. 927 if s.writeNext != nil && s.SndWnd == 0 { 928 s.enableZeroWindowProbing() 929 } 930 931 // If we have no more pending data, start the keepalive timer. 932 if s.SndUna == s.SndNxt { 933 s.ep.resetKeepaliveTimer(false) 934 } else { 935 // Enable timers if we have pending data. 936 if shouldScheduleProbe && s.shouldSchedulePTO() { 937 // Schedule PTO after transmitting new data that wasn't itself a TLP probe. 938 s.schedulePTO() 939 } else if !s.resendTimer.enabled() { 940 s.probeTimer.disable() 941 if s.Outstanding > 0 { 942 // Enable the resend timer if it's not enabled yet and there is 943 // outstanding data. 944 s.resendTimer.enable(s.RTO) 945 } 946 } 947 } 948 } 949 950 // sendData sends new data segments. It is called when data becomes available or 951 // when the send window opens up. 952 // +checklocks:s.ep.mu 953 func (s *sender) sendData() { 954 limit := s.MaxPayloadSize 955 if s.gso { 956 limit = int(s.ep.gso.MaxSize - header.TCPHeaderMaximumSize) 957 } 958 end := s.SndUna.Add(s.SndWnd) 959 960 // Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10. 961 // "A TCP SHOULD set cwnd to no more than RW before beginning 962 // transmission if the TCP has not sent data in the interval exceeding 963 // the retrasmission timeout." 964 if !s.FastRecovery.Active && s.state != tcpip.RTORecovery && s.ep.stack.Clock().NowMonotonic().Sub(s.LastSendTime) > s.RTO { 965 if s.SndCwnd > InitialCwnd { 966 s.SndCwnd = InitialCwnd 967 } 968 } 969 970 var dataSent bool 971 for seg := s.writeNext; seg != nil && s.Outstanding < s.SndCwnd; seg = seg.Next() { 972 cwndLimit := (s.SndCwnd - s.Outstanding) * s.MaxPayloadSize 973 if cwndLimit < limit { 974 limit = cwndLimit 975 } 976 if s.isAssignedSequenceNumber(seg) && s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 977 // Move writeNext along so that we don't try and scan data that 978 // has already been SACKED. 979 s.updateWriteNext(seg.Next()) 980 continue 981 } 982 if sent := s.maybeSendSegment(seg, limit, end); !sent { 983 break 984 } 985 dataSent = true 986 s.Outstanding += s.pCount(seg, s.MaxPayloadSize) 987 s.updateWriteNext(seg.Next()) 988 } 989 990 s.postXmit(dataSent, true /* shouldScheduleProbe */) 991 } 992 993 func (s *sender) enterRecovery() { 994 // Initialize the variables used to detect spurious recovery after 995 // entering recovery. 996 // 997 // See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1. 998 s.spuriousRecovery = false 999 s.retransmitTS = 0 1000 1001 s.FastRecovery.Active = true 1002 // Save state to reflect we're now in fast recovery. 1003 // 1004 // See : https://tools.ietf.org/html/rfc5681#section-3.2 Step 3. 1005 // We inflate the cwnd by 3 to account for the 3 packets which triggered 1006 // the 3 duplicate ACKs and are now not in flight. 1007 s.SndCwnd = s.Ssthresh + 3 1008 s.SackedOut = 0 1009 s.DupAckCount = 0 1010 s.FastRecovery.First = s.SndUna 1011 s.FastRecovery.Last = s.SndNxt - 1 1012 s.FastRecovery.MaxCwnd = s.SndCwnd + s.Outstanding 1013 s.FastRecovery.HighRxt = s.SndUna 1014 s.FastRecovery.RescueRxt = s.SndUna 1015 1016 // Record retransmitTS if the sender is not in recovery as per: 1017 // https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 1018 s.recordRetransmitTS() 1019 1020 if s.ep.SACKPermitted { 1021 s.state = tcpip.SACKRecovery 1022 s.ep.stack.Stats().TCP.SACKRecovery.Increment() 1023 // Set TLPRxtOut to false according to 1024 // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1. 1025 if s.rc.tlpRxtOut { 1026 // The tail loss probe triggered recovery. 1027 s.ep.stack.Stats().TCP.TLPRecovery.Increment() 1028 } 1029 s.rc.tlpRxtOut = false 1030 return 1031 } 1032 s.state = tcpip.FastRecovery 1033 s.ep.stack.Stats().TCP.FastRecovery.Increment() 1034 } 1035 1036 func (s *sender) leaveRecovery() { 1037 s.FastRecovery.Active = false 1038 s.FastRecovery.MaxCwnd = 0 1039 s.DupAckCount = 0 1040 1041 // Deflate cwnd. It had been artificially inflated when new dups arrived. 1042 s.SndCwnd = s.Ssthresh 1043 s.cc.PostRecovery() 1044 } 1045 1046 // isAssignedSequenceNumber relies on the fact that we only set flags once a 1047 // sequencenumber is assigned and that is only done right before we send the 1048 // segment. As a result any segment that has a non-zero flag has a valid 1049 // sequence number assigned to it. 1050 func (s *sender) isAssignedSequenceNumber(seg *segment) bool { 1051 return seg.flags != 0 1052 } 1053 1054 // SetPipe implements the SetPipe() function described in RFC6675. Netstack 1055 // maintains the congestion window in number of packets and not bytes, so 1056 // SetPipe() here measures number of outstanding packets rather than actual 1057 // outstanding bytes in the network. 1058 func (s *sender) SetPipe() { 1059 // If SACK isn't permitted or it is permitted but recovery is not active 1060 // then ignore pipe calculations. 1061 if !s.ep.SACKPermitted || !s.FastRecovery.Active { 1062 return 1063 } 1064 pipe := 0 1065 smss := seqnum.Size(s.ep.scoreboard.SMSS()) 1066 for s1 := s.writeList.Front(); s1 != nil && s1.payloadSize() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() { 1067 // With GSO each segment can be much larger than SMSS. So check the segment 1068 // in SMSS sized ranges. 1069 segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.payloadSize())) 1070 for startSeq := s1.sequenceNumber; startSeq.LessThan(segEnd); startSeq = startSeq.Add(smss) { 1071 endSeq := startSeq.Add(smss) 1072 if segEnd.LessThan(endSeq) { 1073 endSeq = segEnd 1074 } 1075 sb := header.SACKBlock{Start: startSeq, End: endSeq} 1076 // SetPipe(): 1077 // 1078 // After initializing pipe to zero, the following steps are 1079 // taken for each octet 'S1' in the sequence space between 1080 // HighACK and HighData that has not been SACKed: 1081 if !s1.sequenceNumber.LessThan(s.SndNxt) { 1082 break 1083 } 1084 if s.ep.scoreboard.IsSACKED(sb) { 1085 continue 1086 } 1087 1088 // SetPipe(): 1089 // 1090 // (a) If IsLost(S1) returns false, Pipe is incremened by 1. 1091 // 1092 // NOTE: here we mark the whole segment as lost. We do not try 1093 // and test every byte in our write buffer as we maintain our 1094 // pipe in terms of oustanding packets and not bytes. 1095 if !s.ep.scoreboard.IsRangeLost(sb) { 1096 pipe++ 1097 } 1098 // SetPipe(): 1099 // (b) If S1 <= HighRxt, Pipe is incremented by 1. 1100 if s1.sequenceNumber.LessThanEq(s.FastRecovery.HighRxt) { 1101 pipe++ 1102 } 1103 } 1104 } 1105 s.Outstanding = pipe 1106 } 1107 1108 // shouldEnterRecovery returns true if the sender should enter fast recovery 1109 // based on dupAck count and sack scoreboard. 1110 // See RFC 6675 section 5. 1111 func (s *sender) shouldEnterRecovery() bool { 1112 return s.DupAckCount >= nDupAckThreshold || 1113 (s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 && s.ep.scoreboard.IsLost(s.SndUna)) 1114 } 1115 1116 // detectLoss is called when an ack is received and returns whether a loss is 1117 // detected. It manages the state related to duplicate acks and determines if 1118 // a retransmit is needed according to the rules in RFC 6582 (NewReno). 1119 func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) { 1120 // We're not in fast recovery yet. 1121 1122 // If RACK is enabled and there is no reordering we should honor the 1123 // three duplicate ACK rule to enter recovery. 1124 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-4 1125 if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1126 if s.rc.Reord { 1127 return false 1128 } 1129 } 1130 1131 if !s.isDupAck(seg) { 1132 s.DupAckCount = 0 1133 return false 1134 } 1135 1136 s.DupAckCount++ 1137 1138 // Do not enter fast recovery until we reach nDupAckThreshold or the 1139 // first unacknowledged byte is considered lost as per SACK scoreboard. 1140 if !s.shouldEnterRecovery() { 1141 // RFC 6675 Step 3. 1142 s.FastRecovery.HighRxt = s.SndUna - 1 1143 // Do run SetPipe() to calculate the outstanding segments. 1144 s.SetPipe() 1145 s.state = tcpip.Disorder 1146 return false 1147 } 1148 1149 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2 1150 // 1151 // We only do the check here, the incrementing of last to the highest 1152 // sequence number transmitted till now is done when enterRecovery 1153 // is invoked. 1154 // 1155 // Note that we only enter recovery when at least one more byte of data 1156 // beyond s.fr.last (the highest byte that was outstanding when fast 1157 // retransmit was last entered) is acked. 1158 if !s.FastRecovery.Last.LessThan(seg.ackNumber - 1) { 1159 s.DupAckCount = 0 1160 return false 1161 } 1162 s.cc.HandleLossDetected() 1163 s.enterRecovery() 1164 return true 1165 } 1166 1167 // isDupAck determines if seg is a duplicate ack as defined in 1168 // https://tools.ietf.org/html/rfc5681#section-2. 1169 func (s *sender) isDupAck(seg *segment) bool { 1170 // A TCP that utilizes selective acknowledgments (SACKs) [RFC2018, RFC2883] 1171 // can leverage the SACK information to determine when an incoming ACK is a 1172 // "duplicate" (e.g., if the ACK contains previously unknown SACK 1173 // information). 1174 if s.ep.SACKPermitted && !seg.hasNewSACKInfo { 1175 return false 1176 } 1177 1178 // (a) The receiver of the ACK has outstanding data. 1179 return s.SndUna != s.SndNxt && 1180 // (b) The incoming acknowledgment carries no data. 1181 seg.logicalLen() == 0 && 1182 // (c) The SYN and FIN bits are both off. 1183 !seg.flags.Intersects(header.TCPFlagFin|header.TCPFlagSyn) && 1184 // (d) the ACK number is equal to the greatest acknowledgment received on 1185 // the given connection (TCP.UNA from RFC793). 1186 seg.ackNumber == s.SndUna && 1187 // (e) the advertised window in the incoming acknowledgment equals the 1188 // advertised window in the last incoming acknowledgment. 1189 s.SndWnd == seg.window 1190 } 1191 1192 // Iterate the writeList and update RACK for each segment which is newly acked 1193 // either cumulatively or selectively. Loop through the segments which are 1194 // sacked, and update the RACK related variables and check for reordering. 1195 // Returns true when the DSACK block has been detected in the received ACK. 1196 // 1197 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 1198 // steps 2 and 3. 1199 func (s *sender) walkSACK(rcvdSeg *segment) bool { 1200 s.rc.setDSACKSeen(false) 1201 1202 // Look for DSACK block. 1203 hasDSACK := false 1204 idx := 0 1205 n := len(rcvdSeg.parsedOptions.SACKBlocks) 1206 if checkDSACK(rcvdSeg) { 1207 dsackBlock := rcvdSeg.parsedOptions.SACKBlocks[0] 1208 numDSACK := uint64(dsackBlock.End-dsackBlock.Start) / uint64(s.MaxPayloadSize) 1209 // numDSACK can be zero when DSACK is sent for subsegments. 1210 if numDSACK < 1 { 1211 numDSACK = 1 1212 } 1213 s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.IncrementBy(numDSACK) 1214 s.rc.setDSACKSeen(true) 1215 idx = 1 1216 n-- 1217 hasDSACK = true 1218 } 1219 1220 if n == 0 { 1221 return hasDSACK 1222 } 1223 1224 // Sort the SACK blocks. The first block is the most recent unacked 1225 // block. The following blocks can be in arbitrary order. 1226 sackBlocks := make([]header.SACKBlock, n) 1227 copy(sackBlocks, rcvdSeg.parsedOptions.SACKBlocks[idx:]) 1228 sort.Slice(sackBlocks, func(i, j int) bool { 1229 return sackBlocks[j].Start.LessThan(sackBlocks[i].Start) 1230 }) 1231 1232 seg := s.writeList.Front() 1233 for _, sb := range sackBlocks { 1234 for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 { 1235 if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked { 1236 s.rc.update(seg, rcvdSeg) 1237 s.rc.detectReorder(seg) 1238 seg.acked = true 1239 s.SackedOut += s.pCount(seg, s.MaxPayloadSize) 1240 } 1241 seg = seg.Next() 1242 } 1243 } 1244 return hasDSACK 1245 } 1246 1247 // checkDSACK checks if a DSACK is reported. 1248 func checkDSACK(rcvdSeg *segment) bool { 1249 n := len(rcvdSeg.parsedOptions.SACKBlocks) 1250 if n == 0 { 1251 return false 1252 } 1253 1254 sb := rcvdSeg.parsedOptions.SACKBlocks[0] 1255 // Check if SACK block is invalid. 1256 if sb.End.LessThan(sb.Start) { 1257 return false 1258 } 1259 1260 // See: https://tools.ietf.org/html/rfc2883#section-5 DSACK is sent in 1261 // at most one SACK block. DSACK is detected in the below two cases: 1262 // * If the SACK sequence space is less than this cumulative ACK, it is 1263 // an indication that the segment identified by the SACK block has 1264 // been received more than once by the receiver. 1265 // * If the sequence space in the first SACK block is greater than the 1266 // cumulative ACK, then the sender next compares the sequence space 1267 // in the first SACK block with the sequence space in the second SACK 1268 // block, if there is one. This comparison can determine if the first 1269 // SACK block is reporting duplicate data that lies above the 1270 // cumulative ACK. 1271 if sb.Start.LessThan(rcvdSeg.ackNumber) { 1272 return true 1273 } 1274 1275 if n > 1 { 1276 sb1 := rcvdSeg.parsedOptions.SACKBlocks[1] 1277 if sb1.End.LessThan(sb1.Start) { 1278 return false 1279 } 1280 1281 // If the first SACK block is fully covered by second SACK 1282 // block, then the first block is a DSACK block. 1283 if sb.End.LessThanEq(sb1.End) && sb1.Start.LessThanEq(sb.Start) { 1284 return true 1285 } 1286 } 1287 1288 return false 1289 } 1290 1291 func (s *sender) recordRetransmitTS() { 1292 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 1293 // 1294 // The Eifel detection algorithm is used, only upon initiation of loss 1295 // recovery, i.e., when either the timeout-based retransmit or the fast 1296 // retransmit is sent. The Eifel detection algorithm MUST NOT be 1297 // reinitiated after loss recovery has already started. In particular, 1298 // it must not be reinitiated upon subsequent timeouts for the same 1299 // segment, and not upon retransmitting segments other than the oldest 1300 // outstanding segment, e.g., during selective loss recovery. 1301 if s.inRecovery() { 1302 return 1303 } 1304 1305 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 1306 // 1307 // Set a "RetransmitTS" variable to the value of the Timestamp Value 1308 // field of the Timestamps option included in the retransmit sent when 1309 // loss recovery is initiated. A TCP sender must ensure that 1310 // RetransmitTS does not get overwritten as loss recovery progresses, 1311 // e.g., in case of a second timeout and subsequent second retransmit of 1312 // the same octet. 1313 s.retransmitTS = s.ep.tsValNow() 1314 } 1315 1316 func (s *sender) detectSpuriousRecovery(hasDSACK bool, tsEchoReply uint32) { 1317 // Return if the sender has already detected spurious recovery. 1318 if s.spuriousRecovery { 1319 return 1320 } 1321 1322 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 4 1323 // 1324 // If the value of the Timestamp Echo Reply field of the acceptable ACK's 1325 // Timestamps option is smaller than the value of RetransmitTS, then 1326 // proceed to next step, else return. 1327 if tsEchoReply >= s.retransmitTS { 1328 return 1329 } 1330 1331 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5 1332 // 1333 // If the acceptable ACK carries a DSACK option [RFC2883], then return. 1334 if hasDSACK { 1335 return 1336 } 1337 1338 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5 1339 // 1340 // If during the lifetime of the TCP connection the TCP sender has 1341 // previously received an ACK with a DSACK option, or the acceptable ACK 1342 // does not acknowledge all outstanding data, then proceed to next step, 1343 // else return. 1344 numDSACK := s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.Value() 1345 if numDSACK == 0 && s.SndUna == s.SndNxt { 1346 return 1347 } 1348 1349 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 6 1350 // 1351 // If the loss recovery has been initiated with a timeout-based 1352 // retransmit, then set 1353 // SpuriousRecovery <- SPUR_TO (equal 1), 1354 // else set 1355 // SpuriousRecovery <- dupacks+1 1356 // Set the spurious recovery variable to true as we do not differentiate 1357 // between fast, SACK or RTO recovery. 1358 s.spuriousRecovery = true 1359 s.ep.stack.Stats().TCP.SpuriousRecovery.Increment() 1360 1361 // RFC 3522 will detect all kinds of spurious recoveries (fast, SACK and 1362 // timeout). Increment the metric for RTO only as we want to track the 1363 // number of timeout recoveries. 1364 if s.state == tcpip.RTORecovery { 1365 s.ep.stack.Stats().TCP.SpuriousRTORecovery.Increment() 1366 } 1367 } 1368 1369 // Check if the sender is in RTORecovery, FastRecovery or SACKRecovery state. 1370 func (s *sender) inRecovery() bool { 1371 if s.state == tcpip.RTORecovery || s.state == tcpip.FastRecovery || s.state == tcpip.SACKRecovery { 1372 return true 1373 } 1374 return false 1375 } 1376 1377 // handleRcvdSegment is called when a segment is received; it is responsible for 1378 // updating the send-related state. 1379 // +checklocks:s.ep.mu 1380 // +checklocksalias:s.rc.snd.ep.mu=s.ep.mu 1381 func (s *sender) handleRcvdSegment(rcvdSeg *segment) { 1382 // Check if we can extract an RTT measurement from this ack. 1383 if !rcvdSeg.parsedOptions.TS && s.RTTMeasureSeqNum.LessThan(rcvdSeg.ackNumber) { 1384 s.updateRTO(s.ep.stack.Clock().NowMonotonic().Sub(s.RTTMeasureTime)) 1385 s.RTTMeasureSeqNum = s.SndNxt 1386 } 1387 1388 // Update Timestamp if required. See RFC7323, section-4.3. 1389 if s.ep.SendTSOk && rcvdSeg.parsedOptions.TS { 1390 s.ep.updateRecentTimestamp(rcvdSeg.parsedOptions.TSVal, s.MaxSentAck, rcvdSeg.sequenceNumber) 1391 } 1392 1393 // Insert SACKBlock information into our scoreboard. 1394 hasDSACK := false 1395 if s.ep.SACKPermitted { 1396 for _, sb := range rcvdSeg.parsedOptions.SACKBlocks { 1397 // Only insert the SACK block if the following holds 1398 // true: 1399 // * SACK block acks data after the ack number in the 1400 // current segment. 1401 // * SACK block represents a sequence 1402 // between sndUna and sndNxt (i.e. data that is 1403 // currently unacked and in-flight). 1404 // * SACK block that has not been SACKed already. 1405 // 1406 // NOTE: This check specifically excludes DSACK blocks 1407 // which have start/end before sndUna and are used to 1408 // indicate spurious retransmissions. 1409 if rcvdSeg.ackNumber.LessThan(sb.Start) && s.SndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.SndNxt) && !s.ep.scoreboard.IsSACKED(sb) { 1410 s.ep.scoreboard.Insert(sb) 1411 rcvdSeg.hasNewSACKInfo = true 1412 } 1413 } 1414 1415 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08 1416 // section-7.2 1417 // * Step 2: Update RACK stats. 1418 // If the ACK is not ignored as invalid, update the RACK.rtt 1419 // to be the RTT sample calculated using this ACK, and 1420 // continue. If this ACK or SACK was for the most recently 1421 // sent packet, then record the RACK.xmit_ts timestamp and 1422 // RACK.end_seq sequence implied by this ACK. 1423 // * Step 3: Detect packet reordering. 1424 // If the ACK selectively or cumulatively acknowledges an 1425 // unacknowledged and also never retransmitted sequence below 1426 // RACK.fack, then the corresponding packet has been 1427 // reordered and RACK.reord is set to TRUE. 1428 if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1429 hasDSACK = s.walkSACK(rcvdSeg) 1430 } 1431 s.SetPipe() 1432 } 1433 1434 ack := rcvdSeg.ackNumber 1435 fastRetransmit := false 1436 // Do not leave fast recovery, if the ACK is out of range. 1437 if s.FastRecovery.Active { 1438 // Leave fast recovery if it acknowledges all the data covered by 1439 // this fast recovery session. 1440 if (ack-1).InRange(s.SndUna, s.SndNxt) && s.FastRecovery.Last.LessThan(ack) { 1441 s.leaveRecovery() 1442 } 1443 } else { 1444 // Detect loss by counting the duplicates and enter recovery. 1445 fastRetransmit = s.detectLoss(rcvdSeg) 1446 } 1447 1448 // See if TLP based recovery was successful. 1449 if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1450 s.detectTLPRecovery(ack, rcvdSeg) 1451 } 1452 1453 // Stash away the current window size. 1454 s.SndWnd = rcvdSeg.window 1455 1456 // Disable zero window probing if remote advertizes a non-zero receive 1457 // window. This can be with an ACK to the zero window probe (where the 1458 // acknumber refers to the already acknowledged byte) OR to any previously 1459 // unacknowledged segment. 1460 if s.zeroWindowProbing && rcvdSeg.window > 0 && 1461 (ack == s.SndUna || (ack-1).InRange(s.SndUna, s.SndNxt)) { 1462 s.disableZeroWindowProbing() 1463 } 1464 1465 // On receiving the ACK for the zero window probe, account for it and 1466 // skip trying to send any segment as we are still probing for 1467 // receive window to become non-zero. 1468 if s.zeroWindowProbing && s.unackZeroWindowProbes > 0 && ack == s.SndUna { 1469 s.unackZeroWindowProbes-- 1470 return 1471 } 1472 1473 // Ignore ack if it doesn't acknowledge any new data. 1474 if (ack - 1).InRange(s.SndUna, s.SndNxt) { 1475 s.DupAckCount = 0 1476 1477 // See : https://tools.ietf.org/html/rfc1323#section-3.3. 1478 // Specifically we should only update the RTO using TSEcr if the 1479 // following condition holds: 1480 // 1481 // A TSecr value received in a segment is used to update the 1482 // averaged RTT measurement only if the segment acknowledges 1483 // some new data, i.e., only if it advances the left edge of 1484 // the send window. 1485 if s.ep.SendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 { 1486 s.updateRTO(s.ep.elapsed(s.ep.stack.Clock().NowMonotonic(), rcvdSeg.parsedOptions.TSEcr)) 1487 } 1488 1489 if s.shouldSchedulePTO() { 1490 // Schedule PTO upon receiving an ACK that cumulatively acknowledges data. 1491 // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1. 1492 s.schedulePTO() 1493 } else { 1494 // When an ack is received we must rearm the timer. 1495 // RFC 6298 5.3 1496 s.probeTimer.disable() 1497 s.resendTimer.enable(s.RTO) 1498 } 1499 1500 // Remove all acknowledged data from the write list. 1501 acked := s.SndUna.Size(ack) 1502 s.SndUna = ack 1503 ackLeft := acked 1504 originalOutstanding := s.Outstanding 1505 for ackLeft > 0 { 1506 // We use logicalLen here because we can have FIN 1507 // segments (which are always at the end of list) that 1508 // have no data, but do consume a sequence number. 1509 seg := s.writeList.Front() 1510 datalen := seg.logicalLen() 1511 1512 if datalen > ackLeft { 1513 prevCount := s.pCount(seg, s.MaxPayloadSize) 1514 seg.TrimFront(ackLeft) 1515 seg.sequenceNumber.UpdateForward(ackLeft) 1516 s.Outstanding -= prevCount - s.pCount(seg, s.MaxPayloadSize) 1517 break 1518 } 1519 1520 if s.writeNext == seg { 1521 s.updateWriteNext(seg.Next()) 1522 } 1523 1524 // Update the RACK fields if SACK is enabled. 1525 if s.ep.SACKPermitted && !seg.acked && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1526 s.rc.update(seg, rcvdSeg) 1527 s.rc.detectReorder(seg) 1528 } 1529 1530 s.writeList.Remove(seg) 1531 1532 // If SACK is enabled then only reduce outstanding if 1533 // the segment was not previously SACKED as these have 1534 // already been accounted for in SetPipe(). 1535 if !s.ep.SACKPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 1536 s.Outstanding -= s.pCount(seg, s.MaxPayloadSize) 1537 } else { 1538 s.SackedOut -= s.pCount(seg, s.MaxPayloadSize) 1539 } 1540 seg.DecRef() 1541 ackLeft -= datalen 1542 } 1543 1544 // Clear SACK information for all acked data. 1545 s.ep.scoreboard.Delete(s.SndUna) 1546 1547 // Detect if the sender entered recovery spuriously. 1548 if s.inRecovery() { 1549 s.detectSpuriousRecovery(hasDSACK, rcvdSeg.parsedOptions.TSEcr) 1550 } 1551 1552 // If we are not in fast recovery then update the congestion 1553 // window based on the number of acknowledged packets. 1554 if !s.FastRecovery.Active { 1555 s.cc.Update(originalOutstanding - s.Outstanding) 1556 if s.FastRecovery.Last.LessThan(s.SndUna) { 1557 s.state = tcpip.Open 1558 // Update RACK when we are exiting fast or RTO 1559 // recovery as described in the RFC 1560 // draft-ietf-tcpm-rack-08 Section-7.2 Step 4. 1561 if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1562 s.rc.exitRecovery() 1563 } 1564 s.reorderTimer.disable() 1565 } 1566 } 1567 1568 // Update the send buffer usage and notify potential waiters. 1569 s.ep.updateSndBufferUsage(int(acked)) 1570 1571 // It is possible for s.outstanding to drop below zero if we get 1572 // a retransmit timeout, reset outstanding to zero but later 1573 // get an ack that cover previously sent data. 1574 if s.Outstanding < 0 { 1575 s.Outstanding = 0 1576 } 1577 1578 s.SetPipe() 1579 1580 // If all outstanding data was acknowledged the disable the timer. 1581 // RFC 6298 Rule 5.3 1582 if s.SndUna == s.SndNxt { 1583 s.Outstanding = 0 1584 // Reset firstRetransmittedSegXmitTime to the zero value. 1585 s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{} 1586 s.resendTimer.disable() 1587 s.probeTimer.disable() 1588 } 1589 } 1590 1591 if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1592 // Update RACK reorder window. 1593 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 1594 // * Upon receiving an ACK: 1595 // * Step 4: Update RACK reordering window 1596 s.rc.updateRACKReorderWindow() 1597 1598 // After the reorder window is calculated, detect any loss by checking 1599 // if the time elapsed after the segments are sent is greater than the 1600 // reorder window. 1601 if numLost := s.rc.detectLoss(rcvdSeg.rcvdTime); numLost > 0 && !s.FastRecovery.Active { 1602 // If any segment is marked as lost by 1603 // RACK, enter recovery and retransmit 1604 // the lost segments. 1605 s.cc.HandleLossDetected() 1606 s.enterRecovery() 1607 fastRetransmit = true 1608 } 1609 1610 if s.FastRecovery.Active { 1611 s.rc.DoRecovery(nil, fastRetransmit) 1612 } 1613 } 1614 1615 // Now that we've popped all acknowledged data from the retransmit 1616 // queue, retransmit if needed. 1617 if s.FastRecovery.Active && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 { 1618 s.lr.DoRecovery(rcvdSeg, fastRetransmit) 1619 // When SACK is enabled data sending is governed by steps in 1620 // RFC 6675 Section 5 recovery steps A-C. 1621 // See: https://tools.ietf.org/html/rfc6675#section-5. 1622 if s.ep.SACKPermitted { 1623 return 1624 } 1625 } 1626 1627 // Send more data now that some of the pending data has been ack'd, or 1628 // that the window opened up, or the congestion window was inflated due 1629 // to a duplicate ack during fast recovery. This will also re-enable 1630 // the retransmit timer if needed. 1631 s.sendData() 1632 } 1633 1634 // sendSegment sends the specified segment. 1635 // +checklocks:s.ep.mu 1636 func (s *sender) sendSegment(seg *segment) tcpip.Error { 1637 if seg.xmitCount > 0 { 1638 s.ep.stack.Stats().TCP.Retransmits.Increment() 1639 s.ep.stats.SendErrors.Retransmits.Increment() 1640 if s.SndCwnd < s.Ssthresh { 1641 s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment() 1642 } 1643 } 1644 seg.xmitTime = s.ep.stack.Clock().NowMonotonic() 1645 seg.xmitCount++ 1646 seg.lost = false 1647 1648 err := s.sendSegmentFromPacketBuffer(seg.pkt, seg.flags, seg.sequenceNumber) 1649 1650 // Every time a packet containing data is sent (including a 1651 // retransmission), if SACK is enabled and we are retransmitting data 1652 // then use the conservative timer described in RFC6675 Section 6.0, 1653 // otherwise follow the standard time described in RFC6298 Section 5.1. 1654 if err != nil && seg.payloadSize() != 0 { 1655 if s.FastRecovery.Active && seg.xmitCount > 1 && s.ep.SACKPermitted { 1656 s.resendTimer.enable(s.RTO) 1657 } else { 1658 if !s.resendTimer.enabled() { 1659 s.resendTimer.enable(s.RTO) 1660 } 1661 } 1662 } 1663 1664 return err 1665 } 1666 1667 // sendSegmentFromPacketBuffer sends a new segment containing the given payload, 1668 // flags and sequence number. 1669 // +checklocks:s.ep.mu 1670 // +checklocksalias:s.ep.rcv.ep.mu=s.ep.mu 1671 func (s *sender) sendSegmentFromPacketBuffer(pkt stack.PacketBufferPtr, flags header.TCPFlags, seq seqnum.Value) tcpip.Error { 1672 s.LastSendTime = s.ep.stack.Clock().NowMonotonic() 1673 if seq == s.RTTMeasureSeqNum { 1674 s.RTTMeasureTime = s.LastSendTime 1675 } 1676 1677 rcvNxt, rcvWnd := s.ep.rcv.getSendParams() 1678 1679 // Remember the max sent ack. 1680 s.MaxSentAck = rcvNxt 1681 1682 // We need to clone the packet because sendRaw takes ownership of pkt, 1683 // and pkt could be reprocessed later on (i.e retrasmission). 1684 pkt = pkt.Clone() 1685 defer pkt.DecRef() 1686 1687 return s.ep.sendRaw(pkt, flags, seq, rcvNxt, rcvWnd) 1688 } 1689 1690 // sendEmptySegment sends a new empty segment, flags and sequence number. 1691 // +checklocks:s.ep.mu 1692 // +checklocksalias:s.ep.rcv.ep.mu=s.ep.mu 1693 func (s *sender) sendEmptySegment(flags header.TCPFlags, seq seqnum.Value) tcpip.Error { 1694 s.LastSendTime = s.ep.stack.Clock().NowMonotonic() 1695 if seq == s.RTTMeasureSeqNum { 1696 s.RTTMeasureTime = s.LastSendTime 1697 } 1698 1699 rcvNxt, rcvWnd := s.ep.rcv.getSendParams() 1700 1701 // Remember the max sent ack. 1702 s.MaxSentAck = rcvNxt 1703 1704 return s.ep.sendEmptyRaw(flags, seq, rcvNxt, rcvWnd) 1705 } 1706 1707 // maybeSendOutOfWindowAck sends an ACK if we are not being rate limited 1708 // currently. 1709 // +checklocks:s.ep.mu 1710 func (s *sender) maybeSendOutOfWindowAck(seg *segment) { 1711 // Data packets are unlikely to be part of an ACK loop. So always send 1712 // an ACK for a packet w/ data. 1713 if seg.payloadSize() > 0 || s.ep.allowOutOfWindowAck() { 1714 s.sendAck() 1715 } 1716 } 1717 1718 func (s *sender) updateWriteNext(seg *segment) { 1719 if s.writeNext != nil { 1720 s.writeNext.DecRef() 1721 } 1722 if seg != nil { 1723 seg.IncRef() 1724 } 1725 s.writeNext = seg 1726 }