gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/tcpip/transport/tcp/snd.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "fmt" 19 "math" 20 "sort" 21 "time" 22 23 "gvisor.dev/gvisor/pkg/buffer" 24 "gvisor.dev/gvisor/pkg/sync" 25 "gvisor.dev/gvisor/pkg/tcpip" 26 "gvisor.dev/gvisor/pkg/tcpip/header" 27 "gvisor.dev/gvisor/pkg/tcpip/seqnum" 28 "gvisor.dev/gvisor/pkg/tcpip/stack" 29 ) 30 31 const ( 32 // MinRTO is the minimum allowed value for the retransmit timeout. 33 MinRTO = 200 * time.Millisecond 34 35 // MaxRTO is the maximum allowed value for the retransmit timeout. 36 MaxRTO = 120 * time.Second 37 38 // MinSRTT is the minimum allowed value for smoothed RTT. 39 MinSRTT = 1 * time.Millisecond 40 41 // InitialCwnd is the initial congestion window. 42 InitialCwnd = 10 43 44 // nDupAckThreshold is the number of duplicate ACK's required 45 // before fast-retransmit is entered. 46 nDupAckThreshold = 3 47 48 // MaxRetries is the maximum number of probe retries sender does 49 // before timing out the connection. 50 // Linux default TCP_RETR2, net.ipv4.tcp_retries2. 51 MaxRetries = 15 52 53 // InitialSsthresh is the the maximum int value, which depends on the 54 // platform. 55 InitialSsthresh = math.MaxInt 56 57 // unknownRTT is used to indicate to congestion control algorithms that we 58 // were unable to measure the round-trip time when processing ACKs. 59 // Algorithms (such as HyStart) that use the round-trip time should ignore 60 // such Updates. 61 unknownRTT = time.Duration(-1) 62 ) 63 64 // congestionControl is an interface that must be implemented by any supported 65 // congestion control algorithm. 66 type congestionControl interface { 67 // HandleLossDetected is invoked when the loss is detected by RACK or 68 // sender.dupAckCount >= nDupAckThreshold just before entering fast 69 // retransmit. 70 HandleLossDetected() 71 72 // HandleRTOExpired is invoked when the retransmit timer expires. 73 HandleRTOExpired() 74 75 // Update is invoked when processing inbound acks. It's passed the 76 // number of packet's that were acked by the most recent cumulative 77 // acknowledgement. rtt is the round-trip time, or is set to unknownRTT 78 // (above) to indicate the time is unknown. 79 Update(packetsAcked int, rtt time.Duration) 80 81 // PostRecovery is invoked when the sender is exiting a fast retransmit/ 82 // recovery phase. This provides congestion control algorithms a way 83 // to adjust their state when exiting recovery. 84 PostRecovery() 85 } 86 87 // lossRecovery is an interface that must be implemented by any supported 88 // loss recovery algorithm. 89 type lossRecovery interface { 90 // DoRecovery is invoked when loss is detected and segments need 91 // to be retransmitted. The cumulative or selective ACK is passed along 92 // with the flag which identifies whether the connection entered fast 93 // retransmit with this ACK and to retransmit the first unacknowledged 94 // segment. 95 DoRecovery(rcvdSeg *segment, fastRetransmit bool) 96 } 97 98 // sender holds the state necessary to send TCP segments. 99 // 100 // +stateify savable 101 type sender struct { 102 stack.TCPSenderState 103 ep *Endpoint 104 105 // lr is the loss recovery algorithm used by the sender. 106 lr lossRecovery 107 108 // firstRetransmittedSegXmitTime is the original transmit time of 109 // the first segment that was retransmitted due to RTO expiration. 110 firstRetransmittedSegXmitTime tcpip.MonotonicTime 111 112 // zeroWindowProbing is set if the sender is currently probing 113 // for zero receive window. 114 zeroWindowProbing bool `state:"nosave"` 115 116 // unackZeroWindowProbes is the number of unacknowledged zero 117 // window probes. 118 unackZeroWindowProbes uint32 `state:"nosave"` 119 120 // writeNext is the next segment to write that hasn't already been 121 // written, i.e. the first payload starting at SND.NXT. 122 writeNext *segment 123 124 // writeList holds all writable data: both unsent data and 125 // sent-but-unacknowledged data. Alternatively: it holds all bytes 126 // starting from SND.UNA. 127 writeList segmentList 128 129 // resendTimer is used for RTOs. 130 resendTimer timer `state:"nosave"` 131 132 // rtt.TCPRTTState.SRTT and rtt.TCPRTTState.RTTVar are the "smoothed 133 // round-trip time", and "round-trip time variation", as defined in 134 // section 2 of RFC 6298. 135 rtt rtt 136 137 // minRTO is the minimum permitted value for sender.rto. 138 minRTO time.Duration 139 140 // maxRTO is the maximum permitted value for sender.rto. 141 maxRTO time.Duration 142 143 // maxRetries is the maximum permitted retransmissions. 144 maxRetries uint32 145 146 // gso is set if generic segmentation offload is enabled. 147 gso bool 148 149 // state is the current state of congestion control for this endpoint. 150 state tcpip.CongestionControlState 151 152 // cc is the congestion control algorithm in use for this sender. 153 cc congestionControl 154 155 // rc has the fields needed for implementing RACK loss detection 156 // algorithm. 157 rc rackControl 158 159 // reorderTimer is the timer used to retransmit the segments after RACK 160 // detects them as lost. 161 reorderTimer timer `state:"nosave"` 162 163 // probeTimer is used to schedule PTO for RACK TLP algorithm. 164 probeTimer timer `state:"nosave"` 165 166 // spuriousRecovery indicates whether the sender entered recovery 167 // spuriously as described in RFC3522 Section 3.2. 168 spuriousRecovery bool 169 170 // retransmitTS is the timestamp at which the sender sends retransmitted 171 // segment after entering an RTO for the first time as described in 172 // RFC3522 Section 3.2. 173 retransmitTS uint32 174 175 // startCork start corking the segments. 176 startCork bool 177 178 // corkTimer is used to drain the segments which are held when TCP_CORK 179 // option is enabled. 180 corkTimer timer `state:"nosave"` 181 } 182 183 // rtt is a synchronization wrapper used to appease stateify. See the comment 184 // in sender, where it is used. 185 // 186 // +stateify savable 187 type rtt struct { 188 sync.Mutex `state:"nosave"` 189 190 stack.TCPRTTState 191 } 192 193 // +checklocks:ep.mu 194 func newSender(ep *Endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender { 195 // The sender MUST reduce the TCP data length to account for any IP or 196 // TCP options that it is including in the packets that it sends. 197 // See: https://tools.ietf.org/html/rfc6691#section-2 198 maxPayloadSize := int(mss) - ep.maxOptionSize() 199 200 s := &sender{ 201 ep: ep, 202 TCPSenderState: stack.TCPSenderState{ 203 SndWnd: sndWnd, 204 SndUna: iss + 1, 205 SndNxt: iss + 1, 206 RTTMeasureSeqNum: iss + 1, 207 LastSendTime: ep.stack.Clock().NowMonotonic(), 208 MaxPayloadSize: maxPayloadSize, 209 MaxSentAck: irs + 1, 210 FastRecovery: stack.TCPFastRecoveryState{ 211 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1. 212 Last: iss, 213 HighRxt: iss, 214 RescueRxt: iss, 215 }, 216 RTO: 1 * time.Second, 217 }, 218 gso: ep.gso.Type != stack.GSONone, 219 } 220 221 if s.gso { 222 s.ep.gso.MSS = uint16(maxPayloadSize) 223 } 224 225 s.cc = s.initCongestionControl(ep.cc) 226 s.lr = s.initLossRecovery() 227 s.rc.init(s, iss) 228 229 // A negative sndWndScale means that no scaling is in use, otherwise we 230 // store the scaling value. 231 if sndWndScale > 0 { 232 s.SndWndScale = uint8(sndWndScale) 233 } 234 235 s.resendTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.retransmitTimerExpired)) 236 s.reorderTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.rc.reorderTimerExpired)) 237 s.probeTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.probeTimerExpired)) 238 s.corkTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.corkTimerExpired)) 239 240 s.ep.AssertLockHeld(ep) 241 s.updateMaxPayloadSize(int(ep.route.MTU()), 0) 242 // Initialize SACK Scoreboard after updating max payload size as we use 243 // the maxPayloadSize as the smss when determining if a segment is lost 244 // etc. 245 s.ep.scoreboard = NewSACKScoreboard(uint16(s.MaxPayloadSize), iss) 246 247 // Get Stack wide config. 248 var minRTO tcpip.TCPMinRTOOption 249 if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil { 250 panic(fmt.Sprintf("unable to get minRTO from stack: %s", err)) 251 } 252 s.minRTO = time.Duration(minRTO) 253 254 var maxRTO tcpip.TCPMaxRTOOption 255 if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil { 256 panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err)) 257 } 258 s.maxRTO = time.Duration(maxRTO) 259 260 var maxRetries tcpip.TCPMaxRetriesOption 261 if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil { 262 panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err)) 263 } 264 s.maxRetries = uint32(maxRetries) 265 266 return s 267 } 268 269 // initCongestionControl initializes the specified congestion control module and 270 // returns a handle to it. It also initializes the sndCwnd and sndSsThresh to 271 // their initial values. 272 func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl { 273 s.SndCwnd = InitialCwnd 274 s.Ssthresh = InitialSsthresh 275 276 switch congestionControlName { 277 case ccCubic: 278 return newCubicCC(s) 279 case ccReno: 280 fallthrough 281 default: 282 return newRenoCC(s) 283 } 284 } 285 286 // initLossRecovery initiates the loss recovery algorithm for the sender. 287 func (s *sender) initLossRecovery() lossRecovery { 288 if s.ep.SACKPermitted { 289 return newSACKRecovery(s) 290 } 291 return newRenoRecovery(s) 292 } 293 294 // updateMaxPayloadSize updates the maximum payload size based on the given 295 // MTU. If this is in response to "packet too big" control packets (indicated 296 // by the count argument), it also reduces the number of outstanding packets and 297 // attempts to retransmit the first packet above the MTU size. 298 // +checklocks:s.ep.mu 299 func (s *sender) updateMaxPayloadSize(mtu, count int) { 300 m := mtu - header.TCPMinimumSize 301 302 m -= s.ep.maxOptionSize() 303 304 // We don't adjust up for now. 305 if m >= s.MaxPayloadSize { 306 return 307 } 308 309 // Make sure we can transmit at least one byte. 310 if m <= 0 { 311 m = 1 312 } 313 314 oldMSS := s.MaxPayloadSize 315 s.MaxPayloadSize = m 316 if s.gso { 317 s.ep.gso.MSS = uint16(m) 318 } 319 320 if count == 0 { 321 // updateMaxPayloadSize is also called when the sender is created. 322 // and there is no data to send in such cases. Return immediately. 323 return 324 } 325 326 // Update the scoreboard's smss to reflect the new lowered 327 // maxPayloadSize. 328 s.ep.scoreboard.smss = uint16(m) 329 330 s.Outstanding -= count 331 if s.Outstanding < 0 { 332 s.Outstanding = 0 333 } 334 335 // Rewind writeNext to the first segment exceeding the MTU. Do nothing 336 // if it is already before such a packet. 337 nextSeg := s.writeNext 338 for seg := s.writeList.Front(); seg != nil; seg = seg.Next() { 339 if seg == s.writeNext { 340 // We got to writeNext before we could find a segment 341 // exceeding the MTU. 342 break 343 } 344 345 if seg.payloadSize() > m { 346 // xmitCount is used for loss detection, but 347 // retransmission doesn't indicate congestion here, 348 // it's just PMTUD. 349 seg.xmitCount = 0 350 if nextSeg == s.writeNext { 351 // We found a segment exceeding the MTU. Rewind 352 // writeNext and try to retransmit it. 353 nextSeg = seg 354 } 355 } 356 357 if s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 358 // Update sackedOut for new maximum payload size. 359 s.SackedOut -= s.pCount(seg, oldMSS) 360 s.SackedOut += s.pCount(seg, s.MaxPayloadSize) 361 } 362 } 363 364 // Since we likely reduced the number of outstanding packets, we may be 365 // ready to send some more. 366 s.updateWriteNext(nextSeg) 367 s.sendData() 368 } 369 370 // sendAck sends an ACK segment. 371 // +checklocks:s.ep.mu 372 func (s *sender) sendAck() { 373 s.sendEmptySegment(header.TCPFlagAck, s.SndNxt) 374 } 375 376 // updateRTO updates the retransmit timeout when a new roud-trip time is 377 // available. This is done in accordance with section 2 of RFC 6298. 378 func (s *sender) updateRTO(rtt time.Duration) { 379 s.rtt.Lock() 380 if !s.rtt.TCPRTTState.SRTTInited { 381 s.rtt.TCPRTTState.RTTVar = rtt / 2 382 s.rtt.TCPRTTState.SRTT = rtt 383 s.rtt.TCPRTTState.SRTTInited = true 384 } else { 385 diff := s.rtt.TCPRTTState.SRTT - rtt 386 if diff < 0 { 387 diff = -diff 388 } 389 // Use RFC6298 standard algorithm to update TCPRTTState.RTTVar and TCPRTTState.SRTT when 390 // no timestamps are available. 391 if !s.ep.SendTSOk { 392 s.rtt.TCPRTTState.RTTVar = (3*s.rtt.TCPRTTState.RTTVar + diff) / 4 393 s.rtt.TCPRTTState.SRTT = (7*s.rtt.TCPRTTState.SRTT + rtt) / 8 394 } else { 395 // When we are taking RTT measurements of every ACK then 396 // we need to use a modified method as specified in 397 // https://tools.ietf.org/html/rfc7323#appendix-G 398 if s.Outstanding == 0 { 399 s.rtt.Unlock() 400 return 401 } 402 // Netstack measures congestion window/inflight all in 403 // terms of packets and not bytes. This is similar to 404 // how linux also does cwnd and inflight. In practice 405 // this approximation works as expected. 406 expectedSamples := math.Ceil(float64(s.Outstanding) / 2) 407 408 // alpha & beta values are the original values as recommended in 409 // https://tools.ietf.org/html/rfc6298#section-2.3. 410 const alpha = 0.125 411 const beta = 0.25 412 413 alphaPrime := alpha / expectedSamples 414 betaPrime := beta / expectedSamples 415 rttVar := (1-betaPrime)*s.rtt.TCPRTTState.RTTVar.Seconds() + betaPrime*diff.Seconds() 416 srtt := (1-alphaPrime)*s.rtt.TCPRTTState.SRTT.Seconds() + alphaPrime*rtt.Seconds() 417 s.rtt.TCPRTTState.RTTVar = time.Duration(rttVar * float64(time.Second)) 418 s.rtt.TCPRTTState.SRTT = time.Duration(srtt * float64(time.Second)) 419 } 420 } 421 422 if s.rtt.TCPRTTState.SRTT < MinSRTT { 423 s.rtt.TCPRTTState.SRTT = MinSRTT 424 } 425 426 s.RTO = s.rtt.TCPRTTState.SRTT + 4*s.rtt.TCPRTTState.RTTVar 427 s.rtt.Unlock() 428 if s.RTO < s.minRTO { 429 s.RTO = s.minRTO 430 } 431 if s.RTO > s.maxRTO { 432 s.RTO = s.maxRTO 433 } 434 } 435 436 // resendSegment resends the first unacknowledged segment. 437 // +checklocks:s.ep.mu 438 func (s *sender) resendSegment() { 439 // Don't use any segments we already sent to measure RTT as they may 440 // have been affected by packets being lost. 441 s.RTTMeasureSeqNum = s.SndNxt 442 443 // Resend the segment. 444 if seg := s.writeList.Front(); seg != nil { 445 if seg.payloadSize() > s.MaxPayloadSize { 446 s.splitSeg(seg, s.MaxPayloadSize) 447 } 448 449 // See: RFC 6675 section 5 Step 4.3 450 // 451 // To prevent retransmission, set both the HighRXT and RescueRXT 452 // to the highest sequence number in the retransmitted segment. 453 s.FastRecovery.HighRxt = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) - 1 454 s.FastRecovery.RescueRxt = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) - 1 455 s.sendSegment(seg) 456 s.ep.stack.Stats().TCP.FastRetransmit.Increment() 457 s.ep.stats.SendErrors.FastRetransmit.Increment() 458 459 // Run SetPipe() as per RFC 6675 section 5 Step 4.4 460 s.SetPipe() 461 } 462 } 463 464 // retransmitTimerExpired is called when the retransmit timer expires, and 465 // unacknowledged segments are assumed lost, and thus need to be resent. 466 // Returns true if the connection is still usable, or false if the connection 467 // is deemed lost. 468 // +checklocks:s.ep.mu 469 func (s *sender) retransmitTimerExpired() tcpip.Error { 470 // Check if the timer actually expired or if it's a spurious wake due 471 // to a previously orphaned runtime timer. 472 if s.resendTimer.isUninitialized() || !s.resendTimer.checkExpiration() { 473 return nil 474 } 475 476 // Initialize the variables used to detect spurious recovery after 477 // entering RTO. 478 // 479 // See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1. 480 s.spuriousRecovery = false 481 s.retransmitTS = 0 482 483 // TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases 484 // when writeList is empty. Remove this once we have a proper fix for this 485 // issue. 486 if s.writeList.Front() == nil { 487 return nil 488 } 489 490 s.ep.stack.Stats().TCP.Timeouts.Increment() 491 s.ep.stats.SendErrors.Timeouts.Increment() 492 493 // Set TLPRxtOut to false according to 494 // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1. 495 s.rc.tlpRxtOut = false 496 497 // Give up if we've waited more than a minute since the last resend or 498 // if a user time out is set and we have exceeded the user specified 499 // timeout since the first retransmission. 500 uto := s.ep.userTimeout 501 502 if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) { 503 // We store the original xmitTime of the segment that we are 504 // about to retransmit as the retransmission time. This is 505 // required as by the time the retransmitTimer has expired the 506 // segment has already been sent and unacked for the RTO at the 507 // time the segment was sent. 508 s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime 509 } 510 511 elapsed := s.ep.stack.Clock().NowMonotonic().Sub(s.firstRetransmittedSegXmitTime) 512 remaining := s.maxRTO 513 if uto != 0 { 514 // Cap to the user specified timeout if one is specified. 515 remaining = uto - elapsed 516 } 517 518 // Always honor the user-timeout irrespective of whether the zero 519 // window probes were acknowledged. 520 // net/ipv4/tcp_timer.c::tcp_probe_timer() 521 if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries { 522 s.ep.stack.Stats().TCP.EstablishedTimedout.Increment() 523 return &tcpip.ErrTimeout{} 524 } 525 526 // Set new timeout. The timer will be restarted by the call to sendData 527 // below. 528 s.RTO *= 2 529 // Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5 530 if s.RTO > s.maxRTO { 531 s.RTO = s.maxRTO 532 } 533 534 // Cap RTO to remaining time. 535 if s.RTO > remaining { 536 s.RTO = remaining 537 } 538 539 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4. 540 // 541 // Retransmit timeouts: 542 // After a retransmit timeout, record the highest sequence number 543 // transmitted in the variable recover, and exit the fast recovery 544 // procedure if applicable. 545 s.FastRecovery.Last = s.SndNxt - 1 546 547 if s.FastRecovery.Active { 548 // We were attempting fast recovery but were not successful. 549 // Leave the state. We don't need to update ssthresh because it 550 // has already been updated when entered fast-recovery. 551 s.leaveRecovery() 552 } 553 554 // Record retransmitTS if the sender is not in recovery as per: 555 // https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 556 s.recordRetransmitTS() 557 558 s.state = tcpip.RTORecovery 559 s.cc.HandleRTOExpired() 560 561 // Mark the next segment to be sent as the first unacknowledged one and 562 // start sending again. Set the number of outstanding packets to 0 so 563 // that we'll be able to retransmit. 564 // 565 // We'll keep on transmitting (or retransmitting) as we get acks for 566 // the data we transmit. 567 s.Outstanding = 0 568 569 // Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1 570 // 571 // In order to avoid memory deadlocks, the TCP receiver is allowed to 572 // discard data that has already been selectively acknowledged. As a 573 // result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK 574 // information gathered from a receiver upon a retransmission timeout 575 // (RTO) "since the timeout might indicate that the data receiver has 576 // reneged." Additionally, a TCP sender MUST "ignore prior SACK 577 // information in determining which data to retransmit." 578 // 579 // NOTE: We take the stricter interpretation and just expunge all 580 // information as we lack more rigorous checks to validate if the SACK 581 // information is usable after an RTO. 582 s.ep.scoreboard.Reset() 583 s.updateWriteNext(s.writeList.Front()) 584 585 // RFC 1122 4.2.2.17: Start sending zero window probes when we still see a 586 // zero receive window after retransmission interval and we have data to 587 // send. 588 if s.zeroWindowProbing { 589 s.sendZeroWindowProbe() 590 // RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed 591 // indefinitely. As long as the receiving TCP continues to send 592 // acknowledgments in response to the probe segments, the sending TCP 593 // MUST allow the connection to stay open. 594 return nil 595 } 596 597 seg := s.writeNext 598 // RFC 1122 4.2.3.5: Close the connection when the number of 599 // retransmissions for this segment is beyond a limit. 600 if seg != nil && seg.xmitCount > s.maxRetries { 601 s.ep.stack.Stats().TCP.EstablishedTimedout.Increment() 602 return &tcpip.ErrTimeout{} 603 } 604 605 s.sendData() 606 607 return nil 608 } 609 610 // pCount returns the number of packets in the segment. Due to GSO, a segment 611 // can be composed of multiple packets. 612 func (s *sender) pCount(seg *segment, maxPayloadSize int) int { 613 size := seg.payloadSize() 614 if size == 0 { 615 return 1 616 } 617 618 return (size-1)/maxPayloadSize + 1 619 } 620 621 // splitSeg splits a given segment at the size specified and inserts the 622 // remainder as a new segment after the current one in the write list. 623 func (s *sender) splitSeg(seg *segment, size int) { 624 if seg.payloadSize() <= size { 625 return 626 } 627 // Split this segment up. 628 nSeg := seg.clone() 629 nSeg.pkt.Data().TrimFront(size) 630 nSeg.sequenceNumber.UpdateForward(seqnum.Size(size)) 631 s.writeList.InsertAfter(seg, nSeg) 632 633 // The segment being split does not carry PUSH flag because it is 634 // followed by the newly split segment. 635 // RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered 636 // segment (i.e., when there is no more queued data to be sent). 637 // Linux removes PSH flag only when the segment is being split over MSS 638 // and retains it when we are splitting the segment over lack of sender 639 // window space. 640 // ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point() 641 // ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test() 642 if seg.payloadSize() > s.MaxPayloadSize { 643 seg.flags ^= header.TCPFlagPsh 644 } 645 seg.pkt.Data().CapLength(size) 646 } 647 648 // NextSeg implements the RFC6675 NextSeg() operation. 649 // 650 // NextSeg starts scanning the writeList starting from nextSegHint and returns 651 // the hint to be passed on the next call to NextSeg. This is required to avoid 652 // iterating the write list repeatedly when NextSeg is invoked in a loop during 653 // recovery. The returned hint will be nil if there are no more segments that 654 // can match rules defined by NextSeg operation in RFC6675. 655 // 656 // rescueRtx will be true only if nextSeg is a rescue retransmission as 657 // described by Step 4) of the NextSeg algorithm. 658 func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) { 659 var s3 *segment 660 var s4 *segment 661 // Step 1. 662 for seg := nextSegHint; seg != nil; seg = seg.Next() { 663 // Stop iteration if we hit a segment that has never been 664 // transmitted (i.e. either it has no assigned sequence number 665 // or if it does have one, it's >= the next sequence number 666 // to be sent [i.e. >= s.sndNxt]). 667 if !s.isAssignedSequenceNumber(seg) || s.SndNxt.LessThanEq(seg.sequenceNumber) { 668 hint = nil 669 break 670 } 671 segSeq := seg.sequenceNumber 672 if smss := s.ep.scoreboard.SMSS(); seg.payloadSize() > int(smss) { 673 s.splitSeg(seg, int(smss)) 674 } 675 676 // See RFC 6675 Section 4 677 // 678 // 1. If there exists a smallest unSACKED sequence number 679 // 'S2' that meets the following 3 criteria for determinig 680 // loss, the sequence range of one segment of up to SMSS 681 // octets starting with S2 MUST be returned. 682 if !s.ep.scoreboard.IsSACKED(header.SACKBlock{Start: segSeq, End: segSeq.Add(1)}) { 683 // NextSeg(): 684 // 685 // (1.a) S2 is greater than HighRxt 686 // (1.b) S2 is less than highest octet covered by 687 // any received SACK. 688 if s.FastRecovery.HighRxt.LessThan(segSeq) && segSeq.LessThan(s.ep.scoreboard.maxSACKED) { 689 // NextSeg(): 690 // (1.c) IsLost(S2) returns true. 691 if s.ep.scoreboard.IsLost(segSeq) { 692 return seg, seg.Next(), false 693 } 694 695 // NextSeg(): 696 // 697 // (3): If the conditions for rules (1) and (2) 698 // fail, but there exists an unSACKed sequence 699 // number S3 that meets the criteria for 700 // detecting loss given in steps 1.a and 1.b 701 // above (specifically excluding (1.c)) then one 702 // segment of upto SMSS octets starting with S3 703 // SHOULD be returned. 704 if s3 == nil { 705 s3 = seg 706 hint = seg.Next() 707 } 708 } 709 // NextSeg(): 710 // 711 // (4) If the conditions for (1), (2) and (3) fail, 712 // but there exists outstanding unSACKED data, we 713 // provide the opportunity for a single "rescue" 714 // retransmission per entry into loss recovery. If 715 // HighACK is greater than RescueRxt (or RescueRxt 716 // is undefined), then one segment of upto SMSS 717 // octets that MUST include the highest outstanding 718 // unSACKed sequence number SHOULD be returned, and 719 // RescueRxt set to RecoveryPoint. HighRxt MUST NOT 720 // be updated. 721 if s.FastRecovery.RescueRxt.LessThan(s.SndUna - 1) { 722 if s4 != nil { 723 if s4.sequenceNumber.LessThan(segSeq) { 724 s4 = seg 725 } 726 } else { 727 s4 = seg 728 } 729 } 730 } 731 } 732 733 // If we got here then no segment matched step (1). 734 // Step (2): "If no sequence number 'S2' per rule (1) 735 // exists but there exists available unsent data and the 736 // receiver's advertised window allows, the sequence 737 // range of one segment of up to SMSS octets of 738 // previously unsent data starting with sequence number 739 // HighData+1 MUST be returned." 740 for seg := s.writeNext; seg != nil; seg = seg.Next() { 741 if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.SndNxt) { 742 continue 743 } 744 // We do not split the segment here to <= smss as it has 745 // potentially not been assigned a sequence number yet. 746 return seg, nil, false 747 } 748 749 if s3 != nil { 750 return s3, hint, false 751 } 752 753 return s4, nil, true 754 } 755 756 // maybeSendSegment tries to send the specified segment and either coalesces 757 // other segments into this one or splits the specified segment based on the 758 // lower of the specified limit value or the receivers window size specified by 759 // end. 760 // +checklocks:s.ep.mu 761 func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (sent bool) { 762 // We abuse the flags field to determine if we have already 763 // assigned a sequence number to this segment. 764 if !s.isAssignedSequenceNumber(seg) { 765 // Merge segments if allowed. 766 if seg.payloadSize() != 0 { 767 available := int(s.SndNxt.Size(end)) 768 if available > limit { 769 available = limit 770 } 771 772 // nextTooBig indicates that the next segment was too 773 // large to entirely fit in the current segment. It 774 // would be possible to split the next segment and merge 775 // the portion that fits, but unexpectedly splitting 776 // segments can have user visible side-effects which can 777 // break applications. For example, RFC 7766 section 8 778 // says that the length and data of a DNS response 779 // should be sent in the same TCP segment to avoid 780 // triggering bugs in poorly written DNS 781 // implementations. 782 var nextTooBig bool 783 for nSeg := seg.Next(); nSeg != nil && nSeg.payloadSize() != 0; nSeg = seg.Next() { 784 if seg.payloadSize()+nSeg.payloadSize() > available { 785 nextTooBig = true 786 break 787 } 788 seg.merge(nSeg) 789 s.writeList.Remove(nSeg) 790 nSeg.DecRef() 791 } 792 if !nextTooBig && seg.payloadSize() < available { 793 // Segment is not full. 794 if s.Outstanding > 0 && s.ep.ops.GetDelayOption() { 795 // Nagle's algorithm. From Wikipedia: 796 // Nagle's algorithm works by 797 // combining a number of small 798 // outgoing messages and sending them 799 // all at once. Specifically, as long 800 // as there is a sent packet for which 801 // the sender has received no 802 // acknowledgment, the sender should 803 // keep buffering its output until it 804 // has a full packet's worth of 805 // output, thus allowing output to be 806 // sent all at once. 807 return false 808 } 809 // With TCP_CORK, hold back until minimum of the available 810 // send space and MSS. 811 if s.ep.ops.GetCorkOption() { 812 if seg.payloadSize() < s.MaxPayloadSize { 813 if !s.startCork { 814 s.startCork = true 815 // Enable the timer for 816 // 200ms, after which 817 // the segments are drained. 818 s.corkTimer.enable(MinRTO) 819 } 820 return false 821 } 822 // Disable the TCP_CORK timer. 823 s.startCork = false 824 s.corkTimer.disable() 825 } 826 } 827 } 828 829 // Assign flags. We don't do it above so that we can merge 830 // additional data if Nagle holds the segment. 831 seg.sequenceNumber = s.SndNxt 832 seg.flags = header.TCPFlagAck | header.TCPFlagPsh 833 } 834 835 var segEnd seqnum.Value 836 if seg.payloadSize() == 0 { 837 if s.writeList.Back() != seg { 838 panic("FIN segments must be the final segment in the write list.") 839 } 840 seg.flags = header.TCPFlagAck | header.TCPFlagFin 841 segEnd = seg.sequenceNumber.Add(1) 842 // Update the state to reflect that we have now 843 // queued a FIN. 844 s.ep.updateConnDirectionState(connDirectionStateSndClosed) 845 switch s.ep.EndpointState() { 846 case StateCloseWait: 847 s.ep.setEndpointState(StateLastAck) 848 default: 849 s.ep.setEndpointState(StateFinWait1) 850 } 851 } else { 852 // We're sending a non-FIN segment. 853 if seg.flags&header.TCPFlagFin != 0 { 854 panic("Netstack queues FIN segments without data.") 855 } 856 857 if !seg.sequenceNumber.LessThan(end) { 858 return false 859 } 860 861 available := int(seg.sequenceNumber.Size(end)) 862 if available == 0 { 863 return false 864 } 865 866 // If the whole segment or at least 1MSS sized segment cannot 867 // be accommodated in the receiver advertised window, skip 868 // splitting and sending of the segment. ref: 869 // net/ipv4/tcp_output.c::tcp_snd_wnd_test() 870 // 871 // Linux checks this for all segment transmits not triggered by 872 // a probe timer. On this condition, it defers the segment split 873 // and transmit to a short probe timer. 874 // 875 // ref: include/net/tcp.h::tcp_check_probe_timer() 876 // ref: net/ipv4/tcp_output.c::tcp_write_wakeup() 877 // 878 // Instead of defining a new transmit timer, we attempt to split 879 // the segment right here if there are no pending segments. If 880 // there are pending segments, segment transmits are deferred to 881 // the retransmit timer handler. 882 if s.SndUna != s.SndNxt { 883 switch { 884 case available >= seg.payloadSize(): 885 // OK to send, the whole segments fits in the 886 // receiver's advertised window. 887 case available >= s.MaxPayloadSize: 888 // OK to send, at least 1 MSS sized segment fits 889 // in the receiver's advertised window. 890 default: 891 return false 892 } 893 } 894 895 // The segment size limit is computed as a function of sender 896 // congestion window and MSS. When sender congestion window is > 897 // 1, this limit can be larger than MSS. Ensure that the 898 // currently available send space is not greater than minimum of 899 // this limit and MSS. 900 if available > limit { 901 available = limit 902 } 903 904 // If GSO is not in use then cap available to 905 // maxPayloadSize. When GSO is in use the gVisor GSO logic or 906 // the host GSO logic will cap the segment to the correct size. 907 if s.ep.gso.Type == stack.GSONone && available > s.MaxPayloadSize { 908 available = s.MaxPayloadSize 909 } 910 911 if seg.payloadSize() > available { 912 // A negative value causes splitSeg to panic anyways, so just panic 913 // earlier to get more information about the cause. 914 s.splitSeg(seg, available) 915 } 916 917 segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) 918 } 919 920 s.sendSegment(seg) 921 922 // Update sndNxt if we actually sent new data (as opposed to 923 // retransmitting some previously sent data). 924 if s.SndNxt.LessThan(segEnd) { 925 s.SndNxt = segEnd 926 } 927 928 return true 929 } 930 931 // zeroProbeJunk is data sent during zero window probes. Its value is 932 // irrelevant; since the sequence number has already been acknowledged it will 933 // be discarded. It's only here to avoid allocating. 934 var zeroProbeJunk = []byte{0} 935 936 // +checklocks:s.ep.mu 937 func (s *sender) sendZeroWindowProbe() { 938 s.unackZeroWindowProbes++ 939 940 // Send a zero window probe with sequence number pointing to the last 941 // acknowledged byte. Note that, like Linux, this isn't quite what RFC 942 // 9293 3.8.6.1 describes: we don't send the next byte in the stream, 943 // we re-send an ACKed byte to goad the receiver into responding. 944 pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ 945 Payload: buffer.MakeWithData(zeroProbeJunk), 946 }) 947 defer pkt.DecRef() 948 s.sendSegmentFromPacketBuffer(pkt, header.TCPFlagAck, s.SndUna-1) 949 950 // Rearm the timer to continue probing. 951 s.resendTimer.enable(s.RTO) 952 } 953 954 func (s *sender) enableZeroWindowProbing() { 955 s.zeroWindowProbing = true 956 // We piggyback the probing on the retransmit timer with the 957 // current retranmission interval, as we may start probing while 958 // segment retransmissions. 959 if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) { 960 s.firstRetransmittedSegXmitTime = s.ep.stack.Clock().NowMonotonic() 961 } 962 s.resendTimer.enable(s.RTO) 963 } 964 965 func (s *sender) disableZeroWindowProbing() { 966 s.zeroWindowProbing = false 967 s.unackZeroWindowProbes = 0 968 s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{} 969 s.resendTimer.disable() 970 } 971 972 func (s *sender) postXmit(dataSent bool, shouldScheduleProbe bool) { 973 if dataSent { 974 // We sent data, so we should stop the keepalive timer to ensure 975 // that no keepalives are sent while there is pending data. 976 s.ep.disableKeepaliveTimer() 977 } 978 979 // If the sender has advertised zero receive window and we have 980 // data to be sent out, start zero window probing to query the 981 // the remote for it's receive window size. 982 if s.writeNext != nil && s.SndWnd == 0 { 983 s.enableZeroWindowProbing() 984 } 985 986 // If we have no more pending data, start the keepalive timer. 987 if s.SndUna == s.SndNxt { 988 s.ep.resetKeepaliveTimer(false) 989 } else { 990 // Enable timers if we have pending data. 991 if shouldScheduleProbe && s.shouldSchedulePTO() { 992 // Schedule PTO after transmitting new data that wasn't itself a TLP probe. 993 s.schedulePTO() 994 } else if !s.resendTimer.enabled() { 995 s.probeTimer.disable() 996 if s.Outstanding > 0 { 997 // Enable the resend timer if it's not enabled yet and there is 998 // outstanding data. 999 s.resendTimer.enable(s.RTO) 1000 } 1001 } 1002 } 1003 } 1004 1005 // sendData sends new data segments. It is called when data becomes available or 1006 // when the send window opens up. 1007 // +checklocks:s.ep.mu 1008 func (s *sender) sendData() { 1009 limit := s.MaxPayloadSize 1010 if s.gso { 1011 limit = int(s.ep.gso.MaxSize - header.TCPTotalHeaderMaximumSize - 1) 1012 } 1013 end := s.SndUna.Add(s.SndWnd) 1014 1015 // Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10. 1016 // "A TCP SHOULD set cwnd to no more than RW before beginning 1017 // transmission if the TCP has not sent data in the interval exceeding 1018 // the retrasmission timeout." 1019 if !s.FastRecovery.Active && s.state != tcpip.RTORecovery && s.ep.stack.Clock().NowMonotonic().Sub(s.LastSendTime) > s.RTO { 1020 if s.SndCwnd > InitialCwnd { 1021 s.SndCwnd = InitialCwnd 1022 } 1023 } 1024 1025 var dataSent bool 1026 for seg := s.writeNext; seg != nil && s.Outstanding < s.SndCwnd; seg = seg.Next() { 1027 cwndLimit := (s.SndCwnd - s.Outstanding) * s.MaxPayloadSize 1028 if cwndLimit < limit { 1029 limit = cwndLimit 1030 } 1031 if s.isAssignedSequenceNumber(seg) && s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 1032 // Move writeNext along so that we don't try and scan data that 1033 // has already been SACKED. 1034 s.updateWriteNext(seg.Next()) 1035 continue 1036 } 1037 if sent := s.maybeSendSegment(seg, limit, end); !sent { 1038 break 1039 } 1040 dataSent = true 1041 s.Outstanding += s.pCount(seg, s.MaxPayloadSize) 1042 s.updateWriteNext(seg.Next()) 1043 } 1044 1045 s.postXmit(dataSent, true /* shouldScheduleProbe */) 1046 } 1047 1048 func (s *sender) enterRecovery() { 1049 // Initialize the variables used to detect spurious recovery after 1050 // entering recovery. 1051 // 1052 // See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1. 1053 s.spuriousRecovery = false 1054 s.retransmitTS = 0 1055 1056 s.FastRecovery.Active = true 1057 // Save state to reflect we're now in fast recovery. 1058 // 1059 // See : https://tools.ietf.org/html/rfc5681#section-3.2 Step 3. 1060 // We inflate the cwnd by 3 to account for the 3 packets which triggered 1061 // the 3 duplicate ACKs and are now not in flight. 1062 s.SndCwnd = s.Ssthresh + 3 1063 s.SackedOut = 0 1064 s.DupAckCount = 0 1065 s.FastRecovery.First = s.SndUna 1066 s.FastRecovery.Last = s.SndNxt - 1 1067 s.FastRecovery.MaxCwnd = s.SndCwnd + s.Outstanding 1068 s.FastRecovery.HighRxt = s.SndUna 1069 s.FastRecovery.RescueRxt = s.SndUna 1070 1071 // Record retransmitTS if the sender is not in recovery as per: 1072 // https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 1073 s.recordRetransmitTS() 1074 1075 if s.ep.SACKPermitted { 1076 s.state = tcpip.SACKRecovery 1077 s.ep.stack.Stats().TCP.SACKRecovery.Increment() 1078 // Set TLPRxtOut to false according to 1079 // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1. 1080 if s.rc.tlpRxtOut { 1081 // The tail loss probe triggered recovery. 1082 s.ep.stack.Stats().TCP.TLPRecovery.Increment() 1083 } 1084 s.rc.tlpRxtOut = false 1085 return 1086 } 1087 s.state = tcpip.FastRecovery 1088 s.ep.stack.Stats().TCP.FastRecovery.Increment() 1089 } 1090 1091 func (s *sender) leaveRecovery() { 1092 s.FastRecovery.Active = false 1093 s.FastRecovery.MaxCwnd = 0 1094 s.DupAckCount = 0 1095 1096 // Deflate cwnd. It had been artificially inflated when new dups arrived. 1097 s.SndCwnd = s.Ssthresh 1098 s.cc.PostRecovery() 1099 } 1100 1101 // isAssignedSequenceNumber relies on the fact that we only set flags once a 1102 // sequencenumber is assigned and that is only done right before we send the 1103 // segment. As a result any segment that has a non-zero flag has a valid 1104 // sequence number assigned to it. 1105 func (s *sender) isAssignedSequenceNumber(seg *segment) bool { 1106 return seg.flags != 0 1107 } 1108 1109 // SetPipe implements the SetPipe() function described in RFC6675. Netstack 1110 // maintains the congestion window in number of packets and not bytes, so 1111 // SetPipe() here measures number of outstanding packets rather than actual 1112 // outstanding bytes in the network. 1113 func (s *sender) SetPipe() { 1114 // If SACK isn't permitted or it is permitted but recovery is not active 1115 // then ignore pipe calculations. 1116 if !s.ep.SACKPermitted || !s.FastRecovery.Active { 1117 return 1118 } 1119 pipe := 0 1120 smss := seqnum.Size(s.ep.scoreboard.SMSS()) 1121 for s1 := s.writeList.Front(); s1 != nil && s1.payloadSize() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() { 1122 // With GSO each segment can be much larger than SMSS. So check the segment 1123 // in SMSS sized ranges. 1124 segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.payloadSize())) 1125 for startSeq := s1.sequenceNumber; startSeq.LessThan(segEnd); startSeq = startSeq.Add(smss) { 1126 endSeq := startSeq.Add(smss) 1127 if segEnd.LessThan(endSeq) { 1128 endSeq = segEnd 1129 } 1130 sb := header.SACKBlock{Start: startSeq, End: endSeq} 1131 // SetPipe(): 1132 // 1133 // After initializing pipe to zero, the following steps are 1134 // taken for each octet 'S1' in the sequence space between 1135 // HighACK and HighData that has not been SACKed: 1136 if !s1.sequenceNumber.LessThan(s.SndNxt) { 1137 break 1138 } 1139 if s.ep.scoreboard.IsSACKED(sb) { 1140 continue 1141 } 1142 1143 // SetPipe(): 1144 // 1145 // (a) If IsLost(S1) returns false, Pipe is incremened by 1. 1146 // 1147 // NOTE: here we mark the whole segment as lost. We do not try 1148 // and test every byte in our write buffer as we maintain our 1149 // pipe in terms of outstanding packets and not bytes. 1150 if !s.ep.scoreboard.IsRangeLost(sb) { 1151 pipe++ 1152 } 1153 // SetPipe(): 1154 // (b) If S1 <= HighRxt, Pipe is incremented by 1. 1155 if s1.sequenceNumber.LessThanEq(s.FastRecovery.HighRxt) { 1156 pipe++ 1157 } 1158 } 1159 } 1160 s.Outstanding = pipe 1161 } 1162 1163 // shouldEnterRecovery returns true if the sender should enter fast recovery 1164 // based on dupAck count and sack scoreboard. 1165 // See RFC 6675 section 5. 1166 func (s *sender) shouldEnterRecovery() bool { 1167 return s.DupAckCount >= nDupAckThreshold || 1168 (s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 && s.ep.scoreboard.IsLost(s.SndUna)) 1169 } 1170 1171 // detectLoss is called when an ack is received and returns whether a loss is 1172 // detected. It manages the state related to duplicate acks and determines if 1173 // a retransmit is needed according to the rules in RFC 6582 (NewReno). 1174 func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) { 1175 // We're not in fast recovery yet. 1176 1177 // If RACK is enabled and there is no reordering we should honor the 1178 // three duplicate ACK rule to enter recovery. 1179 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-4 1180 if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1181 if s.rc.Reord { 1182 return false 1183 } 1184 } 1185 1186 if !s.isDupAck(seg) { 1187 s.DupAckCount = 0 1188 return false 1189 } 1190 1191 s.DupAckCount++ 1192 1193 // Do not enter fast recovery until we reach nDupAckThreshold or the 1194 // first unacknowledged byte is considered lost as per SACK scoreboard. 1195 if !s.shouldEnterRecovery() { 1196 // RFC 6675 Step 3. 1197 s.FastRecovery.HighRxt = s.SndUna - 1 1198 // Do run SetPipe() to calculate the outstanding segments. 1199 s.SetPipe() 1200 s.state = tcpip.Disorder 1201 return false 1202 } 1203 1204 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2 1205 // 1206 // We only do the check here, the incrementing of last to the highest 1207 // sequence number transmitted till now is done when enterRecovery 1208 // is invoked. 1209 // 1210 // Note that we only enter recovery when at least one more byte of data 1211 // beyond s.fr.last (the highest byte that was outstanding when fast 1212 // retransmit was last entered) is acked. 1213 if !s.FastRecovery.Last.LessThan(seg.ackNumber - 1) { 1214 s.DupAckCount = 0 1215 return false 1216 } 1217 s.cc.HandleLossDetected() 1218 s.enterRecovery() 1219 return true 1220 } 1221 1222 // isDupAck determines if seg is a duplicate ack as defined in 1223 // https://tools.ietf.org/html/rfc5681#section-2. 1224 func (s *sender) isDupAck(seg *segment) bool { 1225 // A TCP that utilizes selective acknowledgments (SACKs) [RFC2018, RFC2883] 1226 // can leverage the SACK information to determine when an incoming ACK is a 1227 // "duplicate" (e.g., if the ACK contains previously unknown SACK 1228 // information). 1229 if s.ep.SACKPermitted && !seg.hasNewSACKInfo { 1230 return false 1231 } 1232 1233 // (a) The receiver of the ACK has outstanding data. 1234 return s.SndUna != s.SndNxt && 1235 // (b) The incoming acknowledgment carries no data. 1236 seg.logicalLen() == 0 && 1237 // (c) The SYN and FIN bits are both off. 1238 !seg.flags.Intersects(header.TCPFlagFin|header.TCPFlagSyn) && 1239 // (d) the ACK number is equal to the greatest acknowledgment received on 1240 // the given connection (TCP.UNA from RFC793). 1241 seg.ackNumber == s.SndUna && 1242 // (e) the advertised window in the incoming acknowledgment equals the 1243 // advertised window in the last incoming acknowledgment. 1244 s.SndWnd == seg.window 1245 } 1246 1247 // Iterate the writeList and update RACK for each segment which is newly acked 1248 // either cumulatively or selectively. Loop through the segments which are 1249 // sacked, and update the RACK related variables and check for reordering. 1250 // Returns true when the DSACK block has been detected in the received ACK. 1251 // 1252 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 1253 // steps 2 and 3. 1254 func (s *sender) walkSACK(rcvdSeg *segment) bool { 1255 s.rc.setDSACKSeen(false) 1256 1257 // Look for DSACK block. 1258 hasDSACK := false 1259 idx := 0 1260 n := len(rcvdSeg.parsedOptions.SACKBlocks) 1261 if checkDSACK(rcvdSeg) { 1262 dsackBlock := rcvdSeg.parsedOptions.SACKBlocks[0] 1263 numDSACK := uint64(dsackBlock.End-dsackBlock.Start) / uint64(s.MaxPayloadSize) 1264 // numDSACK can be zero when DSACK is sent for subsegments. 1265 if numDSACK < 1 { 1266 numDSACK = 1 1267 } 1268 s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.IncrementBy(numDSACK) 1269 s.rc.setDSACKSeen(true) 1270 idx = 1 1271 n-- 1272 hasDSACK = true 1273 } 1274 1275 if n == 0 { 1276 return hasDSACK 1277 } 1278 1279 // Sort the SACK blocks. The first block is the most recent unacked 1280 // block. The following blocks can be in arbitrary order. 1281 sackBlocks := make([]header.SACKBlock, n) 1282 copy(sackBlocks, rcvdSeg.parsedOptions.SACKBlocks[idx:]) 1283 sort.Slice(sackBlocks, func(i, j int) bool { 1284 return sackBlocks[j].Start.LessThan(sackBlocks[i].Start) 1285 }) 1286 1287 seg := s.writeList.Front() 1288 for _, sb := range sackBlocks { 1289 for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 { 1290 if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked { 1291 s.rc.update(seg, rcvdSeg) 1292 s.rc.detectReorder(seg) 1293 seg.acked = true 1294 s.SackedOut += s.pCount(seg, s.MaxPayloadSize) 1295 } 1296 seg = seg.Next() 1297 } 1298 } 1299 return hasDSACK 1300 } 1301 1302 // checkDSACK checks if a DSACK is reported. 1303 func checkDSACK(rcvdSeg *segment) bool { 1304 n := len(rcvdSeg.parsedOptions.SACKBlocks) 1305 if n == 0 { 1306 return false 1307 } 1308 1309 sb := rcvdSeg.parsedOptions.SACKBlocks[0] 1310 // Check if SACK block is invalid. 1311 if sb.End.LessThan(sb.Start) { 1312 return false 1313 } 1314 1315 // See: https://tools.ietf.org/html/rfc2883#section-5 DSACK is sent in 1316 // at most one SACK block. DSACK is detected in the below two cases: 1317 // * If the SACK sequence space is less than this cumulative ACK, it is 1318 // an indication that the segment identified by the SACK block has 1319 // been received more than once by the receiver. 1320 // * If the sequence space in the first SACK block is greater than the 1321 // cumulative ACK, then the sender next compares the sequence space 1322 // in the first SACK block with the sequence space in the second SACK 1323 // block, if there is one. This comparison can determine if the first 1324 // SACK block is reporting duplicate data that lies above the 1325 // cumulative ACK. 1326 if sb.Start.LessThan(rcvdSeg.ackNumber) { 1327 return true 1328 } 1329 1330 if n > 1 { 1331 sb1 := rcvdSeg.parsedOptions.SACKBlocks[1] 1332 if sb1.End.LessThan(sb1.Start) { 1333 return false 1334 } 1335 1336 // If the first SACK block is fully covered by second SACK 1337 // block, then the first block is a DSACK block. 1338 if sb.End.LessThanEq(sb1.End) && sb1.Start.LessThanEq(sb.Start) { 1339 return true 1340 } 1341 } 1342 1343 return false 1344 } 1345 1346 func (s *sender) recordRetransmitTS() { 1347 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 1348 // 1349 // The Eifel detection algorithm is used, only upon initiation of loss 1350 // recovery, i.e., when either the timeout-based retransmit or the fast 1351 // retransmit is sent. The Eifel detection algorithm MUST NOT be 1352 // reinitiated after loss recovery has already started. In particular, 1353 // it must not be reinitiated upon subsequent timeouts for the same 1354 // segment, and not upon retransmitting segments other than the oldest 1355 // outstanding segment, e.g., during selective loss recovery. 1356 if s.inRecovery() { 1357 return 1358 } 1359 1360 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 1361 // 1362 // Set a "RetransmitTS" variable to the value of the Timestamp Value 1363 // field of the Timestamps option included in the retransmit sent when 1364 // loss recovery is initiated. A TCP sender must ensure that 1365 // RetransmitTS does not get overwritten as loss recovery progresses, 1366 // e.g., in case of a second timeout and subsequent second retransmit of 1367 // the same octet. 1368 s.retransmitTS = s.ep.tsValNow() 1369 } 1370 1371 func (s *sender) detectSpuriousRecovery(hasDSACK bool, tsEchoReply uint32) { 1372 // Return if the sender has already detected spurious recovery. 1373 if s.spuriousRecovery { 1374 return 1375 } 1376 1377 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 4 1378 // 1379 // If the value of the Timestamp Echo Reply field of the acceptable ACK's 1380 // Timestamps option is smaller than the value of RetransmitTS, then 1381 // proceed to next step, else return. 1382 if tsEchoReply >= s.retransmitTS { 1383 return 1384 } 1385 1386 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5 1387 // 1388 // If the acceptable ACK carries a DSACK option [RFC2883], then return. 1389 if hasDSACK { 1390 return 1391 } 1392 1393 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5 1394 // 1395 // If during the lifetime of the TCP connection the TCP sender has 1396 // previously received an ACK with a DSACK option, or the acceptable ACK 1397 // does not acknowledge all outstanding data, then proceed to next step, 1398 // else return. 1399 numDSACK := s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.Value() 1400 if numDSACK == 0 && s.SndUna == s.SndNxt { 1401 return 1402 } 1403 1404 // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 6 1405 // 1406 // If the loss recovery has been initiated with a timeout-based 1407 // retransmit, then set 1408 // SpuriousRecovery <- SPUR_TO (equal 1), 1409 // else set 1410 // SpuriousRecovery <- dupacks+1 1411 // Set the spurious recovery variable to true as we do not differentiate 1412 // between fast, SACK or RTO recovery. 1413 s.spuriousRecovery = true 1414 s.ep.stack.Stats().TCP.SpuriousRecovery.Increment() 1415 1416 // RFC 3522 will detect all kinds of spurious recoveries (fast, SACK and 1417 // timeout). Increment the metric for RTO only as we want to track the 1418 // number of timeout recoveries. 1419 if s.state == tcpip.RTORecovery { 1420 s.ep.stack.Stats().TCP.SpuriousRTORecovery.Increment() 1421 } 1422 } 1423 1424 // Check if the sender is in RTORecovery, FastRecovery or SACKRecovery state. 1425 func (s *sender) inRecovery() bool { 1426 if s.state == tcpip.RTORecovery || s.state == tcpip.FastRecovery || s.state == tcpip.SACKRecovery { 1427 return true 1428 } 1429 return false 1430 } 1431 1432 // handleRcvdSegment is called when a segment is received; it is responsible for 1433 // updating the send-related state. 1434 // +checklocks:s.ep.mu 1435 // +checklocksalias:s.rc.snd.ep.mu=s.ep.mu 1436 func (s *sender) handleRcvdSegment(rcvdSeg *segment) { 1437 bestRTT := unknownRTT 1438 1439 // Check if we can extract an RTT measurement from this ack. 1440 if !rcvdSeg.parsedOptions.TS && s.RTTMeasureSeqNum.LessThan(rcvdSeg.ackNumber) { 1441 bestRTT = s.ep.stack.Clock().NowMonotonic().Sub(s.RTTMeasureTime) 1442 s.updateRTO(bestRTT) 1443 s.RTTMeasureSeqNum = s.SndNxt 1444 } 1445 1446 // Update Timestamp if required. See RFC7323, section-4.3. 1447 if s.ep.SendTSOk && rcvdSeg.parsedOptions.TS { 1448 s.ep.updateRecentTimestamp(rcvdSeg.parsedOptions.TSVal, s.MaxSentAck, rcvdSeg.sequenceNumber) 1449 } 1450 1451 // Insert SACKBlock information into our scoreboard. 1452 hasDSACK := false 1453 if s.ep.SACKPermitted { 1454 for _, sb := range rcvdSeg.parsedOptions.SACKBlocks { 1455 // Only insert the SACK block if the following holds 1456 // true: 1457 // * SACK block acks data after the ack number in the 1458 // current segment. 1459 // * SACK block represents a sequence 1460 // between sndUna and sndNxt (i.e. data that is 1461 // currently unacked and in-flight). 1462 // * SACK block that has not been SACKed already. 1463 // 1464 // NOTE: This check specifically excludes DSACK blocks 1465 // which have start/end before sndUna and are used to 1466 // indicate spurious retransmissions. 1467 if rcvdSeg.ackNumber.LessThan(sb.Start) && s.SndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.SndNxt) && !s.ep.scoreboard.IsSACKED(sb) { 1468 s.ep.scoreboard.Insert(sb) 1469 rcvdSeg.hasNewSACKInfo = true 1470 } 1471 } 1472 1473 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08 1474 // section-7.2 1475 // * Step 2: Update RACK stats. 1476 // If the ACK is not ignored as invalid, update the RACK.rtt 1477 // to be the RTT sample calculated using this ACK, and 1478 // continue. If this ACK or SACK was for the most recently 1479 // sent packet, then record the RACK.xmit_ts timestamp and 1480 // RACK.end_seq sequence implied by this ACK. 1481 // * Step 3: Detect packet reordering. 1482 // If the ACK selectively or cumulatively acknowledges an 1483 // unacknowledged and also never retransmitted sequence below 1484 // RACK.fack, then the corresponding packet has been 1485 // reordered and RACK.reord is set to TRUE. 1486 if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1487 hasDSACK = s.walkSACK(rcvdSeg) 1488 } 1489 s.SetPipe() 1490 } 1491 1492 ack := rcvdSeg.ackNumber 1493 fastRetransmit := false 1494 // Do not leave fast recovery, if the ACK is out of range. 1495 if s.FastRecovery.Active { 1496 // Leave fast recovery if it acknowledges all the data covered by 1497 // this fast recovery session. 1498 if (ack-1).InRange(s.SndUna, s.SndNxt) && s.FastRecovery.Last.LessThan(ack) { 1499 s.leaveRecovery() 1500 } 1501 } else { 1502 // Detect loss by counting the duplicates and enter recovery. 1503 fastRetransmit = s.detectLoss(rcvdSeg) 1504 } 1505 1506 // See if TLP based recovery was successful. 1507 if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1508 s.detectTLPRecovery(ack, rcvdSeg) 1509 } 1510 1511 // Stash away the current window size. 1512 s.SndWnd = rcvdSeg.window 1513 1514 // Disable zero window probing if remote advertises a non-zero receive 1515 // window. This can be with an ACK to the zero window probe (where the 1516 // acknumber refers to the already acknowledged byte) OR to any previously 1517 // unacknowledged segment. 1518 if s.zeroWindowProbing && rcvdSeg.window > 0 && 1519 (ack == s.SndUna || (ack-1).InRange(s.SndUna, s.SndNxt)) { 1520 s.disableZeroWindowProbing() 1521 } 1522 1523 // On receiving the ACK for the zero window probe, account for it and 1524 // skip trying to send any segment as we are still probing for 1525 // receive window to become non-zero. 1526 if s.zeroWindowProbing && s.unackZeroWindowProbes > 0 && ack == s.SndUna { 1527 s.unackZeroWindowProbes-- 1528 return 1529 } 1530 1531 // Ignore ack if it doesn't acknowledge any new data. 1532 if (ack - 1).InRange(s.SndUna, s.SndNxt) { 1533 s.DupAckCount = 0 1534 1535 // See : https://tools.ietf.org/html/rfc1323#section-3.3. 1536 // Specifically we should only update the RTO using TSEcr if the 1537 // following condition holds: 1538 // 1539 // A TSecr value received in a segment is used to update the 1540 // averaged RTT measurement only if the segment acknowledges 1541 // some new data, i.e., only if it advances the left edge of 1542 // the send window. 1543 if s.ep.SendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 { 1544 tsRTT := s.ep.elapsed(s.ep.stack.Clock().NowMonotonic(), rcvdSeg.parsedOptions.TSEcr) 1545 s.updateRTO(tsRTT) 1546 // Following Linux, prefer RTT computed from ACKs to TSEcr because, 1547 // "broken middle-boxes or peers may corrupt TS-ECR fields" 1548 // https://github.com/torvalds/linux/blob/39cd87c4eb2b893354f3b850f916353f2658ae6f/net/ipv4/tcp_input.c#L3141C1-L3144C24 1549 if bestRTT == unknownRTT { 1550 bestRTT = tsRTT 1551 } 1552 } 1553 1554 if s.shouldSchedulePTO() { 1555 // Schedule PTO upon receiving an ACK that cumulatively acknowledges data. 1556 // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1. 1557 s.schedulePTO() 1558 } else { 1559 // When an ack is received we must rearm the timer. 1560 // RFC 6298 5.3 1561 s.probeTimer.disable() 1562 s.resendTimer.enable(s.RTO) 1563 } 1564 1565 // Remove all acknowledged data from the write list. 1566 acked := s.SndUna.Size(ack) 1567 s.SndUna = ack 1568 ackLeft := acked 1569 originalOutstanding := s.Outstanding 1570 for ackLeft > 0 { 1571 // We use logicalLen here because we can have FIN 1572 // segments (which are always at the end of list) that 1573 // have no data, but do consume a sequence number. 1574 seg := s.writeList.Front() 1575 datalen := seg.logicalLen() 1576 1577 if datalen > ackLeft { 1578 prevCount := s.pCount(seg, s.MaxPayloadSize) 1579 seg.TrimFront(ackLeft) 1580 seg.sequenceNumber.UpdateForward(ackLeft) 1581 s.Outstanding -= prevCount - s.pCount(seg, s.MaxPayloadSize) 1582 break 1583 } 1584 1585 if s.writeNext == seg { 1586 s.updateWriteNext(seg.Next()) 1587 } 1588 1589 // Update the RACK fields if SACK is enabled. 1590 if s.ep.SACKPermitted && !seg.acked && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1591 s.rc.update(seg, rcvdSeg) 1592 s.rc.detectReorder(seg) 1593 } 1594 1595 s.writeList.Remove(seg) 1596 1597 // If SACK is enabled then only reduce outstanding if 1598 // the segment was not previously SACKED as these have 1599 // already been accounted for in SetPipe(). 1600 if !s.ep.SACKPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 1601 s.Outstanding -= s.pCount(seg, s.MaxPayloadSize) 1602 } else { 1603 s.SackedOut -= s.pCount(seg, s.MaxPayloadSize) 1604 } 1605 seg.DecRef() 1606 ackLeft -= datalen 1607 } 1608 1609 // Clear SACK information for all acked data. 1610 s.ep.scoreboard.Delete(s.SndUna) 1611 1612 // Detect if the sender entered recovery spuriously. 1613 if s.inRecovery() { 1614 s.detectSpuriousRecovery(hasDSACK, rcvdSeg.parsedOptions.TSEcr) 1615 } 1616 1617 // If we are not in fast recovery then update the congestion 1618 // window based on the number of acknowledged packets. 1619 if !s.FastRecovery.Active { 1620 s.cc.Update(originalOutstanding-s.Outstanding, bestRTT) 1621 if s.FastRecovery.Last.LessThan(s.SndUna) { 1622 s.state = tcpip.Open 1623 // Update RACK when we are exiting fast or RTO 1624 // recovery as described in the RFC 1625 // draft-ietf-tcpm-rack-08 Section-7.2 Step 4. 1626 if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1627 s.rc.exitRecovery() 1628 } 1629 s.reorderTimer.disable() 1630 } 1631 } 1632 1633 // Update the send buffer usage and notify potential waiters. 1634 s.ep.updateSndBufferUsage(int(acked)) 1635 1636 // It is possible for s.outstanding to drop below zero if we get 1637 // a retransmit timeout, reset outstanding to zero but later 1638 // get an ack that cover previously sent data. 1639 if s.Outstanding < 0 { 1640 s.Outstanding = 0 1641 } 1642 1643 s.SetPipe() 1644 1645 // If all outstanding data was acknowledged the disable the timer. 1646 // RFC 6298 Rule 5.3 1647 if s.SndUna == s.SndNxt { 1648 s.Outstanding = 0 1649 // Reset firstRetransmittedSegXmitTime to the zero value. 1650 s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{} 1651 s.resendTimer.disable() 1652 s.probeTimer.disable() 1653 } 1654 } 1655 1656 if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { 1657 // Update RACK reorder window. 1658 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 1659 // * Upon receiving an ACK: 1660 // * Step 4: Update RACK reordering window 1661 s.rc.updateRACKReorderWindow() 1662 1663 // After the reorder window is calculated, detect any loss by checking 1664 // if the time elapsed after the segments are sent is greater than the 1665 // reorder window. 1666 if numLost := s.rc.detectLoss(rcvdSeg.rcvdTime); numLost > 0 && !s.FastRecovery.Active { 1667 // If any segment is marked as lost by 1668 // RACK, enter recovery and retransmit 1669 // the lost segments. 1670 s.cc.HandleLossDetected() 1671 s.enterRecovery() 1672 fastRetransmit = true 1673 } 1674 1675 if s.FastRecovery.Active { 1676 s.rc.DoRecovery(nil, fastRetransmit) 1677 } 1678 } 1679 1680 // Now that we've popped all acknowledged data from the retransmit 1681 // queue, retransmit if needed. 1682 if s.FastRecovery.Active && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 { 1683 s.lr.DoRecovery(rcvdSeg, fastRetransmit) 1684 // When SACK is enabled data sending is governed by steps in 1685 // RFC 6675 Section 5 recovery steps A-C. 1686 // See: https://tools.ietf.org/html/rfc6675#section-5. 1687 if s.ep.SACKPermitted { 1688 return 1689 } 1690 } 1691 1692 // Send more data now that some of the pending data has been ack'd, or 1693 // that the window opened up, or the congestion window was inflated due 1694 // to a duplicate ack during fast recovery. This will also re-enable 1695 // the retransmit timer if needed. 1696 s.sendData() 1697 } 1698 1699 // sendSegment sends the specified segment. 1700 // +checklocks:s.ep.mu 1701 func (s *sender) sendSegment(seg *segment) tcpip.Error { 1702 if seg.xmitCount > 0 { 1703 s.ep.stack.Stats().TCP.Retransmits.Increment() 1704 s.ep.stats.SendErrors.Retransmits.Increment() 1705 if s.SndCwnd < s.Ssthresh { 1706 s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment() 1707 } 1708 } 1709 seg.xmitTime = s.ep.stack.Clock().NowMonotonic() 1710 seg.xmitCount++ 1711 seg.lost = false 1712 1713 err := s.sendSegmentFromPacketBuffer(seg.pkt, seg.flags, seg.sequenceNumber) 1714 1715 // Every time a packet containing data is sent (including a 1716 // retransmission), if SACK is enabled and we are retransmitting data 1717 // then use the conservative timer described in RFC6675 Section 6.0, 1718 // otherwise follow the standard time described in RFC6298 Section 5.1. 1719 if err != nil && seg.payloadSize() != 0 { 1720 if s.FastRecovery.Active && seg.xmitCount > 1 && s.ep.SACKPermitted { 1721 s.resendTimer.enable(s.RTO) 1722 } else { 1723 if !s.resendTimer.enabled() { 1724 s.resendTimer.enable(s.RTO) 1725 } 1726 } 1727 } 1728 1729 return err 1730 } 1731 1732 // sendSegmentFromPacketBuffer sends a new segment containing the given payload, 1733 // flags and sequence number. 1734 // +checklocks:s.ep.mu 1735 // +checklocksalias:s.ep.rcv.ep.mu=s.ep.mu 1736 func (s *sender) sendSegmentFromPacketBuffer(pkt *stack.PacketBuffer, flags header.TCPFlags, seq seqnum.Value) tcpip.Error { 1737 s.LastSendTime = s.ep.stack.Clock().NowMonotonic() 1738 if seq == s.RTTMeasureSeqNum { 1739 s.RTTMeasureTime = s.LastSendTime 1740 } 1741 1742 rcvNxt, rcvWnd := s.ep.rcv.getSendParams() 1743 1744 // Remember the max sent ack. 1745 s.MaxSentAck = rcvNxt 1746 1747 // We need to clone the packet because sendRaw takes ownership of pkt, 1748 // and pkt could be reprocessed later on (i.e retrasmission). 1749 pkt = pkt.Clone() 1750 defer pkt.DecRef() 1751 1752 return s.ep.sendRaw(pkt, flags, seq, rcvNxt, rcvWnd) 1753 } 1754 1755 // sendEmptySegment sends a new empty segment, flags and sequence number. 1756 // +checklocks:s.ep.mu 1757 // +checklocksalias:s.ep.rcv.ep.mu=s.ep.mu 1758 func (s *sender) sendEmptySegment(flags header.TCPFlags, seq seqnum.Value) tcpip.Error { 1759 s.LastSendTime = s.ep.stack.Clock().NowMonotonic() 1760 if seq == s.RTTMeasureSeqNum { 1761 s.RTTMeasureTime = s.LastSendTime 1762 } 1763 1764 rcvNxt, rcvWnd := s.ep.rcv.getSendParams() 1765 1766 // Remember the max sent ack. 1767 s.MaxSentAck = rcvNxt 1768 1769 return s.ep.sendEmptyRaw(flags, seq, rcvNxt, rcvWnd) 1770 } 1771 1772 // maybeSendOutOfWindowAck sends an ACK if we are not being rate limited 1773 // currently. 1774 // +checklocks:s.ep.mu 1775 func (s *sender) maybeSendOutOfWindowAck(seg *segment) { 1776 // Data packets are unlikely to be part of an ACK loop. So always send 1777 // an ACK for a packet w/ data. 1778 if seg.payloadSize() > 0 || s.ep.allowOutOfWindowAck() { 1779 s.sendAck() 1780 } 1781 } 1782 1783 func (s *sender) updateWriteNext(seg *segment) { 1784 if s.writeNext != nil { 1785 s.writeNext.DecRef() 1786 } 1787 if seg != nil { 1788 seg.IncRef() 1789 } 1790 s.writeNext = seg 1791 } 1792 1793 // corkTimerExpired drains all the segments when TCP_CORK is enabled. 1794 // +checklocks:s.ep.mu 1795 func (s *sender) corkTimerExpired() tcpip.Error { 1796 // Check if the timer actually expired or if it's a spurious wake due 1797 // to a previously orphaned runtime timer. 1798 if s.corkTimer.isUninitialized() || !s.corkTimer.checkExpiration() { 1799 return nil 1800 } 1801 1802 // Assign sequence number and flags to the segment. 1803 seg := s.writeNext 1804 if seg == nil { 1805 return nil 1806 } 1807 seg.sequenceNumber = s.SndNxt 1808 seg.flags = header.TCPFlagAck | header.TCPFlagPsh 1809 // Drain all the segments. 1810 s.sendData() 1811 return nil 1812 }