github.com/vpnishe/netstack@v1.10.6/tcpip/transport/tcp/snd.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "math" 19 "sync" 20 "sync/atomic" 21 "time" 22 23 "github.com/vpnishe/netstack/sleep" 24 "github.com/vpnishe/netstack/tcpip" 25 "github.com/vpnishe/netstack/tcpip/buffer" 26 "github.com/vpnishe/netstack/tcpip/header" 27 "github.com/vpnishe/netstack/tcpip/seqnum" 28 ) 29 30 const ( 31 // minRTO is the minimum allowed value for the retransmit timeout. 32 minRTO = 200 * time.Millisecond 33 34 // InitialCwnd is the initial congestion window. 35 InitialCwnd = 10 36 37 // nDupAckThreshold is the number of duplicate ACK's required 38 // before fast-retransmit is entered. 39 nDupAckThreshold = 3 40 ) 41 42 // ccState indicates the current congestion control state for this sender. 43 type ccState int 44 45 const ( 46 // Open indicates that the sender is receiving acks in order and 47 // no loss or dupACK's etc have been detected. 48 Open ccState = iota 49 // RTORecovery indicates that an RTO has occurred and the sender 50 // has entered an RTO based recovery phase. 51 RTORecovery 52 // FastRecovery indicates that the sender has entered FastRecovery 53 // based on receiving nDupAck's. This state is entered only when 54 // SACK is not in use. 55 FastRecovery 56 // SACKRecovery indicates that the sender has entered SACK based 57 // recovery. 58 SACKRecovery 59 // Disorder indicates the sender either received some SACK blocks 60 // or dupACK's. 61 Disorder 62 ) 63 64 // congestionControl is an interface that must be implemented by any supported 65 // congestion control algorithm. 66 type congestionControl interface { 67 // HandleNDupAcks is invoked when sender.dupAckCount >= nDupAckThreshold 68 // just before entering fast retransmit. 69 HandleNDupAcks() 70 71 // HandleRTOExpired is invoked when the retransmit timer expires. 72 HandleRTOExpired() 73 74 // Update is invoked when processing inbound acks. It's passed the 75 // number of packet's that were acked by the most recent cumulative 76 // acknowledgement. 77 Update(packetsAcked int) 78 79 // PostRecovery is invoked when the sender is exiting a fast retransmit/ 80 // recovery phase. This provides congestion control algorithms a way 81 // to adjust their state when exiting recovery. 82 PostRecovery() 83 } 84 85 // sender holds the state necessary to send TCP segments. 86 // 87 // +stateify savable 88 type sender struct { 89 ep *endpoint 90 91 // lastSendTime is the timestamp when the last packet was sent. 92 lastSendTime time.Time 93 94 // dupAckCount is the number of duplicated acks received. It is used for 95 // fast retransmit. 96 dupAckCount int 97 98 // fr holds state related to fast recovery. 99 fr fastRecovery 100 101 // sndCwnd is the congestion window, in packets. 102 sndCwnd int 103 104 // sndSsthresh is the threshold between slow start and congestion 105 // avoidance. 106 sndSsthresh int 107 108 // sndCAAckCount is the number of packets acknowledged during congestion 109 // avoidance. When enough packets have been ack'd (typically cwnd 110 // packets), the congestion window is incremented by one. 111 sndCAAckCount int 112 113 // outstanding is the number of outstanding packets, that is, packets 114 // that have been sent but not yet acknowledged. 115 outstanding int 116 117 // sndWnd is the send window size. 118 sndWnd seqnum.Size 119 120 // sndUna is the next unacknowledged sequence number. 121 sndUna seqnum.Value 122 123 // sndNxt is the sequence number of the next segment to be sent. 124 sndNxt seqnum.Value 125 126 // sndNxtList is the sequence number of the next segment to be added to 127 // the send list. 128 sndNxtList seqnum.Value 129 130 // rttMeasureSeqNum is the sequence number being used for the latest RTT 131 // measurement. 132 rttMeasureSeqNum seqnum.Value 133 134 // rttMeasureTime is the time when the rttMeasureSeqNum was sent. 135 rttMeasureTime time.Time 136 137 closed bool 138 writeNext *segment 139 writeList segmentList 140 resendTimer timer 141 resendWaker sleep.Waker 142 143 // rtt.srtt, rtt.rttvar, and rto are the "smoothed round-trip time", 144 // "round-trip time variation" and "retransmit timeout", as defined in 145 // section 2 of RFC 6298. 146 rtt rtt 147 rto time.Duration 148 149 // maxPayloadSize is the maximum size of the payload of a given segment. 150 // It is initialized on demand. 151 maxPayloadSize int 152 153 // gso is set if generic segmentation offload is enabled. 154 gso bool 155 156 // sndWndScale is the number of bits to shift left when reading the send 157 // window size from a segment. 158 sndWndScale uint8 159 160 // maxSentAck is the maxium acknowledgement actually sent. 161 maxSentAck seqnum.Value 162 163 // state is the current state of congestion control for this endpoint. 164 state ccState 165 166 // cc is the congestion control algorithm in use for this sender. 167 cc congestionControl 168 } 169 170 // rtt is a synchronization wrapper used to appease stateify. See the comment 171 // in sender, where it is used. 172 // 173 // +stateify savable 174 type rtt struct { 175 sync.Mutex 176 177 srtt time.Duration 178 rttvar time.Duration 179 srttInited bool 180 } 181 182 // fastRecovery holds information related to fast recovery from a packet loss. 183 // 184 // +stateify savable 185 type fastRecovery struct { 186 // active whether the endpoint is in fast recovery. The following fields 187 // are only meaningful when active is true. 188 active bool 189 190 // first and last represent the inclusive sequence number range being 191 // recovered. 192 first seqnum.Value 193 last seqnum.Value 194 195 // maxCwnd is the maximum value the congestion window may be inflated to 196 // due to duplicate acks. This exists to avoid attacks where the 197 // receiver intentionally sends duplicate acks to artificially inflate 198 // the sender's cwnd. 199 maxCwnd int 200 201 // highRxt is the highest sequence number which has been retransmitted 202 // during the current loss recovery phase. 203 // See: RFC 6675 Section 2 for details. 204 highRxt seqnum.Value 205 206 // rescueRxt is the highest sequence number which has been 207 // optimistically retransmitted to prevent stalling of the ACK clock 208 // when there is loss at the end of the window and no new data is 209 // available for transmission. 210 // See: RFC 6675 Section 2 for details. 211 rescueRxt seqnum.Value 212 } 213 214 func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender { 215 // The sender MUST reduce the TCP data length to account for any IP or 216 // TCP options that it is including in the packets that it sends. 217 // See: https://tools.ietf.org/html/rfc6691#section-2 218 maxPayloadSize := int(mss) - ep.maxOptionSize() 219 220 s := &sender{ 221 ep: ep, 222 sndWnd: sndWnd, 223 sndUna: iss + 1, 224 sndNxt: iss + 1, 225 sndNxtList: iss + 1, 226 rto: 1 * time.Second, 227 rttMeasureSeqNum: iss + 1, 228 lastSendTime: time.Now(), 229 maxPayloadSize: maxPayloadSize, 230 maxSentAck: irs + 1, 231 fr: fastRecovery{ 232 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1. 233 last: iss, 234 highRxt: iss, 235 rescueRxt: iss, 236 }, 237 gso: ep.gso != nil, 238 } 239 240 if s.gso { 241 s.ep.gso.MSS = uint16(maxPayloadSize) 242 } 243 244 s.cc = s.initCongestionControl(ep.cc) 245 246 // A negative sndWndScale means that no scaling is in use, otherwise we 247 // store the scaling value. 248 if sndWndScale > 0 { 249 s.sndWndScale = uint8(sndWndScale) 250 } 251 252 s.resendTimer.init(&s.resendWaker) 253 254 s.updateMaxPayloadSize(int(ep.route.MTU()), 0) 255 256 // Initialize SACK Scoreboard after updating max payload size as we use 257 // the maxPayloadSize as the smss when determining if a segment is lost 258 // etc. 259 s.ep.scoreboard = NewSACKScoreboard(uint16(s.maxPayloadSize), iss) 260 261 return s 262 } 263 264 // initCongestionControl initializes the specified congestion control module and 265 // returns a handle to it. It also initializes the sndCwnd and sndSsThresh to 266 // their initial values. 267 func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl { 268 s.sndCwnd = InitialCwnd 269 s.sndSsthresh = math.MaxUint16 270 271 switch congestionControlName { 272 case ccCubic: 273 return newCubicCC(s) 274 case ccReno: 275 fallthrough 276 default: 277 return newRenoCC(s) 278 } 279 } 280 281 // updateMaxPayloadSize updates the maximum payload size based on the given 282 // MTU. If this is in response to "packet too big" control packets (indicated 283 // by the count argument), it also reduces the number of outstanding packets and 284 // attempts to retransmit the first packet above the MTU size. 285 func (s *sender) updateMaxPayloadSize(mtu, count int) { 286 m := mtu - header.TCPMinimumSize 287 288 m -= s.ep.maxOptionSize() 289 290 // We don't adjust up for now. 291 if m >= s.maxPayloadSize { 292 return 293 } 294 295 // Make sure we can transmit at least one byte. 296 if m <= 0 { 297 m = 1 298 } 299 300 s.maxPayloadSize = m 301 if s.gso { 302 s.ep.gso.MSS = uint16(m) 303 } 304 305 if count == 0 { 306 // updateMaxPayloadSize is also called when the sender is created. 307 // and there is no data to send in such cases. Return immediately. 308 return 309 } 310 311 // Update the scoreboard's smss to reflect the new lowered 312 // maxPayloadSize. 313 s.ep.scoreboard.smss = uint16(m) 314 315 s.outstanding -= count 316 if s.outstanding < 0 { 317 s.outstanding = 0 318 } 319 320 // Rewind writeNext to the first segment exceeding the MTU. Do nothing 321 // if it is already before such a packet. 322 for seg := s.writeList.Front(); seg != nil; seg = seg.Next() { 323 if seg == s.writeNext { 324 // We got to writeNext before we could find a segment 325 // exceeding the MTU. 326 break 327 } 328 329 if seg.data.Size() > m { 330 // We found a segment exceeding the MTU. Rewind 331 // writeNext and try to retransmit it. 332 s.writeNext = seg 333 break 334 } 335 } 336 337 // Since we likely reduced the number of outstanding packets, we may be 338 // ready to send some more. 339 s.sendData() 340 } 341 342 // sendAck sends an ACK segment. 343 func (s *sender) sendAck() { 344 s.sendSegmentFromView(buffer.VectorisedView{}, header.TCPFlagAck, s.sndNxt) 345 } 346 347 // updateRTO updates the retransmit timeout when a new roud-trip time is 348 // available. This is done in accordance with section 2 of RFC 6298. 349 func (s *sender) updateRTO(rtt time.Duration) { 350 s.rtt.Lock() 351 if !s.rtt.srttInited { 352 s.rtt.rttvar = rtt / 2 353 s.rtt.srtt = rtt 354 s.rtt.srttInited = true 355 } else { 356 diff := s.rtt.srtt - rtt 357 if diff < 0 { 358 diff = -diff 359 } 360 // Use RFC6298 standard algorithm to update rttvar and srtt when 361 // no timestamps are available. 362 if !s.ep.sendTSOk { 363 s.rtt.rttvar = (3*s.rtt.rttvar + diff) / 4 364 s.rtt.srtt = (7*s.rtt.srtt + rtt) / 8 365 } else { 366 // When we are taking RTT measurements of every ACK then 367 // we need to use a modified method as specified in 368 // https://tools.ietf.org/html/rfc7323#appendix-G 369 if s.outstanding == 0 { 370 s.rtt.Unlock() 371 return 372 } 373 // Netstack measures congestion window/inflight all in 374 // terms of packets and not bytes. This is similar to 375 // how linux also does cwnd and inflight. In practice 376 // this approximation works as expected. 377 expectedSamples := math.Ceil(float64(s.outstanding) / 2) 378 379 // alpha & beta values are the original values as recommended in 380 // https://tools.ietf.org/html/rfc6298#section-2.3. 381 const alpha = 0.125 382 const beta = 0.25 383 384 alphaPrime := alpha / expectedSamples 385 betaPrime := beta / expectedSamples 386 rttVar := (1-betaPrime)*s.rtt.rttvar.Seconds() + betaPrime*diff.Seconds() 387 srtt := (1-alphaPrime)*s.rtt.srtt.Seconds() + alphaPrime*rtt.Seconds() 388 s.rtt.rttvar = time.Duration(rttVar * float64(time.Second)) 389 s.rtt.srtt = time.Duration(srtt * float64(time.Second)) 390 } 391 } 392 393 s.rto = s.rtt.srtt + 4*s.rtt.rttvar 394 s.rtt.Unlock() 395 if s.rto < minRTO { 396 s.rto = minRTO 397 } 398 } 399 400 // resendSegment resends the first unacknowledged segment. 401 func (s *sender) resendSegment() { 402 // Don't use any segments we already sent to measure RTT as they may 403 // have been affected by packets being lost. 404 s.rttMeasureSeqNum = s.sndNxt 405 406 // Resend the segment. 407 if seg := s.writeList.Front(); seg != nil { 408 if seg.data.Size() > s.maxPayloadSize { 409 s.splitSeg(seg, s.maxPayloadSize) 410 } 411 412 // See: RFC 6675 section 5 Step 4.3 413 // 414 // To prevent retransmission, set both the HighRXT and RescueRXT 415 // to the highest sequence number in the retransmitted segment. 416 s.fr.highRxt = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) - 1 417 s.fr.rescueRxt = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) - 1 418 s.sendSegment(seg) 419 s.ep.stack.Stats().TCP.FastRetransmit.Increment() 420 s.ep.stats.SendErrors.FastRetransmit.Increment() 421 422 // Run SetPipe() as per RFC 6675 section 5 Step 4.4 423 s.SetPipe() 424 } 425 } 426 427 // retransmitTimerExpired is called when the retransmit timer expires, and 428 // unacknowledged segments are assumed lost, and thus need to be resent. 429 // Returns true if the connection is still usable, or false if the connection 430 // is deemed lost. 431 func (s *sender) retransmitTimerExpired() bool { 432 // Check if the timer actually expired or if it's a spurious wake due 433 // to a previously orphaned runtime timer. 434 if !s.resendTimer.checkExpiration() { 435 return true 436 } 437 438 s.ep.stack.Stats().TCP.Timeouts.Increment() 439 s.ep.stats.SendErrors.Timeouts.Increment() 440 441 // Give up if we've waited more than a minute since the last resend. 442 if s.rto >= 60*time.Second { 443 return false 444 } 445 446 // Set new timeout. The timer will be restarted by the call to sendData 447 // below. 448 s.rto *= 2 449 450 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4. 451 // 452 // Retransmit timeouts: 453 // After a retransmit timeout, record the highest sequence number 454 // transmitted in the variable recover, and exit the fast recovery 455 // procedure if applicable. 456 s.fr.last = s.sndNxt - 1 457 458 if s.fr.active { 459 // We were attempting fast recovery but were not successful. 460 // Leave the state. We don't need to update ssthresh because it 461 // has already been updated when entered fast-recovery. 462 s.leaveFastRecovery() 463 } 464 465 s.state = RTORecovery 466 s.cc.HandleRTOExpired() 467 468 // Mark the next segment to be sent as the first unacknowledged one and 469 // start sending again. Set the number of outstanding packets to 0 so 470 // that we'll be able to retransmit. 471 // 472 // We'll keep on transmitting (or retransmitting) as we get acks for 473 // the data we transmit. 474 s.outstanding = 0 475 476 // Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1 477 // 478 // In order to avoid memory deadlocks, the TCP receiver is allowed to 479 // discard data that has already been selectively acknowledged. As a 480 // result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK 481 // information gathered from a receiver upon a retransmission timeout 482 // (RTO) "since the timeout might indicate that the data receiver has 483 // reneged." Additionally, a TCP sender MUST "ignore prior SACK 484 // information in determining which data to retransmit." 485 // 486 // NOTE: We take the stricter interpretation and just expunge all 487 // information as we lack more rigorous checks to validate if the SACK 488 // information is usable after an RTO. 489 s.ep.scoreboard.Reset() 490 s.writeNext = s.writeList.Front() 491 s.sendData() 492 493 return true 494 } 495 496 // pCount returns the number of packets in the segment. Due to GSO, a segment 497 // can be composed of multiple packets. 498 func (s *sender) pCount(seg *segment) int { 499 size := seg.data.Size() 500 if size == 0 { 501 return 1 502 } 503 504 return (size-1)/s.maxPayloadSize + 1 505 } 506 507 // splitSeg splits a given segment at the size specified and inserts the 508 // remainder as a new segment after the current one in the write list. 509 func (s *sender) splitSeg(seg *segment, size int) { 510 if seg.data.Size() <= size { 511 return 512 } 513 // Split this segment up. 514 nSeg := seg.clone() 515 nSeg.data.TrimFront(size) 516 nSeg.sequenceNumber.UpdateForward(seqnum.Size(size)) 517 s.writeList.InsertAfter(seg, nSeg) 518 seg.data.CapLength(size) 519 } 520 521 // NextSeg implements the RFC6675 NextSeg() operation. It returns segments that 522 // match rule 1, 3 and 4 of the NextSeg() operation defined in RFC6675. Rule 2 523 // is handled by the normal send logic. 524 func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) { 525 var s3 *segment 526 var s4 *segment 527 smss := s.ep.scoreboard.SMSS() 528 // Step 1. 529 for seg := s.writeList.Front(); seg != nil; seg = seg.Next() { 530 if !s.isAssignedSequenceNumber(seg) { 531 break 532 } 533 segSeq := seg.sequenceNumber 534 if seg.data.Size() > int(smss) { 535 s.splitSeg(seg, int(smss)) 536 } 537 // See RFC 6675 Section 4 538 // 539 // 1. If there exists a smallest unSACKED sequence number 540 // 'S2' that meets the following 3 criteria for determinig 541 // loss, the sequence range of one segment of up to SMSS 542 // octects starting with S2 MUST be returned. 543 if !s.ep.scoreboard.IsSACKED(header.SACKBlock{segSeq, segSeq.Add(1)}) { 544 // NextSeg(): 545 // 546 // (1.a) S2 is greater than HighRxt 547 // (1.b) S2 is less than highest octect covered by 548 // any received SACK. 549 if s.fr.highRxt.LessThan(segSeq) && segSeq.LessThan(s.ep.scoreboard.maxSACKED) { 550 // NextSeg(): 551 // (1.c) IsLost(S2) returns true. 552 if s.ep.scoreboard.IsLost(segSeq) { 553 return seg, s3, s4 554 } 555 // NextSeg(): 556 // 557 // (3): If the conditions for rules (1) and (2) 558 // fail, but there exists an unSACKed sequence 559 // number S3 that meets the criteria for 560 // detecting loss given in steps 1.a and 1.b 561 // above (specifically excluding (1.c)) then one 562 // segment of upto SMSS octets starting with S3 563 // SHOULD be returned. 564 if s3 == nil { 565 s3 = seg 566 } 567 } 568 // NextSeg(): 569 // 570 // (4) If the conditions for (1), (2) and (3) fail, 571 // but there exists outstanding unSACKED data, we 572 // provide the opportunity for a single "rescue" 573 // retransmission per entry into loss recovery. If 574 // HighACK is greater than RescueRxt, the one 575 // segment of upto SMSS octects that MUST include 576 // the highest outstanding unSACKed sequence number 577 // SHOULD be returned. 578 if s.fr.rescueRxt.LessThan(s.sndUna - 1) { 579 if s4 != nil { 580 if s4.sequenceNumber.LessThan(segSeq) { 581 s4 = seg 582 } 583 } else { 584 s4 = seg 585 } 586 s.fr.rescueRxt = s.fr.last 587 } 588 } 589 } 590 591 return nil, s3, s4 592 } 593 594 // maybeSendSegment tries to send the specified segment and either coalesces 595 // other segments into this one or splits the specified segment based on the 596 // lower of the specified limit value or the receivers window size specified by 597 // end. 598 func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (sent bool) { 599 // We abuse the flags field to determine if we have already 600 // assigned a sequence number to this segment. 601 if !s.isAssignedSequenceNumber(seg) { 602 // Merge segments if allowed. 603 if seg.data.Size() != 0 { 604 available := int(seg.sequenceNumber.Size(end)) 605 if available > limit { 606 available = limit 607 } 608 609 // nextTooBig indicates that the next segment was too 610 // large to entirely fit in the current segment. It 611 // would be possible to split the next segment and merge 612 // the portion that fits, but unexpectedly splitting 613 // segments can have user visible side-effects which can 614 // break applications. For example, RFC 7766 section 8 615 // says that the length and data of a DNS response 616 // should be sent in the same TCP segment to avoid 617 // triggering bugs in poorly written DNS 618 // implementations. 619 var nextTooBig bool 620 for seg.Next() != nil && seg.Next().data.Size() != 0 { 621 if seg.data.Size()+seg.Next().data.Size() > available { 622 nextTooBig = true 623 break 624 } 625 seg.data.Append(seg.Next().data) 626 627 // Consume the segment that we just merged in. 628 s.writeList.Remove(seg.Next()) 629 } 630 if !nextTooBig && seg.data.Size() < available { 631 // Segment is not full. 632 if s.outstanding > 0 && atomic.LoadUint32(&s.ep.delay) != 0 { 633 // Nagle's algorithm. From Wikipedia: 634 // Nagle's algorithm works by 635 // combining a number of small 636 // outgoing messages and sending them 637 // all at once. Specifically, as long 638 // as there is a sent packet for which 639 // the sender has received no 640 // acknowledgment, the sender should 641 // keep buffering its output until it 642 // has a full packet's worth of 643 // output, thus allowing output to be 644 // sent all at once. 645 return false 646 } 647 if atomic.LoadUint32(&s.ep.cork) != 0 { 648 // Hold back the segment until full. 649 return false 650 } 651 } 652 } 653 654 // Assign flags. We don't do it above so that we can merge 655 // additional data if Nagle holds the segment. 656 seg.sequenceNumber = s.sndNxt 657 seg.flags = header.TCPFlagAck | header.TCPFlagPsh 658 } 659 660 var segEnd seqnum.Value 661 if seg.data.Size() == 0 { 662 if s.writeList.Back() != seg { 663 panic("FIN segments must be the final segment in the write list.") 664 } 665 seg.flags = header.TCPFlagAck | header.TCPFlagFin 666 segEnd = seg.sequenceNumber.Add(1) 667 // Transition to FIN-WAIT1 state since we're initiating an active close. 668 s.ep.mu.Lock() 669 switch s.ep.state { 670 case StateCloseWait: 671 // We've already received a FIN and are now sending our own. The 672 // sender is now awaiting a final ACK for this FIN. 673 s.ep.state = StateLastAck 674 default: 675 s.ep.state = StateFinWait1 676 } 677 s.ep.stack.Stats().TCP.CurrentEstablished.Decrement() 678 s.ep.mu.Unlock() 679 } else { 680 // We're sending a non-FIN segment. 681 if seg.flags&header.TCPFlagFin != 0 { 682 panic("Netstack queues FIN segments without data.") 683 } 684 685 if !seg.sequenceNumber.LessThan(end) { 686 return false 687 } 688 689 available := int(seg.sequenceNumber.Size(end)) 690 if available == 0 { 691 return false 692 } 693 if available > limit { 694 available = limit 695 } 696 697 if seg.data.Size() > available { 698 s.splitSeg(seg, available) 699 } 700 701 segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) 702 } 703 704 s.sendSegment(seg) 705 706 // Update sndNxt if we actually sent new data (as opposed to 707 // retransmitting some previously sent data). 708 if s.sndNxt.LessThan(segEnd) { 709 s.sndNxt = segEnd 710 } 711 712 return true 713 } 714 715 // handleSACKRecovery implements the loss recovery phase as described in RFC6675 716 // section 5, step C. 717 func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool) { 718 s.SetPipe() 719 for s.outstanding < s.sndCwnd { 720 nextSeg, s3, s4 := s.NextSeg() 721 if nextSeg == nil { 722 // NextSeg(): 723 // 724 // Step (2): "If no sequence number 'S2' per rule (1) 725 // exists but there exists available unsent data and the 726 // receiver's advertised window allows, the sequence 727 // range of one segment of up to SMSS octets of 728 // previously unsent data starting with sequence number 729 // HighData+1 MUST be returned." 730 for seg := s.writeNext; seg != nil; seg = seg.Next() { 731 if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.sndNxt) { 732 continue 733 } 734 // Step C.3 described below is handled by 735 // maybeSendSegment which increments sndNxt when 736 // a segment is transmitted. 737 // 738 // Step C.3 "If any of the data octets sent in 739 // (C.1) are above HighData, HighData must be 740 // updated to reflect the transmission of 741 // previously unsent data." 742 if sent := s.maybeSendSegment(seg, limit, end); !sent { 743 break 744 } 745 dataSent = true 746 s.outstanding++ 747 s.writeNext = seg.Next() 748 nextSeg = seg 749 break 750 } 751 if nextSeg != nil { 752 continue 753 } 754 } 755 rescueRtx := false 756 if nextSeg == nil && s3 != nil { 757 nextSeg = s3 758 } 759 if nextSeg == nil && s4 != nil { 760 nextSeg = s4 761 rescueRtx = true 762 } 763 if nextSeg == nil { 764 break 765 } 766 segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen()) 767 if !rescueRtx && nextSeg.sequenceNumber.LessThan(s.sndNxt) { 768 // RFC 6675, Step C.2 769 // 770 // "If any of the data octets sent in (C.1) are below 771 // HighData, HighRxt MUST be set to the highest sequence 772 // number of the retransmitted segment unless NextSeg () 773 // rule (4) was invoked for this retransmission." 774 s.fr.highRxt = segEnd - 1 775 } 776 777 // RFC 6675, Step C.4. 778 // 779 // "The estimate of the amount of data outstanding in the network 780 // must be updated by incrementing pipe by the number of octets 781 // transmitted in (C.1)." 782 s.outstanding++ 783 dataSent = true 784 s.sendSegment(nextSeg) 785 } 786 return dataSent 787 } 788 789 // sendData sends new data segments. It is called when data becomes available or 790 // when the send window opens up. 791 func (s *sender) sendData() { 792 limit := s.maxPayloadSize 793 if s.gso { 794 limit = int(s.ep.gso.MaxSize - header.TCPHeaderMaximumSize) 795 } 796 end := s.sndUna.Add(s.sndWnd) 797 798 // Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10. 799 // "A TCP SHOULD set cwnd to no more than RW before beginning 800 // transmission if the TCP has not sent data in the interval exceeding 801 // the retrasmission timeout." 802 if !s.fr.active && time.Now().Sub(s.lastSendTime) > s.rto { 803 if s.sndCwnd > InitialCwnd { 804 s.sndCwnd = InitialCwnd 805 } 806 } 807 808 var dataSent bool 809 810 // RFC 6675 recovery algorithm step C 1-5. 811 if s.fr.active && s.ep.sackPermitted { 812 dataSent = s.handleSACKRecovery(s.maxPayloadSize, end) 813 } else { 814 for seg := s.writeNext; seg != nil && s.outstanding < s.sndCwnd; seg = seg.Next() { 815 cwndLimit := (s.sndCwnd - s.outstanding) * s.maxPayloadSize 816 if cwndLimit < limit { 817 limit = cwndLimit 818 } 819 if s.isAssignedSequenceNumber(seg) && s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 820 continue 821 } 822 if sent := s.maybeSendSegment(seg, limit, end); !sent { 823 break 824 } 825 dataSent = true 826 s.outstanding += s.pCount(seg) 827 s.writeNext = seg.Next() 828 } 829 } 830 831 if dataSent { 832 // We sent data, so we should stop the keepalive timer to ensure 833 // that no keepalives are sent while there is pending data. 834 s.ep.disableKeepaliveTimer() 835 } 836 837 // Enable the timer if we have pending data and it's not enabled yet. 838 if !s.resendTimer.enabled() && s.sndUna != s.sndNxt { 839 s.resendTimer.enable(s.rto) 840 } 841 // If we have no more pending data, start the keepalive timer. 842 if s.sndUna == s.sndNxt { 843 s.ep.resetKeepaliveTimer(false) 844 } 845 } 846 847 func (s *sender) enterFastRecovery() { 848 s.fr.active = true 849 // Save state to reflect we're now in fast recovery. 850 // 851 // See : https://tools.ietf.org/html/rfc5681#section-3.2 Step 3. 852 // We inflate the cwnd by 3 to account for the 3 packets which triggered 853 // the 3 duplicate ACKs and are now not in flight. 854 s.sndCwnd = s.sndSsthresh + 3 855 s.fr.first = s.sndUna 856 s.fr.last = s.sndNxt - 1 857 s.fr.maxCwnd = s.sndCwnd + s.outstanding 858 if s.ep.sackPermitted { 859 s.state = SACKRecovery 860 s.ep.stack.Stats().TCP.SACKRecovery.Increment() 861 return 862 } 863 s.state = FastRecovery 864 s.ep.stack.Stats().TCP.FastRecovery.Increment() 865 } 866 867 func (s *sender) leaveFastRecovery() { 868 s.fr.active = false 869 s.fr.maxCwnd = 0 870 s.dupAckCount = 0 871 872 // Deflate cwnd. It had been artificially inflated when new dups arrived. 873 s.sndCwnd = s.sndSsthresh 874 875 s.cc.PostRecovery() 876 } 877 878 func (s *sender) handleFastRecovery(seg *segment) (rtx bool) { 879 ack := seg.ackNumber 880 // We are in fast recovery mode. Ignore the ack if it's out of 881 // range. 882 if !ack.InRange(s.sndUna, s.sndNxt+1) { 883 return false 884 } 885 886 // Leave fast recovery if it acknowledges all the data covered by 887 // this fast recovery session. 888 if s.fr.last.LessThan(ack) { 889 s.leaveFastRecovery() 890 return false 891 } 892 893 if s.ep.sackPermitted { 894 // When SACK is enabled we let retransmission be governed by 895 // the SACK logic. 896 return false 897 } 898 899 // Don't count this as a duplicate if it is carrying data or 900 // updating the window. 901 if seg.logicalLen() != 0 || s.sndWnd != seg.window { 902 return false 903 } 904 905 // Inflate the congestion window if we're getting duplicate acks 906 // for the packet we retransmitted. 907 if ack == s.fr.first { 908 // We received a dup, inflate the congestion window by 1 packet 909 // if we're not at the max yet. Only inflate the window if 910 // regular FastRecovery is in use, RFC6675 does not require 911 // inflating cwnd on duplicate ACKs. 912 if s.sndCwnd < s.fr.maxCwnd { 913 s.sndCwnd++ 914 } 915 return false 916 } 917 918 // A partial ack was received. Retransmit this packet and 919 // remember it so that we don't retransmit it again. We don't 920 // inflate the window because we're putting the same packet back 921 // onto the wire. 922 // 923 // N.B. The retransmit timer will be reset by the caller. 924 s.fr.first = ack 925 s.dupAckCount = 0 926 return true 927 } 928 929 // isAssignedSequenceNumber relies on the fact that we only set flags once a 930 // sequencenumber is assigned and that is only done right before we send the 931 // segment. As a result any segment that has a non-zero flag has a valid 932 // sequence number assigned to it. 933 func (s *sender) isAssignedSequenceNumber(seg *segment) bool { 934 return seg.flags != 0 935 } 936 937 // SetPipe implements the SetPipe() function described in RFC6675. Netstack 938 // maintains the congestion window in number of packets and not bytes, so 939 // SetPipe() here measures number of outstanding packets rather than actual 940 // outstanding bytes in the network. 941 func (s *sender) SetPipe() { 942 // If SACK isn't permitted or it is permitted but recovery is not active 943 // then ignore pipe calculations. 944 if !s.ep.sackPermitted || !s.fr.active { 945 return 946 } 947 pipe := 0 948 smss := seqnum.Size(s.ep.scoreboard.SMSS()) 949 for s1 := s.writeList.Front(); s1 != nil && s1.data.Size() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() { 950 // With GSO each segment can be much larger than SMSS. So check the segment 951 // in SMSS sized ranges. 952 segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.data.Size())) 953 for startSeq := s1.sequenceNumber; startSeq.LessThan(segEnd); startSeq = startSeq.Add(smss) { 954 endSeq := startSeq.Add(smss) 955 if segEnd.LessThan(endSeq) { 956 endSeq = segEnd 957 } 958 sb := header.SACKBlock{startSeq, endSeq} 959 // SetPipe(): 960 // 961 // After initializing pipe to zero, the following steps are 962 // taken for each octet 'S1' in the sequence space between 963 // HighACK and HighData that has not been SACKed: 964 if !s1.sequenceNumber.LessThan(s.sndNxt) { 965 break 966 } 967 if s.ep.scoreboard.IsSACKED(sb) { 968 continue 969 } 970 971 // SetPipe(): 972 // 973 // (a) If IsLost(S1) returns false, Pipe is incremened by 1. 974 // 975 // NOTE: here we mark the whole segment as lost. We do not try 976 // and test every byte in our write buffer as we maintain our 977 // pipe in terms of oustanding packets and not bytes. 978 if !s.ep.scoreboard.IsRangeLost(sb) { 979 pipe++ 980 } 981 // SetPipe(): 982 // (b) If S1 <= HighRxt, Pipe is incremented by 1. 983 if s1.sequenceNumber.LessThanEq(s.fr.highRxt) { 984 pipe++ 985 } 986 } 987 } 988 s.outstanding = pipe 989 } 990 991 // checkDuplicateAck is called when an ack is received. It manages the state 992 // related to duplicate acks and determines if a retransmit is needed according 993 // to the rules in RFC 6582 (NewReno). 994 func (s *sender) checkDuplicateAck(seg *segment) (rtx bool) { 995 ack := seg.ackNumber 996 if s.fr.active { 997 return s.handleFastRecovery(seg) 998 } 999 1000 // We're not in fast recovery yet. A segment is considered a duplicate 1001 // only if it doesn't carry any data and doesn't update the send window, 1002 // because if it does, it wasn't sent in response to an out-of-order 1003 // segment. If SACK is enabled then we have an additional check to see 1004 // if the segment carries new SACK information. If it does then it is 1005 // considered a duplicate ACK as per RFC6675. 1006 if ack != s.sndUna || seg.logicalLen() != 0 || s.sndWnd != seg.window || ack == s.sndNxt { 1007 if !s.ep.sackPermitted || !seg.hasNewSACKInfo { 1008 s.dupAckCount = 0 1009 return false 1010 } 1011 } 1012 1013 s.dupAckCount++ 1014 1015 // Do not enter fast recovery until we reach nDupAckThreshold or the 1016 // first unacknowledged byte is considered lost as per SACK scoreboard. 1017 if s.dupAckCount < nDupAckThreshold || (s.ep.sackPermitted && !s.ep.scoreboard.IsLost(s.sndUna)) { 1018 // RFC 6675 Step 3. 1019 s.fr.highRxt = s.sndUna - 1 1020 // Do run SetPipe() to calculate the outstanding segments. 1021 s.SetPipe() 1022 s.state = Disorder 1023 return false 1024 } 1025 1026 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2 1027 // 1028 // We only do the check here, the incrementing of last to the highest 1029 // sequence number transmitted till now is done when enterFastRecovery 1030 // is invoked. 1031 if !s.fr.last.LessThan(seg.ackNumber) { 1032 s.dupAckCount = 0 1033 return false 1034 } 1035 s.cc.HandleNDupAcks() 1036 s.enterFastRecovery() 1037 s.dupAckCount = 0 1038 return true 1039 } 1040 1041 // handleRcvdSegment is called when a segment is received; it is responsible for 1042 // updating the send-related state. 1043 func (s *sender) handleRcvdSegment(seg *segment) { 1044 // Check if we can extract an RTT measurement from this ack. 1045 if !seg.parsedOptions.TS && s.rttMeasureSeqNum.LessThan(seg.ackNumber) { 1046 s.updateRTO(time.Now().Sub(s.rttMeasureTime)) 1047 s.rttMeasureSeqNum = s.sndNxt 1048 } 1049 1050 // Update Timestamp if required. See RFC7323, section-4.3. 1051 if s.ep.sendTSOk && seg.parsedOptions.TS { 1052 s.ep.updateRecentTimestamp(seg.parsedOptions.TSVal, s.maxSentAck, seg.sequenceNumber) 1053 } 1054 1055 // Insert SACKBlock information into our scoreboard. 1056 if s.ep.sackPermitted { 1057 for _, sb := range seg.parsedOptions.SACKBlocks { 1058 // Only insert the SACK block if the following holds 1059 // true: 1060 // * SACK block acks data after the ack number in the 1061 // current segment. 1062 // * SACK block represents a sequence 1063 // between sndUna and sndNxt (i.e. data that is 1064 // currently unacked and in-flight). 1065 // * SACK block that has not been SACKed already. 1066 // 1067 // NOTE: This check specifically excludes DSACK blocks 1068 // which have start/end before sndUna and are used to 1069 // indicate spurious retransmissions. 1070 if seg.ackNumber.LessThan(sb.Start) && s.sndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.sndNxt) && !s.ep.scoreboard.IsSACKED(sb) { 1071 s.ep.scoreboard.Insert(sb) 1072 seg.hasNewSACKInfo = true 1073 } 1074 } 1075 s.SetPipe() 1076 } 1077 1078 // Count the duplicates and do the fast retransmit if needed. 1079 rtx := s.checkDuplicateAck(seg) 1080 1081 // Stash away the current window size. 1082 s.sndWnd = seg.window 1083 1084 // Ignore ack if it doesn't acknowledge any new data. 1085 ack := seg.ackNumber 1086 if (ack - 1).InRange(s.sndUna, s.sndNxt) { 1087 s.dupAckCount = 0 1088 1089 // See : https://tools.ietf.org/html/rfc1323#section-3.3. 1090 // Specifically we should only update the RTO using TSEcr if the 1091 // following condition holds: 1092 // 1093 // A TSecr value received in a segment is used to update the 1094 // averaged RTT measurement only if the segment acknowledges 1095 // some new data, i.e., only if it advances the left edge of 1096 // the send window. 1097 if s.ep.sendTSOk && seg.parsedOptions.TSEcr != 0 { 1098 // TSVal/Ecr values sent by Netstack are at a millisecond 1099 // granularity. 1100 elapsed := time.Duration(s.ep.timestamp()-seg.parsedOptions.TSEcr) * time.Millisecond 1101 s.updateRTO(elapsed) 1102 } 1103 1104 // When an ack is received we must rearm the timer. 1105 // RFC 6298 5.2 1106 s.resendTimer.enable(s.rto) 1107 1108 // Remove all acknowledged data from the write list. 1109 acked := s.sndUna.Size(ack) 1110 s.sndUna = ack 1111 1112 ackLeft := acked 1113 originalOutstanding := s.outstanding 1114 for ackLeft > 0 { 1115 // We use logicalLen here because we can have FIN 1116 // segments (which are always at the end of list) that 1117 // have no data, but do consume a sequence number. 1118 seg := s.writeList.Front() 1119 datalen := seg.logicalLen() 1120 1121 if datalen > ackLeft { 1122 prevCount := s.pCount(seg) 1123 seg.data.TrimFront(int(ackLeft)) 1124 seg.sequenceNumber.UpdateForward(ackLeft) 1125 s.outstanding -= prevCount - s.pCount(seg) 1126 break 1127 } 1128 1129 if s.writeNext == seg { 1130 s.writeNext = seg.Next() 1131 } 1132 s.writeList.Remove(seg) 1133 1134 // if SACK is enabled then Only reduce outstanding if 1135 // the segment was not previously SACKED as these have 1136 // already been accounted for in SetPipe(). 1137 if !s.ep.sackPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 1138 s.outstanding -= s.pCount(seg) 1139 } 1140 seg.decRef() 1141 ackLeft -= datalen 1142 } 1143 1144 // Update the send buffer usage and notify potential waiters. 1145 s.ep.updateSndBufferUsage(int(acked)) 1146 1147 // Clear SACK information for all acked data. 1148 s.ep.scoreboard.Delete(s.sndUna) 1149 1150 // If we are not in fast recovery then update the congestion 1151 // window based on the number of acknowledged packets. 1152 if !s.fr.active { 1153 s.cc.Update(originalOutstanding - s.outstanding) 1154 if s.fr.last.LessThan(s.sndUna) { 1155 s.state = Open 1156 } 1157 } 1158 1159 // It is possible for s.outstanding to drop below zero if we get 1160 // a retransmit timeout, reset outstanding to zero but later 1161 // get an ack that cover previously sent data. 1162 if s.outstanding < 0 { 1163 s.outstanding = 0 1164 } 1165 1166 s.SetPipe() 1167 1168 // If all outstanding data was acknowledged the disable the timer. 1169 // RFC 6298 Rule 5.3 1170 if s.sndUna == s.sndNxt { 1171 s.outstanding = 0 1172 s.resendTimer.disable() 1173 } 1174 } 1175 // Now that we've popped all acknowledged data from the retransmit 1176 // queue, retransmit if needed. 1177 if rtx { 1178 s.resendSegment() 1179 } 1180 1181 // Send more data now that some of the pending data has been ack'd, or 1182 // that the window opened up, or the congestion window was inflated due 1183 // to a duplicate ack during fast recovery. This will also re-enable 1184 // the retransmit timer if needed. 1185 if !s.ep.sackPermitted || s.fr.active || s.dupAckCount == 0 || seg.hasNewSACKInfo { 1186 s.sendData() 1187 } 1188 } 1189 1190 // sendSegment sends the specified segment. 1191 func (s *sender) sendSegment(seg *segment) *tcpip.Error { 1192 if !seg.xmitTime.IsZero() { 1193 s.ep.stack.Stats().TCP.Retransmits.Increment() 1194 s.ep.stats.SendErrors.Retransmits.Increment() 1195 if s.sndCwnd < s.sndSsthresh { 1196 s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment() 1197 } 1198 } 1199 seg.xmitTime = time.Now() 1200 return s.sendSegmentFromView(seg.data, seg.flags, seg.sequenceNumber) 1201 } 1202 1203 // sendSegmentFromView sends a new segment containing the given payload, flags 1204 // and sequence number. 1205 func (s *sender) sendSegmentFromView(data buffer.VectorisedView, flags byte, seq seqnum.Value) *tcpip.Error { 1206 s.lastSendTime = time.Now() 1207 if seq == s.rttMeasureSeqNum { 1208 s.rttMeasureTime = s.lastSendTime 1209 } 1210 1211 rcvNxt, rcvWnd := s.ep.rcv.getSendParams() 1212 1213 // Remember the max sent ack. 1214 s.maxSentAck = rcvNxt 1215 1216 // Every time a packet containing data is sent (including a 1217 // retransmission), if SACK is enabled then use the conservative timer 1218 // described in RFC6675 Section 4.0, otherwise follow the standard time 1219 // described in RFC6298 Section 5.2. 1220 if data.Size() != 0 { 1221 if s.ep.sackPermitted { 1222 s.resendTimer.enable(s.rto) 1223 } else { 1224 if !s.resendTimer.enabled() { 1225 s.resendTimer.enable(s.rto) 1226 } 1227 } 1228 } 1229 1230 return s.ep.sendRaw(data, flags, seq, rcvNxt, rcvWnd) 1231 }