github.com/flowerwrong/netstack@v0.0.0-20191009141956-e5848263af28/tcpip/transport/tcp/snd.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "math" 19 "sync" 20 "sync/atomic" 21 "time" 22 23 "github.com/FlowerWrong/netstack/sleep" 24 "github.com/FlowerWrong/netstack/tcpip" 25 "github.com/FlowerWrong/netstack/tcpip/buffer" 26 "github.com/FlowerWrong/netstack/tcpip/header" 27 "github.com/FlowerWrong/netstack/tcpip/seqnum" 28 ) 29 30 const ( 31 // minRTO is the minimum allowed value for the retransmit timeout. 32 minRTO = 200 * time.Millisecond 33 34 // InitialCwnd is the initial congestion window. 35 InitialCwnd = 10 36 37 // nDupAckThreshold is the number of duplicate ACK's required 38 // before fast-retransmit is entered. 39 nDupAckThreshold = 3 40 ) 41 42 // ccState indicates the current congestion control state for this sender. 43 type ccState int 44 45 const ( 46 // Open indicates that the sender is receiving acks in order and 47 // no loss or dupACK's etc have been detected. 48 Open ccState = iota 49 // RTORecovery indicates that an RTO has occurred and the sender 50 // has entered an RTO based recovery phase. 51 RTORecovery 52 // FastRecovery indicates that the sender has entered FastRecovery 53 // based on receiving nDupAck's. This state is entered only when 54 // SACK is not in use. 55 FastRecovery 56 // SACKRecovery indicates that the sender has entered SACK based 57 // recovery. 58 SACKRecovery 59 // Disorder indicates the sender either received some SACK blocks 60 // or dupACK's. 61 Disorder 62 ) 63 64 // congestionControl is an interface that must be implemented by any supported 65 // congestion control algorithm. 66 type congestionControl interface { 67 // HandleNDupAcks is invoked when sender.dupAckCount >= nDupAckThreshold 68 // just before entering fast retransmit. 69 HandleNDupAcks() 70 71 // HandleRTOExpired is invoked when the retransmit timer expires. 72 HandleRTOExpired() 73 74 // Update is invoked when processing inbound acks. It's passed the 75 // number of packet's that were acked by the most recent cumulative 76 // acknowledgement. 77 Update(packetsAcked int) 78 79 // PostRecovery is invoked when the sender is exiting a fast retransmit/ 80 // recovery phase. This provides congestion control algorithms a way 81 // to adjust their state when exiting recovery. 82 PostRecovery() 83 } 84 85 // sender holds the state necessary to send TCP segments. 86 // 87 // +stateify savable 88 type sender struct { 89 ep *endpoint 90 91 // lastSendTime is the timestamp when the last packet was sent. 92 lastSendTime time.Time 93 94 // dupAckCount is the number of duplicated acks received. It is used for 95 // fast retransmit. 96 dupAckCount int 97 98 // fr holds state related to fast recovery. 99 fr fastRecovery 100 101 // sndCwnd is the congestion window, in packets. 102 sndCwnd int 103 104 // sndSsthresh is the threshold between slow start and congestion 105 // avoidance. 106 sndSsthresh int 107 108 // sndCAAckCount is the number of packets acknowledged during congestion 109 // avoidance. When enough packets have been ack'd (typically cwnd 110 // packets), the congestion window is incremented by one. 111 sndCAAckCount int 112 113 // outstanding is the number of outstanding packets, that is, packets 114 // that have been sent but not yet acknowledged. 115 outstanding int 116 117 // sndWnd is the send window size. 118 sndWnd seqnum.Size 119 120 // sndUna is the next unacknowledged sequence number. 121 sndUna seqnum.Value 122 123 // sndNxt is the sequence number of the next segment to be sent. 124 sndNxt seqnum.Value 125 126 // sndNxtList is the sequence number of the next segment to be added to 127 // the send list. 128 sndNxtList seqnum.Value 129 130 // rttMeasureSeqNum is the sequence number being used for the latest RTT 131 // measurement. 132 rttMeasureSeqNum seqnum.Value 133 134 // rttMeasureTime is the time when the rttMeasureSeqNum was sent. 135 rttMeasureTime time.Time 136 137 closed bool 138 writeNext *segment 139 writeList segmentList 140 resendTimer timer 141 resendWaker sleep.Waker 142 143 // rtt.srtt, rtt.rttvar, and rto are the "smoothed round-trip time", 144 // "round-trip time variation" and "retransmit timeout", as defined in 145 // section 2 of RFC 6298. 146 rtt rtt 147 rto time.Duration 148 149 // maxPayloadSize is the maximum size of the payload of a given segment. 150 // It is initialized on demand. 151 maxPayloadSize int 152 153 // gso is set if generic segmentation offload is enabled. 154 gso bool 155 156 // sndWndScale is the number of bits to shift left when reading the send 157 // window size from a segment. 158 sndWndScale uint8 159 160 // maxSentAck is the maxium acknowledgement actually sent. 161 maxSentAck seqnum.Value 162 163 // state is the current state of congestion control for this endpoint. 164 state ccState 165 166 // cc is the congestion control algorithm in use for this sender. 167 cc congestionControl 168 } 169 170 // rtt is a synchronization wrapper used to appease stateify. See the comment 171 // in sender, where it is used. 172 // 173 // +stateify savable 174 type rtt struct { 175 sync.Mutex 176 177 srtt time.Duration 178 rttvar time.Duration 179 srttInited bool 180 } 181 182 // fastRecovery holds information related to fast recovery from a packet loss. 183 // 184 // +stateify savable 185 type fastRecovery struct { 186 // active whether the endpoint is in fast recovery. The following fields 187 // are only meaningful when active is true. 188 active bool 189 190 // first and last represent the inclusive sequence number range being 191 // recovered. 192 first seqnum.Value 193 last seqnum.Value 194 195 // maxCwnd is the maximum value the congestion window may be inflated to 196 // due to duplicate acks. This exists to avoid attacks where the 197 // receiver intentionally sends duplicate acks to artificially inflate 198 // the sender's cwnd. 199 maxCwnd int 200 201 // highRxt is the highest sequence number which has been retransmitted 202 // during the current loss recovery phase. 203 // See: RFC 6675 Section 2 for details. 204 highRxt seqnum.Value 205 206 // rescueRxt is the highest sequence number which has been 207 // optimistically retransmitted to prevent stalling of the ACK clock 208 // when there is loss at the end of the window and no new data is 209 // available for transmission. 210 // See: RFC 6675 Section 2 for details. 211 rescueRxt seqnum.Value 212 } 213 214 func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender { 215 // The sender MUST reduce the TCP data length to account for any IP or 216 // TCP options that it is including in the packets that it sends. 217 // See: https://tools.ietf.org/html/rfc6691#section-2 218 maxPayloadSize := int(mss) - ep.maxOptionSize() 219 220 s := &sender{ 221 ep: ep, 222 sndWnd: sndWnd, 223 sndUna: iss + 1, 224 sndNxt: iss + 1, 225 sndNxtList: iss + 1, 226 rto: 1 * time.Second, 227 rttMeasureSeqNum: iss + 1, 228 lastSendTime: time.Now(), 229 maxPayloadSize: maxPayloadSize, 230 maxSentAck: irs + 1, 231 fr: fastRecovery{ 232 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1. 233 last: iss, 234 highRxt: iss, 235 rescueRxt: iss, 236 }, 237 gso: ep.gso != nil, 238 } 239 240 if s.gso { 241 s.ep.gso.MSS = uint16(maxPayloadSize) 242 } 243 244 s.cc = s.initCongestionControl(ep.cc) 245 246 // A negative sndWndScale means that no scaling is in use, otherwise we 247 // store the scaling value. 248 if sndWndScale > 0 { 249 s.sndWndScale = uint8(sndWndScale) 250 } 251 252 s.resendTimer.init(&s.resendWaker) 253 254 s.updateMaxPayloadSize(int(ep.route.MTU()), 0) 255 256 // Initialize SACK Scoreboard after updating max payload size as we use 257 // the maxPayloadSize as the smss when determining if a segment is lost 258 // etc. 259 s.ep.scoreboard = NewSACKScoreboard(uint16(s.maxPayloadSize), iss) 260 261 return s 262 } 263 264 // initCongestionControl initializes the specified congestion control module and 265 // returns a handle to it. It also initializes the sndCwnd and sndSsThresh to 266 // their initial values. 267 func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl { 268 s.sndCwnd = InitialCwnd 269 s.sndSsthresh = math.MaxInt64 270 271 switch congestionControlName { 272 case ccCubic: 273 return newCubicCC(s) 274 case ccReno: 275 fallthrough 276 default: 277 return newRenoCC(s) 278 } 279 } 280 281 // updateMaxPayloadSize updates the maximum payload size based on the given 282 // MTU. If this is in response to "packet too big" control packets (indicated 283 // by the count argument), it also reduces the number of outstanding packets and 284 // attempts to retransmit the first packet above the MTU size. 285 func (s *sender) updateMaxPayloadSize(mtu, count int) { 286 m := mtu - header.TCPMinimumSize 287 288 m -= s.ep.maxOptionSize() 289 290 // We don't adjust up for now. 291 if m >= s.maxPayloadSize { 292 return 293 } 294 295 // Make sure we can transmit at least one byte. 296 if m <= 0 { 297 m = 1 298 } 299 300 s.maxPayloadSize = m 301 if s.gso { 302 s.ep.gso.MSS = uint16(m) 303 } 304 305 if count == 0 { 306 // updateMaxPayloadSize is also called when the sender is created. 307 // and there is no data to send in such cases. Return immediately. 308 return 309 } 310 311 // Update the scoreboard's smss to reflect the new lowered 312 // maxPayloadSize. 313 s.ep.scoreboard.smss = uint16(m) 314 315 s.outstanding -= count 316 if s.outstanding < 0 { 317 s.outstanding = 0 318 } 319 320 // Rewind writeNext to the first segment exceeding the MTU. Do nothing 321 // if it is already before such a packet. 322 for seg := s.writeList.Front(); seg != nil; seg = seg.Next() { 323 if seg == s.writeNext { 324 // We got to writeNext before we could find a segment 325 // exceeding the MTU. 326 break 327 } 328 329 if seg.data.Size() > m { 330 // We found a segment exceeding the MTU. Rewind 331 // writeNext and try to retransmit it. 332 s.writeNext = seg 333 break 334 } 335 } 336 337 // Since we likely reduced the number of outstanding packets, we may be 338 // ready to send some more. 339 s.sendData() 340 } 341 342 // sendAck sends an ACK segment. 343 func (s *sender) sendAck() { 344 s.sendSegmentFromView(buffer.VectorisedView{}, header.TCPFlagAck, s.sndNxt) 345 } 346 347 // updateRTO updates the retransmit timeout when a new roud-trip time is 348 // available. This is done in accordance with section 2 of RFC 6298. 349 func (s *sender) updateRTO(rtt time.Duration) { 350 s.rtt.Lock() 351 if !s.rtt.srttInited { 352 s.rtt.rttvar = rtt / 2 353 s.rtt.srtt = rtt 354 s.rtt.srttInited = true 355 } else { 356 diff := s.rtt.srtt - rtt 357 if diff < 0 { 358 diff = -diff 359 } 360 // Use RFC6298 standard algorithm to update rttvar and srtt when 361 // no timestamps are available. 362 if !s.ep.sendTSOk { 363 s.rtt.rttvar = (3*s.rtt.rttvar + diff) / 4 364 s.rtt.srtt = (7*s.rtt.srtt + rtt) / 8 365 } else { 366 // When we are taking RTT measurements of every ACK then 367 // we need to use a modified method as specified in 368 // https://tools.ietf.org/html/rfc7323#appendix-G 369 if s.outstanding == 0 { 370 s.rtt.Unlock() 371 return 372 } 373 // Netstack measures congestion window/inflight all in 374 // terms of packets and not bytes. This is similar to 375 // how linux also does cwnd and inflight. In practice 376 // this approximation works as expected. 377 expectedSamples := math.Ceil(float64(s.outstanding) / 2) 378 379 // alpha & beta values are the original values as recommended in 380 // https://tools.ietf.org/html/rfc6298#section-2.3. 381 const alpha = 0.125 382 const beta = 0.25 383 384 alphaPrime := alpha / expectedSamples 385 betaPrime := beta / expectedSamples 386 rttVar := (1-betaPrime)*s.rtt.rttvar.Seconds() + betaPrime*diff.Seconds() 387 srtt := (1-alphaPrime)*s.rtt.srtt.Seconds() + alphaPrime*rtt.Seconds() 388 s.rtt.rttvar = time.Duration(rttVar * float64(time.Second)) 389 s.rtt.srtt = time.Duration(srtt * float64(time.Second)) 390 } 391 } 392 393 s.rto = s.rtt.srtt + 4*s.rtt.rttvar 394 s.rtt.Unlock() 395 if s.rto < minRTO { 396 s.rto = minRTO 397 } 398 } 399 400 // resendSegment resends the first unacknowledged segment. 401 func (s *sender) resendSegment() { 402 // Don't use any segments we already sent to measure RTT as they may 403 // have been affected by packets being lost. 404 s.rttMeasureSeqNum = s.sndNxt 405 406 // Resend the segment. 407 if seg := s.writeList.Front(); seg != nil { 408 if seg.data.Size() > s.maxPayloadSize { 409 s.splitSeg(seg, s.maxPayloadSize) 410 } 411 412 // See: RFC 6675 section 5 Step 4.3 413 // 414 // To prevent retransmission, set both the HighRXT and RescueRXT 415 // to the highest sequence number in the retransmitted segment. 416 s.fr.highRxt = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) - 1 417 s.fr.rescueRxt = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) - 1 418 s.sendSegment(seg) 419 s.ep.stack.Stats().TCP.FastRetransmit.Increment() 420 421 // Run SetPipe() as per RFC 6675 section 5 Step 4.4 422 s.SetPipe() 423 } 424 } 425 426 // retransmitTimerExpired is called when the retransmit timer expires, and 427 // unacknowledged segments are assumed lost, and thus need to be resent. 428 // Returns true if the connection is still usable, or false if the connection 429 // is deemed lost. 430 func (s *sender) retransmitTimerExpired() bool { 431 // Check if the timer actually expired or if it's a spurious wake due 432 // to a previously orphaned runtime timer. 433 if !s.resendTimer.checkExpiration() { 434 return true 435 } 436 437 s.ep.stack.Stats().TCP.Timeouts.Increment() 438 439 // Give up if we've waited more than a minute since the last resend. 440 if s.rto >= 60*time.Second { 441 return false 442 } 443 444 // Set new timeout. The timer will be restarted by the call to sendData 445 // below. 446 s.rto *= 2 447 448 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4. 449 // 450 // Retransmit timeouts: 451 // After a retransmit timeout, record the highest sequence number 452 // transmitted in the variable recover, and exit the fast recovery 453 // procedure if applicable. 454 s.fr.last = s.sndNxt - 1 455 456 if s.fr.active { 457 // We were attempting fast recovery but were not successful. 458 // Leave the state. We don't need to update ssthresh because it 459 // has already been updated when entered fast-recovery. 460 s.leaveFastRecovery() 461 } 462 463 s.state = RTORecovery 464 s.cc.HandleRTOExpired() 465 466 // Mark the next segment to be sent as the first unacknowledged one and 467 // start sending again. Set the number of outstanding packets to 0 so 468 // that we'll be able to retransmit. 469 // 470 // We'll keep on transmitting (or retransmitting) as we get acks for 471 // the data we transmit. 472 s.outstanding = 0 473 474 // Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1 475 // 476 // In order to avoid memory deadlocks, the TCP receiver is allowed to 477 // discard data that has already been selectively acknowledged. As a 478 // result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK 479 // information gathered from a receiver upon a retransmission timeout 480 // (RTO) "since the timeout might indicate that the data receiver has 481 // reneged." Additionally, a TCP sender MUST "ignore prior SACK 482 // information in determining which data to retransmit." 483 // 484 // NOTE: We take the stricter interpretation and just expunge all 485 // information as we lack more rigorous checks to validate if the SACK 486 // information is usable after an RTO. 487 s.ep.scoreboard.Reset() 488 s.writeNext = s.writeList.Front() 489 s.sendData() 490 491 return true 492 } 493 494 // pCount returns the number of packets in the segment. Due to GSO, a segment 495 // can be composed of multiple packets. 496 func (s *sender) pCount(seg *segment) int { 497 size := seg.data.Size() 498 if size == 0 { 499 return 1 500 } 501 502 return (size-1)/s.maxPayloadSize + 1 503 } 504 505 // splitSeg splits a given segment at the size specified and inserts the 506 // remainder as a new segment after the current one in the write list. 507 func (s *sender) splitSeg(seg *segment, size int) { 508 if seg.data.Size() <= size { 509 return 510 } 511 // Split this segment up. 512 nSeg := seg.clone() 513 nSeg.data.TrimFront(size) 514 nSeg.sequenceNumber.UpdateForward(seqnum.Size(size)) 515 s.writeList.InsertAfter(seg, nSeg) 516 seg.data.CapLength(size) 517 } 518 519 // NextSeg implements the RFC6675 NextSeg() operation. It returns segments that 520 // match rule 1, 3 and 4 of the NextSeg() operation defined in RFC6675. Rule 2 521 // is handled by the normal send logic. 522 func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) { 523 var s3 *segment 524 var s4 *segment 525 smss := s.ep.scoreboard.SMSS() 526 // Step 1. 527 for seg := s.writeList.Front(); seg != nil; seg = seg.Next() { 528 if !s.isAssignedSequenceNumber(seg) { 529 break 530 } 531 segSeq := seg.sequenceNumber 532 if seg.data.Size() > int(smss) { 533 s.splitSeg(seg, int(smss)) 534 } 535 // See RFC 6675 Section 4 536 // 537 // 1. If there exists a smallest unSACKED sequence number 538 // 'S2' that meets the following 3 criteria for determinig 539 // loss, the sequence range of one segment of up to SMSS 540 // octects starting with S2 MUST be returned. 541 if !s.ep.scoreboard.IsSACKED(header.SACKBlock{segSeq, segSeq.Add(1)}) { 542 // NextSeg(): 543 // 544 // (1.a) S2 is greater than HighRxt 545 // (1.b) S2 is less than highest octect covered by 546 // any received SACK. 547 if s.fr.highRxt.LessThan(segSeq) && segSeq.LessThan(s.ep.scoreboard.maxSACKED) { 548 // NextSeg(): 549 // (1.c) IsLost(S2) returns true. 550 if s.ep.scoreboard.IsLost(segSeq) { 551 return seg, s3, s4 552 } 553 // NextSeg(): 554 // 555 // (3): If the conditions for rules (1) and (2) 556 // fail, but there exists an unSACKed sequence 557 // number S3 that meets the criteria for 558 // detecting loss given in steps 1.a and 1.b 559 // above (specifically excluding (1.c)) then one 560 // segment of upto SMSS octets starting with S3 561 // SHOULD be returned. 562 if s3 == nil { 563 s3 = seg 564 } 565 } 566 // NextSeg(): 567 // 568 // (4) If the conditions for (1), (2) and (3) fail, 569 // but there exists outstanding unSACKED data, we 570 // provide the opportunity for a single "rescue" 571 // retransmission per entry into loss recovery. If 572 // HighACK is greater than RescueRxt, the one 573 // segment of upto SMSS octects that MUST include 574 // the highest outstanding unSACKed sequence number 575 // SHOULD be returned. 576 if s.fr.rescueRxt.LessThan(s.sndUna - 1) { 577 if s4 != nil { 578 if s4.sequenceNumber.LessThan(segSeq) { 579 s4 = seg 580 } 581 } else { 582 s4 = seg 583 } 584 s.fr.rescueRxt = s.fr.last 585 } 586 } 587 } 588 589 return nil, s3, s4 590 } 591 592 // maybeSendSegment tries to send the specified segment and either coalesces 593 // other segments into this one or splits the specified segment based on the 594 // lower of the specified limit value or the receivers window size specified by 595 // end. 596 func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (sent bool) { 597 // We abuse the flags field to determine if we have already 598 // assigned a sequence number to this segment. 599 if !s.isAssignedSequenceNumber(seg) { 600 // Merge segments if allowed. 601 if seg.data.Size() != 0 { 602 available := int(seg.sequenceNumber.Size(end)) 603 if available > limit { 604 available = limit 605 } 606 607 // nextTooBig indicates that the next segment was too 608 // large to entirely fit in the current segment. It 609 // would be possible to split the next segment and merge 610 // the portion that fits, but unexpectedly splitting 611 // segments can have user visible side-effects which can 612 // break applications. For example, RFC 7766 section 8 613 // says that the length and data of a DNS response 614 // should be sent in the same TCP segment to avoid 615 // triggering bugs in poorly written DNS 616 // implementations. 617 var nextTooBig bool 618 for seg.Next() != nil && seg.Next().data.Size() != 0 { 619 if seg.data.Size()+seg.Next().data.Size() > available { 620 nextTooBig = true 621 break 622 } 623 seg.data.Append(seg.Next().data) 624 625 // Consume the segment that we just merged in. 626 s.writeList.Remove(seg.Next()) 627 } 628 if !nextTooBig && seg.data.Size() < available { 629 // Segment is not full. 630 if s.outstanding > 0 && atomic.LoadUint32(&s.ep.delay) != 0 { 631 // Nagle's algorithm. From Wikipedia: 632 // Nagle's algorithm works by 633 // combining a number of small 634 // outgoing messages and sending them 635 // all at once. Specifically, as long 636 // as there is a sent packet for which 637 // the sender has received no 638 // acknowledgment, the sender should 639 // keep buffering its output until it 640 // has a full packet's worth of 641 // output, thus allowing output to be 642 // sent all at once. 643 return false 644 } 645 if atomic.LoadUint32(&s.ep.cork) != 0 { 646 // Hold back the segment until full. 647 return false 648 } 649 } 650 } 651 652 // Assign flags. We don't do it above so that we can merge 653 // additional data if Nagle holds the segment. 654 seg.sequenceNumber = s.sndNxt 655 seg.flags = header.TCPFlagAck | header.TCPFlagPsh 656 } 657 658 var segEnd seqnum.Value 659 if seg.data.Size() == 0 { 660 if s.writeList.Back() != seg { 661 panic("FIN segments must be the final segment in the write list.") 662 } 663 seg.flags = header.TCPFlagAck | header.TCPFlagFin 664 segEnd = seg.sequenceNumber.Add(1) 665 // Transition to FIN-WAIT1 state since we're initiating an active close. 666 s.ep.mu.Lock() 667 switch s.ep.state { 668 case StateCloseWait: 669 // We've already received a FIN and are now sending our own. The 670 // sender is now awaiting a final ACK for this FIN. 671 s.ep.state = StateLastAck 672 default: 673 s.ep.state = StateFinWait1 674 } 675 s.ep.mu.Unlock() 676 } else { 677 // We're sending a non-FIN segment. 678 if seg.flags&header.TCPFlagFin != 0 { 679 panic("Netstack queues FIN segments without data.") 680 } 681 682 if !seg.sequenceNumber.LessThan(end) { 683 return false 684 } 685 686 available := int(seg.sequenceNumber.Size(end)) 687 if available == 0 { 688 return false 689 } 690 if available > limit { 691 available = limit 692 } 693 694 if seg.data.Size() > available { 695 s.splitSeg(seg, available) 696 } 697 698 segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) 699 } 700 701 s.sendSegment(seg) 702 703 // Update sndNxt if we actually sent new data (as opposed to 704 // retransmitting some previously sent data). 705 if s.sndNxt.LessThan(segEnd) { 706 s.sndNxt = segEnd 707 } 708 709 return true 710 } 711 712 // handleSACKRecovery implements the loss recovery phase as described in RFC6675 713 // section 5, step C. 714 func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool) { 715 s.SetPipe() 716 for s.outstanding < s.sndCwnd { 717 nextSeg, s3, s4 := s.NextSeg() 718 if nextSeg == nil { 719 // NextSeg(): 720 // 721 // Step (2): "If no sequence number 'S2' per rule (1) 722 // exists but there exists available unsent data and the 723 // receiver's advertised window allows, the sequence 724 // range of one segment of up to SMSS octets of 725 // previously unsent data starting with sequence number 726 // HighData+1 MUST be returned." 727 for seg := s.writeNext; seg != nil; seg = seg.Next() { 728 if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.sndNxt) { 729 continue 730 } 731 // Step C.3 described below is handled by 732 // maybeSendSegment which increments sndNxt when 733 // a segment is transmitted. 734 // 735 // Step C.3 "If any of the data octets sent in 736 // (C.1) are above HighData, HighData must be 737 // updated to reflect the transmission of 738 // previously unsent data." 739 if sent := s.maybeSendSegment(seg, limit, end); !sent { 740 break 741 } 742 dataSent = true 743 s.outstanding++ 744 s.writeNext = seg.Next() 745 nextSeg = seg 746 break 747 } 748 if nextSeg != nil { 749 continue 750 } 751 } 752 rescueRtx := false 753 if nextSeg == nil && s3 != nil { 754 nextSeg = s3 755 } 756 if nextSeg == nil && s4 != nil { 757 nextSeg = s4 758 rescueRtx = true 759 } 760 if nextSeg == nil { 761 break 762 } 763 segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen()) 764 if !rescueRtx && nextSeg.sequenceNumber.LessThan(s.sndNxt) { 765 // RFC 6675, Step C.2 766 // 767 // "If any of the data octets sent in (C.1) are below 768 // HighData, HighRxt MUST be set to the highest sequence 769 // number of the retransmitted segment unless NextSeg () 770 // rule (4) was invoked for this retransmission." 771 s.fr.highRxt = segEnd - 1 772 } 773 774 // RFC 6675, Step C.4. 775 // 776 // "The estimate of the amount of data outstanding in the network 777 // must be updated by incrementing pipe by the number of octets 778 // transmitted in (C.1)." 779 s.outstanding++ 780 dataSent = true 781 s.sendSegment(nextSeg) 782 } 783 return dataSent 784 } 785 786 // sendData sends new data segments. It is called when data becomes available or 787 // when the send window opens up. 788 func (s *sender) sendData() { 789 limit := s.maxPayloadSize 790 if s.gso { 791 limit = int(s.ep.gso.MaxSize - header.TCPHeaderMaximumSize) 792 } 793 end := s.sndUna.Add(s.sndWnd) 794 795 // Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10. 796 // "A TCP SHOULD set cwnd to no more than RW before beginning 797 // transmission if the TCP has not sent data in the interval exceeding 798 // the retrasmission timeout." 799 if !s.fr.active && time.Now().Sub(s.lastSendTime) > s.rto { 800 if s.sndCwnd > InitialCwnd { 801 s.sndCwnd = InitialCwnd 802 } 803 } 804 805 var dataSent bool 806 807 // RFC 6675 recovery algorithm step C 1-5. 808 if s.fr.active && s.ep.sackPermitted { 809 dataSent = s.handleSACKRecovery(s.maxPayloadSize, end) 810 } else { 811 for seg := s.writeNext; seg != nil && s.outstanding < s.sndCwnd; seg = seg.Next() { 812 cwndLimit := (s.sndCwnd - s.outstanding) * s.maxPayloadSize 813 if cwndLimit < limit { 814 limit = cwndLimit 815 } 816 if s.isAssignedSequenceNumber(seg) && s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 817 continue 818 } 819 if sent := s.maybeSendSegment(seg, limit, end); !sent { 820 break 821 } 822 dataSent = true 823 s.outstanding += s.pCount(seg) 824 s.writeNext = seg.Next() 825 } 826 } 827 828 if dataSent { 829 // We sent data, so we should stop the keepalive timer to ensure 830 // that no keepalives are sent while there is pending data. 831 s.ep.disableKeepaliveTimer() 832 } 833 834 // Enable the timer if we have pending data and it's not enabled yet. 835 if !s.resendTimer.enabled() && s.sndUna != s.sndNxt { 836 s.resendTimer.enable(s.rto) 837 } 838 // If we have no more pending data, start the keepalive timer. 839 if s.sndUna == s.sndNxt { 840 s.ep.resetKeepaliveTimer(false) 841 } 842 } 843 844 func (s *sender) enterFastRecovery() { 845 s.fr.active = true 846 // Save state to reflect we're now in fast recovery. 847 // 848 // See : https://tools.ietf.org/html/rfc5681#section-3.2 Step 3. 849 // We inflate the cwnd by 3 to account for the 3 packets which triggered 850 // the 3 duplicate ACKs and are now not in flight. 851 s.sndCwnd = s.sndSsthresh + 3 852 s.fr.first = s.sndUna 853 s.fr.last = s.sndNxt - 1 854 s.fr.maxCwnd = s.sndCwnd + s.outstanding 855 if s.ep.sackPermitted { 856 s.state = SACKRecovery 857 s.ep.stack.Stats().TCP.SACKRecovery.Increment() 858 return 859 } 860 s.state = FastRecovery 861 s.ep.stack.Stats().TCP.FastRecovery.Increment() 862 } 863 864 func (s *sender) leaveFastRecovery() { 865 s.fr.active = false 866 s.fr.maxCwnd = 0 867 s.dupAckCount = 0 868 869 // Deflate cwnd. It had been artificially inflated when new dups arrived. 870 s.sndCwnd = s.sndSsthresh 871 872 s.cc.PostRecovery() 873 } 874 875 func (s *sender) handleFastRecovery(seg *segment) (rtx bool) { 876 ack := seg.ackNumber 877 // We are in fast recovery mode. Ignore the ack if it's out of 878 // range. 879 if !ack.InRange(s.sndUna, s.sndNxt+1) { 880 return false 881 } 882 883 // Leave fast recovery if it acknowledges all the data covered by 884 // this fast recovery session. 885 if s.fr.last.LessThan(ack) { 886 s.leaveFastRecovery() 887 return false 888 } 889 890 if s.ep.sackPermitted { 891 // When SACK is enabled we let retransmission be governed by 892 // the SACK logic. 893 return false 894 } 895 896 // Don't count this as a duplicate if it is carrying data or 897 // updating the window. 898 if seg.logicalLen() != 0 || s.sndWnd != seg.window { 899 return false 900 } 901 902 // Inflate the congestion window if we're getting duplicate acks 903 // for the packet we retransmitted. 904 if ack == s.fr.first { 905 // We received a dup, inflate the congestion window by 1 packet 906 // if we're not at the max yet. Only inflate the window if 907 // regular FastRecovery is in use, RFC6675 does not require 908 // inflating cwnd on duplicate ACKs. 909 if s.sndCwnd < s.fr.maxCwnd { 910 s.sndCwnd++ 911 } 912 return false 913 } 914 915 // A partial ack was received. Retransmit this packet and 916 // remember it so that we don't retransmit it again. We don't 917 // inflate the window because we're putting the same packet back 918 // onto the wire. 919 // 920 // N.B. The retransmit timer will be reset by the caller. 921 s.fr.first = ack 922 s.dupAckCount = 0 923 return true 924 } 925 926 // isAssignedSequenceNumber relies on the fact that we only set flags once a 927 // sequencenumber is assigned and that is only done right before we send the 928 // segment. As a result any segment that has a non-zero flag has a valid 929 // sequence number assigned to it. 930 func (s *sender) isAssignedSequenceNumber(seg *segment) bool { 931 return seg.flags != 0 932 } 933 934 // SetPipe implements the SetPipe() function described in RFC6675. Netstack 935 // maintains the congestion window in number of packets and not bytes, so 936 // SetPipe() here measures number of outstanding packets rather than actual 937 // outstanding bytes in the network. 938 func (s *sender) SetPipe() { 939 // If SACK isn't permitted or it is permitted but recovery is not active 940 // then ignore pipe calculations. 941 if !s.ep.sackPermitted || !s.fr.active { 942 return 943 } 944 pipe := 0 945 smss := seqnum.Size(s.ep.scoreboard.SMSS()) 946 for s1 := s.writeList.Front(); s1 != nil && s1.data.Size() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() { 947 // With GSO each segment can be much larger than SMSS. So check the segment 948 // in SMSS sized ranges. 949 segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.data.Size())) 950 for startSeq := s1.sequenceNumber; startSeq.LessThan(segEnd); startSeq = startSeq.Add(smss) { 951 endSeq := startSeq.Add(smss) 952 if segEnd.LessThan(endSeq) { 953 endSeq = segEnd 954 } 955 sb := header.SACKBlock{startSeq, endSeq} 956 // SetPipe(): 957 // 958 // After initializing pipe to zero, the following steps are 959 // taken for each octet 'S1' in the sequence space between 960 // HighACK and HighData that has not been SACKed: 961 if !s1.sequenceNumber.LessThan(s.sndNxt) { 962 break 963 } 964 if s.ep.scoreboard.IsSACKED(sb) { 965 continue 966 } 967 968 // SetPipe(): 969 // 970 // (a) If IsLost(S1) returns false, Pipe is incremened by 1. 971 // 972 // NOTE: here we mark the whole segment as lost. We do not try 973 // and test every byte in our write buffer as we maintain our 974 // pipe in terms of oustanding packets and not bytes. 975 if !s.ep.scoreboard.IsRangeLost(sb) { 976 pipe++ 977 } 978 // SetPipe(): 979 // (b) If S1 <= HighRxt, Pipe is incremented by 1. 980 if s1.sequenceNumber.LessThanEq(s.fr.highRxt) { 981 pipe++ 982 } 983 } 984 } 985 s.outstanding = pipe 986 } 987 988 // checkDuplicateAck is called when an ack is received. It manages the state 989 // related to duplicate acks and determines if a retransmit is needed according 990 // to the rules in RFC 6582 (NewReno). 991 func (s *sender) checkDuplicateAck(seg *segment) (rtx bool) { 992 ack := seg.ackNumber 993 if s.fr.active { 994 return s.handleFastRecovery(seg) 995 } 996 997 // We're not in fast recovery yet. A segment is considered a duplicate 998 // only if it doesn't carry any data and doesn't update the send window, 999 // because if it does, it wasn't sent in response to an out-of-order 1000 // segment. If SACK is enabled then we have an additional check to see 1001 // if the segment carries new SACK information. If it does then it is 1002 // considered a duplicate ACK as per RFC6675. 1003 if ack != s.sndUna || seg.logicalLen() != 0 || s.sndWnd != seg.window || ack == s.sndNxt { 1004 if !s.ep.sackPermitted || !seg.hasNewSACKInfo { 1005 s.dupAckCount = 0 1006 return false 1007 } 1008 } 1009 1010 s.dupAckCount++ 1011 1012 // Do not enter fast recovery until we reach nDupAckThreshold or the 1013 // first unacknowledged byte is considered lost as per SACK scoreboard. 1014 if s.dupAckCount < nDupAckThreshold || (s.ep.sackPermitted && !s.ep.scoreboard.IsLost(s.sndUna)) { 1015 // RFC 6675 Step 3. 1016 s.fr.highRxt = s.sndUna - 1 1017 // Do run SetPipe() to calculate the outstanding segments. 1018 s.SetPipe() 1019 s.state = Disorder 1020 return false 1021 } 1022 1023 // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2 1024 // 1025 // We only do the check here, the incrementing of last to the highest 1026 // sequence number transmitted till now is done when enterFastRecovery 1027 // is invoked. 1028 if !s.fr.last.LessThan(seg.ackNumber) { 1029 s.dupAckCount = 0 1030 return false 1031 } 1032 s.cc.HandleNDupAcks() 1033 s.enterFastRecovery() 1034 s.dupAckCount = 0 1035 return true 1036 } 1037 1038 // handleRcvdSegment is called when a segment is received; it is responsible for 1039 // updating the send-related state. 1040 func (s *sender) handleRcvdSegment(seg *segment) { 1041 // Check if we can extract an RTT measurement from this ack. 1042 if !seg.parsedOptions.TS && s.rttMeasureSeqNum.LessThan(seg.ackNumber) { 1043 s.updateRTO(time.Now().Sub(s.rttMeasureTime)) 1044 s.rttMeasureSeqNum = s.sndNxt 1045 } 1046 1047 // Update Timestamp if required. See RFC7323, section-4.3. 1048 if s.ep.sendTSOk && seg.parsedOptions.TS { 1049 s.ep.updateRecentTimestamp(seg.parsedOptions.TSVal, s.maxSentAck, seg.sequenceNumber) 1050 } 1051 1052 // Insert SACKBlock information into our scoreboard. 1053 if s.ep.sackPermitted { 1054 for _, sb := range seg.parsedOptions.SACKBlocks { 1055 // Only insert the SACK block if the following holds 1056 // true: 1057 // * SACK block acks data after the ack number in the 1058 // current segment. 1059 // * SACK block represents a sequence 1060 // between sndUna and sndNxt (i.e. data that is 1061 // currently unacked and in-flight). 1062 // * SACK block that has not been SACKed already. 1063 // 1064 // NOTE: This check specifically excludes DSACK blocks 1065 // which have start/end before sndUna and are used to 1066 // indicate spurious retransmissions. 1067 if seg.ackNumber.LessThan(sb.Start) && s.sndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.sndNxt) && !s.ep.scoreboard.IsSACKED(sb) { 1068 s.ep.scoreboard.Insert(sb) 1069 seg.hasNewSACKInfo = true 1070 } 1071 } 1072 s.SetPipe() 1073 } 1074 1075 // Count the duplicates and do the fast retransmit if needed. 1076 rtx := s.checkDuplicateAck(seg) 1077 1078 // Stash away the current window size. 1079 s.sndWnd = seg.window 1080 1081 // Ignore ack if it doesn't acknowledge any new data. 1082 ack := seg.ackNumber 1083 if (ack - 1).InRange(s.sndUna, s.sndNxt) { 1084 s.dupAckCount = 0 1085 1086 // See : https://tools.ietf.org/html/rfc1323#section-3.3. 1087 // Specifically we should only update the RTO using TSEcr if the 1088 // following condition holds: 1089 // 1090 // A TSecr value received in a segment is used to update the 1091 // averaged RTT measurement only if the segment acknowledges 1092 // some new data, i.e., only if it advances the left edge of 1093 // the send window. 1094 if s.ep.sendTSOk && seg.parsedOptions.TSEcr != 0 { 1095 // TSVal/Ecr values sent by Netstack are at a millisecond 1096 // granularity. 1097 elapsed := time.Duration(s.ep.timestamp()-seg.parsedOptions.TSEcr) * time.Millisecond 1098 s.updateRTO(elapsed) 1099 } 1100 1101 // When an ack is received we must rearm the timer. 1102 // RFC 6298 5.2 1103 s.resendTimer.enable(s.rto) 1104 1105 // Remove all acknowledged data from the write list. 1106 acked := s.sndUna.Size(ack) 1107 s.sndUna = ack 1108 1109 ackLeft := acked 1110 originalOutstanding := s.outstanding 1111 for ackLeft > 0 { 1112 // We use logicalLen here because we can have FIN 1113 // segments (which are always at the end of list) that 1114 // have no data, but do consume a sequence number. 1115 seg := s.writeList.Front() 1116 datalen := seg.logicalLen() 1117 1118 if datalen > ackLeft { 1119 prevCount := s.pCount(seg) 1120 seg.data.TrimFront(int(ackLeft)) 1121 seg.sequenceNumber.UpdateForward(ackLeft) 1122 s.outstanding -= prevCount - s.pCount(seg) 1123 break 1124 } 1125 1126 if s.writeNext == seg { 1127 s.writeNext = seg.Next() 1128 } 1129 s.writeList.Remove(seg) 1130 1131 // if SACK is enabled then Only reduce outstanding if 1132 // the segment was not previously SACKED as these have 1133 // already been accounted for in SetPipe(). 1134 if !s.ep.sackPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) { 1135 s.outstanding -= s.pCount(seg) 1136 } 1137 seg.decRef() 1138 ackLeft -= datalen 1139 } 1140 1141 // Update the send buffer usage and notify potential waiters. 1142 s.ep.updateSndBufferUsage(int(acked)) 1143 1144 // Clear SACK information for all acked data. 1145 s.ep.scoreboard.Delete(s.sndUna) 1146 1147 // If we are not in fast recovery then update the congestion 1148 // window based on the number of acknowledged packets. 1149 if !s.fr.active { 1150 s.cc.Update(originalOutstanding - s.outstanding) 1151 if s.fr.last.LessThan(s.sndUna) { 1152 s.state = Open 1153 } 1154 } 1155 1156 // It is possible for s.outstanding to drop below zero if we get 1157 // a retransmit timeout, reset outstanding to zero but later 1158 // get an ack that cover previously sent data. 1159 if s.outstanding < 0 { 1160 s.outstanding = 0 1161 } 1162 1163 s.SetPipe() 1164 1165 // If all outstanding data was acknowledged the disable the timer. 1166 // RFC 6298 Rule 5.3 1167 if s.sndUna == s.sndNxt { 1168 s.outstanding = 0 1169 s.resendTimer.disable() 1170 } 1171 } 1172 // Now that we've popped all acknowledged data from the retransmit 1173 // queue, retransmit if needed. 1174 if rtx { 1175 s.resendSegment() 1176 } 1177 1178 // Send more data now that some of the pending data has been ack'd, or 1179 // that the window opened up, or the congestion window was inflated due 1180 // to a duplicate ack during fast recovery. This will also re-enable 1181 // the retransmit timer if needed. 1182 if !s.ep.sackPermitted || s.fr.active || s.dupAckCount == 0 || seg.hasNewSACKInfo { 1183 s.sendData() 1184 } 1185 } 1186 1187 // sendSegment sends the specified segment. 1188 func (s *sender) sendSegment(seg *segment) *tcpip.Error { 1189 if !seg.xmitTime.IsZero() { 1190 s.ep.stack.Stats().TCP.Retransmits.Increment() 1191 if s.sndCwnd < s.sndSsthresh { 1192 s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment() 1193 } 1194 } 1195 seg.xmitTime = time.Now() 1196 return s.sendSegmentFromView(seg.data, seg.flags, seg.sequenceNumber) 1197 } 1198 1199 // sendSegmentFromView sends a new segment containing the given payload, flags 1200 // and sequence number. 1201 func (s *sender) sendSegmentFromView(data buffer.VectorisedView, flags byte, seq seqnum.Value) *tcpip.Error { 1202 s.lastSendTime = time.Now() 1203 if seq == s.rttMeasureSeqNum { 1204 s.rttMeasureTime = s.lastSendTime 1205 } 1206 1207 rcvNxt, rcvWnd := s.ep.rcv.getSendParams() 1208 1209 // Remember the max sent ack. 1210 s.maxSentAck = rcvNxt 1211 1212 // Every time a packet containing data is sent (including a 1213 // retransmission), if SACK is enabled then use the conservative timer 1214 // described in RFC6675 Section 4.0, otherwise follow the standard time 1215 // described in RFC6298 Section 5.2. 1216 if data.Size() != 0 { 1217 if s.ep.sackPermitted { 1218 s.resendTimer.enable(s.rto) 1219 } else { 1220 if !s.resendTimer.enabled() { 1221 s.resendTimer.enable(s.rto) 1222 } 1223 } 1224 } 1225 1226 return s.ep.sendRaw(data, flags, seq, rcvNxt, rcvWnd) 1227 }