github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/tcpip/transport/tcp/rack.go (about) 1 // Copyright 2020 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "time" 19 20 "github.com/SagerNet/gvisor/pkg/tcpip" 21 "github.com/SagerNet/gvisor/pkg/tcpip/seqnum" 22 "github.com/SagerNet/gvisor/pkg/tcpip/stack" 23 ) 24 25 const ( 26 // wcDelayedACKTimeout is the recommended maximum delayed ACK timer 27 // value as defined in the RFC. It stands for worst case delayed ACK 28 // timer (WCDelAckT). When FlightSize is 1, PTO is inflated by 29 // WCDelAckT time to compensate for a potential long delayed ACK timer 30 // at the receiver. 31 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5. 32 wcDelayedACKTimeout = 200 * time.Millisecond 33 34 // tcpRACKRecoveryThreshold is the number of loss recoveries for which 35 // the reorder window is inflated and after that the reorder window is 36 // reset to its initial value of minRTT/4. 37 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2. 38 tcpRACKRecoveryThreshold = 16 39 ) 40 41 // RACK is a loss detection algorithm used in TCP to detect packet loss and 42 // reordering using transmission timestamp of the packets instead of packet or 43 // sequence counts. To use RACK, SACK should be enabled on the connection. 44 45 // rackControl stores the rack related fields. 46 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-6.1 47 // 48 // +stateify savable 49 type rackControl struct { 50 stack.TCPRACKState 51 52 // exitedRecovery indicates if the connection is exiting loss recovery. 53 // This flag is set if the sender is leaving the recovery after 54 // receiving an ACK and is reset during updating of reorder window. 55 exitedRecovery bool 56 57 // minRTT is the estimated minimum RTT of the connection. 58 minRTT time.Duration 59 60 // tlpRxtOut indicates whether there is an unacknowledged 61 // TLP retransmission. 62 tlpRxtOut bool 63 64 // tlpHighRxt the value of sender.sndNxt at the time of sending 65 // a TLP retransmission. 66 tlpHighRxt seqnum.Value 67 68 // snd is a reference to the sender. 69 snd *sender 70 } 71 72 // init initializes RACK specific fields. 73 func (rc *rackControl) init(snd *sender, iss seqnum.Value) { 74 rc.FACK = iss 75 rc.ReoWndIncr = 1 76 rc.snd = snd 77 } 78 79 // update will update the RACK related fields when an ACK has been received. 80 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-09#section-6.2 81 func (rc *rackControl) update(seg *segment, ackSeg *segment) { 82 rtt := rc.snd.ep.stack.Clock().NowMonotonic().Sub(seg.xmitTime) 83 tsOffset := rc.snd.ep.TSOffset 84 85 // If the ACK is for a retransmitted packet, do not update if it is a 86 // spurious inference which is determined by below checks: 87 // 1. When Timestamping option is available, if the TSVal is less than 88 // the transmit time of the most recent retransmitted packet. 89 // 2. When RTT calculated for the packet is less than the smoothed RTT 90 // for the connection. 91 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 92 // step 2 93 if seg.xmitCount > 1 { 94 if ackSeg.parsedOptions.TS && ackSeg.parsedOptions.TSEcr != 0 { 95 if ackSeg.parsedOptions.TSEcr < tcpTimeStamp(seg.xmitTime, tsOffset) { 96 return 97 } 98 } 99 if rtt < rc.minRTT { 100 return 101 } 102 } 103 104 rc.RTT = rtt 105 106 // The sender can either track a simple global minimum of all RTT 107 // measurements from the connection, or a windowed min-filtered value 108 // of recent RTT measurements. This implementation keeps track of the 109 // simple global minimum of all RTTs for the connection. 110 if rtt < rc.minRTT || rc.minRTT == 0 { 111 rc.minRTT = rtt 112 } 113 114 // Update rc.xmitTime and rc.endSequence to the transmit time and 115 // ending sequence number of the packet which has been acknowledged 116 // most recently. 117 endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) 118 if rc.XmitTime.Before(seg.xmitTime) || (seg.xmitTime == rc.XmitTime && rc.EndSequence.LessThan(endSeq)) { 119 rc.XmitTime = seg.xmitTime 120 rc.EndSequence = endSeq 121 } 122 } 123 124 // detectReorder detects if packet reordering has been observed. 125 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 126 // * Step 3: Detect data segment reordering. 127 // To detect reordering, the sender looks for original data segments being 128 // delivered out of order. To detect such cases, the sender tracks the 129 // highest sequence selectively or cumulatively acknowledged in the RACK.fack 130 // variable. The name "fack" stands for the most "Forward ACK" (this term is 131 // adopted from [FACK]). If a never retransmitted segment that's below 132 // RACK.fack is (selectively or cumulatively) acknowledged, it has been 133 // delivered out of order. The sender sets RACK.reord to TRUE if such segment 134 // is identified. 135 func (rc *rackControl) detectReorder(seg *segment) { 136 endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) 137 if rc.FACK.LessThan(endSeq) { 138 rc.FACK = endSeq 139 return 140 } 141 142 if endSeq.LessThan(rc.FACK) && seg.xmitCount == 1 { 143 rc.Reord = true 144 } 145 } 146 147 func (rc *rackControl) setDSACKSeen(dsackSeen bool) { 148 rc.DSACKSeen = dsackSeen 149 } 150 151 // shouldSchedulePTO dictates whether we should schedule a PTO or not. 152 // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1. 153 func (s *sender) shouldSchedulePTO() bool { 154 // Schedule PTO only if RACK loss detection is enabled. 155 return s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 && 156 // The connection supports SACK. 157 s.ep.SACKPermitted && 158 // The connection is not in loss recovery. 159 (s.state != tcpip.RTORecovery && s.state != tcpip.SACKRecovery) && 160 // The connection has no SACKed sequences in the SACK scoreboard. 161 s.ep.scoreboard.Sacked() == 0 162 } 163 164 // schedulePTO schedules the probe timeout as defined in 165 // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1. 166 func (s *sender) schedulePTO() { 167 pto := time.Second 168 s.rtt.Lock() 169 if s.rtt.TCPRTTState.SRTTInited && s.rtt.TCPRTTState.SRTT > 0 { 170 pto = s.rtt.TCPRTTState.SRTT * 2 171 if s.Outstanding == 1 { 172 pto += wcDelayedACKTimeout 173 } 174 } 175 s.rtt.Unlock() 176 177 now := s.ep.stack.Clock().NowMonotonic() 178 if s.resendTimer.enabled() { 179 if now.Add(pto).After(s.resendTimer.target) { 180 pto = s.resendTimer.target.Sub(now) 181 } 182 s.resendTimer.disable() 183 } 184 185 s.probeTimer.enable(pto) 186 } 187 188 // probeTimerExpired is the same as TLP_send_probe() as defined in 189 // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.2. 190 func (s *sender) probeTimerExpired() tcpip.Error { 191 if !s.probeTimer.checkExpiration() { 192 return nil 193 } 194 195 var dataSent bool 196 if s.writeNext != nil && s.writeNext.xmitCount == 0 && s.Outstanding < s.SndCwnd { 197 dataSent = s.maybeSendSegment(s.writeNext, int(s.ep.scoreboard.SMSS()), s.SndUna.Add(s.SndWnd)) 198 if dataSent { 199 s.Outstanding += s.pCount(s.writeNext, s.MaxPayloadSize) 200 s.writeNext = s.writeNext.Next() 201 } 202 } 203 204 if !dataSent && !s.rc.tlpRxtOut { 205 var highestSeqXmit *segment 206 for highestSeqXmit = s.writeList.Front(); highestSeqXmit != nil; highestSeqXmit = highestSeqXmit.Next() { 207 if highestSeqXmit.xmitCount == 0 { 208 // Nothing in writeList is transmitted, no need to send a probe. 209 highestSeqXmit = nil 210 break 211 } 212 if highestSeqXmit.Next() == nil || highestSeqXmit.Next().xmitCount == 0 { 213 // Either everything in writeList has been transmitted or the next 214 // sequence has not been transmitted. Either way this is the highest 215 // sequence segment that was transmitted. 216 break 217 } 218 } 219 220 if highestSeqXmit != nil { 221 dataSent = s.maybeSendSegment(highestSeqXmit, int(s.ep.scoreboard.SMSS()), s.SndUna.Add(s.SndWnd)) 222 if dataSent { 223 s.rc.tlpRxtOut = true 224 s.rc.tlpHighRxt = s.SndNxt 225 } 226 } 227 } 228 229 // Whether or not the probe was sent, the sender must arm the resend timer, 230 // not the probe timer. This ensures that the sender does not send repeated, 231 // back-to-back tail loss probes. 232 s.postXmit(dataSent, false /* shouldScheduleProbe */) 233 return nil 234 } 235 236 // detectTLPRecovery detects if recovery was accomplished by the loss probes 237 // and updates TLP state accordingly. 238 // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.3. 239 func (s *sender) detectTLPRecovery(ack seqnum.Value, rcvdSeg *segment) { 240 if !(s.ep.SACKPermitted && s.rc.tlpRxtOut) { 241 return 242 } 243 244 // Step 1. 245 if s.isDupAck(rcvdSeg) && ack == s.rc.tlpHighRxt { 246 var sbAboveTLPHighRxt bool 247 for _, sb := range rcvdSeg.parsedOptions.SACKBlocks { 248 if s.rc.tlpHighRxt.LessThan(sb.End) { 249 sbAboveTLPHighRxt = true 250 break 251 } 252 } 253 if !sbAboveTLPHighRxt { 254 // TLP episode is complete. 255 s.rc.tlpRxtOut = false 256 } 257 } 258 259 if s.rc.tlpRxtOut && s.rc.tlpHighRxt.LessThanEq(ack) { 260 // TLP episode is complete. 261 s.rc.tlpRxtOut = false 262 if !checkDSACK(rcvdSeg) { 263 // Step 2. Either the original packet or the retransmission (in the 264 // form of a probe) was lost. Invoke a congestion control response 265 // equivalent to fast recovery. 266 s.cc.HandleLossDetected() 267 s.enterRecovery() 268 s.leaveRecovery() 269 } 270 } 271 } 272 273 // updateRACKReorderWindow updates the reorder window. 274 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 275 // * Step 4: Update RACK reordering window 276 // To handle the prevalent small degree of reordering, RACK.reo_wnd serves as 277 // an allowance for settling time before marking a packet lost. RACK starts 278 // initially with a conservative window of min_RTT/4. If no reordering has 279 // been observed RACK uses reo_wnd of zero during loss recovery, in order to 280 // retransmit quickly, or when the number of DUPACKs exceeds the classic 281 // DUPACKthreshold. 282 func (rc *rackControl) updateRACKReorderWindow() { 283 dsackSeen := rc.DSACKSeen 284 snd := rc.snd 285 286 // React to DSACK once per round trip. 287 // If SND.UNA < RACK.rtt_seq: 288 // RACK.dsack = false 289 if snd.SndUna.LessThan(rc.RTTSeq) { 290 dsackSeen = false 291 } 292 293 // If RACK.dsack: 294 // RACK.reo_wnd_incr += 1 295 // RACK.dsack = false 296 // RACK.rtt_seq = SND.NXT 297 // RACK.reo_wnd_persist = 16 298 if dsackSeen { 299 rc.ReoWndIncr++ 300 dsackSeen = false 301 rc.RTTSeq = snd.SndNxt 302 rc.ReoWndPersist = tcpRACKRecoveryThreshold 303 } else if rc.exitedRecovery { 304 // Else if exiting loss recovery: 305 // RACK.reo_wnd_persist -= 1 306 // If RACK.reo_wnd_persist <= 0: 307 // RACK.reo_wnd_incr = 1 308 rc.ReoWndPersist-- 309 if rc.ReoWndPersist <= 0 { 310 rc.ReoWndIncr = 1 311 } 312 rc.exitedRecovery = false 313 } 314 315 // Reorder window is zero during loss recovery, or when the number of 316 // DUPACKs exceeds the classic DUPACKthreshold. 317 // If RACK.reord is FALSE: 318 // If in loss recovery: (If in fast or timeout recovery) 319 // RACK.reo_wnd = 0 320 // Return 321 // Else if RACK.pkts_sacked >= RACK.dupthresh: 322 // RACK.reo_wnd = 0 323 // return 324 if !rc.Reord { 325 if snd.state == tcpip.RTORecovery || snd.state == tcpip.SACKRecovery { 326 rc.ReoWnd = 0 327 return 328 } 329 330 if snd.SackedOut >= nDupAckThreshold { 331 rc.ReoWnd = 0 332 return 333 } 334 } 335 336 // Calculate reorder window. 337 // RACK.reo_wnd = RACK.min_RTT / 4 * RACK.reo_wnd_incr 338 // RACK.reo_wnd = min(RACK.reo_wnd, SRTT) 339 snd.rtt.Lock() 340 srtt := snd.rtt.TCPRTTState.SRTT 341 snd.rtt.Unlock() 342 rc.ReoWnd = time.Duration((int64(rc.minRTT) / 4) * int64(rc.ReoWndIncr)) 343 if srtt < rc.ReoWnd { 344 rc.ReoWnd = srtt 345 } 346 } 347 348 func (rc *rackControl) exitRecovery() { 349 rc.exitedRecovery = true 350 } 351 352 // detectLoss marks the segment as lost if the reordering window has elapsed 353 // and the ACK is not received. It will also arm the reorder timer. 354 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 Step 5. 355 func (rc *rackControl) detectLoss(rcvTime tcpip.MonotonicTime) int { 356 var timeout time.Duration 357 numLost := 0 358 for seg := rc.snd.writeList.Front(); seg != nil && seg.xmitCount != 0; seg = seg.Next() { 359 if rc.snd.ep.scoreboard.IsSACKED(seg.sackBlock()) { 360 continue 361 } 362 363 if seg.lost && seg.xmitCount == 1 { 364 numLost++ 365 continue 366 } 367 368 endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) 369 if seg.xmitTime.Before(rc.XmitTime) || (seg.xmitTime == rc.XmitTime && rc.EndSequence.LessThan(endSeq)) { 370 timeRemaining := seg.xmitTime.Sub(rcvTime) + rc.RTT + rc.ReoWnd 371 if timeRemaining <= 0 { 372 seg.lost = true 373 numLost++ 374 } else if timeRemaining > timeout { 375 timeout = timeRemaining 376 } 377 } 378 } 379 380 if timeout != 0 && !rc.snd.reorderTimer.enabled() { 381 rc.snd.reorderTimer.enable(timeout) 382 } 383 return numLost 384 } 385 386 // reorderTimerExpired will retransmit the segments which have not been acked 387 // before the reorder timer expired. 388 func (rc *rackControl) reorderTimerExpired() tcpip.Error { 389 // Check if the timer actually expired or if it's a spurious wake due 390 // to a previously orphaned runtime timer. 391 if !rc.snd.reorderTimer.checkExpiration() { 392 return nil 393 } 394 395 numLost := rc.detectLoss(rc.snd.ep.stack.Clock().NowMonotonic()) 396 if numLost == 0 { 397 return nil 398 } 399 400 fastRetransmit := false 401 if !rc.snd.FastRecovery.Active { 402 rc.snd.cc.HandleLossDetected() 403 rc.snd.enterRecovery() 404 fastRetransmit = true 405 } 406 407 rc.DoRecovery(nil, fastRetransmit) 408 return nil 409 } 410 411 // DoRecovery implements lossRecovery.DoRecovery. 412 func (rc *rackControl) DoRecovery(_ *segment, fastRetransmit bool) { 413 snd := rc.snd 414 if fastRetransmit { 415 snd.resendSegment() 416 } 417 418 var dataSent bool 419 // Iterate the writeList and retransmit the segments which are marked 420 // as lost by RACK. 421 for seg := snd.writeList.Front(); seg != nil && seg.xmitCount > 0; seg = seg.Next() { 422 if seg == snd.writeNext { 423 break 424 } 425 426 if !seg.lost { 427 continue 428 } 429 430 // Reset seg.lost as it is already SACKed. 431 if snd.ep.scoreboard.IsSACKED(seg.sackBlock()) { 432 seg.lost = false 433 continue 434 } 435 436 // Check the congestion window after entering recovery. 437 if snd.Outstanding >= snd.SndCwnd { 438 break 439 } 440 441 if sent := snd.maybeSendSegment(seg, int(snd.ep.scoreboard.SMSS()), snd.SndUna.Add(snd.SndWnd)); !sent { 442 break 443 } 444 dataSent = true 445 snd.Outstanding += snd.pCount(seg, snd.MaxPayloadSize) 446 } 447 448 snd.postXmit(dataSent, true /* shouldScheduleProbe */) 449 }