github.com/sagernet/gvisor@v0.0.0-20240428053021-e691de28565f/pkg/tcpip/transport/tcp/rack.go (about) 1 // Copyright 2020 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "time" 19 20 "github.com/sagernet/gvisor/pkg/tcpip" 21 "github.com/sagernet/gvisor/pkg/tcpip/seqnum" 22 "github.com/sagernet/gvisor/pkg/tcpip/stack" 23 ) 24 25 const ( 26 // wcDelayedACKTimeout is the recommended maximum delayed ACK timer 27 // value as defined in the RFC. It stands for worst case delayed ACK 28 // timer (WCDelAckT). When FlightSize is 1, PTO is inflated by 29 // WCDelAckT time to compensate for a potential long delayed ACK timer 30 // at the receiver. 31 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5. 32 wcDelayedACKTimeout = 200 * time.Millisecond 33 34 // tcpRACKRecoveryThreshold is the number of loss recoveries for which 35 // the reorder window is inflated and after that the reorder window is 36 // reset to its initial value of minRTT/4. 37 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2. 38 tcpRACKRecoveryThreshold = 16 39 ) 40 41 // RACK is a loss detection algorithm used in TCP to detect packet loss and 42 // reordering using transmission timestamp of the packets instead of packet or 43 // sequence counts. To use RACK, SACK should be enabled on the connection. 44 45 // rackControl stores the rack related fields. 46 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-6.1 47 // 48 // +stateify savable 49 type rackControl struct { 50 stack.TCPRACKState 51 52 // exitedRecovery indicates if the connection is exiting loss recovery. 53 // This flag is set if the sender is leaving the recovery after 54 // receiving an ACK and is reset during updating of reorder window. 55 exitedRecovery bool 56 57 // minRTT is the estimated minimum RTT of the connection. 58 minRTT time.Duration 59 60 // tlpRxtOut indicates whether there is an unacknowledged 61 // TLP retransmission. 62 tlpRxtOut bool 63 64 // tlpHighRxt the value of sender.sndNxt at the time of sending 65 // a TLP retransmission. 66 tlpHighRxt seqnum.Value 67 68 // snd is a reference to the sender. 69 snd *sender 70 } 71 72 // init initializes RACK specific fields. 73 func (rc *rackControl) init(snd *sender, iss seqnum.Value) { 74 rc.FACK = iss 75 rc.ReoWndIncr = 1 76 rc.snd = snd 77 } 78 79 // update will update the RACK related fields when an ACK has been received. 80 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-09#section-6.2 81 func (rc *rackControl) update(seg *segment, ackSeg *segment) { 82 rtt := rc.snd.ep.stack.Clock().NowMonotonic().Sub(seg.xmitTime) 83 84 // If the ACK is for a retransmitted packet, do not update if it is a 85 // spurious inference which is determined by below checks: 86 // 1. When Timestamping option is available, if the TSVal is less than 87 // the transmit time of the most recent retransmitted packet. 88 // 2. When RTT calculated for the packet is less than the smoothed RTT 89 // for the connection. 90 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 91 // step 2 92 if seg.xmitCount > 1 { 93 if ackSeg.parsedOptions.TS && ackSeg.parsedOptions.TSEcr != 0 { 94 if ackSeg.parsedOptions.TSEcr < rc.snd.ep.tsVal(seg.xmitTime) { 95 return 96 } 97 } 98 if rtt < rc.minRTT { 99 return 100 } 101 } 102 103 rc.RTT = rtt 104 105 // The sender can either track a simple global minimum of all RTT 106 // measurements from the connection, or a windowed min-filtered value 107 // of recent RTT measurements. This implementation keeps track of the 108 // simple global minimum of all RTTs for the connection. 109 if rtt < rc.minRTT || rc.minRTT == 0 { 110 rc.minRTT = rtt 111 } 112 113 // Update rc.xmitTime and rc.endSequence to the transmit time and 114 // ending sequence number of the packet which has been acknowledged 115 // most recently. 116 endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) 117 if rc.XmitTime.Before(seg.xmitTime) || (seg.xmitTime == rc.XmitTime && rc.EndSequence.LessThan(endSeq)) { 118 rc.XmitTime = seg.xmitTime 119 rc.EndSequence = endSeq 120 } 121 } 122 123 // detectReorder detects if packet reordering has been observed. 124 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 125 // - Step 3: Detect data segment reordering. 126 // To detect reordering, the sender looks for original data segments being 127 // delivered out of order. To detect such cases, the sender tracks the 128 // highest sequence selectively or cumulatively acknowledged in the RACK.fack 129 // variable. The name "fack" stands for the most "Forward ACK" (this term is 130 // adopted from [FACK]). If a never retransmitted segment that's below 131 // RACK.fack is (selectively or cumulatively) acknowledged, it has been 132 // delivered out of order. The sender sets RACK.reord to TRUE if such segment 133 // is identified. 134 func (rc *rackControl) detectReorder(seg *segment) { 135 endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) 136 if rc.FACK.LessThan(endSeq) { 137 rc.FACK = endSeq 138 return 139 } 140 141 if endSeq.LessThan(rc.FACK) && seg.xmitCount == 1 { 142 rc.Reord = true 143 } 144 } 145 146 func (rc *rackControl) setDSACKSeen(dsackSeen bool) { 147 rc.DSACKSeen = dsackSeen 148 } 149 150 // shouldSchedulePTO dictates whether we should schedule a PTO or not. 151 // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1. 152 func (s *sender) shouldSchedulePTO() bool { 153 // Schedule PTO only if RACK loss detection is enabled. 154 return s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 && 155 // The connection supports SACK. 156 s.ep.SACKPermitted && 157 // The connection is not in loss recovery. 158 (s.state != tcpip.RTORecovery && s.state != tcpip.SACKRecovery) && 159 // The connection has no SACKed sequences in the SACK scoreboard. 160 s.ep.scoreboard.Sacked() == 0 161 } 162 163 // schedulePTO schedules the probe timeout as defined in 164 // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1. 165 func (s *sender) schedulePTO() { 166 pto := time.Second 167 s.rtt.Lock() 168 if s.rtt.TCPRTTState.SRTTInited && s.rtt.TCPRTTState.SRTT > 0 { 169 pto = s.rtt.TCPRTTState.SRTT * 2 170 if s.Outstanding == 1 { 171 pto += wcDelayedACKTimeout 172 } 173 } 174 s.rtt.Unlock() 175 176 now := s.ep.stack.Clock().NowMonotonic() 177 if s.resendTimer.enabled() { 178 if now.Add(pto).After(s.resendTimer.target) { 179 pto = s.resendTimer.target.Sub(now) 180 } 181 s.resendTimer.disable() 182 } 183 184 s.probeTimer.enable(pto) 185 } 186 187 // probeTimerExpired is the same as TLP_send_probe() as defined in 188 // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.2. 189 // 190 // +checklocks:s.ep.mu 191 func (s *sender) probeTimerExpired() tcpip.Error { 192 if s.probeTimer.isUninitialized() || !s.probeTimer.checkExpiration() { 193 return nil 194 } 195 196 var dataSent bool 197 if s.writeNext != nil && s.writeNext.xmitCount == 0 && s.Outstanding < s.SndCwnd { 198 dataSent = s.maybeSendSegment(s.writeNext, int(s.ep.scoreboard.SMSS()), s.SndUna.Add(s.SndWnd)) 199 if dataSent { 200 s.Outstanding += s.pCount(s.writeNext, s.MaxPayloadSize) 201 s.updateWriteNext(s.writeNext.Next()) 202 } 203 } 204 205 if !dataSent && !s.rc.tlpRxtOut { 206 var highestSeqXmit *segment 207 for highestSeqXmit = s.writeList.Front(); highestSeqXmit != nil; highestSeqXmit = highestSeqXmit.Next() { 208 if highestSeqXmit.xmitCount == 0 { 209 // Nothing in writeList is transmitted, no need to send a probe. 210 highestSeqXmit = nil 211 break 212 } 213 if highestSeqXmit.Next() == nil || highestSeqXmit.Next().xmitCount == 0 { 214 // Either everything in writeList has been transmitted or the next 215 // sequence has not been transmitted. Either way this is the highest 216 // sequence segment that was transmitted. 217 break 218 } 219 } 220 221 if highestSeqXmit != nil { 222 dataSent = s.maybeSendSegment(highestSeqXmit, int(s.ep.scoreboard.SMSS()), s.SndUna.Add(s.SndWnd)) 223 if dataSent { 224 s.rc.tlpRxtOut = true 225 s.rc.tlpHighRxt = s.SndNxt 226 } 227 } 228 } 229 230 // Whether or not the probe was sent, the sender must arm the resend timer, 231 // not the probe timer. This ensures that the sender does not send repeated, 232 // back-to-back tail loss probes. 233 s.postXmit(dataSent, false /* shouldScheduleProbe */) 234 return nil 235 } 236 237 // detectTLPRecovery detects if recovery was accomplished by the loss probes 238 // and updates TLP state accordingly. 239 // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.3. 240 func (s *sender) detectTLPRecovery(ack seqnum.Value, rcvdSeg *segment) { 241 if !(s.ep.SACKPermitted && s.rc.tlpRxtOut) { 242 return 243 } 244 245 // Step 1. 246 if s.isDupAck(rcvdSeg) && ack == s.rc.tlpHighRxt { 247 var sbAboveTLPHighRxt bool 248 for _, sb := range rcvdSeg.parsedOptions.SACKBlocks { 249 if s.rc.tlpHighRxt.LessThan(sb.End) { 250 sbAboveTLPHighRxt = true 251 break 252 } 253 } 254 if !sbAboveTLPHighRxt { 255 // TLP episode is complete. 256 s.rc.tlpRxtOut = false 257 } 258 } 259 260 if s.rc.tlpRxtOut && s.rc.tlpHighRxt.LessThanEq(ack) { 261 // TLP episode is complete. 262 s.rc.tlpRxtOut = false 263 if !checkDSACK(rcvdSeg) { 264 // Step 2. Either the original packet or the retransmission (in the 265 // form of a probe) was lost. Invoke a congestion control response 266 // equivalent to fast recovery. 267 s.cc.HandleLossDetected() 268 s.enterRecovery() 269 s.leaveRecovery() 270 } 271 } 272 } 273 274 // updateRACKReorderWindow updates the reorder window. 275 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 276 // - Step 4: Update RACK reordering window 277 // To handle the prevalent small degree of reordering, RACK.reo_wnd serves as 278 // an allowance for settling time before marking a packet lost. RACK starts 279 // initially with a conservative window of min_RTT/4. If no reordering has 280 // been observed RACK uses reo_wnd of zero during loss recovery, in order to 281 // retransmit quickly, or when the number of DUPACKs exceeds the classic 282 // DUPACKthreshold. 283 func (rc *rackControl) updateRACKReorderWindow() { 284 dsackSeen := rc.DSACKSeen 285 snd := rc.snd 286 287 // React to DSACK once per round trip. 288 // If SND.UNA < RACK.rtt_seq: 289 // RACK.dsack = false 290 if snd.SndUna.LessThan(rc.RTTSeq) { 291 dsackSeen = false 292 } 293 294 // If RACK.dsack: 295 // RACK.reo_wnd_incr += 1 296 // RACK.dsack = false 297 // RACK.rtt_seq = SND.NXT 298 // RACK.reo_wnd_persist = 16 299 if dsackSeen { 300 rc.ReoWndIncr++ 301 dsackSeen = false 302 rc.RTTSeq = snd.SndNxt 303 rc.ReoWndPersist = tcpRACKRecoveryThreshold 304 } else if rc.exitedRecovery { 305 // Else if exiting loss recovery: 306 // RACK.reo_wnd_persist -= 1 307 // If RACK.reo_wnd_persist <= 0: 308 // RACK.reo_wnd_incr = 1 309 rc.ReoWndPersist-- 310 if rc.ReoWndPersist <= 0 { 311 rc.ReoWndIncr = 1 312 } 313 rc.exitedRecovery = false 314 } 315 316 // Reorder window is zero during loss recovery, or when the number of 317 // DUPACKs exceeds the classic DUPACKthreshold. 318 // If RACK.reord is FALSE: 319 // If in loss recovery: (If in fast or timeout recovery) 320 // RACK.reo_wnd = 0 321 // Return 322 // Else if RACK.pkts_sacked >= RACK.dupthresh: 323 // RACK.reo_wnd = 0 324 // return 325 if !rc.Reord { 326 if snd.state == tcpip.RTORecovery || snd.state == tcpip.SACKRecovery { 327 rc.ReoWnd = 0 328 return 329 } 330 331 if snd.SackedOut >= nDupAckThreshold { 332 rc.ReoWnd = 0 333 return 334 } 335 } 336 337 // Calculate reorder window. 338 // RACK.reo_wnd = RACK.min_RTT / 4 * RACK.reo_wnd_incr 339 // RACK.reo_wnd = min(RACK.reo_wnd, SRTT) 340 snd.rtt.Lock() 341 srtt := snd.rtt.TCPRTTState.SRTT 342 snd.rtt.Unlock() 343 rc.ReoWnd = time.Duration((int64(rc.minRTT) / 4) * int64(rc.ReoWndIncr)) 344 if srtt < rc.ReoWnd { 345 rc.ReoWnd = srtt 346 } 347 } 348 349 func (rc *rackControl) exitRecovery() { 350 rc.exitedRecovery = true 351 } 352 353 // detectLoss marks the segment as lost if the reordering window has elapsed 354 // and the ACK is not received. It will also arm the reorder timer. 355 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 Step 5. 356 func (rc *rackControl) detectLoss(rcvTime tcpip.MonotonicTime) int { 357 var timeout time.Duration 358 numLost := 0 359 for seg := rc.snd.writeList.Front(); seg != nil && seg.xmitCount != 0; seg = seg.Next() { 360 if rc.snd.ep.scoreboard.IsSACKED(seg.sackBlock()) { 361 continue 362 } 363 364 if seg.lost && seg.xmitCount == 1 { 365 numLost++ 366 continue 367 } 368 369 endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) 370 if seg.xmitTime.Before(rc.XmitTime) || (seg.xmitTime == rc.XmitTime && rc.EndSequence.LessThan(endSeq)) { 371 timeRemaining := seg.xmitTime.Sub(rcvTime) + rc.RTT + rc.ReoWnd 372 if timeRemaining <= 0 { 373 seg.lost = true 374 numLost++ 375 } else if timeRemaining > timeout { 376 timeout = timeRemaining 377 } 378 } 379 } 380 381 if timeout != 0 && !rc.snd.reorderTimer.enabled() { 382 rc.snd.reorderTimer.enable(timeout) 383 } 384 return numLost 385 } 386 387 // reorderTimerExpired will retransmit the segments which have not been acked 388 // before the reorder timer expired. 389 // 390 // +checklocks:rc.snd.ep.mu 391 func (rc *rackControl) reorderTimerExpired() tcpip.Error { 392 if rc.snd.reorderTimer.isUninitialized() || !rc.snd.reorderTimer.checkExpiration() { 393 return nil 394 } 395 396 numLost := rc.detectLoss(rc.snd.ep.stack.Clock().NowMonotonic()) 397 if numLost == 0 { 398 return nil 399 } 400 401 fastRetransmit := false 402 if !rc.snd.FastRecovery.Active { 403 rc.snd.cc.HandleLossDetected() 404 rc.snd.enterRecovery() 405 fastRetransmit = true 406 } 407 408 rc.DoRecovery(nil, fastRetransmit) 409 return nil 410 } 411 412 // DoRecovery implements lossRecovery.DoRecovery. 413 // 414 // +checklocks:rc.snd.ep.mu 415 func (rc *rackControl) DoRecovery(_ *segment, fastRetransmit bool) { 416 snd := rc.snd 417 if fastRetransmit { 418 snd.resendSegment() 419 } 420 421 var dataSent bool 422 // Iterate the writeList and retransmit the segments which are marked 423 // as lost by RACK. 424 for seg := snd.writeList.Front(); seg != nil && seg.xmitCount > 0; seg = seg.Next() { 425 if seg == snd.writeNext { 426 break 427 } 428 429 if !seg.lost { 430 continue 431 } 432 433 // Reset seg.lost as it is already SACKed. 434 if snd.ep.scoreboard.IsSACKED(seg.sackBlock()) { 435 seg.lost = false 436 continue 437 } 438 439 // Check the congestion window after entering recovery. 440 if snd.Outstanding >= snd.SndCwnd { 441 break 442 } 443 444 if sent := snd.maybeSendSegment(seg, int(snd.ep.scoreboard.SMSS()), snd.SndUna.Add(snd.SndWnd)); !sent { 445 break 446 } 447 dataSent = true 448 snd.Outstanding += snd.pCount(seg, snd.MaxPayloadSize) 449 } 450 451 snd.postXmit(dataSent, true /* shouldScheduleProbe */) 452 }