github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/tcpip/stack/tcp.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package stack 16 17 import ( 18 "context" 19 "time" 20 21 "github.com/metacubex/gvisor/pkg/atomicbitops" 22 "github.com/metacubex/gvisor/pkg/tcpip" 23 "github.com/metacubex/gvisor/pkg/tcpip/header" 24 "github.com/metacubex/gvisor/pkg/tcpip/internal/tcp" 25 "github.com/metacubex/gvisor/pkg/tcpip/seqnum" 26 ) 27 28 // contextID is this package's type for context.Context.Value keys. 29 type contextID int 30 31 const ( 32 // CtxRestoreStack is a Context.Value key for the stack to be used in restore. 33 CtxRestoreStack contextID = iota 34 ) 35 36 // RestoreStackFromContext returns the stack to be used during restore. 37 func RestoreStackFromContext(ctx context.Context) *Stack { 38 return ctx.Value(CtxRestoreStack).(*Stack) 39 } 40 41 // TCPProbeFunc is the expected function type for a TCP probe function to be 42 // passed to stack.AddTCPProbe. 43 type TCPProbeFunc func(s *TCPEndpointState) 44 45 // TCPCubicState is used to hold a copy of the internal cubic state when the 46 // TCPProbeFunc is invoked. 47 // 48 // +stateify savable 49 type TCPCubicState struct { 50 // WLastMax is the previous wMax value. 51 WLastMax float64 52 53 // WMax is the value of the congestion window at the time of the last 54 // congestion event. 55 WMax float64 56 57 // T is the time when the current congestion avoidance was entered. 58 T tcpip.MonotonicTime 59 60 // TimeSinceLastCongestion denotes the time since the current 61 // congestion avoidance was entered. 62 TimeSinceLastCongestion time.Duration 63 64 // C is the cubic constant as specified in RFC8312, page 11. 65 C float64 66 67 // K is the time period (in seconds) that the above function takes to 68 // increase the current window size to WMax if there are no further 69 // congestion events and is calculated using the following equation: 70 // 71 // K = cubic_root(WMax*(1-beta_cubic)/C) (Eq. 2, page 5) 72 K float64 73 74 // Beta is the CUBIC multiplication decrease factor. That is, when a 75 // congestion event is detected, CUBIC reduces its cwnd to 76 // WC(0)=WMax*beta_cubic. 77 Beta float64 78 79 // WC is window computed by CUBIC at time TimeSinceLastCongestion. It's 80 // calculated using the formula: 81 // 82 // WC(TimeSinceLastCongestion) = C*(t-K)^3 + WMax (Eq. 1) 83 WC float64 84 85 // WEst is the window computed by CUBIC at time 86 // TimeSinceLastCongestion+RTT i.e WC(TimeSinceLastCongestion+RTT). 87 WEst float64 88 } 89 90 // TCPRACKState is used to hold a copy of the internal RACK state when the 91 // TCPProbeFunc is invoked. 92 // 93 // +stateify savable 94 type TCPRACKState struct { 95 // XmitTime is the transmission timestamp of the most recent 96 // acknowledged segment. 97 XmitTime tcpip.MonotonicTime 98 99 // EndSequence is the ending TCP sequence number of the most recent 100 // acknowledged segment. 101 EndSequence seqnum.Value 102 103 // FACK is the highest selectively or cumulatively acknowledged 104 // sequence. 105 FACK seqnum.Value 106 107 // RTT is the round trip time of the most recently delivered packet on 108 // the connection (either cumulatively acknowledged or selectively 109 // acknowledged) that was not marked invalid as a possible spurious 110 // retransmission. 111 RTT time.Duration 112 113 // Reord is true iff reordering has been detected on this connection. 114 Reord bool 115 116 // DSACKSeen is true iff the connection has seen a DSACK. 117 DSACKSeen bool 118 119 // ReoWnd is the reordering window time used for recording packet 120 // transmission times. It is used to defer the moment at which RACK 121 // marks a packet lost. 122 ReoWnd time.Duration 123 124 // ReoWndIncr is the multiplier applied to adjust reorder window. 125 ReoWndIncr uint8 126 127 // ReoWndPersist is the number of loss recoveries before resetting 128 // reorder window. 129 ReoWndPersist int8 130 131 // RTTSeq is the SND.NXT when RTT is updated. 132 RTTSeq seqnum.Value 133 } 134 135 // TCPEndpointID is the unique 4 tuple that identifies a given endpoint. 136 // 137 // +stateify savable 138 type TCPEndpointID struct { 139 // LocalPort is the local port associated with the endpoint. 140 LocalPort uint16 141 142 // LocalAddress is the local [network layer] address associated with 143 // the endpoint. 144 LocalAddress tcpip.Address 145 146 // RemotePort is the remote port associated with the endpoint. 147 RemotePort uint16 148 149 // RemoteAddress it the remote [network layer] address associated with 150 // the endpoint. 151 RemoteAddress tcpip.Address 152 } 153 154 // TCPFastRecoveryState holds a copy of the internal fast recovery state of a 155 // TCP endpoint. 156 // 157 // +stateify savable 158 type TCPFastRecoveryState struct { 159 // Active if true indicates the endpoint is in fast recovery. The 160 // following fields are only meaningful when Active is true. 161 Active bool 162 163 // First is the first unacknowledged sequence number being recovered. 164 First seqnum.Value 165 166 // Last is the 'recover' sequence number that indicates the point at 167 // which we should exit recovery barring any timeouts etc. 168 Last seqnum.Value 169 170 // MaxCwnd is the maximum value we are permitted to grow the congestion 171 // window during recovery. This is set at the time we enter recovery. 172 // It exists to avoid attacks where the receiver intentionally sends 173 // duplicate acks to artificially inflate the sender's cwnd. 174 MaxCwnd int 175 176 // HighRxt is the highest sequence number which has been retransmitted 177 // during the current loss recovery phase. See: RFC 6675 Section 2 for 178 // details. 179 HighRxt seqnum.Value 180 181 // RescueRxt is the highest sequence number which has been 182 // optimistically retransmitted to prevent stalling of the ACK clock 183 // when there is loss at the end of the window and no new data is 184 // available for transmission. See: RFC 6675 Section 2 for details. 185 RescueRxt seqnum.Value 186 } 187 188 // TCPReceiverState holds a copy of the internal state of the receiver for a 189 // given TCP endpoint. 190 // 191 // +stateify savable 192 type TCPReceiverState struct { 193 // RcvNxt is the TCP variable RCV.NXT. 194 RcvNxt seqnum.Value 195 196 // RcvAcc is one beyond the last acceptable sequence number. That is, 197 // the "largest" sequence value that the receiver has announced to its 198 // peer that it's willing to accept. This may be different than RcvNxt 199 // + (last advertised receive window) if the receive window is reduced; 200 // in that case we have to reduce the window as we receive more data 201 // instead of shrinking it. 202 RcvAcc seqnum.Value 203 204 // RcvWndScale is the window scaling to use for inbound segments. 205 RcvWndScale uint8 206 207 // PendingBufUsed is the number of bytes pending in the receive queue. 208 PendingBufUsed int 209 } 210 211 // TCPRTTState holds a copy of information about the endpoint's round trip 212 // time. 213 // 214 // +stateify savable 215 type TCPRTTState struct { 216 // SRTT is the smoothed round trip time defined in section 2 of RFC 217 // 6298. 218 SRTT time.Duration 219 220 // RTTVar is the round-trip time variation as defined in section 2 of 221 // RFC 6298. 222 RTTVar time.Duration 223 224 // SRTTInited if true indicates that a valid RTT measurement has been 225 // completed. 226 SRTTInited bool 227 } 228 229 // TCPSenderState holds a copy of the internal state of the sender for a given 230 // TCP Endpoint. 231 // 232 // +stateify savable 233 type TCPSenderState struct { 234 // LastSendTime is the timestamp at which we sent the last segment. 235 LastSendTime tcpip.MonotonicTime 236 237 // DupAckCount is the number of Duplicate ACKs received. It is used for 238 // fast retransmit. 239 DupAckCount int 240 241 // SndCwnd is the size of the sending congestion window in packets. 242 SndCwnd int 243 244 // Ssthresh is the threshold between slow start and congestion 245 // avoidance. 246 Ssthresh int 247 248 // SndCAAckCount is the number of packets acknowledged during 249 // congestion avoidance. When enough packets have been ack'd (typically 250 // cwnd packets), the congestion window is incremented by one. 251 SndCAAckCount int 252 253 // Outstanding is the number of packets that have been sent but not yet 254 // acknowledged. 255 Outstanding int 256 257 // SackedOut is the number of packets which have been selectively 258 // acked. 259 SackedOut int 260 261 // SndWnd is the send window size in bytes. 262 SndWnd seqnum.Size 263 264 // SndUna is the next unacknowledged sequence number. 265 SndUna seqnum.Value 266 267 // SndNxt is the sequence number of the next segment to be sent. 268 SndNxt seqnum.Value 269 270 // RTTMeasureSeqNum is the sequence number being used for the latest 271 // RTT measurement. 272 RTTMeasureSeqNum seqnum.Value 273 274 // RTTMeasureTime is the time when the RTTMeasureSeqNum was sent. 275 RTTMeasureTime tcpip.MonotonicTime 276 277 // Closed indicates that the caller has closed the endpoint for 278 // sending. 279 Closed bool 280 281 // RTO is the retransmit timeout as defined in section of 2 of RFC 282 // 6298. 283 RTO time.Duration 284 285 // RTTState holds information about the endpoint's round trip time. 286 RTTState TCPRTTState 287 288 // MaxPayloadSize is the maximum size of the payload of a given 289 // segment. It is initialized on demand. 290 MaxPayloadSize int 291 292 // SndWndScale is the number of bits to shift left when reading the 293 // send window size from a segment. 294 SndWndScale uint8 295 296 // MaxSentAck is the highest acknowledgement number sent till now. 297 MaxSentAck seqnum.Value 298 299 // FastRecovery holds the fast recovery state for the endpoint. 300 FastRecovery TCPFastRecoveryState 301 302 // Cubic holds the state related to CUBIC congestion control. 303 Cubic TCPCubicState 304 305 // RACKState holds the state related to RACK loss detection algorithm. 306 RACKState TCPRACKState 307 308 // RetransmitTS records the timestamp used to detect spurious recovery. 309 RetransmitTS uint32 310 311 // SpuriousRecovery indicates if the sender entered recovery spuriously. 312 SpuriousRecovery bool 313 } 314 315 // TCPSACKInfo holds TCP SACK related information for a given TCP endpoint. 316 // 317 // +stateify savable 318 type TCPSACKInfo struct { 319 // Blocks is the list of SACK Blocks that identify the out of order 320 // segments held by a given TCP endpoint. 321 Blocks []header.SACKBlock 322 323 // ReceivedBlocks are the SACK blocks received by this endpoint from 324 // the peer endpoint. 325 ReceivedBlocks []header.SACKBlock 326 327 // MaxSACKED is the highest sequence number that has been SACKED by the 328 // peer. 329 MaxSACKED seqnum.Value 330 } 331 332 // RcvBufAutoTuneParams holds state related to TCP receive buffer auto-tuning. 333 // 334 // +stateify savable 335 type RcvBufAutoTuneParams struct { 336 // MeasureTime is the time at which the current measurement was 337 // started. 338 MeasureTime tcpip.MonotonicTime 339 340 // CopiedBytes is the number of bytes copied to user space since this 341 // measure began. 342 CopiedBytes int 343 344 // PrevCopiedBytes is the number of bytes copied to userspace in the 345 // previous RTT period. 346 PrevCopiedBytes int 347 348 // RcvBufSize is the auto tuned receive buffer size. 349 RcvBufSize int 350 351 // RTT is the smoothed RTT as measured by observing the time between 352 // when a byte is first acknowledged and the receipt of data that is at 353 // least one window beyond the sequence number that was acknowledged. 354 RTT time.Duration 355 356 // RTTVar is the "round-trip time variation" as defined in section 2 of 357 // RFC6298. 358 RTTVar time.Duration 359 360 // RTTMeasureSeqNumber is the highest acceptable sequence number at the 361 // time this RTT measurement period began. 362 RTTMeasureSeqNumber seqnum.Value 363 364 // RTTMeasureTime is the absolute time at which the current RTT 365 // measurement period began. 366 RTTMeasureTime tcpip.MonotonicTime 367 368 // Disabled is true if an explicit receive buffer is set for the 369 // endpoint. 370 Disabled bool 371 } 372 373 // TCPRcvBufState contains information about the state of an endpoint's receive 374 // socket buffer. 375 // 376 // +stateify savable 377 type TCPRcvBufState struct { 378 // RcvBufUsed is the amount of bytes actually held in the receive 379 // socket buffer for the endpoint. 380 RcvBufUsed int 381 382 // RcvBufAutoTuneParams is used to hold state variables to compute the 383 // auto tuned receive buffer size. 384 RcvAutoParams RcvBufAutoTuneParams 385 386 // RcvClosed if true, indicates the endpoint has been closed for 387 // reading. 388 RcvClosed bool 389 } 390 391 // TCPSndBufState contains information about the state of an endpoint's send 392 // socket buffer. 393 // 394 // +stateify savable 395 type TCPSndBufState struct { 396 // SndBufSize is the size of the socket send buffer. 397 SndBufSize int 398 399 // SndBufUsed is the number of bytes held in the socket send buffer. 400 SndBufUsed int 401 402 // SndClosed indicates that the endpoint has been closed for sends. 403 SndClosed bool 404 405 // PacketTooBigCount is used to notify the main protocol routine how 406 // many times a "packet too big" control packet is received. 407 PacketTooBigCount int 408 409 // SndMTU is the smallest MTU seen in the control packets received. 410 SndMTU int 411 412 // AutoTuneSndBufDisabled indicates that the auto tuning of send buffer 413 // is disabled. 414 AutoTuneSndBufDisabled atomicbitops.Uint32 415 } 416 417 // TCPEndpointStateInner contains the members of TCPEndpointState used directly 418 // (that is, not within another containing struct) within the endpoint's 419 // internal implementation. 420 // 421 // +stateify savable 422 type TCPEndpointStateInner struct { 423 // TSOffset is a randomized offset added to the value of the TSVal 424 // field in the timestamp option. 425 TSOffset tcp.TSOffset 426 427 // SACKPermitted is set to true if the peer sends the TCPSACKPermitted 428 // option in the SYN/SYN-ACK. 429 SACKPermitted bool 430 431 // SendTSOk is used to indicate when the TS Option has been negotiated. 432 // When sendTSOk is true every non-RST segment should carry a TS as per 433 // RFC7323#section-1.1. 434 SendTSOk bool 435 436 // RecentTS is the timestamp that should be sent in the TSEcr field of 437 // the timestamp for future segments sent by the endpoint. This field 438 // is updated if required when a new segment is received by this 439 // endpoint. 440 RecentTS uint32 441 } 442 443 // TCPEndpointState is a copy of the internal state of a TCP endpoint. 444 // 445 // +stateify savable 446 type TCPEndpointState struct { 447 // TCPEndpointStateInner contains the members of TCPEndpointState used 448 // by the endpoint's internal implementation. 449 TCPEndpointStateInner 450 451 // ID is a copy of the TransportEndpointID for the endpoint. 452 ID TCPEndpointID 453 454 // SegTime denotes the absolute time when this segment was received. 455 SegTime tcpip.MonotonicTime 456 457 // RcvBufState contains information about the state of the endpoint's 458 // receive socket buffer. 459 RcvBufState TCPRcvBufState 460 461 // SndBufState contains information about the state of the endpoint's 462 // send socket buffer. 463 SndBufState TCPSndBufState 464 465 // SACK holds TCP SACK related information for this endpoint. 466 SACK TCPSACKInfo 467 468 // Receiver holds variables related to the TCP receiver for the 469 // endpoint. 470 Receiver TCPReceiverState 471 472 // Sender holds state related to the TCP Sender for the endpoint. 473 Sender TCPSenderState 474 }