github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/tcpip/stack/tcp.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package stack 16 17 import ( 18 "time" 19 20 "github.com/MerlinKodo/gvisor/pkg/atomicbitops" 21 "github.com/MerlinKodo/gvisor/pkg/tcpip" 22 "github.com/MerlinKodo/gvisor/pkg/tcpip/header" 23 "github.com/MerlinKodo/gvisor/pkg/tcpip/internal/tcp" 24 "github.com/MerlinKodo/gvisor/pkg/tcpip/seqnum" 25 ) 26 27 // TCPProbeFunc is the expected function type for a TCP probe function to be 28 // passed to stack.AddTCPProbe. 29 type TCPProbeFunc func(s *TCPEndpointState) 30 31 // TCPCubicState is used to hold a copy of the internal cubic state when the 32 // TCPProbeFunc is invoked. 33 // 34 // +stateify savable 35 type TCPCubicState struct { 36 // WLastMax is the previous wMax value. 37 WLastMax float64 38 39 // WMax is the value of the congestion window at the time of the last 40 // congestion event. 41 WMax float64 42 43 // T is the time when the current congestion avoidance was entered. 44 T tcpip.MonotonicTime 45 46 // TimeSinceLastCongestion denotes the time since the current 47 // congestion avoidance was entered. 48 TimeSinceLastCongestion time.Duration 49 50 // C is the cubic constant as specified in RFC8312, page 11. 51 C float64 52 53 // K is the time period (in seconds) that the above function takes to 54 // increase the current window size to WMax if there are no further 55 // congestion events and is calculated using the following equation: 56 // 57 // K = cubic_root(WMax*(1-beta_cubic)/C) (Eq. 2, page 5) 58 K float64 59 60 // Beta is the CUBIC multiplication decrease factor. That is, when a 61 // congestion event is detected, CUBIC reduces its cwnd to 62 // WC(0)=WMax*beta_cubic. 63 Beta float64 64 65 // WC is window computed by CUBIC at time TimeSinceLastCongestion. It's 66 // calculated using the formula: 67 // 68 // WC(TimeSinceLastCongestion) = C*(t-K)^3 + WMax (Eq. 1) 69 WC float64 70 71 // WEst is the window computed by CUBIC at time 72 // TimeSinceLastCongestion+RTT i.e WC(TimeSinceLastCongestion+RTT). 73 WEst float64 74 } 75 76 // TCPRACKState is used to hold a copy of the internal RACK state when the 77 // TCPProbeFunc is invoked. 78 // 79 // +stateify savable 80 type TCPRACKState struct { 81 // XmitTime is the transmission timestamp of the most recent 82 // acknowledged segment. 83 XmitTime tcpip.MonotonicTime 84 85 // EndSequence is the ending TCP sequence number of the most recent 86 // acknowledged segment. 87 EndSequence seqnum.Value 88 89 // FACK is the highest selectively or cumulatively acknowledged 90 // sequence. 91 FACK seqnum.Value 92 93 // RTT is the round trip time of the most recently delivered packet on 94 // the connection (either cumulatively acknowledged or selectively 95 // acknowledged) that was not marked invalid as a possible spurious 96 // retransmission. 97 RTT time.Duration 98 99 // Reord is true iff reordering has been detected on this connection. 100 Reord bool 101 102 // DSACKSeen is true iff the connection has seen a DSACK. 103 DSACKSeen bool 104 105 // ReoWnd is the reordering window time used for recording packet 106 // transmission times. It is used to defer the moment at which RACK 107 // marks a packet lost. 108 ReoWnd time.Duration 109 110 // ReoWndIncr is the multiplier applied to adjust reorder window. 111 ReoWndIncr uint8 112 113 // ReoWndPersist is the number of loss recoveries before resetting 114 // reorder window. 115 ReoWndPersist int8 116 117 // RTTSeq is the SND.NXT when RTT is updated. 118 RTTSeq seqnum.Value 119 } 120 121 // TCPEndpointID is the unique 4 tuple that identifies a given endpoint. 122 // 123 // +stateify savable 124 type TCPEndpointID struct { 125 // LocalPort is the local port associated with the endpoint. 126 LocalPort uint16 127 128 // LocalAddress is the local [network layer] address associated with 129 // the endpoint. 130 LocalAddress tcpip.Address 131 132 // RemotePort is the remote port associated with the endpoint. 133 RemotePort uint16 134 135 // RemoteAddress it the remote [network layer] address associated with 136 // the endpoint. 137 RemoteAddress tcpip.Address 138 } 139 140 // TCPFastRecoveryState holds a copy of the internal fast recovery state of a 141 // TCP endpoint. 142 // 143 // +stateify savable 144 type TCPFastRecoveryState struct { 145 // Active if true indicates the endpoint is in fast recovery. The 146 // following fields are only meaningful when Active is true. 147 Active bool 148 149 // First is the first unacknowledged sequence number being recovered. 150 First seqnum.Value 151 152 // Last is the 'recover' sequence number that indicates the point at 153 // which we should exit recovery barring any timeouts etc. 154 Last seqnum.Value 155 156 // MaxCwnd is the maximum value we are permitted to grow the congestion 157 // window during recovery. This is set at the time we enter recovery. 158 // It exists to avoid attacks where the receiver intentionally sends 159 // duplicate acks to artificially inflate the sender's cwnd. 160 MaxCwnd int 161 162 // HighRxt is the highest sequence number which has been retransmitted 163 // during the current loss recovery phase. See: RFC 6675 Section 2 for 164 // details. 165 HighRxt seqnum.Value 166 167 // RescueRxt is the highest sequence number which has been 168 // optimistically retransmitted to prevent stalling of the ACK clock 169 // when there is loss at the end of the window and no new data is 170 // available for transmission. See: RFC 6675 Section 2 for details. 171 RescueRxt seqnum.Value 172 } 173 174 // TCPReceiverState holds a copy of the internal state of the receiver for a 175 // given TCP endpoint. 176 // 177 // +stateify savable 178 type TCPReceiverState struct { 179 // RcvNxt is the TCP variable RCV.NXT. 180 RcvNxt seqnum.Value 181 182 // RcvAcc is one beyond the last acceptable sequence number. That is, 183 // the "largest" sequence value that the receiver has announced to its 184 // peer that it's willing to accept. This may be different than RcvNxt 185 // + (last advertised receive window) if the receive window is reduced; 186 // in that case we have to reduce the window as we receive more data 187 // instead of shrinking it. 188 RcvAcc seqnum.Value 189 190 // RcvWndScale is the window scaling to use for inbound segments. 191 RcvWndScale uint8 192 193 // PendingBufUsed is the number of bytes pending in the receive queue. 194 PendingBufUsed int 195 } 196 197 // TCPRTTState holds a copy of information about the endpoint's round trip 198 // time. 199 // 200 // +stateify savable 201 type TCPRTTState struct { 202 // SRTT is the smoothed round trip time defined in section 2 of RFC 203 // 6298. 204 SRTT time.Duration 205 206 // RTTVar is the round-trip time variation as defined in section 2 of 207 // RFC 6298. 208 RTTVar time.Duration 209 210 // SRTTInited if true indicates that a valid RTT measurement has been 211 // completed. 212 SRTTInited bool 213 } 214 215 // TCPSenderState holds a copy of the internal state of the sender for a given 216 // TCP Endpoint. 217 // 218 // +stateify savable 219 type TCPSenderState struct { 220 // LastSendTime is the timestamp at which we sent the last segment. 221 LastSendTime tcpip.MonotonicTime 222 223 // DupAckCount is the number of Duplicate ACKs received. It is used for 224 // fast retransmit. 225 DupAckCount int 226 227 // SndCwnd is the size of the sending congestion window in packets. 228 SndCwnd int 229 230 // Ssthresh is the threshold between slow start and congestion 231 // avoidance. 232 Ssthresh int 233 234 // SndCAAckCount is the number of packets acknowledged during 235 // congestion avoidance. When enough packets have been ack'd (typically 236 // cwnd packets), the congestion window is incremented by one. 237 SndCAAckCount int 238 239 // Outstanding is the number of packets that have been sent but not yet 240 // acknowledged. 241 Outstanding int 242 243 // SackedOut is the number of packets which have been selectively 244 // acked. 245 SackedOut int 246 247 // SndWnd is the send window size in bytes. 248 SndWnd seqnum.Size 249 250 // SndUna is the next unacknowledged sequence number. 251 SndUna seqnum.Value 252 253 // SndNxt is the sequence number of the next segment to be sent. 254 SndNxt seqnum.Value 255 256 // RTTMeasureSeqNum is the sequence number being used for the latest 257 // RTT measurement. 258 RTTMeasureSeqNum seqnum.Value 259 260 // RTTMeasureTime is the time when the RTTMeasureSeqNum was sent. 261 RTTMeasureTime tcpip.MonotonicTime 262 263 // Closed indicates that the caller has closed the endpoint for 264 // sending. 265 Closed bool 266 267 // RTO is the retransmit timeout as defined in section of 2 of RFC 268 // 6298. 269 RTO time.Duration 270 271 // RTTState holds information about the endpoint's round trip time. 272 RTTState TCPRTTState 273 274 // MaxPayloadSize is the maximum size of the payload of a given 275 // segment. It is initialized on demand. 276 MaxPayloadSize int 277 278 // SndWndScale is the number of bits to shift left when reading the 279 // send window size from a segment. 280 SndWndScale uint8 281 282 // MaxSentAck is the highest acknowledgement number sent till now. 283 MaxSentAck seqnum.Value 284 285 // FastRecovery holds the fast recovery state for the endpoint. 286 FastRecovery TCPFastRecoveryState 287 288 // Cubic holds the state related to CUBIC congestion control. 289 Cubic TCPCubicState 290 291 // RACKState holds the state related to RACK loss detection algorithm. 292 RACKState TCPRACKState 293 294 // RetransmitTS records the timestamp used to detect spurious recovery. 295 RetransmitTS uint32 296 297 // SpuriousRecovery indicates if the sender entered recovery spuriously. 298 SpuriousRecovery bool 299 } 300 301 // TCPSACKInfo holds TCP SACK related information for a given TCP endpoint. 302 // 303 // +stateify savable 304 type TCPSACKInfo struct { 305 // Blocks is the list of SACK Blocks that identify the out of order 306 // segments held by a given TCP endpoint. 307 Blocks []header.SACKBlock 308 309 // ReceivedBlocks are the SACK blocks received by this endpoint from 310 // the peer endpoint. 311 ReceivedBlocks []header.SACKBlock 312 313 // MaxSACKED is the highest sequence number that has been SACKED by the 314 // peer. 315 MaxSACKED seqnum.Value 316 } 317 318 // RcvBufAutoTuneParams holds state related to TCP receive buffer auto-tuning. 319 // 320 // +stateify savable 321 type RcvBufAutoTuneParams struct { 322 // MeasureTime is the time at which the current measurement was 323 // started. 324 MeasureTime tcpip.MonotonicTime 325 326 // CopiedBytes is the number of bytes copied to user space since this 327 // measure began. 328 CopiedBytes int 329 330 // PrevCopiedBytes is the number of bytes copied to userspace in the 331 // previous RTT period. 332 PrevCopiedBytes int 333 334 // RcvBufSize is the auto tuned receive buffer size. 335 RcvBufSize int 336 337 // RTT is the smoothed RTT as measured by observing the time between 338 // when a byte is first acknowledged and the receipt of data that is at 339 // least one window beyond the sequence number that was acknowledged. 340 RTT time.Duration 341 342 // RTTVar is the "round-trip time variation" as defined in section 2 of 343 // RFC6298. 344 RTTVar time.Duration 345 346 // RTTMeasureSeqNumber is the highest acceptable sequence number at the 347 // time this RTT measurement period began. 348 RTTMeasureSeqNumber seqnum.Value 349 350 // RTTMeasureTime is the absolute time at which the current RTT 351 // measurement period began. 352 RTTMeasureTime tcpip.MonotonicTime 353 354 // Disabled is true if an explicit receive buffer is set for the 355 // endpoint. 356 Disabled bool 357 } 358 359 // TCPRcvBufState contains information about the state of an endpoint's receive 360 // socket buffer. 361 // 362 // +stateify savable 363 type TCPRcvBufState struct { 364 // RcvBufUsed is the amount of bytes actually held in the receive 365 // socket buffer for the endpoint. 366 RcvBufUsed int 367 368 // RcvBufAutoTuneParams is used to hold state variables to compute the 369 // auto tuned receive buffer size. 370 RcvAutoParams RcvBufAutoTuneParams 371 372 // RcvClosed if true, indicates the endpoint has been closed for 373 // reading. 374 RcvClosed bool 375 } 376 377 // TCPSndBufState contains information about the state of an endpoint's send 378 // socket buffer. 379 // 380 // +stateify savable 381 type TCPSndBufState struct { 382 // SndBufSize is the size of the socket send buffer. 383 SndBufSize int 384 385 // SndBufUsed is the number of bytes held in the socket send buffer. 386 SndBufUsed int 387 388 // SndClosed indicates that the endpoint has been closed for sends. 389 SndClosed bool 390 391 // PacketTooBigCount is used to notify the main protocol routine how 392 // many times a "packet too big" control packet is received. 393 PacketTooBigCount int 394 395 // SndMTU is the smallest MTU seen in the control packets received. 396 SndMTU int 397 398 // AutoTuneSndBufDisabled indicates that the auto tuning of send buffer 399 // is disabled. 400 AutoTuneSndBufDisabled atomicbitops.Uint32 401 } 402 403 // TCPEndpointStateInner contains the members of TCPEndpointState used directly 404 // (that is, not within another containing struct) within the endpoint's 405 // internal implementation. 406 // 407 // +stateify savable 408 type TCPEndpointStateInner struct { 409 // TSOffset is a randomized offset added to the value of the TSVal 410 // field in the timestamp option. 411 TSOffset tcp.TSOffset 412 413 // SACKPermitted is set to true if the peer sends the TCPSACKPermitted 414 // option in the SYN/SYN-ACK. 415 SACKPermitted bool 416 417 // SendTSOk is used to indicate when the TS Option has been negotiated. 418 // When sendTSOk is true every non-RST segment should carry a TS as per 419 // RFC7323#section-1.1. 420 SendTSOk bool 421 422 // RecentTS is the timestamp that should be sent in the TSEcr field of 423 // the timestamp for future segments sent by the endpoint. This field 424 // is updated if required when a new segment is received by this 425 // endpoint. 426 RecentTS uint32 427 } 428 429 // TCPEndpointState is a copy of the internal state of a TCP endpoint. 430 // 431 // +stateify savable 432 type TCPEndpointState struct { 433 // TCPEndpointStateInner contains the members of TCPEndpointState used 434 // by the endpoint's internal implementation. 435 TCPEndpointStateInner 436 437 // ID is a copy of the TransportEndpointID for the endpoint. 438 ID TCPEndpointID 439 440 // SegTime denotes the absolute time when this segment was received. 441 SegTime tcpip.MonotonicTime 442 443 // RcvBufState contains information about the state of the endpoint's 444 // receive socket buffer. 445 RcvBufState TCPRcvBufState 446 447 // SndBufState contains information about the state of the endpoint's 448 // send socket buffer. 449 SndBufState TCPSndBufState 450 451 // SACK holds TCP SACK related information for this endpoint. 452 SACK TCPSACKInfo 453 454 // Receiver holds variables related to the TCP receiver for the 455 // endpoint. 456 Receiver TCPReceiverState 457 458 // Sender holds state related to the TCP Sender for the endpoint. 459 Sender TCPSenderState 460 }