github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/tcpip/stack/tcp.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package stack 16 17 import ( 18 "time" 19 20 "github.com/SagerNet/gvisor/pkg/tcpip" 21 "github.com/SagerNet/gvisor/pkg/tcpip/header" 22 "github.com/SagerNet/gvisor/pkg/tcpip/seqnum" 23 ) 24 25 // TCPProbeFunc is the expected function type for a TCP probe function to be 26 // passed to stack.AddTCPProbe. 27 type TCPProbeFunc func(s TCPEndpointState) 28 29 // TCPCubicState is used to hold a copy of the internal cubic state when the 30 // TCPProbeFunc is invoked. 31 // 32 // +stateify savable 33 type TCPCubicState struct { 34 // WLastMax is the previous wMax value. 35 WLastMax float64 36 37 // WMax is the value of the congestion window at the time of the last 38 // congestion event. 39 WMax float64 40 41 // T is the time when the current congestion avoidance was entered. 42 T tcpip.MonotonicTime 43 44 // TimeSinceLastCongestion denotes the time since the current 45 // congestion avoidance was entered. 46 TimeSinceLastCongestion time.Duration 47 48 // C is the cubic constant as specified in RFC8312, page 11. 49 C float64 50 51 // K is the time period (in seconds) that the above function takes to 52 // increase the current window size to WMax if there are no further 53 // congestion events and is calculated using the following equation: 54 // 55 // K = cubic_root(WMax*(1-beta_cubic)/C) (Eq. 2, page 5) 56 K float64 57 58 // Beta is the CUBIC multiplication decrease factor. That is, when a 59 // congestion event is detected, CUBIC reduces its cwnd to 60 // WC(0)=WMax*beta_cubic. 61 Beta float64 62 63 // WC is window computed by CUBIC at time TimeSinceLastCongestion. It's 64 // calculated using the formula: 65 // 66 // WC(TimeSinceLastCongestion) = C*(t-K)^3 + WMax (Eq. 1) 67 WC float64 68 69 // WEst is the window computed by CUBIC at time 70 // TimeSinceLastCongestion+RTT i.e WC(TimeSinceLastCongestion+RTT). 71 WEst float64 72 } 73 74 // TCPRACKState is used to hold a copy of the internal RACK state when the 75 // TCPProbeFunc is invoked. 76 // 77 // +stateify savable 78 type TCPRACKState struct { 79 // XmitTime is the transmission timestamp of the most recent 80 // acknowledged segment. 81 XmitTime tcpip.MonotonicTime 82 83 // EndSequence is the ending TCP sequence number of the most recent 84 // acknowledged segment. 85 EndSequence seqnum.Value 86 87 // FACK is the highest selectively or cumulatively acknowledged 88 // sequence. 89 FACK seqnum.Value 90 91 // RTT is the round trip time of the most recently delivered packet on 92 // the connection (either cumulatively acknowledged or selectively 93 // acknowledged) that was not marked invalid as a possible spurious 94 // retransmission. 95 RTT time.Duration 96 97 // Reord is true iff reordering has been detected on this connection. 98 Reord bool 99 100 // DSACKSeen is true iff the connection has seen a DSACK. 101 DSACKSeen bool 102 103 // ReoWnd is the reordering window time used for recording packet 104 // transmission times. It is used to defer the moment at which RACK 105 // marks a packet lost. 106 ReoWnd time.Duration 107 108 // ReoWndIncr is the multiplier applied to adjust reorder window. 109 ReoWndIncr uint8 110 111 // ReoWndPersist is the number of loss recoveries before resetting 112 // reorder window. 113 ReoWndPersist int8 114 115 // RTTSeq is the SND.NXT when RTT is updated. 116 RTTSeq seqnum.Value 117 } 118 119 // TCPEndpointID is the unique 4 tuple that identifies a given endpoint. 120 // 121 // +stateify savable 122 type TCPEndpointID struct { 123 // LocalPort is the local port associated with the endpoint. 124 LocalPort uint16 125 126 // LocalAddress is the local [network layer] address associated with 127 // the endpoint. 128 LocalAddress tcpip.Address 129 130 // RemotePort is the remote port associated with the endpoint. 131 RemotePort uint16 132 133 // RemoteAddress it the remote [network layer] address associated with 134 // the endpoint. 135 RemoteAddress tcpip.Address 136 } 137 138 // TCPFastRecoveryState holds a copy of the internal fast recovery state of a 139 // TCP endpoint. 140 // 141 // +stateify savable 142 type TCPFastRecoveryState struct { 143 // Active if true indicates the endpoint is in fast recovery. The 144 // following fields are only meaningful when Active is true. 145 Active bool 146 147 // First is the first unacknowledged sequence number being recovered. 148 First seqnum.Value 149 150 // Last is the 'recover' sequence number that indicates the point at 151 // which we should exit recovery barring any timeouts etc. 152 Last seqnum.Value 153 154 // MaxCwnd is the maximum value we are permitted to grow the congestion 155 // window during recovery. This is set at the time we enter recovery. 156 // It exists to avoid attacks where the receiver intentionally sends 157 // duplicate acks to artificially inflate the sender's cwnd. 158 MaxCwnd int 159 160 // HighRxt is the highest sequence number which has been retransmitted 161 // during the current loss recovery phase. See: RFC 6675 Section 2 for 162 // details. 163 HighRxt seqnum.Value 164 165 // RescueRxt is the highest sequence number which has been 166 // optimistically retransmitted to prevent stalling of the ACK clock 167 // when there is loss at the end of the window and no new data is 168 // available for transmission. See: RFC 6675 Section 2 for details. 169 RescueRxt seqnum.Value 170 } 171 172 // TCPReceiverState holds a copy of the internal state of the receiver for a 173 // given TCP endpoint. 174 // 175 // +stateify savable 176 type TCPReceiverState struct { 177 // RcvNxt is the TCP variable RCV.NXT. 178 RcvNxt seqnum.Value 179 180 // RcvAcc is one beyond the last acceptable sequence number. That is, 181 // the "largest" sequence value that the receiver has announced to its 182 // peer that it's willing to accept. This may be different than RcvNxt 183 // + (last advertised receive window) if the receive window is reduced; 184 // in that case we have to reduce the window as we receive more data 185 // instead of shrinking it. 186 RcvAcc seqnum.Value 187 188 // RcvWndScale is the window scaling to use for inbound segments. 189 RcvWndScale uint8 190 191 // PendingBufUsed is the number of bytes pending in the receive queue. 192 PendingBufUsed int 193 } 194 195 // TCPRTTState holds a copy of information about the endpoint's round trip 196 // time. 197 // 198 // +stateify savable 199 type TCPRTTState struct { 200 // SRTT is the smoothed round trip time defined in section 2 of RFC 201 // 6298. 202 SRTT time.Duration 203 204 // RTTVar is the round-trip time variation as defined in section 2 of 205 // RFC 6298. 206 RTTVar time.Duration 207 208 // SRTTInited if true indicates that a valid RTT measurement has been 209 // completed. 210 SRTTInited bool 211 } 212 213 // TCPSenderState holds a copy of the internal state of the sender for a given 214 // TCP Endpoint. 215 // 216 // +stateify savable 217 type TCPSenderState struct { 218 // LastSendTime is the timestamp at which we sent the last segment. 219 LastSendTime tcpip.MonotonicTime 220 221 // DupAckCount is the number of Duplicate ACKs received. It is used for 222 // fast retransmit. 223 DupAckCount int 224 225 // SndCwnd is the size of the sending congestion window in packets. 226 SndCwnd int 227 228 // Ssthresh is the threshold between slow start and congestion 229 // avoidance. 230 Ssthresh int 231 232 // SndCAAckCount is the number of packets acknowledged during 233 // congestion avoidance. When enough packets have been ack'd (typically 234 // cwnd packets), the congestion window is incremented by one. 235 SndCAAckCount int 236 237 // Outstanding is the number of packets that have been sent but not yet 238 // acknowledged. 239 Outstanding int 240 241 // SackedOut is the number of packets which have been selectively 242 // acked. 243 SackedOut int 244 245 // SndWnd is the send window size in bytes. 246 SndWnd seqnum.Size 247 248 // SndUna is the next unacknowledged sequence number. 249 SndUna seqnum.Value 250 251 // SndNxt is the sequence number of the next segment to be sent. 252 SndNxt seqnum.Value 253 254 // RTTMeasureSeqNum is the sequence number being used for the latest 255 // RTT measurement. 256 RTTMeasureSeqNum seqnum.Value 257 258 // RTTMeasureTime is the time when the RTTMeasureSeqNum was sent. 259 RTTMeasureTime tcpip.MonotonicTime 260 261 // Closed indicates that the caller has closed the endpoint for 262 // sending. 263 Closed bool 264 265 // RTO is the retransmit timeout as defined in section of 2 of RFC 266 // 6298. 267 RTO time.Duration 268 269 // RTTState holds information about the endpoint's round trip time. 270 RTTState TCPRTTState 271 272 // MaxPayloadSize is the maximum size of the payload of a given 273 // segment. It is initialized on demand. 274 MaxPayloadSize int 275 276 // SndWndScale is the number of bits to shift left when reading the 277 // send window size from a segment. 278 SndWndScale uint8 279 280 // MaxSentAck is the highest acknowledgement number sent till now. 281 MaxSentAck seqnum.Value 282 283 // FastRecovery holds the fast recovery state for the endpoint. 284 FastRecovery TCPFastRecoveryState 285 286 // Cubic holds the state related to CUBIC congestion control. 287 Cubic TCPCubicState 288 289 // RACKState holds the state related to RACK loss detection algorithm. 290 RACKState TCPRACKState 291 } 292 293 // TCPSACKInfo holds TCP SACK related information for a given TCP endpoint. 294 // 295 // +stateify savable 296 type TCPSACKInfo struct { 297 // Blocks is the list of SACK Blocks that identify the out of order 298 // segments held by a given TCP endpoint. 299 Blocks []header.SACKBlock 300 301 // ReceivedBlocks are the SACK blocks received by this endpoint from 302 // the peer endpoint. 303 ReceivedBlocks []header.SACKBlock 304 305 // MaxSACKED is the highest sequence number that has been SACKED by the 306 // peer. 307 MaxSACKED seqnum.Value 308 } 309 310 // RcvBufAutoTuneParams holds state related to TCP receive buffer auto-tuning. 311 // 312 // +stateify savable 313 type RcvBufAutoTuneParams struct { 314 // MeasureTime is the time at which the current measurement was 315 // started. 316 MeasureTime tcpip.MonotonicTime 317 318 // CopiedBytes is the number of bytes copied to user space since this 319 // measure began. 320 CopiedBytes int 321 322 // PrevCopiedBytes is the number of bytes copied to userspace in the 323 // previous RTT period. 324 PrevCopiedBytes int 325 326 // RcvBufSize is the auto tuned receive buffer size. 327 RcvBufSize int 328 329 // RTT is the smoothed RTT as measured by observing the time between 330 // when a byte is first acknowledged and the receipt of data that is at 331 // least one window beyond the sequence number that was acknowledged. 332 RTT time.Duration 333 334 // RTTVar is the "round-trip time variation" as defined in section 2 of 335 // RFC6298. 336 RTTVar time.Duration 337 338 // RTTMeasureSeqNumber is the highest acceptable sequence number at the 339 // time this RTT measurement period began. 340 RTTMeasureSeqNumber seqnum.Value 341 342 // RTTMeasureTime is the absolute time at which the current RTT 343 // measurement period began. 344 RTTMeasureTime tcpip.MonotonicTime 345 346 // Disabled is true if an explicit receive buffer is set for the 347 // endpoint. 348 Disabled bool 349 } 350 351 // TCPRcvBufState contains information about the state of an endpoint's receive 352 // socket buffer. 353 // 354 // +stateify savable 355 type TCPRcvBufState struct { 356 // RcvBufUsed is the amount of bytes actually held in the receive 357 // socket buffer for the endpoint. 358 RcvBufUsed int 359 360 // RcvBufAutoTuneParams is used to hold state variables to compute the 361 // auto tuned receive buffer size. 362 RcvAutoParams RcvBufAutoTuneParams 363 364 // RcvClosed if true, indicates the endpoint has been closed for 365 // reading. 366 RcvClosed bool 367 } 368 369 // TCPSndBufState contains information about the state of an endpoint's send 370 // socket buffer. 371 // 372 // +stateify savable 373 type TCPSndBufState struct { 374 // SndBufSize is the size of the socket send buffer. 375 SndBufSize int 376 377 // SndBufUsed is the number of bytes held in the socket send buffer. 378 SndBufUsed int 379 380 // SndClosed indicates that the endpoint has been closed for sends. 381 SndClosed bool 382 383 // PacketTooBigCount is used to notify the main protocol routine how 384 // many times a "packet too big" control packet is received. 385 PacketTooBigCount int 386 387 // SndMTU is the smallest MTU seen in the control packets received. 388 SndMTU int 389 } 390 391 // TCPEndpointStateInner contains the members of TCPEndpointState used directly 392 // (that is, not within another containing struct) within the endpoint's 393 // internal implementation. 394 // 395 // +stateify savable 396 type TCPEndpointStateInner struct { 397 // TSOffset is a randomized offset added to the value of the TSVal 398 // field in the timestamp option. 399 TSOffset uint32 400 401 // SACKPermitted is set to true if the peer sends the TCPSACKPermitted 402 // option in the SYN/SYN-ACK. 403 SACKPermitted bool 404 405 // SendTSOk is used to indicate when the TS Option has been negotiated. 406 // When sendTSOk is true every non-RST segment should carry a TS as per 407 // RFC7323#section-1.1. 408 SendTSOk bool 409 410 // RecentTS is the timestamp that should be sent in the TSEcr field of 411 // the timestamp for future segments sent by the endpoint. This field 412 // is updated if required when a new segment is received by this 413 // endpoint. 414 RecentTS uint32 415 } 416 417 // TCPEndpointState is a copy of the internal state of a TCP endpoint. 418 // 419 // +stateify savable 420 type TCPEndpointState struct { 421 // TCPEndpointStateInner contains the members of TCPEndpointState used 422 // by the endpoint's internal implementation. 423 TCPEndpointStateInner 424 425 // ID is a copy of the TransportEndpointID for the endpoint. 426 ID TCPEndpointID 427 428 // SegTime denotes the absolute time when this segment was received. 429 SegTime tcpip.MonotonicTime 430 431 // RcvBufState contains information about the state of the endpoint's 432 // receive socket buffer. 433 RcvBufState TCPRcvBufState 434 435 // SndBufState contains information about the state of the endpoint's 436 // send socket buffer. 437 SndBufState TCPSndBufState 438 439 // SACK holds TCP SACK related information for this endpoint. 440 SACK TCPSACKInfo 441 442 // Receiver holds variables related to the TCP receiver for the 443 // endpoint. 444 Receiver TCPReceiverState 445 446 // Sender holds state related to the TCP Sender for the endpoint. 447 Sender TCPSenderState 448 }