inet.af/netstack@v0.0.0-20220214151720-7585b01ddccf/tcpip/stack/tcp.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package stack 16 17 import ( 18 "time" 19 20 "inet.af/netstack/tcpip" 21 "inet.af/netstack/tcpip/header" 22 "inet.af/netstack/tcpip/internal/tcp" 23 "inet.af/netstack/tcpip/seqnum" 24 ) 25 26 // TCPProbeFunc is the expected function type for a TCP probe function to be 27 // passed to stack.AddTCPProbe. 28 type TCPProbeFunc func(s TCPEndpointState) 29 30 // TCPCubicState is used to hold a copy of the internal cubic state when the 31 // TCPProbeFunc is invoked. 32 // 33 // +stateify savable 34 type TCPCubicState struct { 35 // WLastMax is the previous wMax value. 36 WLastMax float64 37 38 // WMax is the value of the congestion window at the time of the last 39 // congestion event. 40 WMax float64 41 42 // T is the time when the current congestion avoidance was entered. 43 T tcpip.MonotonicTime 44 45 // TimeSinceLastCongestion denotes the time since the current 46 // congestion avoidance was entered. 47 TimeSinceLastCongestion time.Duration 48 49 // C is the cubic constant as specified in RFC8312, page 11. 50 C float64 51 52 // K is the time period (in seconds) that the above function takes to 53 // increase the current window size to WMax if there are no further 54 // congestion events and is calculated using the following equation: 55 // 56 // K = cubic_root(WMax*(1-beta_cubic)/C) (Eq. 2, page 5) 57 K float64 58 59 // Beta is the CUBIC multiplication decrease factor. That is, when a 60 // congestion event is detected, CUBIC reduces its cwnd to 61 // WC(0)=WMax*beta_cubic. 62 Beta float64 63 64 // WC is window computed by CUBIC at time TimeSinceLastCongestion. It's 65 // calculated using the formula: 66 // 67 // WC(TimeSinceLastCongestion) = C*(t-K)^3 + WMax (Eq. 1) 68 WC float64 69 70 // WEst is the window computed by CUBIC at time 71 // TimeSinceLastCongestion+RTT i.e WC(TimeSinceLastCongestion+RTT). 72 WEst float64 73 } 74 75 // TCPRACKState is used to hold a copy of the internal RACK state when the 76 // TCPProbeFunc is invoked. 77 // 78 // +stateify savable 79 type TCPRACKState struct { 80 // XmitTime is the transmission timestamp of the most recent 81 // acknowledged segment. 82 XmitTime tcpip.MonotonicTime 83 84 // EndSequence is the ending TCP sequence number of the most recent 85 // acknowledged segment. 86 EndSequence seqnum.Value 87 88 // FACK is the highest selectively or cumulatively acknowledged 89 // sequence. 90 FACK seqnum.Value 91 92 // RTT is the round trip time of the most recently delivered packet on 93 // the connection (either cumulatively acknowledged or selectively 94 // acknowledged) that was not marked invalid as a possible spurious 95 // retransmission. 96 RTT time.Duration 97 98 // Reord is true iff reordering has been detected on this connection. 99 Reord bool 100 101 // DSACKSeen is true iff the connection has seen a DSACK. 102 DSACKSeen bool 103 104 // ReoWnd is the reordering window time used for recording packet 105 // transmission times. It is used to defer the moment at which RACK 106 // marks a packet lost. 107 ReoWnd time.Duration 108 109 // ReoWndIncr is the multiplier applied to adjust reorder window. 110 ReoWndIncr uint8 111 112 // ReoWndPersist is the number of loss recoveries before resetting 113 // reorder window. 114 ReoWndPersist int8 115 116 // RTTSeq is the SND.NXT when RTT is updated. 117 RTTSeq seqnum.Value 118 } 119 120 // TCPEndpointID is the unique 4 tuple that identifies a given endpoint. 121 // 122 // +stateify savable 123 type TCPEndpointID struct { 124 // LocalPort is the local port associated with the endpoint. 125 LocalPort uint16 126 127 // LocalAddress is the local [network layer] address associated with 128 // the endpoint. 129 LocalAddress tcpip.Address 130 131 // RemotePort is the remote port associated with the endpoint. 132 RemotePort uint16 133 134 // RemoteAddress it the remote [network layer] address associated with 135 // the endpoint. 136 RemoteAddress tcpip.Address 137 } 138 139 // TCPFastRecoveryState holds a copy of the internal fast recovery state of a 140 // TCP endpoint. 141 // 142 // +stateify savable 143 type TCPFastRecoveryState struct { 144 // Active if true indicates the endpoint is in fast recovery. The 145 // following fields are only meaningful when Active is true. 146 Active bool 147 148 // First is the first unacknowledged sequence number being recovered. 149 First seqnum.Value 150 151 // Last is the 'recover' sequence number that indicates the point at 152 // which we should exit recovery barring any timeouts etc. 153 Last seqnum.Value 154 155 // MaxCwnd is the maximum value we are permitted to grow the congestion 156 // window during recovery. This is set at the time we enter recovery. 157 // It exists to avoid attacks where the receiver intentionally sends 158 // duplicate acks to artificially inflate the sender's cwnd. 159 MaxCwnd int 160 161 // HighRxt is the highest sequence number which has been retransmitted 162 // during the current loss recovery phase. See: RFC 6675 Section 2 for 163 // details. 164 HighRxt seqnum.Value 165 166 // RescueRxt is the highest sequence number which has been 167 // optimistically retransmitted to prevent stalling of the ACK clock 168 // when there is loss at the end of the window and no new data is 169 // available for transmission. See: RFC 6675 Section 2 for details. 170 RescueRxt seqnum.Value 171 } 172 173 // TCPReceiverState holds a copy of the internal state of the receiver for a 174 // given TCP endpoint. 175 // 176 // +stateify savable 177 type TCPReceiverState struct { 178 // RcvNxt is the TCP variable RCV.NXT. 179 RcvNxt seqnum.Value 180 181 // RcvAcc is one beyond the last acceptable sequence number. That is, 182 // the "largest" sequence value that the receiver has announced to its 183 // peer that it's willing to accept. This may be different than RcvNxt 184 // + (last advertised receive window) if the receive window is reduced; 185 // in that case we have to reduce the window as we receive more data 186 // instead of shrinking it. 187 RcvAcc seqnum.Value 188 189 // RcvWndScale is the window scaling to use for inbound segments. 190 RcvWndScale uint8 191 192 // PendingBufUsed is the number of bytes pending in the receive queue. 193 PendingBufUsed int 194 } 195 196 // TCPRTTState holds a copy of information about the endpoint's round trip 197 // time. 198 // 199 // +stateify savable 200 type TCPRTTState struct { 201 // SRTT is the smoothed round trip time defined in section 2 of RFC 202 // 6298. 203 SRTT time.Duration 204 205 // RTTVar is the round-trip time variation as defined in section 2 of 206 // RFC 6298. 207 RTTVar time.Duration 208 209 // SRTTInited if true indicates that a valid RTT measurement has been 210 // completed. 211 SRTTInited bool 212 } 213 214 // TCPSenderState holds a copy of the internal state of the sender for a given 215 // TCP Endpoint. 216 // 217 // +stateify savable 218 type TCPSenderState struct { 219 // LastSendTime is the timestamp at which we sent the last segment. 220 LastSendTime tcpip.MonotonicTime 221 222 // DupAckCount is the number of Duplicate ACKs received. It is used for 223 // fast retransmit. 224 DupAckCount int 225 226 // SndCwnd is the size of the sending congestion window in packets. 227 SndCwnd int 228 229 // Ssthresh is the threshold between slow start and congestion 230 // avoidance. 231 Ssthresh int 232 233 // SndCAAckCount is the number of packets acknowledged during 234 // congestion avoidance. When enough packets have been ack'd (typically 235 // cwnd packets), the congestion window is incremented by one. 236 SndCAAckCount int 237 238 // Outstanding is the number of packets that have been sent but not yet 239 // acknowledged. 240 Outstanding int 241 242 // SackedOut is the number of packets which have been selectively 243 // acked. 244 SackedOut int 245 246 // SndWnd is the send window size in bytes. 247 SndWnd seqnum.Size 248 249 // SndUna is the next unacknowledged sequence number. 250 SndUna seqnum.Value 251 252 // SndNxt is the sequence number of the next segment to be sent. 253 SndNxt seqnum.Value 254 255 // RTTMeasureSeqNum is the sequence number being used for the latest 256 // RTT measurement. 257 RTTMeasureSeqNum seqnum.Value 258 259 // RTTMeasureTime is the time when the RTTMeasureSeqNum was sent. 260 RTTMeasureTime tcpip.MonotonicTime 261 262 // Closed indicates that the caller has closed the endpoint for 263 // sending. 264 Closed bool 265 266 // RTO is the retransmit timeout as defined in section of 2 of RFC 267 // 6298. 268 RTO time.Duration 269 270 // RTTState holds information about the endpoint's round trip time. 271 RTTState TCPRTTState 272 273 // MaxPayloadSize is the maximum size of the payload of a given 274 // segment. It is initialized on demand. 275 MaxPayloadSize int 276 277 // SndWndScale is the number of bits to shift left when reading the 278 // send window size from a segment. 279 SndWndScale uint8 280 281 // MaxSentAck is the highest acknowledgement number sent till now. 282 MaxSentAck seqnum.Value 283 284 // FastRecovery holds the fast recovery state for the endpoint. 285 FastRecovery TCPFastRecoveryState 286 287 // Cubic holds the state related to CUBIC congestion control. 288 Cubic TCPCubicState 289 290 // RACKState holds the state related to RACK loss detection algorithm. 291 RACKState TCPRACKState 292 293 // RetransmitTS records the timestamp used to detect spurious recovery. 294 RetransmitTS uint32 295 296 // SpuriousRecovery indicates if the sender entered recovery spuriously. 297 SpuriousRecovery bool 298 } 299 300 // TCPSACKInfo holds TCP SACK related information for a given TCP endpoint. 301 // 302 // +stateify savable 303 type TCPSACKInfo struct { 304 // Blocks is the list of SACK Blocks that identify the out of order 305 // segments held by a given TCP endpoint. 306 Blocks []header.SACKBlock 307 308 // ReceivedBlocks are the SACK blocks received by this endpoint from 309 // the peer endpoint. 310 ReceivedBlocks []header.SACKBlock 311 312 // MaxSACKED is the highest sequence number that has been SACKED by the 313 // peer. 314 MaxSACKED seqnum.Value 315 } 316 317 // RcvBufAutoTuneParams holds state related to TCP receive buffer auto-tuning. 318 // 319 // +stateify savable 320 type RcvBufAutoTuneParams struct { 321 // MeasureTime is the time at which the current measurement was 322 // started. 323 MeasureTime tcpip.MonotonicTime 324 325 // CopiedBytes is the number of bytes copied to user space since this 326 // measure began. 327 CopiedBytes int 328 329 // PrevCopiedBytes is the number of bytes copied to userspace in the 330 // previous RTT period. 331 PrevCopiedBytes int 332 333 // RcvBufSize is the auto tuned receive buffer size. 334 RcvBufSize int 335 336 // RTT is the smoothed RTT as measured by observing the time between 337 // when a byte is first acknowledged and the receipt of data that is at 338 // least one window beyond the sequence number that was acknowledged. 339 RTT time.Duration 340 341 // RTTVar is the "round-trip time variation" as defined in section 2 of 342 // RFC6298. 343 RTTVar time.Duration 344 345 // RTTMeasureSeqNumber is the highest acceptable sequence number at the 346 // time this RTT measurement period began. 347 RTTMeasureSeqNumber seqnum.Value 348 349 // RTTMeasureTime is the absolute time at which the current RTT 350 // measurement period began. 351 RTTMeasureTime tcpip.MonotonicTime 352 353 // Disabled is true if an explicit receive buffer is set for the 354 // endpoint. 355 Disabled bool 356 } 357 358 // TCPRcvBufState contains information about the state of an endpoint's receive 359 // socket buffer. 360 // 361 // +stateify savable 362 type TCPRcvBufState struct { 363 // RcvBufUsed is the amount of bytes actually held in the receive 364 // socket buffer for the endpoint. 365 RcvBufUsed int 366 367 // RcvBufAutoTuneParams is used to hold state variables to compute the 368 // auto tuned receive buffer size. 369 RcvAutoParams RcvBufAutoTuneParams 370 371 // RcvClosed if true, indicates the endpoint has been closed for 372 // reading. 373 RcvClosed bool 374 } 375 376 // TCPSndBufState contains information about the state of an endpoint's send 377 // socket buffer. 378 // 379 // +stateify savable 380 type TCPSndBufState struct { 381 // SndBufSize is the size of the socket send buffer. 382 SndBufSize int 383 384 // SndBufUsed is the number of bytes held in the socket send buffer. 385 SndBufUsed int 386 387 // SndClosed indicates that the endpoint has been closed for sends. 388 SndClosed bool 389 390 // PacketTooBigCount is used to notify the main protocol routine how 391 // many times a "packet too big" control packet is received. 392 PacketTooBigCount int 393 394 // SndMTU is the smallest MTU seen in the control packets received. 395 SndMTU int 396 397 // AutoTuneSndBufDisabled indicates that the auto tuning of send buffer 398 // is disabled. 399 // 400 // Must be accessed using atomic operations. 401 AutoTuneSndBufDisabled uint32 402 } 403 404 // TCPEndpointStateInner contains the members of TCPEndpointState used directly 405 // (that is, not within another containing struct) within the endpoint's 406 // internal implementation. 407 // 408 // +stateify savable 409 type TCPEndpointStateInner struct { 410 // TSOffset is a randomized offset added to the value of the TSVal 411 // field in the timestamp option. 412 TSOffset tcp.TSOffset 413 414 // SACKPermitted is set to true if the peer sends the TCPSACKPermitted 415 // option in the SYN/SYN-ACK. 416 SACKPermitted bool 417 418 // SendTSOk is used to indicate when the TS Option has been negotiated. 419 // When sendTSOk is true every non-RST segment should carry a TS as per 420 // RFC7323#section-1.1. 421 SendTSOk bool 422 423 // RecentTS is the timestamp that should be sent in the TSEcr field of 424 // the timestamp for future segments sent by the endpoint. This field 425 // is updated if required when a new segment is received by this 426 // endpoint. 427 RecentTS uint32 428 } 429 430 // TCPEndpointState is a copy of the internal state of a TCP endpoint. 431 // 432 // +stateify savable 433 type TCPEndpointState struct { 434 // TCPEndpointStateInner contains the members of TCPEndpointState used 435 // by the endpoint's internal implementation. 436 TCPEndpointStateInner 437 438 // ID is a copy of the TransportEndpointID for the endpoint. 439 ID TCPEndpointID 440 441 // SegTime denotes the absolute time when this segment was received. 442 SegTime tcpip.MonotonicTime 443 444 // RcvBufState contains information about the state of the endpoint's 445 // receive socket buffer. 446 RcvBufState TCPRcvBufState 447 448 // SndBufState contains information about the state of the endpoint's 449 // send socket buffer. 450 SndBufState TCPSndBufState 451 452 // SACK holds TCP SACK related information for this endpoint. 453 SACK TCPSACKInfo 454 455 // Receiver holds variables related to the TCP receiver for the 456 // endpoint. 457 Receiver TCPReceiverState 458 459 // Sender holds state related to the TCP Sender for the endpoint. 460 Sender TCPSenderState 461 }