inet.af/netstack@v0.0.0-20220214151720-7585b01ddccf/tcpip/stack/tcp.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package stack
    16  
    17  import (
    18  	"time"
    19  
    20  	"inet.af/netstack/tcpip"
    21  	"inet.af/netstack/tcpip/header"
    22  	"inet.af/netstack/tcpip/internal/tcp"
    23  	"inet.af/netstack/tcpip/seqnum"
    24  )
    25  
    26  // TCPProbeFunc is the expected function type for a TCP probe function to be
    27  // passed to stack.AddTCPProbe.
    28  type TCPProbeFunc func(s TCPEndpointState)
    29  
    30  // TCPCubicState is used to hold a copy of the internal cubic state when the
    31  // TCPProbeFunc is invoked.
    32  //
    33  // +stateify savable
    34  type TCPCubicState struct {
    35  	// WLastMax is the previous wMax value.
    36  	WLastMax float64
    37  
    38  	// WMax is the value of the congestion window at the time of the last
    39  	// congestion event.
    40  	WMax float64
    41  
    42  	// T is the time when the current congestion avoidance was entered.
    43  	T tcpip.MonotonicTime
    44  
    45  	// TimeSinceLastCongestion denotes the time since the current
    46  	// congestion avoidance was entered.
    47  	TimeSinceLastCongestion time.Duration
    48  
    49  	// C is the cubic constant as specified in RFC8312, page 11.
    50  	C float64
    51  
    52  	// K is the time period (in seconds) that the above function takes to
    53  	// increase the current window size to WMax if there are no further
    54  	// congestion events and is calculated using the following equation:
    55  	//
    56  	// K = cubic_root(WMax*(1-beta_cubic)/C) (Eq. 2, page 5)
    57  	K float64
    58  
    59  	// Beta is the CUBIC multiplication decrease factor. That is, when a
    60  	// congestion event is detected, CUBIC reduces its cwnd to
    61  	// WC(0)=WMax*beta_cubic.
    62  	Beta float64
    63  
    64  	// WC is window computed by CUBIC at time TimeSinceLastCongestion. It's
    65  	// calculated using the formula:
    66  	//
    67  	//  WC(TimeSinceLastCongestion) = C*(t-K)^3 + WMax (Eq. 1)
    68  	WC float64
    69  
    70  	// WEst is the window computed by CUBIC at time
    71  	// TimeSinceLastCongestion+RTT i.e WC(TimeSinceLastCongestion+RTT).
    72  	WEst float64
    73  }
    74  
    75  // TCPRACKState is used to hold a copy of the internal RACK state when the
    76  // TCPProbeFunc is invoked.
    77  //
    78  // +stateify savable
    79  type TCPRACKState struct {
    80  	// XmitTime is the transmission timestamp of the most recent
    81  	// acknowledged segment.
    82  	XmitTime tcpip.MonotonicTime
    83  
    84  	// EndSequence is the ending TCP sequence number of the most recent
    85  	// acknowledged segment.
    86  	EndSequence seqnum.Value
    87  
    88  	// FACK is the highest selectively or cumulatively acknowledged
    89  	// sequence.
    90  	FACK seqnum.Value
    91  
    92  	// RTT is the round trip time of the most recently delivered packet on
    93  	// the connection (either cumulatively acknowledged or selectively
    94  	// acknowledged) that was not marked invalid as a possible spurious
    95  	// retransmission.
    96  	RTT time.Duration
    97  
    98  	// Reord is true iff reordering has been detected on this connection.
    99  	Reord bool
   100  
   101  	// DSACKSeen is true iff the connection has seen a DSACK.
   102  	DSACKSeen bool
   103  
   104  	// ReoWnd is the reordering window time used for recording packet
   105  	// transmission times. It is used to defer the moment at which RACK
   106  	// marks a packet lost.
   107  	ReoWnd time.Duration
   108  
   109  	// ReoWndIncr is the multiplier applied to adjust reorder window.
   110  	ReoWndIncr uint8
   111  
   112  	// ReoWndPersist is the number of loss recoveries before resetting
   113  	// reorder window.
   114  	ReoWndPersist int8
   115  
   116  	// RTTSeq is the SND.NXT when RTT is updated.
   117  	RTTSeq seqnum.Value
   118  }
   119  
   120  // TCPEndpointID is the unique 4 tuple that identifies a given endpoint.
   121  //
   122  // +stateify savable
   123  type TCPEndpointID struct {
   124  	// LocalPort is the local port associated with the endpoint.
   125  	LocalPort uint16
   126  
   127  	// LocalAddress is the local [network layer] address associated with
   128  	// the endpoint.
   129  	LocalAddress tcpip.Address
   130  
   131  	// RemotePort is the remote port associated with the endpoint.
   132  	RemotePort uint16
   133  
   134  	// RemoteAddress it the remote [network layer] address associated with
   135  	// the endpoint.
   136  	RemoteAddress tcpip.Address
   137  }
   138  
   139  // TCPFastRecoveryState holds a copy of the internal fast recovery state of a
   140  // TCP endpoint.
   141  //
   142  // +stateify savable
   143  type TCPFastRecoveryState struct {
   144  	// Active if true indicates the endpoint is in fast recovery. The
   145  	// following fields are only meaningful when Active is true.
   146  	Active bool
   147  
   148  	// First is the first unacknowledged sequence number being recovered.
   149  	First seqnum.Value
   150  
   151  	// Last is the 'recover' sequence number that indicates the point at
   152  	// which we should exit recovery barring any timeouts etc.
   153  	Last seqnum.Value
   154  
   155  	// MaxCwnd is the maximum value we are permitted to grow the congestion
   156  	// window during recovery. This is set at the time we enter recovery.
   157  	// It exists to avoid attacks where the receiver intentionally sends
   158  	// duplicate acks to artificially inflate the sender's cwnd.
   159  	MaxCwnd int
   160  
   161  	// HighRxt is the highest sequence number which has been retransmitted
   162  	// during the current loss recovery phase.  See: RFC 6675 Section 2 for
   163  	// details.
   164  	HighRxt seqnum.Value
   165  
   166  	// RescueRxt is the highest sequence number which has been
   167  	// optimistically retransmitted to prevent stalling of the ACK clock
   168  	// when there is loss at the end of the window and no new data is
   169  	// available for transmission.  See: RFC 6675 Section 2 for details.
   170  	RescueRxt seqnum.Value
   171  }
   172  
   173  // TCPReceiverState holds a copy of the internal state of the receiver for a
   174  // given TCP endpoint.
   175  //
   176  // +stateify savable
   177  type TCPReceiverState struct {
   178  	// RcvNxt is the TCP variable RCV.NXT.
   179  	RcvNxt seqnum.Value
   180  
   181  	// RcvAcc is one beyond the last acceptable sequence number. That is,
   182  	// the "largest" sequence value that the receiver has announced to its
   183  	// peer that it's willing to accept. This may be different than RcvNxt
   184  	// + (last advertised receive window) if the receive window is reduced;
   185  	// in that case we have to reduce the window as we receive more data
   186  	// instead of shrinking it.
   187  	RcvAcc seqnum.Value
   188  
   189  	// RcvWndScale is the window scaling to use for inbound segments.
   190  	RcvWndScale uint8
   191  
   192  	// PendingBufUsed is the number of bytes pending in the receive queue.
   193  	PendingBufUsed int
   194  }
   195  
   196  // TCPRTTState holds a copy of information about the endpoint's round trip
   197  // time.
   198  //
   199  // +stateify savable
   200  type TCPRTTState struct {
   201  	// SRTT is the smoothed round trip time defined in section 2 of RFC
   202  	// 6298.
   203  	SRTT time.Duration
   204  
   205  	// RTTVar is the round-trip time variation as defined in section 2 of
   206  	// RFC 6298.
   207  	RTTVar time.Duration
   208  
   209  	// SRTTInited if true indicates that a valid RTT measurement has been
   210  	// completed.
   211  	SRTTInited bool
   212  }
   213  
   214  // TCPSenderState holds a copy of the internal state of the sender for a given
   215  // TCP Endpoint.
   216  //
   217  // +stateify savable
   218  type TCPSenderState struct {
   219  	// LastSendTime is the timestamp at which we sent the last segment.
   220  	LastSendTime tcpip.MonotonicTime
   221  
   222  	// DupAckCount is the number of Duplicate ACKs received. It is used for
   223  	// fast retransmit.
   224  	DupAckCount int
   225  
   226  	// SndCwnd is the size of the sending congestion window in packets.
   227  	SndCwnd int
   228  
   229  	// Ssthresh is the threshold between slow start and congestion
   230  	// avoidance.
   231  	Ssthresh int
   232  
   233  	// SndCAAckCount is the number of packets acknowledged during
   234  	// congestion avoidance. When enough packets have been ack'd (typically
   235  	// cwnd packets), the congestion window is incremented by one.
   236  	SndCAAckCount int
   237  
   238  	// Outstanding is the number of packets that have been sent but not yet
   239  	// acknowledged.
   240  	Outstanding int
   241  
   242  	// SackedOut is the number of packets which have been selectively
   243  	// acked.
   244  	SackedOut int
   245  
   246  	// SndWnd is the send window size in bytes.
   247  	SndWnd seqnum.Size
   248  
   249  	// SndUna is the next unacknowledged sequence number.
   250  	SndUna seqnum.Value
   251  
   252  	// SndNxt is the sequence number of the next segment to be sent.
   253  	SndNxt seqnum.Value
   254  
   255  	// RTTMeasureSeqNum is the sequence number being used for the latest
   256  	// RTT measurement.
   257  	RTTMeasureSeqNum seqnum.Value
   258  
   259  	// RTTMeasureTime is the time when the RTTMeasureSeqNum was sent.
   260  	RTTMeasureTime tcpip.MonotonicTime
   261  
   262  	// Closed indicates that the caller has closed the endpoint for
   263  	// sending.
   264  	Closed bool
   265  
   266  	// RTO is the retransmit timeout as defined in section of 2 of RFC
   267  	// 6298.
   268  	RTO time.Duration
   269  
   270  	// RTTState holds information about the endpoint's round trip time.
   271  	RTTState TCPRTTState
   272  
   273  	// MaxPayloadSize is the maximum size of the payload of a given
   274  	// segment.  It is initialized on demand.
   275  	MaxPayloadSize int
   276  
   277  	// SndWndScale is the number of bits to shift left when reading the
   278  	// send window size from a segment.
   279  	SndWndScale uint8
   280  
   281  	// MaxSentAck is the highest acknowledgement number sent till now.
   282  	MaxSentAck seqnum.Value
   283  
   284  	// FastRecovery holds the fast recovery state for the endpoint.
   285  	FastRecovery TCPFastRecoveryState
   286  
   287  	// Cubic holds the state related to CUBIC congestion control.
   288  	Cubic TCPCubicState
   289  
   290  	// RACKState holds the state related to RACK loss detection algorithm.
   291  	RACKState TCPRACKState
   292  
   293  	// RetransmitTS records the timestamp used to detect spurious recovery.
   294  	RetransmitTS uint32
   295  
   296  	// SpuriousRecovery indicates if the sender entered recovery spuriously.
   297  	SpuriousRecovery bool
   298  }
   299  
   300  // TCPSACKInfo holds TCP SACK related information for a given TCP endpoint.
   301  //
   302  // +stateify savable
   303  type TCPSACKInfo struct {
   304  	// Blocks is the list of SACK Blocks that identify the out of order
   305  	// segments held by a given TCP endpoint.
   306  	Blocks []header.SACKBlock
   307  
   308  	// ReceivedBlocks are the SACK blocks received by this endpoint from
   309  	// the peer endpoint.
   310  	ReceivedBlocks []header.SACKBlock
   311  
   312  	// MaxSACKED is the highest sequence number that has been SACKED by the
   313  	// peer.
   314  	MaxSACKED seqnum.Value
   315  }
   316  
   317  // RcvBufAutoTuneParams holds state related to TCP receive buffer auto-tuning.
   318  //
   319  // +stateify savable
   320  type RcvBufAutoTuneParams struct {
   321  	// MeasureTime is the time at which the current measurement was
   322  	// started.
   323  	MeasureTime tcpip.MonotonicTime
   324  
   325  	// CopiedBytes is the number of bytes copied to user space since this
   326  	// measure began.
   327  	CopiedBytes int
   328  
   329  	// PrevCopiedBytes is the number of bytes copied to userspace in the
   330  	// previous RTT period.
   331  	PrevCopiedBytes int
   332  
   333  	// RcvBufSize is the auto tuned receive buffer size.
   334  	RcvBufSize int
   335  
   336  	// RTT is the smoothed RTT as measured by observing the time between
   337  	// when a byte is first acknowledged and the receipt of data that is at
   338  	// least one window beyond the sequence number that was acknowledged.
   339  	RTT time.Duration
   340  
   341  	// RTTVar is the "round-trip time variation" as defined in section 2 of
   342  	// RFC6298.
   343  	RTTVar time.Duration
   344  
   345  	// RTTMeasureSeqNumber is the highest acceptable sequence number at the
   346  	// time this RTT measurement period began.
   347  	RTTMeasureSeqNumber seqnum.Value
   348  
   349  	// RTTMeasureTime is the absolute time at which the current RTT
   350  	// measurement period began.
   351  	RTTMeasureTime tcpip.MonotonicTime
   352  
   353  	// Disabled is true if an explicit receive buffer is set for the
   354  	// endpoint.
   355  	Disabled bool
   356  }
   357  
   358  // TCPRcvBufState contains information about the state of an endpoint's receive
   359  // socket buffer.
   360  //
   361  // +stateify savable
   362  type TCPRcvBufState struct {
   363  	// RcvBufUsed is the amount of bytes actually held in the receive
   364  	// socket buffer for the endpoint.
   365  	RcvBufUsed int
   366  
   367  	// RcvBufAutoTuneParams is used to hold state variables to compute the
   368  	// auto tuned receive buffer size.
   369  	RcvAutoParams RcvBufAutoTuneParams
   370  
   371  	// RcvClosed if true, indicates the endpoint has been closed for
   372  	// reading.
   373  	RcvClosed bool
   374  }
   375  
   376  // TCPSndBufState contains information about the state of an endpoint's send
   377  // socket buffer.
   378  //
   379  // +stateify savable
   380  type TCPSndBufState struct {
   381  	// SndBufSize is the size of the socket send buffer.
   382  	SndBufSize int
   383  
   384  	// SndBufUsed is the number of bytes held in the socket send buffer.
   385  	SndBufUsed int
   386  
   387  	// SndClosed indicates that the endpoint has been closed for sends.
   388  	SndClosed bool
   389  
   390  	// PacketTooBigCount is used to notify the main protocol routine how
   391  	// many times a "packet too big" control packet is received.
   392  	PacketTooBigCount int
   393  
   394  	// SndMTU is the smallest MTU seen in the control packets received.
   395  	SndMTU int
   396  
   397  	// AutoTuneSndBufDisabled indicates that the auto tuning of send buffer
   398  	// is disabled.
   399  	//
   400  	// Must be accessed using atomic operations.
   401  	AutoTuneSndBufDisabled uint32
   402  }
   403  
   404  // TCPEndpointStateInner contains the members of TCPEndpointState used directly
   405  // (that is, not within another containing struct) within the endpoint's
   406  // internal implementation.
   407  //
   408  // +stateify savable
   409  type TCPEndpointStateInner struct {
   410  	// TSOffset is a randomized offset added to the value of the TSVal
   411  	// field in the timestamp option.
   412  	TSOffset tcp.TSOffset
   413  
   414  	// SACKPermitted is set to true if the peer sends the TCPSACKPermitted
   415  	// option in the SYN/SYN-ACK.
   416  	SACKPermitted bool
   417  
   418  	// SendTSOk is used to indicate when the TS Option has been negotiated.
   419  	// When sendTSOk is true every non-RST segment should carry a TS as per
   420  	// RFC7323#section-1.1.
   421  	SendTSOk bool
   422  
   423  	// RecentTS is the timestamp that should be sent in the TSEcr field of
   424  	// the timestamp for future segments sent by the endpoint. This field
   425  	// is updated if required when a new segment is received by this
   426  	// endpoint.
   427  	RecentTS uint32
   428  }
   429  
   430  // TCPEndpointState is a copy of the internal state of a TCP endpoint.
   431  //
   432  // +stateify savable
   433  type TCPEndpointState struct {
   434  	// TCPEndpointStateInner contains the members of TCPEndpointState used
   435  	// by the endpoint's internal implementation.
   436  	TCPEndpointStateInner
   437  
   438  	// ID is a copy of the TransportEndpointID for the endpoint.
   439  	ID TCPEndpointID
   440  
   441  	// SegTime denotes the absolute time when this segment was received.
   442  	SegTime tcpip.MonotonicTime
   443  
   444  	// RcvBufState contains information about the state of the endpoint's
   445  	// receive socket buffer.
   446  	RcvBufState TCPRcvBufState
   447  
   448  	// SndBufState contains information about the state of the endpoint's
   449  	// send socket buffer.
   450  	SndBufState TCPSndBufState
   451  
   452  	// SACK holds TCP SACK related information for this endpoint.
   453  	SACK TCPSACKInfo
   454  
   455  	// Receiver holds variables related to the TCP receiver for the
   456  	// endpoint.
   457  	Receiver TCPReceiverState
   458  
   459  	// Sender holds state related to the TCP Sender for the endpoint.
   460  	Sender TCPSenderState
   461  }