github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/tcpip/stack/tcp.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package stack
    16  
    17  import (
    18  	"context"
    19  	"time"
    20  
    21  	"github.com/metacubex/gvisor/pkg/atomicbitops"
    22  	"github.com/metacubex/gvisor/pkg/tcpip"
    23  	"github.com/metacubex/gvisor/pkg/tcpip/header"
    24  	"github.com/metacubex/gvisor/pkg/tcpip/internal/tcp"
    25  	"github.com/metacubex/gvisor/pkg/tcpip/seqnum"
    26  )
    27  
    28  // contextID is this package's type for context.Context.Value keys.
    29  type contextID int
    30  
    31  const (
    32  	// CtxRestoreStack is a Context.Value key for the stack to be used in restore.
    33  	CtxRestoreStack contextID = iota
    34  )
    35  
    36  // RestoreStackFromContext returns the stack to be used during restore.
    37  func RestoreStackFromContext(ctx context.Context) *Stack {
    38  	return ctx.Value(CtxRestoreStack).(*Stack)
    39  }
    40  
    41  // TCPProbeFunc is the expected function type for a TCP probe function to be
    42  // passed to stack.AddTCPProbe.
    43  type TCPProbeFunc func(s *TCPEndpointState)
    44  
    45  // TCPCubicState is used to hold a copy of the internal cubic state when the
    46  // TCPProbeFunc is invoked.
    47  //
    48  // +stateify savable
    49  type TCPCubicState struct {
    50  	// WLastMax is the previous wMax value.
    51  	WLastMax float64
    52  
    53  	// WMax is the value of the congestion window at the time of the last
    54  	// congestion event.
    55  	WMax float64
    56  
    57  	// T is the time when the current congestion avoidance was entered.
    58  	T tcpip.MonotonicTime
    59  
    60  	// TimeSinceLastCongestion denotes the time since the current
    61  	// congestion avoidance was entered.
    62  	TimeSinceLastCongestion time.Duration
    63  
    64  	// C is the cubic constant as specified in RFC8312, page 11.
    65  	C float64
    66  
    67  	// K is the time period (in seconds) that the above function takes to
    68  	// increase the current window size to WMax if there are no further
    69  	// congestion events and is calculated using the following equation:
    70  	//
    71  	// K = cubic_root(WMax*(1-beta_cubic)/C) (Eq. 2, page 5)
    72  	K float64
    73  
    74  	// Beta is the CUBIC multiplication decrease factor. That is, when a
    75  	// congestion event is detected, CUBIC reduces its cwnd to
    76  	// WC(0)=WMax*beta_cubic.
    77  	Beta float64
    78  
    79  	// WC is window computed by CUBIC at time TimeSinceLastCongestion. It's
    80  	// calculated using the formula:
    81  	//
    82  	//  WC(TimeSinceLastCongestion) = C*(t-K)^3 + WMax (Eq. 1)
    83  	WC float64
    84  
    85  	// WEst is the window computed by CUBIC at time
    86  	// TimeSinceLastCongestion+RTT i.e WC(TimeSinceLastCongestion+RTT).
    87  	WEst float64
    88  }
    89  
    90  // TCPRACKState is used to hold a copy of the internal RACK state when the
    91  // TCPProbeFunc is invoked.
    92  //
    93  // +stateify savable
    94  type TCPRACKState struct {
    95  	// XmitTime is the transmission timestamp of the most recent
    96  	// acknowledged segment.
    97  	XmitTime tcpip.MonotonicTime
    98  
    99  	// EndSequence is the ending TCP sequence number of the most recent
   100  	// acknowledged segment.
   101  	EndSequence seqnum.Value
   102  
   103  	// FACK is the highest selectively or cumulatively acknowledged
   104  	// sequence.
   105  	FACK seqnum.Value
   106  
   107  	// RTT is the round trip time of the most recently delivered packet on
   108  	// the connection (either cumulatively acknowledged or selectively
   109  	// acknowledged) that was not marked invalid as a possible spurious
   110  	// retransmission.
   111  	RTT time.Duration
   112  
   113  	// Reord is true iff reordering has been detected on this connection.
   114  	Reord bool
   115  
   116  	// DSACKSeen is true iff the connection has seen a DSACK.
   117  	DSACKSeen bool
   118  
   119  	// ReoWnd is the reordering window time used for recording packet
   120  	// transmission times. It is used to defer the moment at which RACK
   121  	// marks a packet lost.
   122  	ReoWnd time.Duration
   123  
   124  	// ReoWndIncr is the multiplier applied to adjust reorder window.
   125  	ReoWndIncr uint8
   126  
   127  	// ReoWndPersist is the number of loss recoveries before resetting
   128  	// reorder window.
   129  	ReoWndPersist int8
   130  
   131  	// RTTSeq is the SND.NXT when RTT is updated.
   132  	RTTSeq seqnum.Value
   133  }
   134  
   135  // TCPEndpointID is the unique 4 tuple that identifies a given endpoint.
   136  //
   137  // +stateify savable
   138  type TCPEndpointID struct {
   139  	// LocalPort is the local port associated with the endpoint.
   140  	LocalPort uint16
   141  
   142  	// LocalAddress is the local [network layer] address associated with
   143  	// the endpoint.
   144  	LocalAddress tcpip.Address
   145  
   146  	// RemotePort is the remote port associated with the endpoint.
   147  	RemotePort uint16
   148  
   149  	// RemoteAddress it the remote [network layer] address associated with
   150  	// the endpoint.
   151  	RemoteAddress tcpip.Address
   152  }
   153  
   154  // TCPFastRecoveryState holds a copy of the internal fast recovery state of a
   155  // TCP endpoint.
   156  //
   157  // +stateify savable
   158  type TCPFastRecoveryState struct {
   159  	// Active if true indicates the endpoint is in fast recovery. The
   160  	// following fields are only meaningful when Active is true.
   161  	Active bool
   162  
   163  	// First is the first unacknowledged sequence number being recovered.
   164  	First seqnum.Value
   165  
   166  	// Last is the 'recover' sequence number that indicates the point at
   167  	// which we should exit recovery barring any timeouts etc.
   168  	Last seqnum.Value
   169  
   170  	// MaxCwnd is the maximum value we are permitted to grow the congestion
   171  	// window during recovery. This is set at the time we enter recovery.
   172  	// It exists to avoid attacks where the receiver intentionally sends
   173  	// duplicate acks to artificially inflate the sender's cwnd.
   174  	MaxCwnd int
   175  
   176  	// HighRxt is the highest sequence number which has been retransmitted
   177  	// during the current loss recovery phase.  See: RFC 6675 Section 2 for
   178  	// details.
   179  	HighRxt seqnum.Value
   180  
   181  	// RescueRxt is the highest sequence number which has been
   182  	// optimistically retransmitted to prevent stalling of the ACK clock
   183  	// when there is loss at the end of the window and no new data is
   184  	// available for transmission.  See: RFC 6675 Section 2 for details.
   185  	RescueRxt seqnum.Value
   186  }
   187  
   188  // TCPReceiverState holds a copy of the internal state of the receiver for a
   189  // given TCP endpoint.
   190  //
   191  // +stateify savable
   192  type TCPReceiverState struct {
   193  	// RcvNxt is the TCP variable RCV.NXT.
   194  	RcvNxt seqnum.Value
   195  
   196  	// RcvAcc is one beyond the last acceptable sequence number. That is,
   197  	// the "largest" sequence value that the receiver has announced to its
   198  	// peer that it's willing to accept. This may be different than RcvNxt
   199  	// + (last advertised receive window) if the receive window is reduced;
   200  	// in that case we have to reduce the window as we receive more data
   201  	// instead of shrinking it.
   202  	RcvAcc seqnum.Value
   203  
   204  	// RcvWndScale is the window scaling to use for inbound segments.
   205  	RcvWndScale uint8
   206  
   207  	// PendingBufUsed is the number of bytes pending in the receive queue.
   208  	PendingBufUsed int
   209  }
   210  
   211  // TCPRTTState holds a copy of information about the endpoint's round trip
   212  // time.
   213  //
   214  // +stateify savable
   215  type TCPRTTState struct {
   216  	// SRTT is the smoothed round trip time defined in section 2 of RFC
   217  	// 6298.
   218  	SRTT time.Duration
   219  
   220  	// RTTVar is the round-trip time variation as defined in section 2 of
   221  	// RFC 6298.
   222  	RTTVar time.Duration
   223  
   224  	// SRTTInited if true indicates that a valid RTT measurement has been
   225  	// completed.
   226  	SRTTInited bool
   227  }
   228  
   229  // TCPSenderState holds a copy of the internal state of the sender for a given
   230  // TCP Endpoint.
   231  //
   232  // +stateify savable
   233  type TCPSenderState struct {
   234  	// LastSendTime is the timestamp at which we sent the last segment.
   235  	LastSendTime tcpip.MonotonicTime
   236  
   237  	// DupAckCount is the number of Duplicate ACKs received. It is used for
   238  	// fast retransmit.
   239  	DupAckCount int
   240  
   241  	// SndCwnd is the size of the sending congestion window in packets.
   242  	SndCwnd int
   243  
   244  	// Ssthresh is the threshold between slow start and congestion
   245  	// avoidance.
   246  	Ssthresh int
   247  
   248  	// SndCAAckCount is the number of packets acknowledged during
   249  	// congestion avoidance. When enough packets have been ack'd (typically
   250  	// cwnd packets), the congestion window is incremented by one.
   251  	SndCAAckCount int
   252  
   253  	// Outstanding is the number of packets that have been sent but not yet
   254  	// acknowledged.
   255  	Outstanding int
   256  
   257  	// SackedOut is the number of packets which have been selectively
   258  	// acked.
   259  	SackedOut int
   260  
   261  	// SndWnd is the send window size in bytes.
   262  	SndWnd seqnum.Size
   263  
   264  	// SndUna is the next unacknowledged sequence number.
   265  	SndUna seqnum.Value
   266  
   267  	// SndNxt is the sequence number of the next segment to be sent.
   268  	SndNxt seqnum.Value
   269  
   270  	// RTTMeasureSeqNum is the sequence number being used for the latest
   271  	// RTT measurement.
   272  	RTTMeasureSeqNum seqnum.Value
   273  
   274  	// RTTMeasureTime is the time when the RTTMeasureSeqNum was sent.
   275  	RTTMeasureTime tcpip.MonotonicTime
   276  
   277  	// Closed indicates that the caller has closed the endpoint for
   278  	// sending.
   279  	Closed bool
   280  
   281  	// RTO is the retransmit timeout as defined in section of 2 of RFC
   282  	// 6298.
   283  	RTO time.Duration
   284  
   285  	// RTTState holds information about the endpoint's round trip time.
   286  	RTTState TCPRTTState
   287  
   288  	// MaxPayloadSize is the maximum size of the payload of a given
   289  	// segment.  It is initialized on demand.
   290  	MaxPayloadSize int
   291  
   292  	// SndWndScale is the number of bits to shift left when reading the
   293  	// send window size from a segment.
   294  	SndWndScale uint8
   295  
   296  	// MaxSentAck is the highest acknowledgement number sent till now.
   297  	MaxSentAck seqnum.Value
   298  
   299  	// FastRecovery holds the fast recovery state for the endpoint.
   300  	FastRecovery TCPFastRecoveryState
   301  
   302  	// Cubic holds the state related to CUBIC congestion control.
   303  	Cubic TCPCubicState
   304  
   305  	// RACKState holds the state related to RACK loss detection algorithm.
   306  	RACKState TCPRACKState
   307  
   308  	// RetransmitTS records the timestamp used to detect spurious recovery.
   309  	RetransmitTS uint32
   310  
   311  	// SpuriousRecovery indicates if the sender entered recovery spuriously.
   312  	SpuriousRecovery bool
   313  }
   314  
   315  // TCPSACKInfo holds TCP SACK related information for a given TCP endpoint.
   316  //
   317  // +stateify savable
   318  type TCPSACKInfo struct {
   319  	// Blocks is the list of SACK Blocks that identify the out of order
   320  	// segments held by a given TCP endpoint.
   321  	Blocks []header.SACKBlock
   322  
   323  	// ReceivedBlocks are the SACK blocks received by this endpoint from
   324  	// the peer endpoint.
   325  	ReceivedBlocks []header.SACKBlock
   326  
   327  	// MaxSACKED is the highest sequence number that has been SACKED by the
   328  	// peer.
   329  	MaxSACKED seqnum.Value
   330  }
   331  
   332  // RcvBufAutoTuneParams holds state related to TCP receive buffer auto-tuning.
   333  //
   334  // +stateify savable
   335  type RcvBufAutoTuneParams struct {
   336  	// MeasureTime is the time at which the current measurement was
   337  	// started.
   338  	MeasureTime tcpip.MonotonicTime
   339  
   340  	// CopiedBytes is the number of bytes copied to user space since this
   341  	// measure began.
   342  	CopiedBytes int
   343  
   344  	// PrevCopiedBytes is the number of bytes copied to userspace in the
   345  	// previous RTT period.
   346  	PrevCopiedBytes int
   347  
   348  	// RcvBufSize is the auto tuned receive buffer size.
   349  	RcvBufSize int
   350  
   351  	// RTT is the smoothed RTT as measured by observing the time between
   352  	// when a byte is first acknowledged and the receipt of data that is at
   353  	// least one window beyond the sequence number that was acknowledged.
   354  	RTT time.Duration
   355  
   356  	// RTTVar is the "round-trip time variation" as defined in section 2 of
   357  	// RFC6298.
   358  	RTTVar time.Duration
   359  
   360  	// RTTMeasureSeqNumber is the highest acceptable sequence number at the
   361  	// time this RTT measurement period began.
   362  	RTTMeasureSeqNumber seqnum.Value
   363  
   364  	// RTTMeasureTime is the absolute time at which the current RTT
   365  	// measurement period began.
   366  	RTTMeasureTime tcpip.MonotonicTime
   367  
   368  	// Disabled is true if an explicit receive buffer is set for the
   369  	// endpoint.
   370  	Disabled bool
   371  }
   372  
   373  // TCPRcvBufState contains information about the state of an endpoint's receive
   374  // socket buffer.
   375  //
   376  // +stateify savable
   377  type TCPRcvBufState struct {
   378  	// RcvBufUsed is the amount of bytes actually held in the receive
   379  	// socket buffer for the endpoint.
   380  	RcvBufUsed int
   381  
   382  	// RcvBufAutoTuneParams is used to hold state variables to compute the
   383  	// auto tuned receive buffer size.
   384  	RcvAutoParams RcvBufAutoTuneParams
   385  
   386  	// RcvClosed if true, indicates the endpoint has been closed for
   387  	// reading.
   388  	RcvClosed bool
   389  }
   390  
   391  // TCPSndBufState contains information about the state of an endpoint's send
   392  // socket buffer.
   393  //
   394  // +stateify savable
   395  type TCPSndBufState struct {
   396  	// SndBufSize is the size of the socket send buffer.
   397  	SndBufSize int
   398  
   399  	// SndBufUsed is the number of bytes held in the socket send buffer.
   400  	SndBufUsed int
   401  
   402  	// SndClosed indicates that the endpoint has been closed for sends.
   403  	SndClosed bool
   404  
   405  	// PacketTooBigCount is used to notify the main protocol routine how
   406  	// many times a "packet too big" control packet is received.
   407  	PacketTooBigCount int
   408  
   409  	// SndMTU is the smallest MTU seen in the control packets received.
   410  	SndMTU int
   411  
   412  	// AutoTuneSndBufDisabled indicates that the auto tuning of send buffer
   413  	// is disabled.
   414  	AutoTuneSndBufDisabled atomicbitops.Uint32
   415  }
   416  
   417  // TCPEndpointStateInner contains the members of TCPEndpointState used directly
   418  // (that is, not within another containing struct) within the endpoint's
   419  // internal implementation.
   420  //
   421  // +stateify savable
   422  type TCPEndpointStateInner struct {
   423  	// TSOffset is a randomized offset added to the value of the TSVal
   424  	// field in the timestamp option.
   425  	TSOffset tcp.TSOffset
   426  
   427  	// SACKPermitted is set to true if the peer sends the TCPSACKPermitted
   428  	// option in the SYN/SYN-ACK.
   429  	SACKPermitted bool
   430  
   431  	// SendTSOk is used to indicate when the TS Option has been negotiated.
   432  	// When sendTSOk is true every non-RST segment should carry a TS as per
   433  	// RFC7323#section-1.1.
   434  	SendTSOk bool
   435  
   436  	// RecentTS is the timestamp that should be sent in the TSEcr field of
   437  	// the timestamp for future segments sent by the endpoint. This field
   438  	// is updated if required when a new segment is received by this
   439  	// endpoint.
   440  	RecentTS uint32
   441  }
   442  
   443  // TCPEndpointState is a copy of the internal state of a TCP endpoint.
   444  //
   445  // +stateify savable
   446  type TCPEndpointState struct {
   447  	// TCPEndpointStateInner contains the members of TCPEndpointState used
   448  	// by the endpoint's internal implementation.
   449  	TCPEndpointStateInner
   450  
   451  	// ID is a copy of the TransportEndpointID for the endpoint.
   452  	ID TCPEndpointID
   453  
   454  	// SegTime denotes the absolute time when this segment was received.
   455  	SegTime tcpip.MonotonicTime
   456  
   457  	// RcvBufState contains information about the state of the endpoint's
   458  	// receive socket buffer.
   459  	RcvBufState TCPRcvBufState
   460  
   461  	// SndBufState contains information about the state of the endpoint's
   462  	// send socket buffer.
   463  	SndBufState TCPSndBufState
   464  
   465  	// SACK holds TCP SACK related information for this endpoint.
   466  	SACK TCPSACKInfo
   467  
   468  	// Receiver holds variables related to the TCP receiver for the
   469  	// endpoint.
   470  	Receiver TCPReceiverState
   471  
   472  	// Sender holds state related to the TCP Sender for the endpoint.
   473  	Sender TCPSenderState
   474  }