github.com/sagernet/gvisor@v0.0.0-20240428053021-e691de28565f/pkg/tcpip/transport/tcp/protocol.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package tcp contains the implementation of the TCP transport protocol.
    16  package tcp
    17  
    18  import (
    19  	"crypto/sha256"
    20  	"encoding/binary"
    21  	"fmt"
    22  	"runtime"
    23  	"strings"
    24  	"time"
    25  
    26  	"github.com/sagernet/gvisor/pkg/sync"
    27  	"github.com/sagernet/gvisor/pkg/tcpip"
    28  	"github.com/sagernet/gvisor/pkg/tcpip/header"
    29  	"github.com/sagernet/gvisor/pkg/tcpip/header/parse"
    30  	"github.com/sagernet/gvisor/pkg/tcpip/internal/tcp"
    31  	"github.com/sagernet/gvisor/pkg/tcpip/seqnum"
    32  	"github.com/sagernet/gvisor/pkg/tcpip/stack"
    33  	"github.com/sagernet/gvisor/pkg/tcpip/transport/raw"
    34  	"github.com/sagernet/gvisor/pkg/waiter"
    35  )
    36  
    37  const (
    38  	// ProtocolNumber is the tcp protocol number.
    39  	ProtocolNumber = header.TCPProtocolNumber
    40  
    41  	// MinBufferSize is the smallest size of a receive or send buffer.
    42  	MinBufferSize = 4 << 10 // 4096 bytes.
    43  
    44  	// DefaultSendBufferSize is the default size of the send buffer for
    45  	// an endpoint.
    46  	DefaultSendBufferSize = 1 << 20 // 1MB
    47  
    48  	// DefaultReceiveBufferSize is the default size of the receive buffer
    49  	// for an endpoint.
    50  	DefaultReceiveBufferSize = 1 << 20 // 1MB
    51  
    52  	// MaxBufferSize is the largest size a receive/send buffer can grow to.
    53  	MaxBufferSize = 4 << 20 // 4MB
    54  
    55  	// DefaultTCPLingerTimeout is the amount of time that sockets linger in
    56  	// FIN_WAIT_2 state before being marked closed.
    57  	DefaultTCPLingerTimeout = 60 * time.Second
    58  
    59  	// MaxTCPLingerTimeout is the maximum amount of time that sockets
    60  	// linger in FIN_WAIT_2 state before being marked closed.
    61  	MaxTCPLingerTimeout = 120 * time.Second
    62  
    63  	// DefaultTCPTimeWaitTimeout is the amount of time that sockets linger
    64  	// in TIME_WAIT state before being marked closed.
    65  	DefaultTCPTimeWaitTimeout = 60 * time.Second
    66  
    67  	// DefaultSynRetries is the default value for the number of SYN retransmits
    68  	// before a connect is aborted.
    69  	DefaultSynRetries = 6
    70  
    71  	// DefaultKeepaliveIdle is the idle time for a connection before keep-alive
    72  	// probes are sent.
    73  	DefaultKeepaliveIdle = 2 * time.Hour
    74  
    75  	// DefaultKeepaliveInterval is the time between two successive keep-alive
    76  	// probes.
    77  	DefaultKeepaliveInterval = 75 * time.Second
    78  
    79  	// DefaultKeepaliveCount is the number of keep-alive probes that are sent
    80  	// before declaring the connection dead.
    81  	DefaultKeepaliveCount = 9
    82  )
    83  
    84  const (
    85  	ccReno  = "reno"
    86  	ccCubic = "cubic"
    87  )
    88  
    89  type protocol struct {
    90  	stack *stack.Stack
    91  
    92  	mu                         sync.RWMutex
    93  	sackEnabled                bool
    94  	recovery                   tcpip.TCPRecovery
    95  	delayEnabled               bool
    96  	alwaysUseSynCookies        bool
    97  	sendBufferSize             tcpip.TCPSendBufferSizeRangeOption
    98  	recvBufferSize             tcpip.TCPReceiveBufferSizeRangeOption
    99  	congestionControl          string
   100  	availableCongestionControl []string
   101  	moderateReceiveBuffer      bool
   102  	lingerTimeout              time.Duration
   103  	timeWaitTimeout            time.Duration
   104  	timeWaitReuse              tcpip.TCPTimeWaitReuseOption
   105  	minRTO                     time.Duration
   106  	maxRTO                     time.Duration
   107  	maxRetries                 uint32
   108  	synRetries                 uint8
   109  	dispatcher                 dispatcher
   110  
   111  	// The following secrets are initialized once and stay unchanged after.
   112  	seqnumSecret   [16]byte
   113  	tsOffsetSecret [16]byte
   114  }
   115  
   116  // Number returns the tcp protocol number.
   117  func (*protocol) Number() tcpip.TransportProtocolNumber {
   118  	return ProtocolNumber
   119  }
   120  
   121  // NewEndpoint creates a new tcp endpoint.
   122  func (p *protocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
   123  	return newEndpoint(p.stack, p, netProto, waiterQueue), nil
   124  }
   125  
   126  // NewRawEndpoint creates a new raw TCP endpoint. Raw TCP sockets are currently
   127  // unsupported. It implements stack.TransportProtocol.NewRawEndpoint.
   128  func (p *protocol) NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
   129  	return raw.NewEndpoint(p.stack, netProto, header.TCPProtocolNumber, waiterQueue)
   130  }
   131  
   132  // MinimumPacketSize returns the minimum valid tcp packet size.
   133  func (*protocol) MinimumPacketSize() int {
   134  	return header.TCPMinimumSize
   135  }
   136  
   137  // ParsePorts returns the source and destination ports stored in the given tcp
   138  // packet.
   139  func (*protocol) ParsePorts(v []byte) (src, dst uint16, err tcpip.Error) {
   140  	h := header.TCP(v)
   141  	return h.SourcePort(), h.DestinationPort(), nil
   142  }
   143  
   144  // QueuePacket queues packets targeted at an endpoint after hashing the packet
   145  // to a specific processing queue. Each queue is serviced by its own processor
   146  // goroutine which is responsible for dequeuing and doing full TCP dispatch of
   147  // the packet.
   148  func (p *protocol) QueuePacket(ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
   149  	p.dispatcher.queuePacket(ep, id, p.stack.Clock(), pkt)
   150  }
   151  
   152  // HandleUnknownDestinationPacket handles packets targeted at this protocol but
   153  // that don't match any existing endpoint.
   154  //
   155  // RFC 793, page 36, states that "If the connection does not exist (CLOSED) then
   156  // a reset is sent in response to any incoming segment except another reset. In
   157  // particular, SYNs addressed to a non-existent connection are rejected by this
   158  // means."
   159  func (p *protocol) HandleUnknownDestinationPacket(id stack.TransportEndpointID, pkt *stack.PacketBuffer) stack.UnknownDestinationPacketDisposition {
   160  	s, err := newIncomingSegment(id, p.stack.Clock(), pkt)
   161  	if err != nil {
   162  		return stack.UnknownDestinationPacketMalformed
   163  	}
   164  	defer s.DecRef()
   165  	if !s.csumValid {
   166  		return stack.UnknownDestinationPacketMalformed
   167  	}
   168  
   169  	if !s.flags.Contains(header.TCPFlagRst) {
   170  		replyWithReset(p.stack, s, stack.DefaultTOS, tcpip.UseDefaultIPv4TTL, tcpip.UseDefaultIPv6HopLimit)
   171  	}
   172  
   173  	return stack.UnknownDestinationPacketHandled
   174  }
   175  
   176  func (p *protocol) tsOffset(src, dst tcpip.Address) tcp.TSOffset {
   177  	// Initialize a random tsOffset that will be added to the recentTS
   178  	// everytime the timestamp is sent when the Timestamp option is enabled.
   179  	//
   180  	// See https://tools.ietf.org/html/rfc7323#section-5.4 for details on
   181  	// why this is required.
   182  	h := sha256.New()
   183  
   184  	// Per hash.Hash.Writer:
   185  	//
   186  	// It never returns an error.
   187  	_, _ = h.Write(p.tsOffsetSecret[:])
   188  	_, _ = h.Write(src.AsSlice())
   189  	_, _ = h.Write(dst.AsSlice())
   190  	return tcp.NewTSOffset(binary.LittleEndian.Uint32(h.Sum(nil)[:4]))
   191  }
   192  
   193  // replyWithReset replies to the given segment with a reset segment.
   194  //
   195  // If the relevant TTL has its reset value (0 for ipv4TTL, -1 for ipv6HopLimit),
   196  // then the route's default TTL will be used.
   197  func replyWithReset(st *stack.Stack, s *segment, tos, ipv4TTL uint8, ipv6HopLimit int16) tcpip.Error {
   198  	net := s.pkt.Network()
   199  	route, err := st.FindRoute(s.pkt.NICID, net.DestinationAddress(), net.SourceAddress(), s.pkt.NetworkProtocolNumber, false /* multicastLoop */)
   200  	if err != nil {
   201  		return err
   202  	}
   203  	defer route.Release()
   204  
   205  	ttl := calculateTTL(route, ipv4TTL, ipv6HopLimit)
   206  
   207  	// Get the seqnum from the packet if the ack flag is set.
   208  	seq := seqnum.Value(0)
   209  	ack := seqnum.Value(0)
   210  	flags := header.TCPFlagRst
   211  	// As per RFC 793 page 35 (Reset Generation)
   212  	//   1.  If the connection does not exist (CLOSED) then a reset is sent
   213  	//   in response to any incoming segment except another reset.  In
   214  	//   particular, SYNs addressed to a non-existent connection are rejected
   215  	//   by this means.
   216  
   217  	//   If the incoming segment has an ACK field, the reset takes its
   218  	//   sequence number from the ACK field of the segment, otherwise the
   219  	//   reset has sequence number zero and the ACK field is set to the sum
   220  	//   of the sequence number and segment length of the incoming segment.
   221  	//   The connection remains in the CLOSED state.
   222  	if s.flags.Contains(header.TCPFlagAck) {
   223  		seq = s.ackNumber
   224  	} else {
   225  		flags |= header.TCPFlagAck
   226  		ack = s.sequenceNumber.Add(s.logicalLen())
   227  	}
   228  
   229  	p := stack.NewPacketBuffer(stack.PacketBufferOptions{ReserveHeaderBytes: header.TCPMinimumSize + int(route.MaxHeaderLength())})
   230  	defer p.DecRef()
   231  	return sendTCP(route, tcpFields{
   232  		id:     s.id,
   233  		ttl:    ttl,
   234  		tos:    tos,
   235  		flags:  flags,
   236  		seq:    seq,
   237  		ack:    ack,
   238  		rcvWnd: 0,
   239  	}, p, stack.GSO{}, nil /* PacketOwner */)
   240  }
   241  
   242  // SetOption implements stack.TransportProtocol.SetOption.
   243  func (p *protocol) SetOption(option tcpip.SettableTransportProtocolOption) tcpip.Error {
   244  	switch v := option.(type) {
   245  	case *tcpip.TCPSACKEnabled:
   246  		p.mu.Lock()
   247  		p.sackEnabled = bool(*v)
   248  		p.mu.Unlock()
   249  		return nil
   250  
   251  	case *tcpip.TCPRecovery:
   252  		p.mu.Lock()
   253  		p.recovery = *v
   254  		p.mu.Unlock()
   255  		return nil
   256  
   257  	case *tcpip.TCPDelayEnabled:
   258  		p.mu.Lock()
   259  		p.delayEnabled = bool(*v)
   260  		p.mu.Unlock()
   261  		return nil
   262  
   263  	case *tcpip.TCPSendBufferSizeRangeOption:
   264  		if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max {
   265  			return &tcpip.ErrInvalidOptionValue{}
   266  		}
   267  		p.mu.Lock()
   268  		p.sendBufferSize = *v
   269  		p.mu.Unlock()
   270  		return nil
   271  
   272  	case *tcpip.TCPReceiveBufferSizeRangeOption:
   273  		if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max {
   274  			return &tcpip.ErrInvalidOptionValue{}
   275  		}
   276  		p.mu.Lock()
   277  		p.recvBufferSize = *v
   278  		p.mu.Unlock()
   279  		return nil
   280  
   281  	case *tcpip.CongestionControlOption:
   282  		for _, c := range p.availableCongestionControl {
   283  			if string(*v) == c {
   284  				p.mu.Lock()
   285  				p.congestionControl = string(*v)
   286  				p.mu.Unlock()
   287  				return nil
   288  			}
   289  		}
   290  		// linux returns ENOENT when an invalid congestion control
   291  		// is specified.
   292  		return &tcpip.ErrNoSuchFile{}
   293  
   294  	case *tcpip.TCPModerateReceiveBufferOption:
   295  		p.mu.Lock()
   296  		p.moderateReceiveBuffer = bool(*v)
   297  		p.mu.Unlock()
   298  		return nil
   299  
   300  	case *tcpip.TCPLingerTimeoutOption:
   301  		p.mu.Lock()
   302  		if *v < 0 {
   303  			p.lingerTimeout = 0
   304  		} else {
   305  			p.lingerTimeout = time.Duration(*v)
   306  		}
   307  		p.mu.Unlock()
   308  		return nil
   309  
   310  	case *tcpip.TCPTimeWaitTimeoutOption:
   311  		p.mu.Lock()
   312  		if *v < 0 {
   313  			p.timeWaitTimeout = 0
   314  		} else {
   315  			p.timeWaitTimeout = time.Duration(*v)
   316  		}
   317  		p.mu.Unlock()
   318  		return nil
   319  
   320  	case *tcpip.TCPTimeWaitReuseOption:
   321  		if *v < tcpip.TCPTimeWaitReuseDisabled || *v > tcpip.TCPTimeWaitReuseLoopbackOnly {
   322  			return &tcpip.ErrInvalidOptionValue{}
   323  		}
   324  		p.mu.Lock()
   325  		p.timeWaitReuse = *v
   326  		p.mu.Unlock()
   327  		return nil
   328  
   329  	case *tcpip.TCPMinRTOOption:
   330  		p.mu.Lock()
   331  		defer p.mu.Unlock()
   332  		if *v < 0 {
   333  			p.minRTO = MinRTO
   334  		} else if minRTO := time.Duration(*v); minRTO <= p.maxRTO {
   335  			p.minRTO = minRTO
   336  		} else {
   337  			return &tcpip.ErrInvalidOptionValue{}
   338  		}
   339  		return nil
   340  
   341  	case *tcpip.TCPMaxRTOOption:
   342  		p.mu.Lock()
   343  		defer p.mu.Unlock()
   344  		if *v < 0 {
   345  			p.maxRTO = MaxRTO
   346  		} else if maxRTO := time.Duration(*v); maxRTO >= p.minRTO {
   347  			p.maxRTO = maxRTO
   348  		} else {
   349  			return &tcpip.ErrInvalidOptionValue{}
   350  		}
   351  		return nil
   352  
   353  	case *tcpip.TCPMaxRetriesOption:
   354  		p.mu.Lock()
   355  		p.maxRetries = uint32(*v)
   356  		p.mu.Unlock()
   357  		return nil
   358  
   359  	case *tcpip.TCPAlwaysUseSynCookies:
   360  		p.mu.Lock()
   361  		p.alwaysUseSynCookies = bool(*v)
   362  		p.mu.Unlock()
   363  		return nil
   364  
   365  	case *tcpip.TCPSynRetriesOption:
   366  		if *v < 1 || *v > 255 {
   367  			return &tcpip.ErrInvalidOptionValue{}
   368  		}
   369  		p.mu.Lock()
   370  		p.synRetries = uint8(*v)
   371  		p.mu.Unlock()
   372  		return nil
   373  
   374  	default:
   375  		return &tcpip.ErrUnknownProtocolOption{}
   376  	}
   377  }
   378  
   379  // Option implements stack.TransportProtocol.Option.
   380  func (p *protocol) Option(option tcpip.GettableTransportProtocolOption) tcpip.Error {
   381  	switch v := option.(type) {
   382  	case *tcpip.TCPSACKEnabled:
   383  		p.mu.RLock()
   384  		*v = tcpip.TCPSACKEnabled(p.sackEnabled)
   385  		p.mu.RUnlock()
   386  		return nil
   387  
   388  	case *tcpip.TCPRecovery:
   389  		p.mu.RLock()
   390  		*v = p.recovery
   391  		p.mu.RUnlock()
   392  		return nil
   393  
   394  	case *tcpip.TCPDelayEnabled:
   395  		p.mu.RLock()
   396  		*v = tcpip.TCPDelayEnabled(p.delayEnabled)
   397  		p.mu.RUnlock()
   398  		return nil
   399  
   400  	case *tcpip.TCPSendBufferSizeRangeOption:
   401  		p.mu.RLock()
   402  		*v = p.sendBufferSize
   403  		p.mu.RUnlock()
   404  		return nil
   405  
   406  	case *tcpip.TCPReceiveBufferSizeRangeOption:
   407  		p.mu.RLock()
   408  		*v = p.recvBufferSize
   409  		p.mu.RUnlock()
   410  		return nil
   411  
   412  	case *tcpip.CongestionControlOption:
   413  		p.mu.RLock()
   414  		*v = tcpip.CongestionControlOption(p.congestionControl)
   415  		p.mu.RUnlock()
   416  		return nil
   417  
   418  	case *tcpip.TCPAvailableCongestionControlOption:
   419  		p.mu.RLock()
   420  		*v = tcpip.TCPAvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " "))
   421  		p.mu.RUnlock()
   422  		return nil
   423  
   424  	case *tcpip.TCPModerateReceiveBufferOption:
   425  		p.mu.RLock()
   426  		*v = tcpip.TCPModerateReceiveBufferOption(p.moderateReceiveBuffer)
   427  		p.mu.RUnlock()
   428  		return nil
   429  
   430  	case *tcpip.TCPLingerTimeoutOption:
   431  		p.mu.RLock()
   432  		*v = tcpip.TCPLingerTimeoutOption(p.lingerTimeout)
   433  		p.mu.RUnlock()
   434  		return nil
   435  
   436  	case *tcpip.TCPTimeWaitTimeoutOption:
   437  		p.mu.RLock()
   438  		*v = tcpip.TCPTimeWaitTimeoutOption(p.timeWaitTimeout)
   439  		p.mu.RUnlock()
   440  		return nil
   441  
   442  	case *tcpip.TCPTimeWaitReuseOption:
   443  		p.mu.RLock()
   444  		*v = p.timeWaitReuse
   445  		p.mu.RUnlock()
   446  		return nil
   447  
   448  	case *tcpip.TCPMinRTOOption:
   449  		p.mu.RLock()
   450  		*v = tcpip.TCPMinRTOOption(p.minRTO)
   451  		p.mu.RUnlock()
   452  		return nil
   453  
   454  	case *tcpip.TCPMaxRTOOption:
   455  		p.mu.RLock()
   456  		*v = tcpip.TCPMaxRTOOption(p.maxRTO)
   457  		p.mu.RUnlock()
   458  		return nil
   459  
   460  	case *tcpip.TCPMaxRetriesOption:
   461  		p.mu.RLock()
   462  		*v = tcpip.TCPMaxRetriesOption(p.maxRetries)
   463  		p.mu.RUnlock()
   464  		return nil
   465  
   466  	case *tcpip.TCPAlwaysUseSynCookies:
   467  		p.mu.RLock()
   468  		*v = tcpip.TCPAlwaysUseSynCookies(p.alwaysUseSynCookies)
   469  		p.mu.RUnlock()
   470  		return nil
   471  
   472  	case *tcpip.TCPSynRetriesOption:
   473  		p.mu.RLock()
   474  		*v = tcpip.TCPSynRetriesOption(p.synRetries)
   475  		p.mu.RUnlock()
   476  		return nil
   477  
   478  	default:
   479  		return &tcpip.ErrUnknownProtocolOption{}
   480  	}
   481  }
   482  
   483  // SendBufferSize implements stack.SendBufSizeProto.
   484  func (p *protocol) SendBufferSize() tcpip.TCPSendBufferSizeRangeOption {
   485  	p.mu.RLock()
   486  	defer p.mu.RUnlock()
   487  	return p.sendBufferSize
   488  }
   489  
   490  // Close implements stack.TransportProtocol.Close.
   491  func (p *protocol) Close() {
   492  	p.dispatcher.close()
   493  }
   494  
   495  // Wait implements stack.TransportProtocol.Wait.
   496  func (p *protocol) Wait() {
   497  	p.dispatcher.wait()
   498  }
   499  
   500  // Pause implements stack.TransportProtocol.Pause.
   501  func (p *protocol) Pause() {
   502  	p.dispatcher.pause()
   503  }
   504  
   505  // Resume implements stack.TransportProtocol.Resume.
   506  func (p *protocol) Resume() {
   507  	p.dispatcher.resume()
   508  }
   509  
   510  // Parse implements stack.TransportProtocol.Parse.
   511  func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
   512  	return parse.TCP(pkt)
   513  }
   514  
   515  // NewProtocol returns a TCP transport protocol.
   516  func NewProtocol(s *stack.Stack) stack.TransportProtocol {
   517  	rng := s.SecureRNG()
   518  	var seqnumSecret [16]byte
   519  	var tsOffsetSecret [16]byte
   520  	if n, err := rng.Reader.Read(seqnumSecret[:]); err != nil || n != len(seqnumSecret) {
   521  		panic(fmt.Sprintf("Read() failed: %v", err))
   522  	}
   523  	if n, err := rng.Reader.Read(tsOffsetSecret[:]); err != nil || n != len(tsOffsetSecret) {
   524  		panic(fmt.Sprintf("Read() failed: %v", err))
   525  	}
   526  	p := protocol{
   527  		stack: s,
   528  		sendBufferSize: tcpip.TCPSendBufferSizeRangeOption{
   529  			Min:     MinBufferSize,
   530  			Default: DefaultSendBufferSize,
   531  			Max:     MaxBufferSize,
   532  		},
   533  		recvBufferSize: tcpip.TCPReceiveBufferSizeRangeOption{
   534  			Min:     MinBufferSize,
   535  			Default: DefaultReceiveBufferSize,
   536  			Max:     MaxBufferSize,
   537  		},
   538  		congestionControl:          ccReno,
   539  		availableCongestionControl: []string{ccReno, ccCubic},
   540  		moderateReceiveBuffer:      true,
   541  		lingerTimeout:              DefaultTCPLingerTimeout,
   542  		timeWaitTimeout:            DefaultTCPTimeWaitTimeout,
   543  		timeWaitReuse:              tcpip.TCPTimeWaitReuseLoopbackOnly,
   544  		synRetries:                 DefaultSynRetries,
   545  		minRTO:                     MinRTO,
   546  		maxRTO:                     MaxRTO,
   547  		maxRetries:                 MaxRetries,
   548  		recovery:                   tcpip.TCPRACKLossDetection,
   549  		seqnumSecret:               seqnumSecret,
   550  		tsOffsetSecret:             tsOffsetSecret,
   551  	}
   552  	p.dispatcher.init(s.InsecureRNG(), runtime.GOMAXPROCS(0))
   553  	return &p
   554  }
   555  
   556  // protocolFromStack retrieves the tcp.protocol instance from stack s.
   557  func protocolFromStack(s *stack.Stack) *protocol {
   558  	return s.TransportProtocolInstance(ProtocolNumber).(*protocol)
   559  }