github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/tcpip/transport/tcp/protocol.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package tcp contains the implementation of the TCP transport protocol.
    16  package tcp
    17  
    18  import (
    19  	"runtime"
    20  	"strings"
    21  	"time"
    22  
    23  	"github.com/SagerNet/gvisor/pkg/sync"
    24  	"github.com/SagerNet/gvisor/pkg/tcpip"
    25  	"github.com/SagerNet/gvisor/pkg/tcpip/buffer"
    26  	"github.com/SagerNet/gvisor/pkg/tcpip/header"
    27  	"github.com/SagerNet/gvisor/pkg/tcpip/header/parse"
    28  	"github.com/SagerNet/gvisor/pkg/tcpip/seqnum"
    29  	"github.com/SagerNet/gvisor/pkg/tcpip/stack"
    30  	"github.com/SagerNet/gvisor/pkg/tcpip/transport/raw"
    31  	"github.com/SagerNet/gvisor/pkg/waiter"
    32  )
    33  
    34  const (
    35  	// ProtocolNumber is the tcp protocol number.
    36  	ProtocolNumber = header.TCPProtocolNumber
    37  
    38  	// MinBufferSize is the smallest size of a receive or send buffer.
    39  	MinBufferSize = 4 << 10 // 4096 bytes.
    40  
    41  	// DefaultSendBufferSize is the default size of the send buffer for
    42  	// an endpoint.
    43  	DefaultSendBufferSize = 1 << 20 // 1MB
    44  
    45  	// DefaultReceiveBufferSize is the default size of the receive buffer
    46  	// for an endpoint.
    47  	DefaultReceiveBufferSize = 1 << 20 // 1MB
    48  
    49  	// MaxBufferSize is the largest size a receive/send buffer can grow to.
    50  	MaxBufferSize = 4 << 20 // 4MB
    51  
    52  	// MaxUnprocessedSegments is the maximum number of unprocessed segments
    53  	// that can be queued for a given endpoint.
    54  	MaxUnprocessedSegments = 300
    55  
    56  	// DefaultTCPLingerTimeout is the amount of time that sockets linger in
    57  	// FIN_WAIT_2 state before being marked closed.
    58  	DefaultTCPLingerTimeout = 60 * time.Second
    59  
    60  	// MaxTCPLingerTimeout is the maximum amount of time that sockets
    61  	// linger in FIN_WAIT_2 state before being marked closed.
    62  	MaxTCPLingerTimeout = 120 * time.Second
    63  
    64  	// DefaultTCPTimeWaitTimeout is the amount of time that sockets linger
    65  	// in TIME_WAIT state before being marked closed.
    66  	DefaultTCPTimeWaitTimeout = 60 * time.Second
    67  
    68  	// DefaultSynRetries is the default value for the number of SYN retransmits
    69  	// before a connect is aborted.
    70  	DefaultSynRetries = 6
    71  )
    72  
    73  const (
    74  	ccReno  = "reno"
    75  	ccCubic = "cubic"
    76  )
    77  
    78  type protocol struct {
    79  	stack *stack.Stack
    80  
    81  	mu                         sync.RWMutex
    82  	sackEnabled                bool
    83  	recovery                   tcpip.TCPRecovery
    84  	delayEnabled               bool
    85  	alwaysUseSynCookies        bool
    86  	sendBufferSize             tcpip.TCPSendBufferSizeRangeOption
    87  	recvBufferSize             tcpip.TCPReceiveBufferSizeRangeOption
    88  	congestionControl          string
    89  	availableCongestionControl []string
    90  	moderateReceiveBuffer      bool
    91  	lingerTimeout              time.Duration
    92  	timeWaitTimeout            time.Duration
    93  	timeWaitReuse              tcpip.TCPTimeWaitReuseOption
    94  	minRTO                     time.Duration
    95  	maxRTO                     time.Duration
    96  	maxRetries                 uint32
    97  	synRetries                 uint8
    98  	dispatcher                 dispatcher
    99  }
   100  
   101  // Number returns the tcp protocol number.
   102  func (*protocol) Number() tcpip.TransportProtocolNumber {
   103  	return ProtocolNumber
   104  }
   105  
   106  // NewEndpoint creates a new tcp endpoint.
   107  func (p *protocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
   108  	return newEndpoint(p.stack, netProto, waiterQueue), nil
   109  }
   110  
   111  // NewRawEndpoint creates a new raw TCP endpoint. Raw TCP sockets are currently
   112  // unsupported. It implements stack.TransportProtocol.NewRawEndpoint.
   113  func (p *protocol) NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
   114  	return raw.NewEndpoint(p.stack, netProto, header.TCPProtocolNumber, waiterQueue)
   115  }
   116  
   117  // MinimumPacketSize returns the minimum valid tcp packet size.
   118  func (*protocol) MinimumPacketSize() int {
   119  	return header.TCPMinimumSize
   120  }
   121  
   122  // ParsePorts returns the source and destination ports stored in the given tcp
   123  // packet.
   124  func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err tcpip.Error) {
   125  	h := header.TCP(v)
   126  	return h.SourcePort(), h.DestinationPort(), nil
   127  }
   128  
   129  // QueuePacket queues packets targeted at an endpoint after hashing the packet
   130  // to a specific processing queue. Each queue is serviced by its own processor
   131  // goroutine which is responsible for dequeuing and doing full TCP dispatch of
   132  // the packet.
   133  func (p *protocol) QueuePacket(ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
   134  	p.dispatcher.queuePacket(ep, id, p.stack.Clock(), pkt)
   135  }
   136  
   137  // HandleUnknownDestinationPacket handles packets targeted at this protocol but
   138  // that don't match any existing endpoint.
   139  //
   140  // RFC 793, page 36, states that "If the connection does not exist (CLOSED) then
   141  // a reset is sent in response to any incoming segment except another reset. In
   142  // particular, SYNs addressed to a non-existent connection are rejected by this
   143  // means."
   144  func (p *protocol) HandleUnknownDestinationPacket(id stack.TransportEndpointID, pkt *stack.PacketBuffer) stack.UnknownDestinationPacketDisposition {
   145  	s := newIncomingSegment(id, p.stack.Clock(), pkt)
   146  	defer s.decRef()
   147  
   148  	if !s.parse(pkt.RXTransportChecksumValidated) || !s.csumValid {
   149  		return stack.UnknownDestinationPacketMalformed
   150  	}
   151  
   152  	if !s.flags.Contains(header.TCPFlagRst) {
   153  		replyWithReset(p.stack, s, stack.DefaultTOS, 0)
   154  	}
   155  
   156  	return stack.UnknownDestinationPacketHandled
   157  }
   158  
   159  // replyWithReset replies to the given segment with a reset segment.
   160  //
   161  // If the passed TTL is 0, then the route's default TTL will be used.
   162  func replyWithReset(st *stack.Stack, s *segment, tos, ttl uint8) tcpip.Error {
   163  	route, err := st.FindRoute(s.nicID, s.dstAddr, s.srcAddr, s.netProto, false /* multicastLoop */)
   164  	if err != nil {
   165  		return err
   166  	}
   167  	defer route.Release()
   168  
   169  	// Get the seqnum from the packet if the ack flag is set.
   170  	seq := seqnum.Value(0)
   171  	ack := seqnum.Value(0)
   172  	flags := header.TCPFlagRst
   173  	// As per RFC 793 page 35 (Reset Generation)
   174  	//   1.  If the connection does not exist (CLOSED) then a reset is sent
   175  	//   in response to any incoming segment except another reset.  In
   176  	//   particular, SYNs addressed to a non-existent connection are rejected
   177  	//   by this means.
   178  
   179  	//   If the incoming segment has an ACK field, the reset takes its
   180  	//   sequence number from the ACK field of the segment, otherwise the
   181  	//   reset has sequence number zero and the ACK field is set to the sum
   182  	//   of the sequence number and segment length of the incoming segment.
   183  	//   The connection remains in the CLOSED state.
   184  	if s.flags.Contains(header.TCPFlagAck) {
   185  		seq = s.ackNumber
   186  	} else {
   187  		flags |= header.TCPFlagAck
   188  		ack = s.sequenceNumber.Add(s.logicalLen())
   189  	}
   190  
   191  	if ttl == 0 {
   192  		ttl = route.DefaultTTL()
   193  	}
   194  
   195  	return sendTCP(route, tcpFields{
   196  		id:     s.id,
   197  		ttl:    ttl,
   198  		tos:    tos,
   199  		flags:  flags,
   200  		seq:    seq,
   201  		ack:    ack,
   202  		rcvWnd: 0,
   203  	}, buffer.VectorisedView{}, stack.GSO{}, nil /* PacketOwner */)
   204  }
   205  
   206  // SetOption implements stack.TransportProtocol.SetOption.
   207  func (p *protocol) SetOption(option tcpip.SettableTransportProtocolOption) tcpip.Error {
   208  	switch v := option.(type) {
   209  	case *tcpip.TCPSACKEnabled:
   210  		p.mu.Lock()
   211  		p.sackEnabled = bool(*v)
   212  		p.mu.Unlock()
   213  		return nil
   214  
   215  	case *tcpip.TCPRecovery:
   216  		p.mu.Lock()
   217  		p.recovery = *v
   218  		p.mu.Unlock()
   219  		return nil
   220  
   221  	case *tcpip.TCPDelayEnabled:
   222  		p.mu.Lock()
   223  		p.delayEnabled = bool(*v)
   224  		p.mu.Unlock()
   225  		return nil
   226  
   227  	case *tcpip.TCPSendBufferSizeRangeOption:
   228  		if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max {
   229  			return &tcpip.ErrInvalidOptionValue{}
   230  		}
   231  		p.mu.Lock()
   232  		p.sendBufferSize = *v
   233  		p.mu.Unlock()
   234  		return nil
   235  
   236  	case *tcpip.TCPReceiveBufferSizeRangeOption:
   237  		if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max {
   238  			return &tcpip.ErrInvalidOptionValue{}
   239  		}
   240  		p.mu.Lock()
   241  		p.recvBufferSize = *v
   242  		p.mu.Unlock()
   243  		return nil
   244  
   245  	case *tcpip.CongestionControlOption:
   246  		for _, c := range p.availableCongestionControl {
   247  			if string(*v) == c {
   248  				p.mu.Lock()
   249  				p.congestionControl = string(*v)
   250  				p.mu.Unlock()
   251  				return nil
   252  			}
   253  		}
   254  		// linux returns ENOENT when an invalid congestion control
   255  		// is specified.
   256  		return &tcpip.ErrNoSuchFile{}
   257  
   258  	case *tcpip.TCPModerateReceiveBufferOption:
   259  		p.mu.Lock()
   260  		p.moderateReceiveBuffer = bool(*v)
   261  		p.mu.Unlock()
   262  		return nil
   263  
   264  	case *tcpip.TCPLingerTimeoutOption:
   265  		p.mu.Lock()
   266  		if *v < 0 {
   267  			p.lingerTimeout = 0
   268  		} else {
   269  			p.lingerTimeout = time.Duration(*v)
   270  		}
   271  		p.mu.Unlock()
   272  		return nil
   273  
   274  	case *tcpip.TCPTimeWaitTimeoutOption:
   275  		p.mu.Lock()
   276  		if *v < 0 {
   277  			p.timeWaitTimeout = 0
   278  		} else {
   279  			p.timeWaitTimeout = time.Duration(*v)
   280  		}
   281  		p.mu.Unlock()
   282  		return nil
   283  
   284  	case *tcpip.TCPTimeWaitReuseOption:
   285  		if *v < tcpip.TCPTimeWaitReuseDisabled || *v > tcpip.TCPTimeWaitReuseLoopbackOnly {
   286  			return &tcpip.ErrInvalidOptionValue{}
   287  		}
   288  		p.mu.Lock()
   289  		p.timeWaitReuse = *v
   290  		p.mu.Unlock()
   291  		return nil
   292  
   293  	case *tcpip.TCPMinRTOOption:
   294  		p.mu.Lock()
   295  		if *v < 0 {
   296  			p.minRTO = MinRTO
   297  		} else {
   298  			p.minRTO = time.Duration(*v)
   299  		}
   300  		p.mu.Unlock()
   301  		return nil
   302  
   303  	case *tcpip.TCPMaxRTOOption:
   304  		p.mu.Lock()
   305  		if *v < 0 {
   306  			p.maxRTO = MaxRTO
   307  		} else {
   308  			p.maxRTO = time.Duration(*v)
   309  		}
   310  		p.mu.Unlock()
   311  		return nil
   312  
   313  	case *tcpip.TCPMaxRetriesOption:
   314  		p.mu.Lock()
   315  		p.maxRetries = uint32(*v)
   316  		p.mu.Unlock()
   317  		return nil
   318  
   319  	case *tcpip.TCPAlwaysUseSynCookies:
   320  		p.mu.Lock()
   321  		p.alwaysUseSynCookies = bool(*v)
   322  		p.mu.Unlock()
   323  		return nil
   324  
   325  	case *tcpip.TCPSynRetriesOption:
   326  		if *v < 1 || *v > 255 {
   327  			return &tcpip.ErrInvalidOptionValue{}
   328  		}
   329  		p.mu.Lock()
   330  		p.synRetries = uint8(*v)
   331  		p.mu.Unlock()
   332  		return nil
   333  
   334  	default:
   335  		return &tcpip.ErrUnknownProtocolOption{}
   336  	}
   337  }
   338  
   339  // Option implements stack.TransportProtocol.Option.
   340  func (p *protocol) Option(option tcpip.GettableTransportProtocolOption) tcpip.Error {
   341  	switch v := option.(type) {
   342  	case *tcpip.TCPSACKEnabled:
   343  		p.mu.RLock()
   344  		*v = tcpip.TCPSACKEnabled(p.sackEnabled)
   345  		p.mu.RUnlock()
   346  		return nil
   347  
   348  	case *tcpip.TCPRecovery:
   349  		p.mu.RLock()
   350  		*v = p.recovery
   351  		p.mu.RUnlock()
   352  		return nil
   353  
   354  	case *tcpip.TCPDelayEnabled:
   355  		p.mu.RLock()
   356  		*v = tcpip.TCPDelayEnabled(p.delayEnabled)
   357  		p.mu.RUnlock()
   358  		return nil
   359  
   360  	case *tcpip.TCPSendBufferSizeRangeOption:
   361  		p.mu.RLock()
   362  		*v = p.sendBufferSize
   363  		p.mu.RUnlock()
   364  		return nil
   365  
   366  	case *tcpip.TCPReceiveBufferSizeRangeOption:
   367  		p.mu.RLock()
   368  		*v = p.recvBufferSize
   369  		p.mu.RUnlock()
   370  		return nil
   371  
   372  	case *tcpip.CongestionControlOption:
   373  		p.mu.RLock()
   374  		*v = tcpip.CongestionControlOption(p.congestionControl)
   375  		p.mu.RUnlock()
   376  		return nil
   377  
   378  	case *tcpip.TCPAvailableCongestionControlOption:
   379  		p.mu.RLock()
   380  		*v = tcpip.TCPAvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " "))
   381  		p.mu.RUnlock()
   382  		return nil
   383  
   384  	case *tcpip.TCPModerateReceiveBufferOption:
   385  		p.mu.RLock()
   386  		*v = tcpip.TCPModerateReceiveBufferOption(p.moderateReceiveBuffer)
   387  		p.mu.RUnlock()
   388  		return nil
   389  
   390  	case *tcpip.TCPLingerTimeoutOption:
   391  		p.mu.RLock()
   392  		*v = tcpip.TCPLingerTimeoutOption(p.lingerTimeout)
   393  		p.mu.RUnlock()
   394  		return nil
   395  
   396  	case *tcpip.TCPTimeWaitTimeoutOption:
   397  		p.mu.RLock()
   398  		*v = tcpip.TCPTimeWaitTimeoutOption(p.timeWaitTimeout)
   399  		p.mu.RUnlock()
   400  		return nil
   401  
   402  	case *tcpip.TCPTimeWaitReuseOption:
   403  		p.mu.RLock()
   404  		*v = p.timeWaitReuse
   405  		p.mu.RUnlock()
   406  		return nil
   407  
   408  	case *tcpip.TCPMinRTOOption:
   409  		p.mu.RLock()
   410  		*v = tcpip.TCPMinRTOOption(p.minRTO)
   411  		p.mu.RUnlock()
   412  		return nil
   413  
   414  	case *tcpip.TCPMaxRTOOption:
   415  		p.mu.RLock()
   416  		*v = tcpip.TCPMaxRTOOption(p.maxRTO)
   417  		p.mu.RUnlock()
   418  		return nil
   419  
   420  	case *tcpip.TCPMaxRetriesOption:
   421  		p.mu.RLock()
   422  		*v = tcpip.TCPMaxRetriesOption(p.maxRetries)
   423  		p.mu.RUnlock()
   424  		return nil
   425  
   426  	case *tcpip.TCPAlwaysUseSynCookies:
   427  		p.mu.RLock()
   428  		*v = tcpip.TCPAlwaysUseSynCookies(p.alwaysUseSynCookies)
   429  		p.mu.RUnlock()
   430  		return nil
   431  
   432  	case *tcpip.TCPSynRetriesOption:
   433  		p.mu.RLock()
   434  		*v = tcpip.TCPSynRetriesOption(p.synRetries)
   435  		p.mu.RUnlock()
   436  		return nil
   437  
   438  	default:
   439  		return &tcpip.ErrUnknownProtocolOption{}
   440  	}
   441  }
   442  
   443  // Close implements stack.TransportProtocol.Close.
   444  func (p *protocol) Close() {
   445  	p.dispatcher.close()
   446  }
   447  
   448  // Wait implements stack.TransportProtocol.Wait.
   449  func (p *protocol) Wait() {
   450  	p.dispatcher.wait()
   451  }
   452  
   453  // Parse implements stack.TransportProtocol.Parse.
   454  func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
   455  	return parse.TCP(pkt)
   456  }
   457  
   458  // NewProtocol returns a TCP transport protocol.
   459  func NewProtocol(s *stack.Stack) stack.TransportProtocol {
   460  	p := protocol{
   461  		stack: s,
   462  		sendBufferSize: tcpip.TCPSendBufferSizeRangeOption{
   463  			Min:     MinBufferSize,
   464  			Default: DefaultSendBufferSize,
   465  			Max:     MaxBufferSize,
   466  		},
   467  		recvBufferSize: tcpip.TCPReceiveBufferSizeRangeOption{
   468  			Min:     MinBufferSize,
   469  			Default: DefaultReceiveBufferSize,
   470  			Max:     MaxBufferSize,
   471  		},
   472  		congestionControl:          ccReno,
   473  		availableCongestionControl: []string{ccReno, ccCubic},
   474  		lingerTimeout:              DefaultTCPLingerTimeout,
   475  		timeWaitTimeout:            DefaultTCPTimeWaitTimeout,
   476  		timeWaitReuse:              tcpip.TCPTimeWaitReuseLoopbackOnly,
   477  		synRetries:                 DefaultSynRetries,
   478  		minRTO:                     MinRTO,
   479  		maxRTO:                     MaxRTO,
   480  		maxRetries:                 MaxRetries,
   481  		// TODO(github.com/SagerNet/issue/5243): Set recovery to tcpip.TCPRACKLossDetection.
   482  		recovery: 0,
   483  	}
   484  	p.dispatcher.init(s.Rand(), runtime.GOMAXPROCS(0))
   485  	return &p
   486  }