inet.af/netstack@v0.0.0-20220214151720-7585b01ddccf/tcpip/transport/tcp/protocol.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package tcp contains the implementation of the TCP transport protocol.
    16  package tcp
    17  
    18  import (
    19  	"runtime"
    20  	"strings"
    21  	"time"
    22  
    23  	"inet.af/netstack/sync"
    24  	"inet.af/netstack/tcpip"
    25  	"inet.af/netstack/tcpip/buffer"
    26  	"inet.af/netstack/tcpip/hash/jenkins"
    27  	"inet.af/netstack/tcpip/header"
    28  	"inet.af/netstack/tcpip/header/parse"
    29  	"inet.af/netstack/tcpip/internal/tcp"
    30  	"inet.af/netstack/tcpip/seqnum"
    31  	"inet.af/netstack/tcpip/stack"
    32  	"inet.af/netstack/tcpip/transport/raw"
    33  	"inet.af/netstack/waiter"
    34  )
    35  
    36  const (
    37  	// ProtocolNumber is the tcp protocol number.
    38  	ProtocolNumber = header.TCPProtocolNumber
    39  
    40  	// MinBufferSize is the smallest size of a receive or send buffer.
    41  	MinBufferSize = 4 << 10 // 4096 bytes.
    42  
    43  	// DefaultSendBufferSize is the default size of the send buffer for
    44  	// an endpoint.
    45  	DefaultSendBufferSize = 1 << 20 // 1MB
    46  
    47  	// DefaultReceiveBufferSize is the default size of the receive buffer
    48  	// for an endpoint.
    49  	DefaultReceiveBufferSize = 1 << 20 // 1MB
    50  
    51  	// MaxBufferSize is the largest size a receive/send buffer can grow to.
    52  	MaxBufferSize = 4 << 20 // 4MB
    53  
    54  	// DefaultTCPLingerTimeout is the amount of time that sockets linger in
    55  	// FIN_WAIT_2 state before being marked closed.
    56  	DefaultTCPLingerTimeout = 60 * time.Second
    57  
    58  	// MaxTCPLingerTimeout is the maximum amount of time that sockets
    59  	// linger in FIN_WAIT_2 state before being marked closed.
    60  	MaxTCPLingerTimeout = 120 * time.Second
    61  
    62  	// DefaultTCPTimeWaitTimeout is the amount of time that sockets linger
    63  	// in TIME_WAIT state before being marked closed.
    64  	DefaultTCPTimeWaitTimeout = 60 * time.Second
    65  
    66  	// DefaultSynRetries is the default value for the number of SYN retransmits
    67  	// before a connect is aborted.
    68  	DefaultSynRetries = 6
    69  
    70  	// DefaultKeepaliveIdle is the idle time for a connection before keep-alive
    71  	// probes are sent.
    72  	DefaultKeepaliveIdle = 2 * time.Hour
    73  
    74  	// DefaultKeepaliveInterval is the time between two successive keep-alive
    75  	// probes.
    76  	DefaultKeepaliveInterval = 75 * time.Second
    77  
    78  	// DefaultKeepaliveCount is the number of keep-alive probes that are sent
    79  	// before declaring the connection dead.
    80  	DefaultKeepaliveCount = 9
    81  )
    82  
    83  const (
    84  	ccReno  = "reno"
    85  	ccCubic = "cubic"
    86  )
    87  
    88  type protocol struct {
    89  	stack *stack.Stack
    90  
    91  	mu                         sync.RWMutex
    92  	sackEnabled                bool
    93  	recovery                   tcpip.TCPRecovery
    94  	delayEnabled               bool
    95  	alwaysUseSynCookies        bool
    96  	sendBufferSize             tcpip.TCPSendBufferSizeRangeOption
    97  	recvBufferSize             tcpip.TCPReceiveBufferSizeRangeOption
    98  	congestionControl          string
    99  	availableCongestionControl []string
   100  	moderateReceiveBuffer      bool
   101  	lingerTimeout              time.Duration
   102  	timeWaitTimeout            time.Duration
   103  	timeWaitReuse              tcpip.TCPTimeWaitReuseOption
   104  	minRTO                     time.Duration
   105  	maxRTO                     time.Duration
   106  	maxRetries                 uint32
   107  	synRetries                 uint8
   108  	dispatcher                 dispatcher
   109  
   110  	// The following secrets are initialized once and stay unchanged after.
   111  	seqnumSecret     uint32
   112  	portOffsetSecret uint32
   113  	tsOffsetSecret   uint32
   114  }
   115  
   116  // Number returns the tcp protocol number.
   117  func (*protocol) Number() tcpip.TransportProtocolNumber {
   118  	return ProtocolNumber
   119  }
   120  
   121  // NewEndpoint creates a new tcp endpoint.
   122  func (p *protocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
   123  	return newEndpoint(p.stack, p, netProto, waiterQueue), nil
   124  }
   125  
   126  // NewRawEndpoint creates a new raw TCP endpoint. Raw TCP sockets are currently
   127  // unsupported. It implements stack.TransportProtocol.NewRawEndpoint.
   128  func (p *protocol) NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
   129  	return raw.NewEndpoint(p.stack, netProto, header.TCPProtocolNumber, waiterQueue)
   130  }
   131  
   132  // MinimumPacketSize returns the minimum valid tcp packet size.
   133  func (*protocol) MinimumPacketSize() int {
   134  	return header.TCPMinimumSize
   135  }
   136  
   137  // ParsePorts returns the source and destination ports stored in the given tcp
   138  // packet.
   139  func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err tcpip.Error) {
   140  	h := header.TCP(v)
   141  	return h.SourcePort(), h.DestinationPort(), nil
   142  }
   143  
   144  // QueuePacket queues packets targeted at an endpoint after hashing the packet
   145  // to a specific processing queue. Each queue is serviced by its own processor
   146  // goroutine which is responsible for dequeuing and doing full TCP dispatch of
   147  // the packet.
   148  func (p *protocol) QueuePacket(ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
   149  	p.dispatcher.queuePacket(ep, id, p.stack.Clock(), pkt)
   150  }
   151  
   152  // HandleUnknownDestinationPacket handles packets targeted at this protocol but
   153  // that don't match any existing endpoint.
   154  //
   155  // RFC 793, page 36, states that "If the connection does not exist (CLOSED) then
   156  // a reset is sent in response to any incoming segment except another reset. In
   157  // particular, SYNs addressed to a non-existent connection are rejected by this
   158  // means."
   159  func (p *protocol) HandleUnknownDestinationPacket(id stack.TransportEndpointID, pkt *stack.PacketBuffer) stack.UnknownDestinationPacketDisposition {
   160  	s := newIncomingSegment(id, p.stack.Clock(), pkt)
   161  	defer s.decRef()
   162  
   163  	if !s.parse(pkt.RXTransportChecksumValidated) || !s.csumValid {
   164  		return stack.UnknownDestinationPacketMalformed
   165  	}
   166  
   167  	if !s.flags.Contains(header.TCPFlagRst) {
   168  		replyWithReset(p.stack, s, stack.DefaultTOS, 0)
   169  	}
   170  
   171  	return stack.UnknownDestinationPacketHandled
   172  }
   173  
   174  func (p *protocol) tsOffset(src, dst tcpip.Address) tcp.TSOffset {
   175  	// Initialize a random tsOffset that will be added to the recentTS
   176  	// everytime the timestamp is sent when the Timestamp option is enabled.
   177  	//
   178  	// See https://tools.ietf.org/html/rfc7323#section-5.4 for details on
   179  	// why this is required.
   180  	//
   181  	// TODO(https://gvisor.dev/issues/6473): This is not really secure as
   182  	// it does not use the recommended algorithm linked above.
   183  	h := jenkins.Sum32(p.tsOffsetSecret)
   184  	// Per hash.Hash.Writer:
   185  	//
   186  	// It never returns an error.
   187  	_, _ = h.Write([]byte(src))
   188  	_, _ = h.Write([]byte(dst))
   189  	return tcp.NewTSOffset(h.Sum32())
   190  }
   191  
   192  // replyWithReset replies to the given segment with a reset segment.
   193  //
   194  // If the passed TTL is 0, then the route's default TTL will be used.
   195  func replyWithReset(st *stack.Stack, s *segment, tos, ttl uint8) tcpip.Error {
   196  	route, err := st.FindRoute(s.nicID, s.dstAddr, s.srcAddr, s.netProto, false /* multicastLoop */)
   197  	if err != nil {
   198  		return err
   199  	}
   200  	defer route.Release()
   201  
   202  	// Get the seqnum from the packet if the ack flag is set.
   203  	seq := seqnum.Value(0)
   204  	ack := seqnum.Value(0)
   205  	flags := header.TCPFlagRst
   206  	// As per RFC 793 page 35 (Reset Generation)
   207  	//   1.  If the connection does not exist (CLOSED) then a reset is sent
   208  	//   in response to any incoming segment except another reset.  In
   209  	//   particular, SYNs addressed to a non-existent connection are rejected
   210  	//   by this means.
   211  
   212  	//   If the incoming segment has an ACK field, the reset takes its
   213  	//   sequence number from the ACK field of the segment, otherwise the
   214  	//   reset has sequence number zero and the ACK field is set to the sum
   215  	//   of the sequence number and segment length of the incoming segment.
   216  	//   The connection remains in the CLOSED state.
   217  	if s.flags.Contains(header.TCPFlagAck) {
   218  		seq = s.ackNumber
   219  	} else {
   220  		flags |= header.TCPFlagAck
   221  		ack = s.sequenceNumber.Add(s.logicalLen())
   222  	}
   223  
   224  	if ttl == 0 {
   225  		ttl = route.DefaultTTL()
   226  	}
   227  
   228  	return sendTCP(route, tcpFields{
   229  		id:     s.id,
   230  		ttl:    ttl,
   231  		tos:    tos,
   232  		flags:  flags,
   233  		seq:    seq,
   234  		ack:    ack,
   235  		rcvWnd: 0,
   236  	}, buffer.VectorisedView{}, stack.GSO{}, nil /* PacketOwner */)
   237  }
   238  
   239  // SetOption implements stack.TransportProtocol.SetOption.
   240  func (p *protocol) SetOption(option tcpip.SettableTransportProtocolOption) tcpip.Error {
   241  	switch v := option.(type) {
   242  	case *tcpip.TCPSACKEnabled:
   243  		p.mu.Lock()
   244  		p.sackEnabled = bool(*v)
   245  		p.mu.Unlock()
   246  		return nil
   247  
   248  	case *tcpip.TCPRecovery:
   249  		p.mu.Lock()
   250  		p.recovery = *v
   251  		p.mu.Unlock()
   252  		return nil
   253  
   254  	case *tcpip.TCPDelayEnabled:
   255  		p.mu.Lock()
   256  		p.delayEnabled = bool(*v)
   257  		p.mu.Unlock()
   258  		return nil
   259  
   260  	case *tcpip.TCPSendBufferSizeRangeOption:
   261  		if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max {
   262  			return &tcpip.ErrInvalidOptionValue{}
   263  		}
   264  		p.mu.Lock()
   265  		p.sendBufferSize = *v
   266  		p.mu.Unlock()
   267  		return nil
   268  
   269  	case *tcpip.TCPReceiveBufferSizeRangeOption:
   270  		if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max {
   271  			return &tcpip.ErrInvalidOptionValue{}
   272  		}
   273  		p.mu.Lock()
   274  		p.recvBufferSize = *v
   275  		p.mu.Unlock()
   276  		return nil
   277  
   278  	case *tcpip.CongestionControlOption:
   279  		for _, c := range p.availableCongestionControl {
   280  			if string(*v) == c {
   281  				p.mu.Lock()
   282  				p.congestionControl = string(*v)
   283  				p.mu.Unlock()
   284  				return nil
   285  			}
   286  		}
   287  		// linux returns ENOENT when an invalid congestion control
   288  		// is specified.
   289  		return &tcpip.ErrNoSuchFile{}
   290  
   291  	case *tcpip.TCPModerateReceiveBufferOption:
   292  		p.mu.Lock()
   293  		p.moderateReceiveBuffer = bool(*v)
   294  		p.mu.Unlock()
   295  		return nil
   296  
   297  	case *tcpip.TCPLingerTimeoutOption:
   298  		p.mu.Lock()
   299  		if *v < 0 {
   300  			p.lingerTimeout = 0
   301  		} else {
   302  			p.lingerTimeout = time.Duration(*v)
   303  		}
   304  		p.mu.Unlock()
   305  		return nil
   306  
   307  	case *tcpip.TCPTimeWaitTimeoutOption:
   308  		p.mu.Lock()
   309  		if *v < 0 {
   310  			p.timeWaitTimeout = 0
   311  		} else {
   312  			p.timeWaitTimeout = time.Duration(*v)
   313  		}
   314  		p.mu.Unlock()
   315  		return nil
   316  
   317  	case *tcpip.TCPTimeWaitReuseOption:
   318  		if *v < tcpip.TCPTimeWaitReuseDisabled || *v > tcpip.TCPTimeWaitReuseLoopbackOnly {
   319  			return &tcpip.ErrInvalidOptionValue{}
   320  		}
   321  		p.mu.Lock()
   322  		p.timeWaitReuse = *v
   323  		p.mu.Unlock()
   324  		return nil
   325  
   326  	case *tcpip.TCPMinRTOOption:
   327  		p.mu.Lock()
   328  		defer p.mu.Unlock()
   329  		if *v < 0 {
   330  			p.minRTO = MinRTO
   331  		} else if minRTO := time.Duration(*v); minRTO <= p.maxRTO {
   332  			p.minRTO = minRTO
   333  		} else {
   334  			return &tcpip.ErrInvalidOptionValue{}
   335  		}
   336  		return nil
   337  
   338  	case *tcpip.TCPMaxRTOOption:
   339  		p.mu.Lock()
   340  		defer p.mu.Unlock()
   341  		if *v < 0 {
   342  			p.maxRTO = MaxRTO
   343  		} else if maxRTO := time.Duration(*v); maxRTO >= p.minRTO {
   344  			p.maxRTO = maxRTO
   345  		} else {
   346  			return &tcpip.ErrInvalidOptionValue{}
   347  		}
   348  		return nil
   349  
   350  	case *tcpip.TCPMaxRetriesOption:
   351  		p.mu.Lock()
   352  		p.maxRetries = uint32(*v)
   353  		p.mu.Unlock()
   354  		return nil
   355  
   356  	case *tcpip.TCPAlwaysUseSynCookies:
   357  		p.mu.Lock()
   358  		p.alwaysUseSynCookies = bool(*v)
   359  		p.mu.Unlock()
   360  		return nil
   361  
   362  	case *tcpip.TCPSynRetriesOption:
   363  		if *v < 1 || *v > 255 {
   364  			return &tcpip.ErrInvalidOptionValue{}
   365  		}
   366  		p.mu.Lock()
   367  		p.synRetries = uint8(*v)
   368  		p.mu.Unlock()
   369  		return nil
   370  
   371  	default:
   372  		return &tcpip.ErrUnknownProtocolOption{}
   373  	}
   374  }
   375  
   376  // Option implements stack.TransportProtocol.Option.
   377  func (p *protocol) Option(option tcpip.GettableTransportProtocolOption) tcpip.Error {
   378  	switch v := option.(type) {
   379  	case *tcpip.TCPSACKEnabled:
   380  		p.mu.RLock()
   381  		*v = tcpip.TCPSACKEnabled(p.sackEnabled)
   382  		p.mu.RUnlock()
   383  		return nil
   384  
   385  	case *tcpip.TCPRecovery:
   386  		p.mu.RLock()
   387  		*v = p.recovery
   388  		p.mu.RUnlock()
   389  		return nil
   390  
   391  	case *tcpip.TCPDelayEnabled:
   392  		p.mu.RLock()
   393  		*v = tcpip.TCPDelayEnabled(p.delayEnabled)
   394  		p.mu.RUnlock()
   395  		return nil
   396  
   397  	case *tcpip.TCPSendBufferSizeRangeOption:
   398  		p.mu.RLock()
   399  		*v = p.sendBufferSize
   400  		p.mu.RUnlock()
   401  		return nil
   402  
   403  	case *tcpip.TCPReceiveBufferSizeRangeOption:
   404  		p.mu.RLock()
   405  		*v = p.recvBufferSize
   406  		p.mu.RUnlock()
   407  		return nil
   408  
   409  	case *tcpip.CongestionControlOption:
   410  		p.mu.RLock()
   411  		*v = tcpip.CongestionControlOption(p.congestionControl)
   412  		p.mu.RUnlock()
   413  		return nil
   414  
   415  	case *tcpip.TCPAvailableCongestionControlOption:
   416  		p.mu.RLock()
   417  		*v = tcpip.TCPAvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " "))
   418  		p.mu.RUnlock()
   419  		return nil
   420  
   421  	case *tcpip.TCPModerateReceiveBufferOption:
   422  		p.mu.RLock()
   423  		*v = tcpip.TCPModerateReceiveBufferOption(p.moderateReceiveBuffer)
   424  		p.mu.RUnlock()
   425  		return nil
   426  
   427  	case *tcpip.TCPLingerTimeoutOption:
   428  		p.mu.RLock()
   429  		*v = tcpip.TCPLingerTimeoutOption(p.lingerTimeout)
   430  		p.mu.RUnlock()
   431  		return nil
   432  
   433  	case *tcpip.TCPTimeWaitTimeoutOption:
   434  		p.mu.RLock()
   435  		*v = tcpip.TCPTimeWaitTimeoutOption(p.timeWaitTimeout)
   436  		p.mu.RUnlock()
   437  		return nil
   438  
   439  	case *tcpip.TCPTimeWaitReuseOption:
   440  		p.mu.RLock()
   441  		*v = p.timeWaitReuse
   442  		p.mu.RUnlock()
   443  		return nil
   444  
   445  	case *tcpip.TCPMinRTOOption:
   446  		p.mu.RLock()
   447  		*v = tcpip.TCPMinRTOOption(p.minRTO)
   448  		p.mu.RUnlock()
   449  		return nil
   450  
   451  	case *tcpip.TCPMaxRTOOption:
   452  		p.mu.RLock()
   453  		*v = tcpip.TCPMaxRTOOption(p.maxRTO)
   454  		p.mu.RUnlock()
   455  		return nil
   456  
   457  	case *tcpip.TCPMaxRetriesOption:
   458  		p.mu.RLock()
   459  		*v = tcpip.TCPMaxRetriesOption(p.maxRetries)
   460  		p.mu.RUnlock()
   461  		return nil
   462  
   463  	case *tcpip.TCPAlwaysUseSynCookies:
   464  		p.mu.RLock()
   465  		*v = tcpip.TCPAlwaysUseSynCookies(p.alwaysUseSynCookies)
   466  		p.mu.RUnlock()
   467  		return nil
   468  
   469  	case *tcpip.TCPSynRetriesOption:
   470  		p.mu.RLock()
   471  		*v = tcpip.TCPSynRetriesOption(p.synRetries)
   472  		p.mu.RUnlock()
   473  		return nil
   474  
   475  	default:
   476  		return &tcpip.ErrUnknownProtocolOption{}
   477  	}
   478  }
   479  
   480  // Close implements stack.TransportProtocol.Close.
   481  func (p *protocol) Close() {
   482  	p.dispatcher.close()
   483  }
   484  
   485  // Wait implements stack.TransportProtocol.Wait.
   486  func (p *protocol) Wait() {
   487  	p.dispatcher.wait()
   488  }
   489  
   490  // Parse implements stack.TransportProtocol.Parse.
   491  func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
   492  	return parse.TCP(pkt)
   493  }
   494  
   495  // NewProtocol returns a TCP transport protocol.
   496  func NewProtocol(s *stack.Stack) stack.TransportProtocol {
   497  	p := protocol{
   498  		stack: s,
   499  		sendBufferSize: tcpip.TCPSendBufferSizeRangeOption{
   500  			Min:     MinBufferSize,
   501  			Default: DefaultSendBufferSize,
   502  			Max:     MaxBufferSize,
   503  		},
   504  		recvBufferSize: tcpip.TCPReceiveBufferSizeRangeOption{
   505  			Min:     MinBufferSize,
   506  			Default: DefaultReceiveBufferSize,
   507  			Max:     MaxBufferSize,
   508  		},
   509  		congestionControl:          ccReno,
   510  		availableCongestionControl: []string{ccReno, ccCubic},
   511  		lingerTimeout:              DefaultTCPLingerTimeout,
   512  		timeWaitTimeout:            DefaultTCPTimeWaitTimeout,
   513  		timeWaitReuse:              tcpip.TCPTimeWaitReuseLoopbackOnly,
   514  		synRetries:                 DefaultSynRetries,
   515  		minRTO:                     MinRTO,
   516  		maxRTO:                     MaxRTO,
   517  		maxRetries:                 MaxRetries,
   518  		recovery:                   tcpip.TCPRACKLossDetection,
   519  		seqnumSecret:               s.Rand().Uint32(),
   520  		portOffsetSecret:           s.Rand().Uint32(),
   521  		tsOffsetSecret:             s.Rand().Uint32(),
   522  	}
   523  	p.dispatcher.init(s.Rand(), runtime.GOMAXPROCS(0))
   524  	return &p
   525  }
   526  
   527  // protocolFromStack retrieves the tcp.protocol instance from stack s.
   528  func protocolFromStack(s *stack.Stack) *protocol {
   529  	return s.TransportProtocolInstance(ProtocolNumber).(*protocol)
   530  }