github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/tcpip/transport/tcp/protocol.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package tcp contains the implementation of the TCP transport protocol.
    16  package tcp
    17  
    18  import (
    19  	"runtime"
    20  	"strings"
    21  	"time"
    22  
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/sync"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/hash/jenkins"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/header"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/header/parse"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/internal/tcp"
    29  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/seqnum"
    30  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/stack"
    31  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/transport/raw"
    32  	"github.com/nicocha30/gvisor-ligolo/pkg/waiter"
    33  )
    34  
    35  const (
    36  	// ProtocolNumber is the tcp protocol number.
    37  	ProtocolNumber = header.TCPProtocolNumber
    38  
    39  	// MinBufferSize is the smallest size of a receive or send buffer.
    40  	MinBufferSize = 4 << 10 // 4096 bytes.
    41  
    42  	// DefaultSendBufferSize is the default size of the send buffer for
    43  	// an endpoint.
    44  	DefaultSendBufferSize = 1 << 20 // 1MB
    45  
    46  	// DefaultReceiveBufferSize is the default size of the receive buffer
    47  	// for an endpoint.
    48  	DefaultReceiveBufferSize = 1 << 20 // 1MB
    49  
    50  	// MaxBufferSize is the largest size a receive/send buffer can grow to.
    51  	MaxBufferSize = 4 << 20 // 4MB
    52  
    53  	// DefaultTCPLingerTimeout is the amount of time that sockets linger in
    54  	// FIN_WAIT_2 state before being marked closed.
    55  	DefaultTCPLingerTimeout = 60 * time.Second
    56  
    57  	// MaxTCPLingerTimeout is the maximum amount of time that sockets
    58  	// linger in FIN_WAIT_2 state before being marked closed.
    59  	MaxTCPLingerTimeout = 120 * time.Second
    60  
    61  	// DefaultTCPTimeWaitTimeout is the amount of time that sockets linger
    62  	// in TIME_WAIT state before being marked closed.
    63  	DefaultTCPTimeWaitTimeout = 60 * time.Second
    64  
    65  	// DefaultSynRetries is the default value for the number of SYN retransmits
    66  	// before a connect is aborted.
    67  	DefaultSynRetries = 6
    68  
    69  	// DefaultKeepaliveIdle is the idle time for a connection before keep-alive
    70  	// probes are sent.
    71  	DefaultKeepaliveIdle = 2 * time.Hour
    72  
    73  	// DefaultKeepaliveInterval is the time between two successive keep-alive
    74  	// probes.
    75  	DefaultKeepaliveInterval = 75 * time.Second
    76  
    77  	// DefaultKeepaliveCount is the number of keep-alive probes that are sent
    78  	// before declaring the connection dead.
    79  	DefaultKeepaliveCount = 9
    80  )
    81  
    82  const (
    83  	ccReno  = "reno"
    84  	ccCubic = "cubic"
    85  )
    86  
    87  type protocol struct {
    88  	stack *stack.Stack
    89  
    90  	mu                         sync.RWMutex
    91  	sackEnabled                bool
    92  	recovery                   tcpip.TCPRecovery
    93  	delayEnabled               bool
    94  	alwaysUseSynCookies        bool
    95  	sendBufferSize             tcpip.TCPSendBufferSizeRangeOption
    96  	recvBufferSize             tcpip.TCPReceiveBufferSizeRangeOption
    97  	congestionControl          string
    98  	availableCongestionControl []string
    99  	moderateReceiveBuffer      bool
   100  	lingerTimeout              time.Duration
   101  	timeWaitTimeout            time.Duration
   102  	timeWaitReuse              tcpip.TCPTimeWaitReuseOption
   103  	minRTO                     time.Duration
   104  	maxRTO                     time.Duration
   105  	maxRetries                 uint32
   106  	synRetries                 uint8
   107  	dispatcher                 dispatcher
   108  
   109  	// The following secrets are initialized once and stay unchanged after.
   110  	seqnumSecret     uint32
   111  	portOffsetSecret uint32
   112  	tsOffsetSecret   uint32
   113  }
   114  
   115  // Number returns the tcp protocol number.
   116  func (*protocol) Number() tcpip.TransportProtocolNumber {
   117  	return ProtocolNumber
   118  }
   119  
   120  // NewEndpoint creates a new tcp endpoint.
   121  func (p *protocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
   122  	return newEndpoint(p.stack, p, netProto, waiterQueue), nil
   123  }
   124  
   125  // NewRawEndpoint creates a new raw TCP endpoint. Raw TCP sockets are currently
   126  // unsupported. It implements stack.TransportProtocol.NewRawEndpoint.
   127  func (p *protocol) NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
   128  	return raw.NewEndpoint(p.stack, netProto, header.TCPProtocolNumber, waiterQueue)
   129  }
   130  
   131  // MinimumPacketSize returns the minimum valid tcp packet size.
   132  func (*protocol) MinimumPacketSize() int {
   133  	return header.TCPMinimumSize
   134  }
   135  
   136  // ParsePorts returns the source and destination ports stored in the given tcp
   137  // packet.
   138  func (*protocol) ParsePorts(v []byte) (src, dst uint16, err tcpip.Error) {
   139  	h := header.TCP(v)
   140  	return h.SourcePort(), h.DestinationPort(), nil
   141  }
   142  
   143  // QueuePacket queues packets targeted at an endpoint after hashing the packet
   144  // to a specific processing queue. Each queue is serviced by its own processor
   145  // goroutine which is responsible for dequeuing and doing full TCP dispatch of
   146  // the packet.
   147  func (p *protocol) QueuePacket(ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt stack.PacketBufferPtr) {
   148  	p.dispatcher.queuePacket(ep, id, p.stack.Clock(), pkt)
   149  }
   150  
   151  // HandleUnknownDestinationPacket handles packets targeted at this protocol but
   152  // that don't match any existing endpoint.
   153  //
   154  // RFC 793, page 36, states that "If the connection does not exist (CLOSED) then
   155  // a reset is sent in response to any incoming segment except another reset. In
   156  // particular, SYNs addressed to a non-existent connection are rejected by this
   157  // means."
   158  func (p *protocol) HandleUnknownDestinationPacket(id stack.TransportEndpointID, pkt stack.PacketBufferPtr) stack.UnknownDestinationPacketDisposition {
   159  	s, err := newIncomingSegment(id, p.stack.Clock(), pkt)
   160  	if err != nil {
   161  		return stack.UnknownDestinationPacketMalformed
   162  	}
   163  	defer s.DecRef()
   164  	if !s.csumValid {
   165  		return stack.UnknownDestinationPacketMalformed
   166  	}
   167  
   168  	if !s.flags.Contains(header.TCPFlagRst) {
   169  		replyWithReset(p.stack, s, stack.DefaultTOS, tcpip.UseDefaultIPv4TTL, tcpip.UseDefaultIPv6HopLimit)
   170  	}
   171  
   172  	return stack.UnknownDestinationPacketHandled
   173  }
   174  
   175  func (p *protocol) tsOffset(src, dst tcpip.Address) tcp.TSOffset {
   176  	// Initialize a random tsOffset that will be added to the recentTS
   177  	// everytime the timestamp is sent when the Timestamp option is enabled.
   178  	//
   179  	// See https://tools.ietf.org/html/rfc7323#section-5.4 for details on
   180  	// why this is required.
   181  	//
   182  	// TODO(https://gvisor.dev/issues/6473): This is not really secure as
   183  	// it does not use the recommended algorithm linked above.
   184  	h := jenkins.Sum32(p.tsOffsetSecret)
   185  	// Per hash.Hash.Writer:
   186  	//
   187  	// It never returns an error.
   188  	_, _ = h.Write(src.AsSlice())
   189  	_, _ = h.Write(dst.AsSlice())
   190  	return tcp.NewTSOffset(h.Sum32())
   191  }
   192  
   193  // replyWithReset replies to the given segment with a reset segment.
   194  //
   195  // If the relevant TTL has its reset value (0 for ipv4TTL, -1 for ipv6HopLimit),
   196  // then the route's default TTL will be used.
   197  func replyWithReset(st *stack.Stack, s *segment, tos, ipv4TTL uint8, ipv6HopLimit int16) tcpip.Error {
   198  	net := s.pkt.Network()
   199  	route, err := st.FindRoute(s.pkt.NICID, net.DestinationAddress(), net.SourceAddress(), s.pkt.NetworkProtocolNumber, false /* multicastLoop */)
   200  	if err != nil {
   201  		return err
   202  	}
   203  	defer route.Release()
   204  
   205  	ttl := calculateTTL(route, ipv4TTL, ipv6HopLimit)
   206  
   207  	// Get the seqnum from the packet if the ack flag is set.
   208  	seq := seqnum.Value(0)
   209  	ack := seqnum.Value(0)
   210  	flags := header.TCPFlagRst
   211  	// As per RFC 793 page 35 (Reset Generation)
   212  	//   1.  If the connection does not exist (CLOSED) then a reset is sent
   213  	//   in response to any incoming segment except another reset.  In
   214  	//   particular, SYNs addressed to a non-existent connection are rejected
   215  	//   by this means.
   216  
   217  	//   If the incoming segment has an ACK field, the reset takes its
   218  	//   sequence number from the ACK field of the segment, otherwise the
   219  	//   reset has sequence number zero and the ACK field is set to the sum
   220  	//   of the sequence number and segment length of the incoming segment.
   221  	//   The connection remains in the CLOSED state.
   222  	if s.flags.Contains(header.TCPFlagAck) {
   223  		seq = s.ackNumber
   224  	} else {
   225  		flags |= header.TCPFlagAck
   226  		ack = s.sequenceNumber.Add(s.logicalLen())
   227  	}
   228  
   229  	p := stack.NewPacketBuffer(stack.PacketBufferOptions{ReserveHeaderBytes: header.TCPMinimumSize + int(route.MaxHeaderLength())})
   230  	defer p.DecRef()
   231  	return sendTCP(route, tcpFields{
   232  		id:     s.id,
   233  		ttl:    ttl,
   234  		tos:    tos,
   235  		flags:  flags,
   236  		seq:    seq,
   237  		ack:    ack,
   238  		rcvWnd: 0,
   239  	}, p, stack.GSO{}, nil /* PacketOwner */)
   240  }
   241  
   242  // SetOption implements stack.TransportProtocol.SetOption.
   243  func (p *protocol) SetOption(option tcpip.SettableTransportProtocolOption) tcpip.Error {
   244  	switch v := option.(type) {
   245  	case *tcpip.TCPSACKEnabled:
   246  		p.mu.Lock()
   247  		p.sackEnabled = bool(*v)
   248  		p.mu.Unlock()
   249  		return nil
   250  
   251  	case *tcpip.TCPRecovery:
   252  		p.mu.Lock()
   253  		p.recovery = *v
   254  		p.mu.Unlock()
   255  		return nil
   256  
   257  	case *tcpip.TCPDelayEnabled:
   258  		p.mu.Lock()
   259  		p.delayEnabled = bool(*v)
   260  		p.mu.Unlock()
   261  		return nil
   262  
   263  	case *tcpip.TCPSendBufferSizeRangeOption:
   264  		if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max {
   265  			return &tcpip.ErrInvalidOptionValue{}
   266  		}
   267  		p.mu.Lock()
   268  		p.sendBufferSize = *v
   269  		p.mu.Unlock()
   270  		return nil
   271  
   272  	case *tcpip.TCPReceiveBufferSizeRangeOption:
   273  		if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max {
   274  			return &tcpip.ErrInvalidOptionValue{}
   275  		}
   276  		p.mu.Lock()
   277  		p.recvBufferSize = *v
   278  		p.mu.Unlock()
   279  		return nil
   280  
   281  	case *tcpip.CongestionControlOption:
   282  		for _, c := range p.availableCongestionControl {
   283  			if string(*v) == c {
   284  				p.mu.Lock()
   285  				p.congestionControl = string(*v)
   286  				p.mu.Unlock()
   287  				return nil
   288  			}
   289  		}
   290  		// linux returns ENOENT when an invalid congestion control
   291  		// is specified.
   292  		return &tcpip.ErrNoSuchFile{}
   293  
   294  	case *tcpip.TCPModerateReceiveBufferOption:
   295  		p.mu.Lock()
   296  		p.moderateReceiveBuffer = bool(*v)
   297  		p.mu.Unlock()
   298  		return nil
   299  
   300  	case *tcpip.TCPLingerTimeoutOption:
   301  		p.mu.Lock()
   302  		if *v < 0 {
   303  			p.lingerTimeout = 0
   304  		} else {
   305  			p.lingerTimeout = time.Duration(*v)
   306  		}
   307  		p.mu.Unlock()
   308  		return nil
   309  
   310  	case *tcpip.TCPTimeWaitTimeoutOption:
   311  		p.mu.Lock()
   312  		if *v < 0 {
   313  			p.timeWaitTimeout = 0
   314  		} else {
   315  			p.timeWaitTimeout = time.Duration(*v)
   316  		}
   317  		p.mu.Unlock()
   318  		return nil
   319  
   320  	case *tcpip.TCPTimeWaitReuseOption:
   321  		if *v < tcpip.TCPTimeWaitReuseDisabled || *v > tcpip.TCPTimeWaitReuseLoopbackOnly {
   322  			return &tcpip.ErrInvalidOptionValue{}
   323  		}
   324  		p.mu.Lock()
   325  		p.timeWaitReuse = *v
   326  		p.mu.Unlock()
   327  		return nil
   328  
   329  	case *tcpip.TCPMinRTOOption:
   330  		p.mu.Lock()
   331  		defer p.mu.Unlock()
   332  		if *v < 0 {
   333  			p.minRTO = MinRTO
   334  		} else if minRTO := time.Duration(*v); minRTO <= p.maxRTO {
   335  			p.minRTO = minRTO
   336  		} else {
   337  			return &tcpip.ErrInvalidOptionValue{}
   338  		}
   339  		return nil
   340  
   341  	case *tcpip.TCPMaxRTOOption:
   342  		p.mu.Lock()
   343  		defer p.mu.Unlock()
   344  		if *v < 0 {
   345  			p.maxRTO = MaxRTO
   346  		} else if maxRTO := time.Duration(*v); maxRTO >= p.minRTO {
   347  			p.maxRTO = maxRTO
   348  		} else {
   349  			return &tcpip.ErrInvalidOptionValue{}
   350  		}
   351  		return nil
   352  
   353  	case *tcpip.TCPMaxRetriesOption:
   354  		p.mu.Lock()
   355  		p.maxRetries = uint32(*v)
   356  		p.mu.Unlock()
   357  		return nil
   358  
   359  	case *tcpip.TCPAlwaysUseSynCookies:
   360  		p.mu.Lock()
   361  		p.alwaysUseSynCookies = bool(*v)
   362  		p.mu.Unlock()
   363  		return nil
   364  
   365  	case *tcpip.TCPSynRetriesOption:
   366  		if *v < 1 || *v > 255 {
   367  			return &tcpip.ErrInvalidOptionValue{}
   368  		}
   369  		p.mu.Lock()
   370  		p.synRetries = uint8(*v)
   371  		p.mu.Unlock()
   372  		return nil
   373  
   374  	default:
   375  		return &tcpip.ErrUnknownProtocolOption{}
   376  	}
   377  }
   378  
   379  // Option implements stack.TransportProtocol.Option.
   380  func (p *protocol) Option(option tcpip.GettableTransportProtocolOption) tcpip.Error {
   381  	switch v := option.(type) {
   382  	case *tcpip.TCPSACKEnabled:
   383  		p.mu.RLock()
   384  		*v = tcpip.TCPSACKEnabled(p.sackEnabled)
   385  		p.mu.RUnlock()
   386  		return nil
   387  
   388  	case *tcpip.TCPRecovery:
   389  		p.mu.RLock()
   390  		*v = p.recovery
   391  		p.mu.RUnlock()
   392  		return nil
   393  
   394  	case *tcpip.TCPDelayEnabled:
   395  		p.mu.RLock()
   396  		*v = tcpip.TCPDelayEnabled(p.delayEnabled)
   397  		p.mu.RUnlock()
   398  		return nil
   399  
   400  	case *tcpip.TCPSendBufferSizeRangeOption:
   401  		p.mu.RLock()
   402  		*v = p.sendBufferSize
   403  		p.mu.RUnlock()
   404  		return nil
   405  
   406  	case *tcpip.TCPReceiveBufferSizeRangeOption:
   407  		p.mu.RLock()
   408  		*v = p.recvBufferSize
   409  		p.mu.RUnlock()
   410  		return nil
   411  
   412  	case *tcpip.CongestionControlOption:
   413  		p.mu.RLock()
   414  		*v = tcpip.CongestionControlOption(p.congestionControl)
   415  		p.mu.RUnlock()
   416  		return nil
   417  
   418  	case *tcpip.TCPAvailableCongestionControlOption:
   419  		p.mu.RLock()
   420  		*v = tcpip.TCPAvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " "))
   421  		p.mu.RUnlock()
   422  		return nil
   423  
   424  	case *tcpip.TCPModerateReceiveBufferOption:
   425  		p.mu.RLock()
   426  		*v = tcpip.TCPModerateReceiveBufferOption(p.moderateReceiveBuffer)
   427  		p.mu.RUnlock()
   428  		return nil
   429  
   430  	case *tcpip.TCPLingerTimeoutOption:
   431  		p.mu.RLock()
   432  		*v = tcpip.TCPLingerTimeoutOption(p.lingerTimeout)
   433  		p.mu.RUnlock()
   434  		return nil
   435  
   436  	case *tcpip.TCPTimeWaitTimeoutOption:
   437  		p.mu.RLock()
   438  		*v = tcpip.TCPTimeWaitTimeoutOption(p.timeWaitTimeout)
   439  		p.mu.RUnlock()
   440  		return nil
   441  
   442  	case *tcpip.TCPTimeWaitReuseOption:
   443  		p.mu.RLock()
   444  		*v = p.timeWaitReuse
   445  		p.mu.RUnlock()
   446  		return nil
   447  
   448  	case *tcpip.TCPMinRTOOption:
   449  		p.mu.RLock()
   450  		*v = tcpip.TCPMinRTOOption(p.minRTO)
   451  		p.mu.RUnlock()
   452  		return nil
   453  
   454  	case *tcpip.TCPMaxRTOOption:
   455  		p.mu.RLock()
   456  		*v = tcpip.TCPMaxRTOOption(p.maxRTO)
   457  		p.mu.RUnlock()
   458  		return nil
   459  
   460  	case *tcpip.TCPMaxRetriesOption:
   461  		p.mu.RLock()
   462  		*v = tcpip.TCPMaxRetriesOption(p.maxRetries)
   463  		p.mu.RUnlock()
   464  		return nil
   465  
   466  	case *tcpip.TCPAlwaysUseSynCookies:
   467  		p.mu.RLock()
   468  		*v = tcpip.TCPAlwaysUseSynCookies(p.alwaysUseSynCookies)
   469  		p.mu.RUnlock()
   470  		return nil
   471  
   472  	case *tcpip.TCPSynRetriesOption:
   473  		p.mu.RLock()
   474  		*v = tcpip.TCPSynRetriesOption(p.synRetries)
   475  		p.mu.RUnlock()
   476  		return nil
   477  
   478  	default:
   479  		return &tcpip.ErrUnknownProtocolOption{}
   480  	}
   481  }
   482  
   483  // Close implements stack.TransportProtocol.Close.
   484  func (p *protocol) Close() {
   485  	p.dispatcher.close()
   486  }
   487  
   488  // Wait implements stack.TransportProtocol.Wait.
   489  func (p *protocol) Wait() {
   490  	p.dispatcher.wait()
   491  }
   492  
   493  // Pause implements stack.TransportProtocol.Pause.
   494  func (p *protocol) Pause() {
   495  	p.dispatcher.pause()
   496  }
   497  
   498  // Resume implements stack.TransportProtocol.Resume.
   499  func (p *protocol) Resume() {
   500  	p.dispatcher.resume()
   501  }
   502  
   503  // Parse implements stack.TransportProtocol.Parse.
   504  func (*protocol) Parse(pkt stack.PacketBufferPtr) bool {
   505  	return parse.TCP(pkt)
   506  }
   507  
   508  // NewProtocol returns a TCP transport protocol.
   509  func NewProtocol(s *stack.Stack) stack.TransportProtocol {
   510  	p := protocol{
   511  		stack: s,
   512  		sendBufferSize: tcpip.TCPSendBufferSizeRangeOption{
   513  			Min:     MinBufferSize,
   514  			Default: DefaultSendBufferSize,
   515  			Max:     MaxBufferSize,
   516  		},
   517  		recvBufferSize: tcpip.TCPReceiveBufferSizeRangeOption{
   518  			Min:     MinBufferSize,
   519  			Default: DefaultReceiveBufferSize,
   520  			Max:     MaxBufferSize,
   521  		},
   522  		congestionControl:          ccReno,
   523  		availableCongestionControl: []string{ccReno, ccCubic},
   524  		moderateReceiveBuffer:      true,
   525  		lingerTimeout:              DefaultTCPLingerTimeout,
   526  		timeWaitTimeout:            DefaultTCPTimeWaitTimeout,
   527  		timeWaitReuse:              tcpip.TCPTimeWaitReuseLoopbackOnly,
   528  		synRetries:                 DefaultSynRetries,
   529  		minRTO:                     MinRTO,
   530  		maxRTO:                     MaxRTO,
   531  		maxRetries:                 MaxRetries,
   532  		recovery:                   tcpip.TCPRACKLossDetection,
   533  		seqnumSecret:               s.Rand().Uint32(),
   534  		portOffsetSecret:           s.Rand().Uint32(),
   535  		tsOffsetSecret:             s.Rand().Uint32(),
   536  	}
   537  	p.dispatcher.init(s.Rand(), runtime.GOMAXPROCS(0))
   538  	return &p
   539  }
   540  
   541  // protocolFromStack retrieves the tcp.protocol instance from stack s.
   542  func protocolFromStack(s *stack.Stack) *protocol {
   543  	return s.TransportProtocolInstance(ProtocolNumber).(*protocol)
   544  }