github.com/polevpn/netstack@v1.10.9/tcpip/stack/stack.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package stack provides the glue between networking protocols and the
    16  // consumers of the networking stack.
    17  //
    18  // For consumers, the only function of interest is New(), everything else is
    19  // provided by the tcpip/public package.
    20  package stack
    21  
    22  import (
    23  	"encoding/binary"
    24  	"sync"
    25  	"sync/atomic"
    26  	"time"
    27  
    28  	"github.com/polevpn/netstack/rand"
    29  	"github.com/polevpn/netstack/sleep"
    30  	"github.com/polevpn/netstack/tcpip"
    31  	"github.com/polevpn/netstack/tcpip/buffer"
    32  	"github.com/polevpn/netstack/tcpip/header"
    33  	"github.com/polevpn/netstack/tcpip/iptables"
    34  	"github.com/polevpn/netstack/tcpip/ports"
    35  	"github.com/polevpn/netstack/tcpip/seqnum"
    36  	"github.com/polevpn/netstack/waiter"
    37  	"golang.org/x/time/rate"
    38  )
    39  
    40  const (
    41  	// ageLimit is set to the same cache stale time used in Linux.
    42  	ageLimit = 1 * time.Minute
    43  	// resolutionTimeout is set to the same ARP timeout used in Linux.
    44  	resolutionTimeout = 1 * time.Second
    45  	// resolutionAttempts is set to the same ARP retries used in Linux.
    46  	resolutionAttempts = 3
    47  
    48  	// DefaultTOS is the default type of service value for network endpoints.
    49  	DefaultTOS = 0
    50  )
    51  
    52  type transportProtocolState struct {
    53  	proto          TransportProtocol
    54  	defaultHandler func(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) bool
    55  }
    56  
    57  // TCPProbeFunc is the expected function type for a TCP probe function to be
    58  // passed to stack.AddTCPProbe.
    59  type TCPProbeFunc func(s TCPEndpointState)
    60  
    61  // TCPCubicState is used to hold a copy of the internal cubic state when the
    62  // TCPProbeFunc is invoked.
    63  type TCPCubicState struct {
    64  	WLastMax                float64
    65  	WMax                    float64
    66  	T                       time.Time
    67  	TimeSinceLastCongestion time.Duration
    68  	C                       float64
    69  	K                       float64
    70  	Beta                    float64
    71  	WC                      float64
    72  	WEst                    float64
    73  }
    74  
    75  // TCPEndpointID is the unique 4 tuple that identifies a given endpoint.
    76  type TCPEndpointID struct {
    77  	// LocalPort is the local port associated with the endpoint.
    78  	LocalPort uint16
    79  
    80  	// LocalAddress is the local [network layer] address associated with
    81  	// the endpoint.
    82  	LocalAddress tcpip.Address
    83  
    84  	// RemotePort is the remote port associated with the endpoint.
    85  	RemotePort uint16
    86  
    87  	// RemoteAddress it the remote [network layer] address associated with
    88  	// the endpoint.
    89  	RemoteAddress tcpip.Address
    90  }
    91  
    92  // TCPFastRecoveryState holds a copy of the internal fast recovery state of a
    93  // TCP endpoint.
    94  type TCPFastRecoveryState struct {
    95  	// Active if true indicates the endpoint is in fast recovery.
    96  	Active bool
    97  
    98  	// First is the first unacknowledged sequence number being recovered.
    99  	First seqnum.Value
   100  
   101  	// Last is the 'recover' sequence number that indicates the point at
   102  	// which we should exit recovery barring any timeouts etc.
   103  	Last seqnum.Value
   104  
   105  	// MaxCwnd is the maximum value we are permitted to grow the congestion
   106  	// window during recovery. This is set at the time we enter recovery.
   107  	MaxCwnd int
   108  
   109  	// HighRxt is the highest sequence number which has been retransmitted
   110  	// during the current loss recovery phase.
   111  	// See: RFC 6675 Section 2 for details.
   112  	HighRxt seqnum.Value
   113  
   114  	// RescueRxt is the highest sequence number which has been
   115  	// optimistically retransmitted to prevent stalling of the ACK clock
   116  	// when there is loss at the end of the window and no new data is
   117  	// available for transmission.
   118  	// See: RFC 6675 Section 2 for details.
   119  	RescueRxt seqnum.Value
   120  }
   121  
   122  // TCPReceiverState holds a copy of the internal state of the receiver for
   123  // a given TCP endpoint.
   124  type TCPReceiverState struct {
   125  	// RcvNxt is the TCP variable RCV.NXT.
   126  	RcvNxt seqnum.Value
   127  
   128  	// RcvAcc is the TCP variable RCV.ACC.
   129  	RcvAcc seqnum.Value
   130  
   131  	// RcvWndScale is the window scaling to use for inbound segments.
   132  	RcvWndScale uint8
   133  
   134  	// PendingBufUsed is the number of bytes pending in the receive
   135  	// queue.
   136  	PendingBufUsed seqnum.Size
   137  
   138  	// PendingBufSize is the size of the socket receive buffer.
   139  	PendingBufSize seqnum.Size
   140  }
   141  
   142  // TCPSenderState holds a copy of the internal state of the sender for
   143  // a given TCP Endpoint.
   144  type TCPSenderState struct {
   145  	// LastSendTime is the time at which we sent the last segment.
   146  	LastSendTime time.Time
   147  
   148  	// DupAckCount is the number of Duplicate ACK's received.
   149  	DupAckCount int
   150  
   151  	// SndCwnd is the size of the sending congestion window in packets.
   152  	SndCwnd int
   153  
   154  	// Ssthresh is the slow start threshold in packets.
   155  	Ssthresh int
   156  
   157  	// SndCAAckCount is the number of packets consumed in congestion
   158  	// avoidance mode.
   159  	SndCAAckCount int
   160  
   161  	// Outstanding is the number of packets in flight.
   162  	Outstanding int
   163  
   164  	// SndWnd is the send window size in bytes.
   165  	SndWnd seqnum.Size
   166  
   167  	// SndUna is the next unacknowledged sequence number.
   168  	SndUna seqnum.Value
   169  
   170  	// SndNxt is the sequence number of the next segment to be sent.
   171  	SndNxt seqnum.Value
   172  
   173  	// RTTMeasureSeqNum is the sequence number being used for the latest RTT
   174  	// measurement.
   175  	RTTMeasureSeqNum seqnum.Value
   176  
   177  	// RTTMeasureTime is the time when the RTTMeasureSeqNum was sent.
   178  	RTTMeasureTime time.Time
   179  
   180  	// Closed indicates that the caller has closed the endpoint for sending.
   181  	Closed bool
   182  
   183  	// SRTT is the smoothed round-trip time as defined in section 2 of
   184  	// RFC 6298.
   185  	SRTT time.Duration
   186  
   187  	// RTO is the retransmit timeout as defined in section of 2 of RFC 6298.
   188  	RTO time.Duration
   189  
   190  	// RTTVar is the round-trip time variation as defined in section 2 of
   191  	// RFC 6298.
   192  	RTTVar time.Duration
   193  
   194  	// SRTTInited if true indicates take a valid RTT measurement has been
   195  	// completed.
   196  	SRTTInited bool
   197  
   198  	// MaxPayloadSize is the maximum size of the payload of a given segment.
   199  	// It is initialized on demand.
   200  	MaxPayloadSize int
   201  
   202  	// SndWndScale is the number of bits to shift left when reading the send
   203  	// window size from a segment.
   204  	SndWndScale uint8
   205  
   206  	// MaxSentAck is the highest acknowledgement number sent till now.
   207  	MaxSentAck seqnum.Value
   208  
   209  	// FastRecovery holds the fast recovery state for the endpoint.
   210  	FastRecovery TCPFastRecoveryState
   211  
   212  	// Cubic holds the state related to CUBIC congestion control.
   213  	Cubic TCPCubicState
   214  }
   215  
   216  // TCPSACKInfo holds TCP SACK related information for a given TCP endpoint.
   217  type TCPSACKInfo struct {
   218  	// Blocks is the list of SACK Blocks that identify the out of order segments
   219  	// held by a given TCP endpoint.
   220  	Blocks []header.SACKBlock
   221  
   222  	// ReceivedBlocks are the SACK blocks received by this endpoint
   223  	// from the peer endpoint.
   224  	ReceivedBlocks []header.SACKBlock
   225  
   226  	// MaxSACKED is the highest sequence number that has been SACKED
   227  	// by the peer.
   228  	MaxSACKED seqnum.Value
   229  }
   230  
   231  // RcvBufAutoTuneParams holds state related to TCP receive buffer auto-tuning.
   232  type RcvBufAutoTuneParams struct {
   233  	// MeasureTime is the time at which the current measurement
   234  	// was started.
   235  	MeasureTime time.Time
   236  
   237  	// CopiedBytes is the number of bytes copied to user space since
   238  	// this measure began.
   239  	CopiedBytes int
   240  
   241  	// PrevCopiedBytes is the number of bytes copied to user space in
   242  	// the previous RTT period.
   243  	PrevCopiedBytes int
   244  
   245  	// RcvBufSize is the auto tuned receive buffer size.
   246  	RcvBufSize int
   247  
   248  	// RTT is the smoothed RTT as measured by observing the time between
   249  	// when a byte is first acknowledged and the receipt of data that is at
   250  	// least one window beyond the sequence number that was acknowledged.
   251  	RTT time.Duration
   252  
   253  	// RTTVar is the "round-trip time variation" as defined in section 2
   254  	// of RFC6298.
   255  	RTTVar time.Duration
   256  
   257  	// RTTMeasureSeqNumber is the highest acceptable sequence number at the
   258  	// time this RTT measurement period began.
   259  	RTTMeasureSeqNumber seqnum.Value
   260  
   261  	// RTTMeasureTime is the absolute time at which the current RTT
   262  	// measurement period began.
   263  	RTTMeasureTime time.Time
   264  
   265  	// Disabled is true if an explicit receive buffer is set for the
   266  	// endpoint.
   267  	Disabled bool
   268  }
   269  
   270  // TCPEndpointState is a copy of the internal state of a TCP endpoint.
   271  type TCPEndpointState struct {
   272  	// ID is a copy of the TransportEndpointID for the endpoint.
   273  	ID TCPEndpointID
   274  
   275  	// SegTime denotes the absolute time when this segment was received.
   276  	SegTime time.Time
   277  
   278  	// RcvBufSize is the size of the receive socket buffer for the endpoint.
   279  	RcvBufSize int
   280  
   281  	// RcvBufUsed is the amount of bytes actually held in the receive socket
   282  	// buffer for the endpoint.
   283  	RcvBufUsed int
   284  
   285  	// RcvBufAutoTuneParams is used to hold state variables to compute
   286  	// the auto tuned receive buffer size.
   287  	RcvAutoParams RcvBufAutoTuneParams
   288  
   289  	// RcvClosed if true, indicates the endpoint has been closed for reading.
   290  	RcvClosed bool
   291  
   292  	// SendTSOk is used to indicate when the TS Option has been negotiated.
   293  	// When sendTSOk is true every non-RST segment should carry a TS as per
   294  	// RFC7323#section-1.1.
   295  	SendTSOk bool
   296  
   297  	// RecentTS is the timestamp that should be sent in the TSEcr field of
   298  	// the timestamp for future segments sent by the endpoint. This field is
   299  	// updated if required when a new segment is received by this endpoint.
   300  	RecentTS uint32
   301  
   302  	// TSOffset is a randomized offset added to the value of the TSVal field
   303  	// in the timestamp option.
   304  	TSOffset uint32
   305  
   306  	// SACKPermitted is set to true if the peer sends the TCPSACKPermitted
   307  	// option in the SYN/SYN-ACK.
   308  	SACKPermitted bool
   309  
   310  	// SACK holds TCP SACK related information for this endpoint.
   311  	SACK TCPSACKInfo
   312  
   313  	// SndBufSize is the size of the socket send buffer.
   314  	SndBufSize int
   315  
   316  	// SndBufUsed is the number of bytes held in the socket send buffer.
   317  	SndBufUsed int
   318  
   319  	// SndClosed indicates that the endpoint has been closed for sends.
   320  	SndClosed bool
   321  
   322  	// SndBufInQueue is the number of bytes in the send queue.
   323  	SndBufInQueue seqnum.Size
   324  
   325  	// PacketTooBigCount is used to notify the main protocol routine how
   326  	// many times a "packet too big" control packet is received.
   327  	PacketTooBigCount int
   328  
   329  	// SndMTU is the smallest MTU seen in the control packets received.
   330  	SndMTU int
   331  
   332  	// Receiver holds variables related to the TCP receiver for the endpoint.
   333  	Receiver TCPReceiverState
   334  
   335  	// Sender holds state related to the TCP Sender for the endpoint.
   336  	Sender TCPSenderState
   337  }
   338  
   339  // ResumableEndpoint is an endpoint that needs to be resumed after restore.
   340  type ResumableEndpoint interface {
   341  	// Resume resumes an endpoint after restore. This can be used to restart
   342  	// background workers such as protocol goroutines. This must be called after
   343  	// all indirect dependencies of the endpoint has been restored, which
   344  	// generally implies at the end of the restore process.
   345  	Resume(*Stack)
   346  }
   347  
   348  // uniqueIDGenerator is a default unique ID generator.
   349  type uniqueIDGenerator uint64
   350  
   351  func (u *uniqueIDGenerator) UniqueID() uint64 {
   352  	return atomic.AddUint64((*uint64)(u), 1)
   353  }
   354  
   355  // Stack is a networking stack, with all supported protocols, NICs, and route
   356  // table.
   357  type Stack struct {
   358  	transportProtocols map[tcpip.TransportProtocolNumber]*transportProtocolState
   359  	networkProtocols   map[tcpip.NetworkProtocolNumber]NetworkProtocol
   360  	linkAddrResolvers  map[tcpip.NetworkProtocolNumber]LinkAddressResolver
   361  
   362  	// rawFactory creates raw endpoints. If nil, raw endpoints are
   363  	// disabled. It is set during Stack creation and is immutable.
   364  	rawFactory RawFactory
   365  
   366  	demux *transportDemuxer
   367  
   368  	stats tcpip.Stats
   369  
   370  	linkAddrCache *linkAddrCache
   371  
   372  	mu               sync.RWMutex
   373  	nics             map[tcpip.NICID]*NIC
   374  	forwarding       bool
   375  	cleanupEndpoints map[TransportEndpoint]struct{}
   376  
   377  	// route is the route table passed in by the user via SetRouteTable(),
   378  	// it is used by FindRoute() to build a route for a specific
   379  	// destination.
   380  	routeTable []tcpip.Route
   381  
   382  	*ports.PortManager
   383  
   384  	// If not nil, then any new endpoints will have this probe function
   385  	// invoked everytime they receive a TCP segment.
   386  	tcpProbeFunc TCPProbeFunc
   387  
   388  	// clock is used to generate user-visible times.
   389  	clock tcpip.Clock
   390  
   391  	// handleLocal allows non-loopback interfaces to loop packets.
   392  	handleLocal bool
   393  
   394  	// tables are the iptables packet filtering and manipulation rules.
   395  	tables iptables.IPTables
   396  
   397  	// resumableEndpoints is a list of endpoints that need to be resumed if the
   398  	// stack is being restored.
   399  	resumableEndpoints []ResumableEndpoint
   400  
   401  	// icmpRateLimiter is a global rate limiter for all ICMP messages generated
   402  	// by the stack.
   403  	icmpRateLimiter *ICMPRateLimiter
   404  
   405  	// seed is a one-time random value initialized at stack startup
   406  	// and is used to seed the TCP port picking on active connections
   407  	//
   408  	// TODO(gvisor.dev/issue/940): S/R this field.
   409  	seed uint32
   410  
   411  	// ndpConfigs is the default NDP configurations used by interfaces.
   412  	ndpConfigs NDPConfigurations
   413  
   414  	// autoGenIPv6LinkLocal determines whether or not the stack will attempt
   415  	// to auto-generate an IPv6 link-local address for newly enabled NICs.
   416  	// See the AutoGenIPv6LinkLocal field of Options for more details.
   417  	autoGenIPv6LinkLocal bool
   418  
   419  	// ndpDisp is the NDP event dispatcher that is used to send the netstack
   420  	// integrator NDP related events.
   421  	ndpDisp NDPDispatcher
   422  
   423  	// uniqueIDGenerator is a generator of unique identifiers.
   424  	uniqueIDGenerator UniqueID
   425  }
   426  
   427  // UniqueID is an abstract generator of unique identifiers.
   428  type UniqueID interface {
   429  	UniqueID() uint64
   430  }
   431  
   432  // Options contains optional Stack configuration.
   433  type Options struct {
   434  	// NetworkProtocols lists the network protocols to enable.
   435  	NetworkProtocols []NetworkProtocol
   436  
   437  	// TransportProtocols lists the transport protocols to enable.
   438  	TransportProtocols []TransportProtocol
   439  
   440  	// Clock is an optional clock source used for timestampping packets.
   441  	//
   442  	// If no Clock is specified, the clock source will be time.Now.
   443  	Clock tcpip.Clock
   444  
   445  	// Stats are optional statistic counters.
   446  	Stats tcpip.Stats
   447  
   448  	// HandleLocal indicates whether packets destined to their source
   449  	// should be handled by the stack internally (true) or outside the
   450  	// stack (false).
   451  	HandleLocal bool
   452  
   453  	// UniqueID is an optional generator of unique identifiers.
   454  	UniqueID UniqueID
   455  
   456  	// NDPConfigs is the default NDP configurations used by interfaces.
   457  	//
   458  	// By default, NDPConfigs will have a zero value for its
   459  	// DupAddrDetectTransmits field, implying that DAD will not be performed
   460  	// before assigning an address to a NIC.
   461  	NDPConfigs NDPConfigurations
   462  
   463  	// AutoGenIPv6LinkLocal determins whether or not the stack will attempt
   464  	// to auto-generate an IPv6 link-local address for newly enabled NICs.
   465  	// Note, setting this to true does not mean that a link-local address
   466  	// will be assigned right away, or at all. If Duplicate Address
   467  	// Detection is enabled, an address will only be assigned if it
   468  	// successfully resolves. If it fails, no further attempt will be made
   469  	// to auto-generate an IPv6 link-local address.
   470  	//
   471  	// The generated link-local address will follow RFC 4291 Appendix A
   472  	// guidelines.
   473  	AutoGenIPv6LinkLocal bool
   474  
   475  	// NDPDisp is the NDP event dispatcher that an integrator can provide to
   476  	// receive NDP related events.
   477  	NDPDisp NDPDispatcher
   478  
   479  	// RawFactory produces raw endpoints. Raw endpoints are enabled only if
   480  	// this is non-nil.
   481  	RawFactory RawFactory
   482  }
   483  
   484  // TransportEndpointInfo holds useful information about a transport endpoint
   485  // which can be queried by monitoring tools.
   486  //
   487  // +stateify savable
   488  type TransportEndpointInfo struct {
   489  	// The following fields are initialized at creation time and are
   490  	// immutable.
   491  
   492  	NetProto   tcpip.NetworkProtocolNumber
   493  	TransProto tcpip.TransportProtocolNumber
   494  
   495  	// The following fields are protected by endpoint mu.
   496  
   497  	ID TransportEndpointID
   498  	// BindNICID and bindAddr are set via calls to Bind(). They are used to
   499  	// reject attempts to send data or connect via a different NIC or
   500  	// address
   501  	BindNICID tcpip.NICID
   502  	BindAddr  tcpip.Address
   503  	// RegisterNICID is the default NICID registered as a side-effect of
   504  	// connect or datagram write.
   505  	RegisterNICID tcpip.NICID
   506  }
   507  
   508  // IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo
   509  // marker interface.
   510  func (*TransportEndpointInfo) IsEndpointInfo() {}
   511  
   512  // New allocates a new networking stack with only the requested networking and
   513  // transport protocols configured with default options.
   514  //
   515  // Note, NDPConfigurations will be fixed before being used by the Stack. That
   516  // is, if an invalid value was provided, it will be reset to the default value.
   517  //
   518  // Protocol options can be changed by calling the
   519  // SetNetworkProtocolOption/SetTransportProtocolOption methods provided by the
   520  // stack. Please refer to individual protocol implementations as to what options
   521  // are supported.
   522  func New(opts Options) *Stack {
   523  	clock := opts.Clock
   524  	if clock == nil {
   525  		clock = &tcpip.StdClock{}
   526  	}
   527  
   528  	if opts.UniqueID == nil {
   529  		opts.UniqueID = new(uniqueIDGenerator)
   530  	}
   531  
   532  	// Make sure opts.NDPConfigs contains valid values only.
   533  	opts.NDPConfigs.validate()
   534  
   535  	s := &Stack{
   536  		transportProtocols:   make(map[tcpip.TransportProtocolNumber]*transportProtocolState),
   537  		networkProtocols:     make(map[tcpip.NetworkProtocolNumber]NetworkProtocol),
   538  		linkAddrResolvers:    make(map[tcpip.NetworkProtocolNumber]LinkAddressResolver),
   539  		nics:                 make(map[tcpip.NICID]*NIC),
   540  		cleanupEndpoints:     make(map[TransportEndpoint]struct{}),
   541  		linkAddrCache:        newLinkAddrCache(ageLimit, resolutionTimeout, resolutionAttempts),
   542  		PortManager:          ports.NewPortManager(),
   543  		clock:                clock,
   544  		stats:                opts.Stats.FillIn(),
   545  		handleLocal:          opts.HandleLocal,
   546  		icmpRateLimiter:      NewICMPRateLimiter(),
   547  		seed:                 generateRandUint32(),
   548  		ndpConfigs:           opts.NDPConfigs,
   549  		autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal,
   550  		uniqueIDGenerator:    opts.UniqueID,
   551  		ndpDisp:              opts.NDPDisp,
   552  	}
   553  
   554  	// Add specified network protocols.
   555  	for _, netProto := range opts.NetworkProtocols {
   556  		s.networkProtocols[netProto.Number()] = netProto
   557  		if r, ok := netProto.(LinkAddressResolver); ok {
   558  			s.linkAddrResolvers[r.LinkAddressProtocol()] = r
   559  		}
   560  	}
   561  
   562  	// Add specified transport protocols.
   563  	for _, transProto := range opts.TransportProtocols {
   564  		s.transportProtocols[transProto.Number()] = &transportProtocolState{
   565  			proto: transProto,
   566  		}
   567  	}
   568  
   569  	// Add the factory for raw endpoints, if present.
   570  	s.rawFactory = opts.RawFactory
   571  
   572  	// Create the global transport demuxer.
   573  	s.demux = newTransportDemuxer(s)
   574  
   575  	return s
   576  }
   577  
   578  // UniqueID returns a unique identifier.
   579  func (s *Stack) UniqueID() uint64 {
   580  	return s.uniqueIDGenerator.UniqueID()
   581  }
   582  
   583  // SetNetworkProtocolOption allows configuring individual protocol level
   584  // options. This method returns an error if the protocol is not supported or
   585  // option is not supported by the protocol implementation or the provided value
   586  // is incorrect.
   587  func (s *Stack) SetNetworkProtocolOption(network tcpip.NetworkProtocolNumber, option interface{}) *tcpip.Error {
   588  	netProto, ok := s.networkProtocols[network]
   589  	if !ok {
   590  		return tcpip.ErrUnknownProtocol
   591  	}
   592  	return netProto.SetOption(option)
   593  }
   594  
   595  // NetworkProtocolOption allows retrieving individual protocol level option
   596  // values. This method returns an error if the protocol is not supported or
   597  // option is not supported by the protocol implementation.
   598  // e.g.
   599  // var v ipv4.MyOption
   600  // err := s.NetworkProtocolOption(tcpip.IPv4ProtocolNumber, &v)
   601  // if err != nil {
   602  //   ...
   603  // }
   604  func (s *Stack) NetworkProtocolOption(network tcpip.NetworkProtocolNumber, option interface{}) *tcpip.Error {
   605  	netProto, ok := s.networkProtocols[network]
   606  	if !ok {
   607  		return tcpip.ErrUnknownProtocol
   608  	}
   609  	return netProto.Option(option)
   610  }
   611  
   612  // SetTransportProtocolOption allows configuring individual protocol level
   613  // options. This method returns an error if the protocol is not supported or
   614  // option is not supported by the protocol implementation or the provided value
   615  // is incorrect.
   616  func (s *Stack) SetTransportProtocolOption(transport tcpip.TransportProtocolNumber, option interface{}) *tcpip.Error {
   617  	transProtoState, ok := s.transportProtocols[transport]
   618  	if !ok {
   619  		return tcpip.ErrUnknownProtocol
   620  	}
   621  	return transProtoState.proto.SetOption(option)
   622  }
   623  
   624  // TransportProtocolOption allows retrieving individual protocol level option
   625  // values. This method returns an error if the protocol is not supported or
   626  // option is not supported by the protocol implementation.
   627  // var v tcp.SACKEnabled
   628  // if err := s.TransportProtocolOption(tcpip.TCPProtocolNumber, &v); err != nil {
   629  //   ...
   630  // }
   631  func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber, option interface{}) *tcpip.Error {
   632  	transProtoState, ok := s.transportProtocols[transport]
   633  	if !ok {
   634  		return tcpip.ErrUnknownProtocol
   635  	}
   636  	return transProtoState.proto.Option(option)
   637  }
   638  
   639  // SetTransportProtocolHandler sets the per-stack default handler for the given
   640  // protocol.
   641  //
   642  // It must be called only during initialization of the stack. Changing it as the
   643  // stack is operating is not supported.
   644  func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, tcpip.PacketBuffer) bool) {
   645  	state := s.transportProtocols[p]
   646  	if state != nil {
   647  		state.defaultHandler = h
   648  	}
   649  }
   650  
   651  // NowNanoseconds implements tcpip.Clock.NowNanoseconds.
   652  func (s *Stack) NowNanoseconds() int64 {
   653  	return s.clock.NowNanoseconds()
   654  }
   655  
   656  // Stats returns a mutable copy of the current stats.
   657  //
   658  // This is not generally exported via the public interface, but is available
   659  // internally.
   660  func (s *Stack) Stats() tcpip.Stats {
   661  	return s.stats
   662  }
   663  
   664  // SetForwarding enables or disables the packet forwarding between NICs.
   665  func (s *Stack) SetForwarding(enable bool) {
   666  	// TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward.
   667  	s.mu.Lock()
   668  	s.forwarding = enable
   669  	s.mu.Unlock()
   670  }
   671  
   672  // Forwarding returns if the packet forwarding between NICs is enabled.
   673  func (s *Stack) Forwarding() bool {
   674  	// TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward.
   675  	s.mu.RLock()
   676  	defer s.mu.RUnlock()
   677  	return s.forwarding
   678  }
   679  
   680  // SetRouteTable assigns the route table to be used by this stack. It
   681  // specifies which NIC to use for given destination address ranges.
   682  func (s *Stack) SetRouteTable(table []tcpip.Route) {
   683  	s.mu.Lock()
   684  	defer s.mu.Unlock()
   685  
   686  	s.routeTable = table
   687  }
   688  
   689  // GetRouteTable returns the route table which is currently in use.
   690  func (s *Stack) GetRouteTable() []tcpip.Route {
   691  	s.mu.Lock()
   692  	defer s.mu.Unlock()
   693  	return append([]tcpip.Route(nil), s.routeTable...)
   694  }
   695  
   696  // NewEndpoint creates a new transport layer endpoint of the given protocol.
   697  func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
   698  	t, ok := s.transportProtocols[transport]
   699  	if !ok {
   700  		return nil, tcpip.ErrUnknownProtocol
   701  	}
   702  
   703  	return t.proto.NewEndpoint(s, network, waiterQueue)
   704  }
   705  
   706  // NewRawEndpoint creates a new raw transport layer endpoint of the given
   707  // protocol. Raw endpoints receive all traffic for a given protocol regardless
   708  // of address.
   709  func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, *tcpip.Error) {
   710  	if s.rawFactory == nil {
   711  		return nil, tcpip.ErrNotPermitted
   712  	}
   713  
   714  	if !associated {
   715  		return s.rawFactory.NewUnassociatedEndpoint(s, network, transport, waiterQueue)
   716  	}
   717  
   718  	t, ok := s.transportProtocols[transport]
   719  	if !ok {
   720  		return nil, tcpip.ErrUnknownProtocol
   721  	}
   722  
   723  	return t.proto.NewRawEndpoint(s, network, waiterQueue)
   724  }
   725  
   726  // NewPacketEndpoint creates a new packet endpoint listening for the given
   727  // netProto.
   728  func (s *Stack) NewPacketEndpoint(cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
   729  	if s.rawFactory == nil {
   730  		return nil, tcpip.ErrNotPermitted
   731  	}
   732  
   733  	return s.rawFactory.NewPacketEndpoint(s, cooked, netProto, waiterQueue)
   734  }
   735  
   736  // createNIC creates a NIC with the provided id and link-layer endpoint, and
   737  // optionally enable it.
   738  func (s *Stack) createNIC(id tcpip.NICID, name string, ep LinkEndpoint, enabled, loopback bool) *tcpip.Error {
   739  	s.mu.Lock()
   740  	defer s.mu.Unlock()
   741  
   742  	// Make sure id is unique.
   743  	if _, ok := s.nics[id]; ok {
   744  		return tcpip.ErrDuplicateNICID
   745  	}
   746  
   747  	n := newNIC(s, id, name, ep, loopback)
   748  
   749  	s.nics[id] = n
   750  	if enabled {
   751  		return n.enable()
   752  	}
   753  
   754  	return nil
   755  }
   756  
   757  // CreateNIC creates a NIC with the provided id and link-layer endpoint.
   758  func (s *Stack) CreateNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error {
   759  	return s.createNIC(id, "", ep, true, false)
   760  }
   761  
   762  // CreateNamedNIC creates a NIC with the provided id and link-layer endpoint,
   763  // and a human-readable name.
   764  func (s *Stack) CreateNamedNIC(id tcpip.NICID, name string, ep LinkEndpoint) *tcpip.Error {
   765  	return s.createNIC(id, name, ep, true, false)
   766  }
   767  
   768  // CreateNamedLoopbackNIC creates a NIC with the provided id and link-layer
   769  // endpoint, and a human-readable name.
   770  func (s *Stack) CreateNamedLoopbackNIC(id tcpip.NICID, name string, ep LinkEndpoint) *tcpip.Error {
   771  	return s.createNIC(id, name, ep, true, true)
   772  }
   773  
   774  // CreateDisabledNIC creates a NIC with the provided id and link-layer endpoint,
   775  // but leave it disable. Stack.EnableNIC must be called before the link-layer
   776  // endpoint starts delivering packets to it.
   777  func (s *Stack) CreateDisabledNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error {
   778  	return s.createNIC(id, "", ep, false, false)
   779  }
   780  
   781  // CreateDisabledNamedNIC is a combination of CreateNamedNIC and
   782  // CreateDisabledNIC.
   783  func (s *Stack) CreateDisabledNamedNIC(id tcpip.NICID, name string, ep LinkEndpoint) *tcpip.Error {
   784  	return s.createNIC(id, name, ep, false, false)
   785  }
   786  
   787  // EnableNIC enables the given NIC so that the link-layer endpoint can start
   788  // delivering packets to it.
   789  func (s *Stack) EnableNIC(id tcpip.NICID) *tcpip.Error {
   790  	s.mu.RLock()
   791  	defer s.mu.RUnlock()
   792  
   793  	nic := s.nics[id]
   794  	if nic == nil {
   795  		return tcpip.ErrUnknownNICID
   796  	}
   797  
   798  	return nic.enable()
   799  }
   800  
   801  // CheckNIC checks if a NIC is usable.
   802  func (s *Stack) CheckNIC(id tcpip.NICID) bool {
   803  	s.mu.RLock()
   804  	nic, ok := s.nics[id]
   805  	s.mu.RUnlock()
   806  	if ok {
   807  		return nic.linkEP.IsAttached()
   808  	}
   809  	return false
   810  }
   811  
   812  // NICSubnets returns a map of NICIDs to their associated subnets.
   813  func (s *Stack) NICAddressRanges() map[tcpip.NICID][]tcpip.Subnet {
   814  	s.mu.RLock()
   815  	defer s.mu.RUnlock()
   816  
   817  	nics := map[tcpip.NICID][]tcpip.Subnet{}
   818  
   819  	for id, nic := range s.nics {
   820  		nics[id] = append(nics[id], nic.AddressRanges()...)
   821  	}
   822  	return nics
   823  }
   824  
   825  // NICInfo captures the name and addresses assigned to a NIC.
   826  type NICInfo struct {
   827  	Name              string
   828  	LinkAddress       tcpip.LinkAddress
   829  	ProtocolAddresses []tcpip.ProtocolAddress
   830  
   831  	// Flags indicate the state of the NIC.
   832  	Flags NICStateFlags
   833  
   834  	// MTU is the maximum transmission unit.
   835  	MTU uint32
   836  
   837  	Stats NICStats
   838  }
   839  
   840  // NICInfo returns a map of NICIDs to their associated information.
   841  func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo {
   842  	s.mu.RLock()
   843  	defer s.mu.RUnlock()
   844  
   845  	nics := make(map[tcpip.NICID]NICInfo)
   846  	for id, nic := range s.nics {
   847  		flags := NICStateFlags{
   848  			Up:          true, // Netstack interfaces are always up.
   849  			Running:     nic.linkEP.IsAttached(),
   850  			Promiscuous: nic.isPromiscuousMode(),
   851  			Loopback:    nic.linkEP.Capabilities()&CapabilityLoopback != 0,
   852  		}
   853  		nics[id] = NICInfo{
   854  			Name:              nic.name,
   855  			LinkAddress:       nic.linkEP.LinkAddress(),
   856  			ProtocolAddresses: nic.PrimaryAddresses(),
   857  			Flags:             flags,
   858  			MTU:               nic.linkEP.MTU(),
   859  			Stats:             nic.stats,
   860  		}
   861  	}
   862  	return nics
   863  }
   864  
   865  // NICStateFlags holds information about the state of an NIC.
   866  type NICStateFlags struct {
   867  	// Up indicates whether the interface is running.
   868  	Up bool
   869  
   870  	// Running indicates whether resources are allocated.
   871  	Running bool
   872  
   873  	// Promiscuous indicates whether the interface is in promiscuous mode.
   874  	Promiscuous bool
   875  
   876  	// Loopback indicates whether the interface is a loopback.
   877  	Loopback bool
   878  }
   879  
   880  // AddAddress adds a new network-layer address to the specified NIC.
   881  func (s *Stack) AddAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error {
   882  	return s.AddAddressWithOptions(id, protocol, addr, CanBePrimaryEndpoint)
   883  }
   884  
   885  // AddProtocolAddress adds a new network-layer protocol address to the
   886  // specified NIC.
   887  func (s *Stack) AddProtocolAddress(id tcpip.NICID, protocolAddress tcpip.ProtocolAddress) *tcpip.Error {
   888  	return s.AddProtocolAddressWithOptions(id, protocolAddress, CanBePrimaryEndpoint)
   889  }
   890  
   891  // AddAddressWithOptions is the same as AddAddress, but allows you to specify
   892  // whether the new endpoint can be primary or not.
   893  func (s *Stack) AddAddressWithOptions(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, peb PrimaryEndpointBehavior) *tcpip.Error {
   894  	netProto, ok := s.networkProtocols[protocol]
   895  	if !ok {
   896  		return tcpip.ErrUnknownProtocol
   897  	}
   898  	return s.AddProtocolAddressWithOptions(id, tcpip.ProtocolAddress{
   899  		Protocol: protocol,
   900  		AddressWithPrefix: tcpip.AddressWithPrefix{
   901  			Address:   addr,
   902  			PrefixLen: netProto.DefaultPrefixLen(),
   903  		},
   904  	}, peb)
   905  }
   906  
   907  // AddProtocolAddressWithOptions is the same as AddProtocolAddress, but allows
   908  // you to specify whether the new endpoint can be primary or not.
   909  func (s *Stack) AddProtocolAddressWithOptions(id tcpip.NICID, protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior) *tcpip.Error {
   910  	s.mu.RLock()
   911  	defer s.mu.RUnlock()
   912  
   913  	nic := s.nics[id]
   914  	if nic == nil {
   915  		return tcpip.ErrUnknownNICID
   916  	}
   917  
   918  	return nic.AddAddress(protocolAddress, peb)
   919  }
   920  
   921  // AddAddressRange adds a range of addresses to the specified NIC. The range is
   922  // given by a subnet address, and all addresses contained in the subnet are
   923  // used except for the subnet address itself and the subnet's broadcast
   924  // address.
   925  func (s *Stack) AddAddressRange(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, subnet tcpip.Subnet) *tcpip.Error {
   926  	s.mu.RLock()
   927  	defer s.mu.RUnlock()
   928  
   929  	if nic, ok := s.nics[id]; ok {
   930  		nic.AddAddressRange(protocol, subnet)
   931  		return nil
   932  	}
   933  
   934  	return tcpip.ErrUnknownNICID
   935  }
   936  
   937  // RemoveAddressRange removes the range of addresses from the specified NIC.
   938  func (s *Stack) RemoveAddressRange(id tcpip.NICID, subnet tcpip.Subnet) *tcpip.Error {
   939  	s.mu.RLock()
   940  	defer s.mu.RUnlock()
   941  
   942  	if nic, ok := s.nics[id]; ok {
   943  		nic.RemoveAddressRange(subnet)
   944  		return nil
   945  	}
   946  
   947  	return tcpip.ErrUnknownNICID
   948  }
   949  
   950  // RemoveAddress removes an existing network-layer address from the specified
   951  // NIC.
   952  func (s *Stack) RemoveAddress(id tcpip.NICID, addr tcpip.Address) *tcpip.Error {
   953  	s.mu.RLock()
   954  	defer s.mu.RUnlock()
   955  
   956  	if nic, ok := s.nics[id]; ok {
   957  		return nic.RemoveAddress(addr)
   958  	}
   959  
   960  	return tcpip.ErrUnknownNICID
   961  }
   962  
   963  // AllAddresses returns a map of NICIDs to their protocol addresses (primary
   964  // and non-primary).
   965  func (s *Stack) AllAddresses() map[tcpip.NICID][]tcpip.ProtocolAddress {
   966  	s.mu.RLock()
   967  	defer s.mu.RUnlock()
   968  
   969  	nics := make(map[tcpip.NICID][]tcpip.ProtocolAddress)
   970  	for id, nic := range s.nics {
   971  		nics[id] = nic.AllAddresses()
   972  	}
   973  	return nics
   974  }
   975  
   976  // GetMainNICAddress returns the first primary address and prefix for the given
   977  // NIC and protocol. Returns an error if the NIC doesn't exist and an empty
   978  // value if the NIC doesn't have a primary address for the given protocol.
   979  func (s *Stack) GetMainNICAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber) (tcpip.AddressWithPrefix, *tcpip.Error) {
   980  	s.mu.RLock()
   981  	defer s.mu.RUnlock()
   982  
   983  	nic, ok := s.nics[id]
   984  	if !ok {
   985  		return tcpip.AddressWithPrefix{}, tcpip.ErrUnknownNICID
   986  	}
   987  
   988  	for _, a := range nic.PrimaryAddresses() {
   989  		if a.Protocol == protocol {
   990  			return a.AddressWithPrefix, nil
   991  		}
   992  	}
   993  	return tcpip.AddressWithPrefix{}, nil
   994  }
   995  
   996  func (s *Stack) getRefEP(nic *NIC, localAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) (ref *referencedNetworkEndpoint) {
   997  	if len(localAddr) == 0 {
   998  		return nic.primaryEndpoint(netProto)
   999  	}
  1000  	return nic.findEndpoint(netProto, localAddr, CanBePrimaryEndpoint)
  1001  }
  1002  
  1003  // FindRoute creates a route to the given destination address, leaving through
  1004  // the given nic and local address (if provided).
  1005  func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber, multicastLoop bool) (Route, *tcpip.Error) {
  1006  	s.mu.RLock()
  1007  	defer s.mu.RUnlock()
  1008  
  1009  	isBroadcast := remoteAddr == header.IPv4Broadcast
  1010  	isMulticast := header.IsV4MulticastAddress(remoteAddr) || header.IsV6MulticastAddress(remoteAddr)
  1011  	needRoute := !(isBroadcast || isMulticast || header.IsV6LinkLocalAddress(remoteAddr))
  1012  	if id != 0 && !needRoute {
  1013  		if nic, ok := s.nics[id]; ok {
  1014  			if ref := s.getRefEP(nic, localAddr, netProto); ref != nil {
  1015  				return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.loopback, multicastLoop && !nic.loopback), nil
  1016  			}
  1017  		}
  1018  	} else {
  1019  		for _, route := range s.routeTable {
  1020  			if (id != 0 && id != route.NIC) || (len(remoteAddr) != 0 && !route.Destination.Contains(remoteAddr)) {
  1021  				continue
  1022  			}
  1023  			if nic, ok := s.nics[route.NIC]; ok {
  1024  				if ref := s.getRefEP(nic, localAddr, netProto); ref != nil {
  1025  					if len(remoteAddr) == 0 {
  1026  						// If no remote address was provided, then the route
  1027  						// provided will refer to the link local address.
  1028  						remoteAddr = ref.ep.ID().LocalAddress
  1029  					}
  1030  
  1031  					r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.loopback, multicastLoop && !nic.loopback)
  1032  					if needRoute {
  1033  						r.NextHop = route.Gateway
  1034  					}
  1035  					return r, nil
  1036  				}
  1037  			}
  1038  		}
  1039  	}
  1040  
  1041  	if !needRoute {
  1042  		return Route{}, tcpip.ErrNetworkUnreachable
  1043  	}
  1044  
  1045  	return Route{}, tcpip.ErrNoRoute
  1046  }
  1047  
  1048  // CheckNetworkProtocol checks if a given network protocol is enabled in the
  1049  // stack.
  1050  func (s *Stack) CheckNetworkProtocol(protocol tcpip.NetworkProtocolNumber) bool {
  1051  	_, ok := s.networkProtocols[protocol]
  1052  	return ok
  1053  }
  1054  
  1055  // CheckLocalAddress determines if the given local address exists, and if it
  1056  // does, returns the id of the NIC it's bound to. Returns 0 if the address
  1057  // does not exist.
  1058  func (s *Stack) CheckLocalAddress(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.NICID {
  1059  	s.mu.RLock()
  1060  	defer s.mu.RUnlock()
  1061  
  1062  	// If a NIC is specified, we try to find the address there only.
  1063  	if nicID != 0 {
  1064  		nic := s.nics[nicID]
  1065  		if nic == nil {
  1066  			return 0
  1067  		}
  1068  
  1069  		ref := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint)
  1070  		if ref == nil {
  1071  			return 0
  1072  		}
  1073  
  1074  		ref.decRef()
  1075  
  1076  		return nic.id
  1077  	}
  1078  
  1079  	// Go through all the NICs.
  1080  	for _, nic := range s.nics {
  1081  		ref := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint)
  1082  		if ref != nil {
  1083  			ref.decRef()
  1084  			return nic.id
  1085  		}
  1086  	}
  1087  
  1088  	return 0
  1089  }
  1090  
  1091  // SetPromiscuousMode enables or disables promiscuous mode in the given NIC.
  1092  func (s *Stack) SetPromiscuousMode(nicID tcpip.NICID, enable bool) *tcpip.Error {
  1093  	s.mu.RLock()
  1094  	defer s.mu.RUnlock()
  1095  
  1096  	nic := s.nics[nicID]
  1097  	if nic == nil {
  1098  		return tcpip.ErrUnknownNICID
  1099  	}
  1100  
  1101  	nic.setPromiscuousMode(enable)
  1102  
  1103  	return nil
  1104  }
  1105  
  1106  // SetSpoofing enables or disables address spoofing in the given NIC, allowing
  1107  // endpoints to bind to any address in the NIC.
  1108  func (s *Stack) SetSpoofing(nicID tcpip.NICID, enable bool) *tcpip.Error {
  1109  	s.mu.RLock()
  1110  	defer s.mu.RUnlock()
  1111  
  1112  	nic := s.nics[nicID]
  1113  	if nic == nil {
  1114  		return tcpip.ErrUnknownNICID
  1115  	}
  1116  
  1117  	nic.setSpoofing(enable)
  1118  
  1119  	return nil
  1120  }
  1121  
  1122  // AddLinkAddress adds a link address to the stack link cache.
  1123  func (s *Stack) AddLinkAddress(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress) {
  1124  	fullAddr := tcpip.FullAddress{NIC: nicID, Addr: addr}
  1125  	s.linkAddrCache.add(fullAddr, linkAddr)
  1126  	// TODO: provide a way for a transport endpoint to receive a signal
  1127  	// that AddLinkAddress for a particular address has been called.
  1128  }
  1129  
  1130  // GetLinkAddress implements LinkAddressCache.GetLinkAddress.
  1131  func (s *Stack) GetLinkAddress(nicID tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, waker *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) {
  1132  	s.mu.RLock()
  1133  	nic := s.nics[nicID]
  1134  	if nic == nil {
  1135  		s.mu.RUnlock()
  1136  		return "", nil, tcpip.ErrUnknownNICID
  1137  	}
  1138  	s.mu.RUnlock()
  1139  
  1140  	fullAddr := tcpip.FullAddress{NIC: nicID, Addr: addr}
  1141  	linkRes := s.linkAddrResolvers[protocol]
  1142  	return s.linkAddrCache.get(fullAddr, linkRes, localAddr, nic.linkEP, waker)
  1143  }
  1144  
  1145  // RemoveWaker implements LinkAddressCache.RemoveWaker.
  1146  func (s *Stack) RemoveWaker(nicID tcpip.NICID, addr tcpip.Address, waker *sleep.Waker) {
  1147  	s.mu.RLock()
  1148  	defer s.mu.RUnlock()
  1149  
  1150  	if nic := s.nics[nicID]; nic == nil {
  1151  		fullAddr := tcpip.FullAddress{NIC: nicID, Addr: addr}
  1152  		s.linkAddrCache.removeWaker(fullAddr, waker)
  1153  	}
  1154  }
  1155  
  1156  // RegisterTransportEndpoint registers the given endpoint with the stack
  1157  // transport dispatcher. Received packets that match the provided id will be
  1158  // delivered to the given endpoint; specifying a nic is optional, but
  1159  // nic-specific IDs have precedence over global ones.
  1160  func (s *Stack) RegisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
  1161  	return s.demux.registerEndpoint(netProtos, protocol, id, ep, reusePort, bindToDevice)
  1162  }
  1163  
  1164  // UnregisterTransportEndpoint removes the endpoint with the given id from the
  1165  // stack transport dispatcher.
  1166  func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
  1167  	s.demux.unregisterEndpoint(netProtos, protocol, id, ep, bindToDevice)
  1168  }
  1169  
  1170  // StartTransportEndpointCleanup removes the endpoint with the given id from
  1171  // the stack transport dispatcher. It also transitions it to the cleanup stage.
  1172  func (s *Stack) StartTransportEndpointCleanup(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
  1173  	s.mu.Lock()
  1174  	defer s.mu.Unlock()
  1175  
  1176  	s.cleanupEndpoints[ep] = struct{}{}
  1177  
  1178  	s.demux.unregisterEndpoint(netProtos, protocol, id, ep, bindToDevice)
  1179  }
  1180  
  1181  // CompleteTransportEndpointCleanup removes the endpoint from the cleanup
  1182  // stage.
  1183  func (s *Stack) CompleteTransportEndpointCleanup(ep TransportEndpoint) {
  1184  	s.mu.Lock()
  1185  	delete(s.cleanupEndpoints, ep)
  1186  	s.mu.Unlock()
  1187  }
  1188  
  1189  // FindTransportEndpoint finds an endpoint that most closely matches the provided
  1190  // id. If no endpoint is found it returns nil.
  1191  func (s *Stack) FindTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, id TransportEndpointID, r *Route) TransportEndpoint {
  1192  	return s.demux.findTransportEndpoint(netProto, transProto, id, r)
  1193  }
  1194  
  1195  // RegisterRawTransportEndpoint registers the given endpoint with the stack
  1196  // transport dispatcher. Received packets that match the provided transport
  1197  // protocol will be delivered to the given endpoint.
  1198  func (s *Stack) RegisterRawTransportEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) *tcpip.Error {
  1199  	return s.demux.registerRawEndpoint(netProto, transProto, ep)
  1200  }
  1201  
  1202  // UnregisterRawTransportEndpoint removes the endpoint for the transport
  1203  // protocol from the stack transport dispatcher.
  1204  func (s *Stack) UnregisterRawTransportEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) {
  1205  	s.demux.unregisterRawEndpoint(netProto, transProto, ep)
  1206  }
  1207  
  1208  // RegisterRestoredEndpoint records e as an endpoint that has been restored on
  1209  // this stack.
  1210  func (s *Stack) RegisterRestoredEndpoint(e ResumableEndpoint) {
  1211  	s.mu.Lock()
  1212  	s.resumableEndpoints = append(s.resumableEndpoints, e)
  1213  	s.mu.Unlock()
  1214  }
  1215  
  1216  // RegisteredEndpoints returns all endpoints which are currently registered.
  1217  func (s *Stack) RegisteredEndpoints() []TransportEndpoint {
  1218  	s.mu.Lock()
  1219  	defer s.mu.Unlock()
  1220  	var es []TransportEndpoint
  1221  	for _, e := range s.demux.protocol {
  1222  		es = append(es, e.transportEndpoints()...)
  1223  	}
  1224  	return es
  1225  }
  1226  
  1227  // CleanupEndpoints returns endpoints currently in the cleanup state.
  1228  func (s *Stack) CleanupEndpoints() []TransportEndpoint {
  1229  	s.mu.Lock()
  1230  	es := make([]TransportEndpoint, 0, len(s.cleanupEndpoints))
  1231  	for e := range s.cleanupEndpoints {
  1232  		es = append(es, e)
  1233  	}
  1234  	s.mu.Unlock()
  1235  	return es
  1236  }
  1237  
  1238  // RestoreCleanupEndpoints adds endpoints to cleanup tracking. This is useful
  1239  // for restoring a stack after a save.
  1240  func (s *Stack) RestoreCleanupEndpoints(es []TransportEndpoint) {
  1241  	s.mu.Lock()
  1242  	for _, e := range es {
  1243  		s.cleanupEndpoints[e] = struct{}{}
  1244  	}
  1245  	s.mu.Unlock()
  1246  }
  1247  
  1248  // Close closes all currently registered transport endpoints.
  1249  //
  1250  // Endpoints created or modified during this call may not get closed.
  1251  func (s *Stack) Close() {
  1252  	for _, e := range s.RegisteredEndpoints() {
  1253  		e.Close()
  1254  	}
  1255  }
  1256  
  1257  // Wait waits for all transport and link endpoints to halt their worker
  1258  // goroutines.
  1259  //
  1260  // Endpoints created or modified during this call may not get waited on.
  1261  //
  1262  // Note that link endpoints must be stopped via an implementation specific
  1263  // mechanism.
  1264  func (s *Stack) Wait() {
  1265  	for _, e := range s.RegisteredEndpoints() {
  1266  		e.Wait()
  1267  	}
  1268  	for _, e := range s.CleanupEndpoints() {
  1269  		e.Wait()
  1270  	}
  1271  
  1272  	s.mu.RLock()
  1273  	defer s.mu.RUnlock()
  1274  	for _, n := range s.nics {
  1275  		n.linkEP.Wait()
  1276  	}
  1277  }
  1278  
  1279  // Resume restarts the stack after a restore. This must be called after the
  1280  // entire system has been restored.
  1281  func (s *Stack) Resume() {
  1282  	// ResumableEndpoint.Resume() may call other methods on s, so we can't hold
  1283  	// s.mu while resuming the endpoints.
  1284  	s.mu.Lock()
  1285  	eps := s.resumableEndpoints
  1286  	s.resumableEndpoints = nil
  1287  	s.mu.Unlock()
  1288  	for _, e := range eps {
  1289  		e.Resume(s)
  1290  	}
  1291  }
  1292  
  1293  // RegisterPacketEndpoint registers ep with the stack, causing it to receive
  1294  // all traffic of the specified netProto on the given NIC. If nicID is 0, it
  1295  // receives traffic from every NIC.
  1296  func (s *Stack) RegisterPacketEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) *tcpip.Error {
  1297  	s.mu.Lock()
  1298  	defer s.mu.Unlock()
  1299  
  1300  	// If no NIC is specified, capture on all devices.
  1301  	if nicID == 0 {
  1302  		// Register with each NIC.
  1303  		for _, nic := range s.nics {
  1304  			if err := nic.registerPacketEndpoint(netProto, ep); err != nil {
  1305  				s.unregisterPacketEndpointLocked(0, netProto, ep)
  1306  				return err
  1307  			}
  1308  		}
  1309  		return nil
  1310  	}
  1311  
  1312  	// Capture on a specific device.
  1313  	nic, ok := s.nics[nicID]
  1314  	if !ok {
  1315  		return tcpip.ErrUnknownNICID
  1316  	}
  1317  	if err := nic.registerPacketEndpoint(netProto, ep); err != nil {
  1318  		return err
  1319  	}
  1320  
  1321  	return nil
  1322  }
  1323  
  1324  // UnregisterPacketEndpoint unregisters ep for packets of the specified
  1325  // netProto from the specified NIC. If nicID is 0, ep is unregistered from all
  1326  // NICs.
  1327  func (s *Stack) UnregisterPacketEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) {
  1328  	s.mu.Lock()
  1329  	defer s.mu.Unlock()
  1330  	s.unregisterPacketEndpointLocked(nicID, netProto, ep)
  1331  }
  1332  
  1333  func (s *Stack) unregisterPacketEndpointLocked(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) {
  1334  	// If no NIC is specified, unregister on all devices.
  1335  	if nicID == 0 {
  1336  		// Unregister with each NIC.
  1337  		for _, nic := range s.nics {
  1338  			nic.unregisterPacketEndpoint(netProto, ep)
  1339  		}
  1340  		return
  1341  	}
  1342  
  1343  	// Unregister in a single device.
  1344  	nic, ok := s.nics[nicID]
  1345  	if !ok {
  1346  		return
  1347  	}
  1348  	nic.unregisterPacketEndpoint(netProto, ep)
  1349  }
  1350  
  1351  // WritePacket writes data directly to the specified NIC. It adds an ethernet
  1352  // header based on the arguments.
  1353  func (s *Stack) WritePacket(nicID tcpip.NICID, dst tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, payload buffer.VectorisedView) *tcpip.Error {
  1354  	s.mu.Lock()
  1355  	nic, ok := s.nics[nicID]
  1356  	s.mu.Unlock()
  1357  	if !ok {
  1358  		return tcpip.ErrUnknownDevice
  1359  	}
  1360  
  1361  	// Add our own fake ethernet header.
  1362  	ethFields := header.EthernetFields{
  1363  		SrcAddr: nic.linkEP.LinkAddress(),
  1364  		DstAddr: dst,
  1365  		Type:    netProto,
  1366  	}
  1367  	fakeHeader := make(header.Ethernet, header.EthernetMinimumSize)
  1368  	fakeHeader.Encode(&ethFields)
  1369  	vv := buffer.View(fakeHeader).ToVectorisedView()
  1370  	vv.Append(payload)
  1371  
  1372  	if err := nic.linkEP.WriteRawPacket(vv); err != nil {
  1373  		return err
  1374  	}
  1375  
  1376  	return nil
  1377  }
  1378  
  1379  // WriteRawPacket writes data directly to the specified NIC without adding any
  1380  // headers.
  1381  func (s *Stack) WriteRawPacket(nicID tcpip.NICID, payload buffer.VectorisedView) *tcpip.Error {
  1382  	s.mu.Lock()
  1383  	nic, ok := s.nics[nicID]
  1384  	s.mu.Unlock()
  1385  	if !ok {
  1386  		return tcpip.ErrUnknownDevice
  1387  	}
  1388  
  1389  	if err := nic.linkEP.WriteRawPacket(payload); err != nil {
  1390  		return err
  1391  	}
  1392  
  1393  	return nil
  1394  }
  1395  
  1396  // NetworkProtocolInstance returns the protocol instance in the stack for the
  1397  // specified network protocol. This method is public for protocol implementers
  1398  // and tests to use.
  1399  func (s *Stack) NetworkProtocolInstance(num tcpip.NetworkProtocolNumber) NetworkProtocol {
  1400  	if p, ok := s.networkProtocols[num]; ok {
  1401  		return p
  1402  	}
  1403  	return nil
  1404  }
  1405  
  1406  // TransportProtocolInstance returns the protocol instance in the stack for the
  1407  // specified transport protocol. This method is public for protocol implementers
  1408  // and tests to use.
  1409  func (s *Stack) TransportProtocolInstance(num tcpip.TransportProtocolNumber) TransportProtocol {
  1410  	if pState, ok := s.transportProtocols[num]; ok {
  1411  		return pState.proto
  1412  	}
  1413  	return nil
  1414  }
  1415  
  1416  // AddTCPProbe installs a probe function that will be invoked on every segment
  1417  // received by a given TCP endpoint. The probe function is passed a copy of the
  1418  // TCP endpoint state before and after processing of the segment.
  1419  //
  1420  // NOTE: TCPProbe is added only to endpoints created after this call. Endpoints
  1421  // created prior to this call will not call the probe function.
  1422  //
  1423  // Further, installing two different probes back to back can result in some
  1424  // endpoints calling the first one and some the second one. There is no
  1425  // guarantee provided on which probe will be invoked. Ideally this should only
  1426  // be called once per stack.
  1427  func (s *Stack) AddTCPProbe(probe TCPProbeFunc) {
  1428  	s.mu.Lock()
  1429  	s.tcpProbeFunc = probe
  1430  	s.mu.Unlock()
  1431  }
  1432  
  1433  // GetTCPProbe returns the TCPProbeFunc if installed with AddTCPProbe, nil
  1434  // otherwise.
  1435  func (s *Stack) GetTCPProbe() TCPProbeFunc {
  1436  	s.mu.Lock()
  1437  	p := s.tcpProbeFunc
  1438  	s.mu.Unlock()
  1439  	return p
  1440  }
  1441  
  1442  // RemoveTCPProbe removes an installed TCP probe.
  1443  //
  1444  // NOTE: This only ensures that endpoints created after this call do not
  1445  // have a probe attached. Endpoints already created will continue to invoke
  1446  // TCP probe.
  1447  func (s *Stack) RemoveTCPProbe() {
  1448  	s.mu.Lock()
  1449  	s.tcpProbeFunc = nil
  1450  	s.mu.Unlock()
  1451  }
  1452  
  1453  // JoinGroup joins the given multicast group on the given NIC.
  1454  func (s *Stack) JoinGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NICID, multicastAddr tcpip.Address) *tcpip.Error {
  1455  	// TODO: notify network of subscription via igmp protocol.
  1456  	s.mu.RLock()
  1457  	defer s.mu.RUnlock()
  1458  
  1459  	if nic, ok := s.nics[nicID]; ok {
  1460  		return nic.joinGroup(protocol, multicastAddr)
  1461  	}
  1462  	return tcpip.ErrUnknownNICID
  1463  }
  1464  
  1465  // LeaveGroup leaves the given multicast group on the given NIC.
  1466  func (s *Stack) LeaveGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NICID, multicastAddr tcpip.Address) *tcpip.Error {
  1467  	s.mu.RLock()
  1468  	defer s.mu.RUnlock()
  1469  
  1470  	if nic, ok := s.nics[nicID]; ok {
  1471  		return nic.leaveGroup(multicastAddr)
  1472  	}
  1473  	return tcpip.ErrUnknownNICID
  1474  }
  1475  
  1476  // IPTables returns the stack's iptables.
  1477  func (s *Stack) IPTables() iptables.IPTables {
  1478  	return s.tables
  1479  }
  1480  
  1481  // SetIPTables sets the stack's iptables.
  1482  func (s *Stack) SetIPTables(ipt iptables.IPTables) {
  1483  	s.tables = ipt
  1484  }
  1485  
  1486  // ICMPLimit returns the maximum number of ICMP messages that can be sent
  1487  // in one second.
  1488  func (s *Stack) ICMPLimit() rate.Limit {
  1489  	return s.icmpRateLimiter.Limit()
  1490  }
  1491  
  1492  // SetICMPLimit sets the maximum number of ICMP messages that be sent
  1493  // in one second.
  1494  func (s *Stack) SetICMPLimit(newLimit rate.Limit) {
  1495  	s.icmpRateLimiter.SetLimit(newLimit)
  1496  }
  1497  
  1498  // ICMPBurst returns the maximum number of ICMP messages that can be sent
  1499  // in a single burst.
  1500  func (s *Stack) ICMPBurst() int {
  1501  	return s.icmpRateLimiter.Burst()
  1502  }
  1503  
  1504  // SetICMPBurst sets the maximum number of ICMP messages that can be sent
  1505  // in a single burst.
  1506  func (s *Stack) SetICMPBurst(burst int) {
  1507  	s.icmpRateLimiter.SetBurst(burst)
  1508  }
  1509  
  1510  // AllowICMPMessage returns true if we the rate limiter allows at least one
  1511  // ICMP message to be sent at this instant.
  1512  func (s *Stack) AllowICMPMessage() bool {
  1513  	return s.icmpRateLimiter.Allow()
  1514  }
  1515  
  1516  // IsAddrTentative returns true if addr is tentative on the NIC with ID id.
  1517  //
  1518  // Note that if addr is not associated with a NIC with id ID, then this
  1519  // function will return false. It will only return true if the address is
  1520  // associated with the NIC AND it is tentative.
  1521  func (s *Stack) IsAddrTentative(id tcpip.NICID, addr tcpip.Address) (bool, *tcpip.Error) {
  1522  	s.mu.RLock()
  1523  	defer s.mu.RUnlock()
  1524  
  1525  	nic, ok := s.nics[id]
  1526  	if !ok {
  1527  		return false, tcpip.ErrUnknownNICID
  1528  	}
  1529  
  1530  	return nic.isAddrTentative(addr), nil
  1531  }
  1532  
  1533  // DupTentativeAddrDetected attempts to inform the NIC with ID id that a
  1534  // tentative addr on it is a duplicate on a link.
  1535  func (s *Stack) DupTentativeAddrDetected(id tcpip.NICID, addr tcpip.Address) *tcpip.Error {
  1536  	s.mu.Lock()
  1537  	defer s.mu.Unlock()
  1538  
  1539  	nic, ok := s.nics[id]
  1540  	if !ok {
  1541  		return tcpip.ErrUnknownNICID
  1542  	}
  1543  
  1544  	return nic.dupTentativeAddrDetected(addr)
  1545  }
  1546  
  1547  // SetNDPConfigurations sets the per-interface NDP configurations on the NIC
  1548  // with ID id to c.
  1549  //
  1550  // Note, if c contains invalid NDP configuration values, it will be fixed to
  1551  // use default values for the erroneous values.
  1552  func (s *Stack) SetNDPConfigurations(id tcpip.NICID, c NDPConfigurations) *tcpip.Error {
  1553  	s.mu.Lock()
  1554  	defer s.mu.Unlock()
  1555  
  1556  	nic, ok := s.nics[id]
  1557  	if !ok {
  1558  		return tcpip.ErrUnknownNICID
  1559  	}
  1560  
  1561  	nic.setNDPConfigs(c)
  1562  
  1563  	return nil
  1564  }
  1565  
  1566  // HandleNDPRA provides a NIC with ID id a validated NDP Router Advertisement
  1567  // message that it needs to handle.
  1568  func (s *Stack) HandleNDPRA(id tcpip.NICID, ip tcpip.Address, ra header.NDPRouterAdvert) *tcpip.Error {
  1569  	s.mu.Lock()
  1570  	defer s.mu.Unlock()
  1571  
  1572  	nic, ok := s.nics[id]
  1573  	if !ok {
  1574  		return tcpip.ErrUnknownNICID
  1575  	}
  1576  
  1577  	nic.handleNDPRA(ip, ra)
  1578  
  1579  	return nil
  1580  }
  1581  
  1582  // Seed returns a 32 bit value that can be used as a seed value for port
  1583  // picking, ISN generation etc.
  1584  //
  1585  // NOTE: The seed is generated once during stack initialization only.
  1586  func (s *Stack) Seed() uint32 {
  1587  	return s.seed
  1588  }
  1589  
  1590  func generateRandUint32() uint32 {
  1591  	b := make([]byte, 4)
  1592  	if _, err := rand.Read(b); err != nil {
  1593  		panic(err)
  1594  	}
  1595  	return binary.LittleEndian.Uint32(b)
  1596  }