github.com/FlowerWrong/netstack@v0.0.0-20191009141956-e5848263af28/tcpip/tcpip.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package tcpip provides the interfaces and related types that users of the
    16  // tcpip stack will use in order to create endpoints used to send and receive
    17  // data over the network stack.
    18  //
    19  // The starting point is the creation and configuration of a stack. A stack can
    20  // be created by calling the New() function of the tcpip/stack/stack package;
    21  // configuring a stack involves creating NICs (via calls to Stack.CreateNIC()),
    22  // adding network addresses (via calls to Stack.AddAddress()), and
    23  // setting a route table (via a call to Stack.SetRouteTable()).
    24  //
    25  // Once a stack is configured, endpoints can be created by calling
    26  // Stack.NewEndpoint(). Such endpoints can be used to send/receive data, connect
    27  // to peers, listen for connections, accept connections, etc., depending on the
    28  // transport protocol selected.
    29  package tcpip
    30  
    31  import (
    32  	"errors"
    33  	"fmt"
    34  	"math/bits"
    35  	"reflect"
    36  	"strconv"
    37  	"strings"
    38  	"sync"
    39  	"sync/atomic"
    40  	"time"
    41  
    42  	"github.com/FlowerWrong/netstack/tcpip/buffer"
    43  	"github.com/FlowerWrong/netstack/tcpip/iptables"
    44  	"github.com/FlowerWrong/netstack/waiter"
    45  )
    46  
    47  // Error represents an error in the netstack error space. Using a special type
    48  // ensures that errors outside of this space are not accidentally introduced.
    49  //
    50  // Note: to support save / restore, it is important that all tcpip errors have
    51  // distinct error messages.
    52  type Error struct {
    53  	msg string
    54  
    55  	ignoreStats bool
    56  }
    57  
    58  // String implements fmt.Stringer.String.
    59  func (e *Error) String() string {
    60  	if e == nil {
    61  		return "<nil>"
    62  	}
    63  	return e.msg
    64  }
    65  
    66  // IgnoreStats indicates whether this error type should be included in failure
    67  // counts in tcpip.Stats structs.
    68  func (e *Error) IgnoreStats() bool {
    69  	return e.ignoreStats
    70  }
    71  
    72  // Errors that can be returned by the network stack.
    73  var (
    74  	ErrUnknownProtocol           = &Error{msg: "unknown protocol"}
    75  	ErrUnknownNICID              = &Error{msg: "unknown nic id"}
    76  	ErrUnknownDevice             = &Error{msg: "unknown device"}
    77  	ErrUnknownProtocolOption     = &Error{msg: "unknown option for protocol"}
    78  	ErrDuplicateNICID            = &Error{msg: "duplicate nic id"}
    79  	ErrDuplicateAddress          = &Error{msg: "duplicate address"}
    80  	ErrNoRoute                   = &Error{msg: "no route"}
    81  	ErrBadLinkEndpoint           = &Error{msg: "bad link layer endpoint"}
    82  	ErrAlreadyBound              = &Error{msg: "endpoint already bound", ignoreStats: true}
    83  	ErrInvalidEndpointState      = &Error{msg: "endpoint is in invalid state"}
    84  	ErrAlreadyConnecting         = &Error{msg: "endpoint is already connecting", ignoreStats: true}
    85  	ErrAlreadyConnected          = &Error{msg: "endpoint is already connected", ignoreStats: true}
    86  	ErrNoPortAvailable           = &Error{msg: "no ports are available"}
    87  	ErrPortInUse                 = &Error{msg: "port is in use"}
    88  	ErrBadLocalAddress           = &Error{msg: "bad local address"}
    89  	ErrClosedForSend             = &Error{msg: "endpoint is closed for send"}
    90  	ErrClosedForReceive          = &Error{msg: "endpoint is closed for receive"}
    91  	ErrWouldBlock                = &Error{msg: "operation would block", ignoreStats: true}
    92  	ErrConnectionRefused         = &Error{msg: "connection was refused"}
    93  	ErrTimeout                   = &Error{msg: "operation timed out"}
    94  	ErrAborted                   = &Error{msg: "operation aborted"}
    95  	ErrConnectStarted            = &Error{msg: "connection attempt started", ignoreStats: true}
    96  	ErrDestinationRequired       = &Error{msg: "destination address is required"}
    97  	ErrNotSupported              = &Error{msg: "operation not supported"}
    98  	ErrQueueSizeNotSupported     = &Error{msg: "queue size querying not supported"}
    99  	ErrNotConnected              = &Error{msg: "endpoint not connected"}
   100  	ErrConnectionReset           = &Error{msg: "connection reset by peer"}
   101  	ErrConnectionAborted         = &Error{msg: "connection aborted"}
   102  	ErrNoSuchFile                = &Error{msg: "no such file"}
   103  	ErrInvalidOptionValue        = &Error{msg: "invalid option value specified"}
   104  	ErrNoLinkAddress             = &Error{msg: "no remote link address"}
   105  	ErrBadAddress                = &Error{msg: "bad address"}
   106  	ErrNetworkUnreachable        = &Error{msg: "network is unreachable"}
   107  	ErrMessageTooLong            = &Error{msg: "message too long"}
   108  	ErrNoBufferSpace             = &Error{msg: "no buffer space available"}
   109  	ErrBroadcastDisabled         = &Error{msg: "broadcast socket option disabled"}
   110  	ErrNotPermitted              = &Error{msg: "operation not permitted"}
   111  	ErrAddressFamilyNotSupported = &Error{msg: "address family not supported by protocol"}
   112  )
   113  
   114  // Errors related to Subnet
   115  var (
   116  	errSubnetLengthMismatch = errors.New("subnet length of address and mask differ")
   117  	errSubnetAddressMasked  = errors.New("subnet address has bits set outside the mask")
   118  )
   119  
   120  // ErrSaveRejection indicates a failed save due to unsupported networking state.
   121  // This type of errors is only used for save logic.
   122  type ErrSaveRejection struct {
   123  	Err error
   124  }
   125  
   126  // Error returns a sensible description of the save rejection error.
   127  func (e ErrSaveRejection) Error() string {
   128  	return "save rejected due to unsupported networking state: " + e.Err.Error()
   129  }
   130  
   131  // A Clock provides the current time.
   132  //
   133  // Times returned by a Clock should always be used for application-visible
   134  // time. Only monotonic times should be used for netstack internal timekeeping.
   135  type Clock interface {
   136  	// NowNanoseconds returns the current real time as a number of
   137  	// nanoseconds since the Unix epoch.
   138  	NowNanoseconds() int64
   139  
   140  	// NowMonotonic returns a monotonic time value.
   141  	NowMonotonic() int64
   142  }
   143  
   144  // Address is a byte slice cast as a string that represents the address of a
   145  // network node. Or, in the case of unix endpoints, it may represent a path.
   146  type Address string
   147  
   148  // AddressMask is a bitmask for an address.
   149  type AddressMask string
   150  
   151  // String implements Stringer.
   152  func (m AddressMask) String() string {
   153  	return Address(m).String()
   154  }
   155  
   156  // Prefix returns the number of bits before the first host bit.
   157  func (m AddressMask) Prefix() int {
   158  	p := 0
   159  	for _, b := range []byte(m) {
   160  		p += bits.LeadingZeros8(^b)
   161  	}
   162  	return p
   163  }
   164  
   165  // Subnet is a subnet defined by its address and mask.
   166  type Subnet struct {
   167  	address Address
   168  	mask    AddressMask
   169  }
   170  
   171  // NewSubnet creates a new Subnet, checking that the address and mask are the same length.
   172  func NewSubnet(a Address, m AddressMask) (Subnet, error) {
   173  	if len(a) != len(m) {
   174  		return Subnet{}, errSubnetLengthMismatch
   175  	}
   176  	for i := 0; i < len(a); i++ {
   177  		if a[i]&^m[i] != 0 {
   178  			return Subnet{}, errSubnetAddressMasked
   179  		}
   180  	}
   181  	return Subnet{a, m}, nil
   182  }
   183  
   184  // String implements Stringer.
   185  func (s Subnet) String() string {
   186  	return fmt.Sprintf("%s/%d", s.ID(), s.Prefix())
   187  }
   188  
   189  // Contains returns true iff the address is of the same length and matches the
   190  // subnet address and mask.
   191  func (s *Subnet) Contains(a Address) bool {
   192  	if len(a) != len(s.address) {
   193  		return false
   194  	}
   195  	for i := 0; i < len(a); i++ {
   196  		if a[i]&s.mask[i] != s.address[i] {
   197  			return false
   198  		}
   199  	}
   200  	return true
   201  }
   202  
   203  // ID returns the subnet ID.
   204  func (s *Subnet) ID() Address {
   205  	return s.address
   206  }
   207  
   208  // Bits returns the number of ones (network bits) and zeros (host bits) in the
   209  // subnet mask.
   210  func (s *Subnet) Bits() (ones int, zeros int) {
   211  	ones = s.mask.Prefix()
   212  	return ones, len(s.mask)*8 - ones
   213  }
   214  
   215  // Prefix returns the number of bits before the first host bit.
   216  func (s *Subnet) Prefix() int {
   217  	return s.mask.Prefix()
   218  }
   219  
   220  // Mask returns the subnet mask.
   221  func (s *Subnet) Mask() AddressMask {
   222  	return s.mask
   223  }
   224  
   225  // Broadcast returns the subnet's broadcast address.
   226  func (s *Subnet) Broadcast() Address {
   227  	addr := []byte(s.address)
   228  	for i := range addr {
   229  		addr[i] |= ^s.mask[i]
   230  	}
   231  	return Address(addr)
   232  }
   233  
   234  // NICID is a number that uniquely identifies a NIC.
   235  type NICID int32
   236  
   237  // ShutdownFlags represents flags that can be passed to the Shutdown() method
   238  // of the Endpoint interface.
   239  type ShutdownFlags int
   240  
   241  // Values of the flags that can be passed to the Shutdown() method. They can
   242  // be OR'ed together.
   243  const (
   244  	ShutdownRead ShutdownFlags = 1 << iota
   245  	ShutdownWrite
   246  )
   247  
   248  // FullAddress represents a full transport node address, as required by the
   249  // Connect() and Bind() methods.
   250  //
   251  // +stateify savable
   252  type FullAddress struct {
   253  	// NIC is the ID of the NIC this address refers to.
   254  	//
   255  	// This may not be used by all endpoint types.
   256  	NIC NICID
   257  
   258  	// Addr is the network address.
   259  	Addr Address
   260  
   261  	// Port is the transport port.
   262  	//
   263  	// This may not be used by all endpoint types.
   264  	Port uint16
   265  }
   266  
   267  // Payloader is an interface that provides data.
   268  //
   269  // This interface allows the endpoint to request the amount of data it needs
   270  // based on internal buffers without exposing them.
   271  type Payloader interface {
   272  	// FullPayload returns all available bytes.
   273  	FullPayload() ([]byte, *Error)
   274  
   275  	// Payload returns a slice containing at most size bytes.
   276  	Payload(size int) ([]byte, *Error)
   277  }
   278  
   279  // SlicePayload implements Payloader for slices.
   280  //
   281  // This is typically used for tests.
   282  type SlicePayload []byte
   283  
   284  // FullPayload implements Payloader.FullPayload.
   285  func (s SlicePayload) FullPayload() ([]byte, *Error) {
   286  	return s, nil
   287  }
   288  
   289  // Payload implements Payloader.Payload.
   290  func (s SlicePayload) Payload(size int) ([]byte, *Error) {
   291  	if size > len(s) {
   292  		size = len(s)
   293  	}
   294  	return s[:size], nil
   295  }
   296  
   297  // A ControlMessages contains socket control messages for IP sockets.
   298  //
   299  // +stateify savable
   300  type ControlMessages struct {
   301  	// HasTimestamp indicates whether Timestamp is valid/set.
   302  	HasTimestamp bool
   303  
   304  	// Timestamp is the time (in ns) that the last packed used to create
   305  	// the read data was received.
   306  	Timestamp int64
   307  
   308  	// HasInq indicates whether Inq is valid/set.
   309  	HasInq bool
   310  
   311  	// Inq is the number of bytes ready to be received.
   312  	Inq int32
   313  }
   314  
   315  // Endpoint is the interface implemented by transport protocols (e.g., tcp, udp)
   316  // that exposes functionality like read, write, connect, etc. to users of the
   317  // networking stack.
   318  type Endpoint interface {
   319  	// Close puts the endpoint in a closed state and frees all resources
   320  	// associated with it.
   321  	Close()
   322  
   323  	// Read reads data from the endpoint and optionally returns the sender.
   324  	//
   325  	// This method does not block if there is no data pending. It will also
   326  	// either return an error or data, never both.
   327  	Read(*FullAddress) (buffer.View, ControlMessages, *Error)
   328  
   329  	// Write writes data to the endpoint's peer. This method does not block if
   330  	// the data cannot be written.
   331  	//
   332  	// Unlike io.Writer.Write, Endpoint.Write transfers ownership of any bytes
   333  	// successfully written to the Endpoint. That is, if a call to
   334  	// Write(SlicePayload{data}) returns (n, err), it may retain data[:n], and
   335  	// the caller should not use data[:n] after Write returns.
   336  	//
   337  	// Note that unlike io.Writer.Write, it is not an error for Write to
   338  	// perform a partial write (if n > 0, no error may be returned). Only
   339  	// stream (TCP) Endpoints may return partial writes, and even then only
   340  	// in the case where writing additional data would block. Other Endpoints
   341  	// will either write the entire message or return an error.
   342  	//
   343  	// For UDP and Ping sockets if address resolution is required,
   344  	// ErrNoLinkAddress and a notification channel is returned for the caller to
   345  	// block. Channel is closed once address resolution is complete (success or
   346  	// not). The channel is only non-nil in this case.
   347  	Write(Payloader, WriteOptions) (int64, <-chan struct{}, *Error)
   348  
   349  	// Peek reads data without consuming it from the endpoint.
   350  	//
   351  	// This method does not block if there is no data pending.
   352  	Peek([][]byte) (int64, ControlMessages, *Error)
   353  
   354  	// Connect connects the endpoint to its peer. Specifying a NIC is
   355  	// optional.
   356  	//
   357  	// There are three classes of return values:
   358  	//	nil -- the attempt to connect succeeded.
   359  	//	ErrConnectStarted/ErrAlreadyConnecting -- the connect attempt started
   360  	//		but hasn't completed yet. In this case, the caller must call Connect
   361  	//		or GetSockOpt(ErrorOption) when the endpoint becomes writable to
   362  	//		get the actual result. The first call to Connect after the socket has
   363  	//		connected returns nil. Calling connect again results in ErrAlreadyConnected.
   364  	//	Anything else -- the attempt to connect failed.
   365  	//
   366  	// If address.Addr is empty, this means that Enpoint has to be
   367  	// disconnected if this is supported, otherwise
   368  	// ErrAddressFamilyNotSupported must be returned.
   369  	Connect(address FullAddress) *Error
   370  
   371  	// Disconnect disconnects the endpoint from its peer.
   372  	Disconnect() *Error
   373  
   374  	// Shutdown closes the read and/or write end of the endpoint connection
   375  	// to its peer.
   376  	Shutdown(flags ShutdownFlags) *Error
   377  
   378  	// Listen puts the endpoint in "listen" mode, which allows it to accept
   379  	// new connections.
   380  	Listen(backlog int) *Error
   381  
   382  	// Accept returns a new endpoint if a peer has established a connection
   383  	// to an endpoint previously set to listen mode. This method does not
   384  	// block if no new connections are available.
   385  	//
   386  	// The returned Queue is the wait queue for the newly created endpoint.
   387  	Accept() (Endpoint, *waiter.Queue, *Error)
   388  
   389  	// Bind binds the endpoint to a specific local address and port.
   390  	// Specifying a NIC is optional.
   391  	Bind(address FullAddress) *Error
   392  
   393  	// GetLocalAddress returns the address to which the endpoint is bound.
   394  	GetLocalAddress() (FullAddress, *Error)
   395  
   396  	// GetRemoteAddress returns the address to which the endpoint is
   397  	// connected.
   398  	GetRemoteAddress() (FullAddress, *Error)
   399  
   400  	// Readiness returns the current readiness of the endpoint. For example,
   401  	// if waiter.EventIn is set, the endpoint is immediately readable.
   402  	Readiness(mask waiter.EventMask) waiter.EventMask
   403  
   404  	// SetSockOpt sets a socket option. opt should be one of the *Option types.
   405  	SetSockOpt(opt interface{}) *Error
   406  
   407  	// SetSockOptInt sets a socket option, for simple cases where a value
   408  	// has the int type.
   409  	SetSockOptInt(opt SockOpt, v int) *Error
   410  
   411  	// GetSockOpt gets a socket option. opt should be a pointer to one of the
   412  	// *Option types.
   413  	GetSockOpt(opt interface{}) *Error
   414  
   415  	// GetSockOptInt gets a socket option for simple cases where a return
   416  	// value has the int type.
   417  	GetSockOptInt(SockOpt) (int, *Error)
   418  
   419  	// State returns a socket's lifecycle state. The returned value is
   420  	// protocol-specific and is primarily used for diagnostics.
   421  	State() uint32
   422  
   423  	// ModerateRecvBuf should be called everytime data is copied to the user
   424  	// space. This allows for dynamic tuning of recv buffer space for a
   425  	// given socket.
   426  	//
   427  	// NOTE: This method is a no-op for sockets other than TCP.
   428  	ModerateRecvBuf(copied int)
   429  
   430  	// IPTables returns the iptables for this endpoint's stack.
   431  	IPTables() (iptables.IPTables, error)
   432  }
   433  
   434  // WriteOptions contains options for Endpoint.Write.
   435  type WriteOptions struct {
   436  	// If To is not nil, write to the given address instead of the endpoint's
   437  	// peer.
   438  	To *FullAddress
   439  
   440  	// More has the same semantics as Linux's MSG_MORE.
   441  	More bool
   442  
   443  	// EndOfRecord has the same semantics as Linux's MSG_EOR.
   444  	EndOfRecord bool
   445  
   446  	// Atomic means that all data fetched from Payloader must be written to the
   447  	// endpoint. If Atomic is false, then data fetched from the Payloader may be
   448  	// discarded if available endpoint buffer space is unsufficient.
   449  	Atomic bool
   450  }
   451  
   452  // SockOpt represents socket options which values have the int type.
   453  type SockOpt int
   454  
   455  const (
   456  	// ReceiveQueueSizeOption is used in GetSockOptInt to specify that the
   457  	// number of unread bytes in the input buffer should be returned.
   458  	ReceiveQueueSizeOption SockOpt = iota
   459  
   460  	// SendBufferSizeOption is used by SetSockOptInt/GetSockOptInt to
   461  	// specify the send buffer size option.
   462  	SendBufferSizeOption
   463  
   464  	// ReceiveBufferSizeOption is used by SetSockOptInt/GetSockOptInt to
   465  	// specify the receive buffer size option.
   466  	ReceiveBufferSizeOption
   467  
   468  	// SendQueueSizeOption is used in GetSockOptInt to specify that the
   469  	// number of unread bytes in the output buffer should be returned.
   470  	SendQueueSizeOption
   471  
   472  	// TODO(b/137664753): convert all int socket options to be handled via
   473  	// GetSockOptInt.
   474  )
   475  
   476  // ErrorOption is used in GetSockOpt to specify that the last error reported by
   477  // the endpoint should be cleared and returned.
   478  type ErrorOption struct{}
   479  
   480  // V6OnlyOption is used by SetSockOpt/GetSockOpt to specify whether an IPv6
   481  // socket is to be restricted to sending and receiving IPv6 packets only.
   482  type V6OnlyOption int
   483  
   484  // DelayOption is used by SetSockOpt/GetSockOpt to specify if data should be
   485  // sent out immediately by the transport protocol. For TCP, it determines if the
   486  // Nagle algorithm is on or off.
   487  type DelayOption int
   488  
   489  // CorkOption is used by SetSockOpt/GetSockOpt to specify if data should be
   490  // held until segments are full by the TCP transport protocol.
   491  type CorkOption int
   492  
   493  // ReuseAddressOption is used by SetSockOpt/GetSockOpt to specify whether Bind()
   494  // should allow reuse of local address.
   495  type ReuseAddressOption int
   496  
   497  // ReusePortOption is used by SetSockOpt/GetSockOpt to permit multiple sockets
   498  // to be bound to an identical socket address.
   499  type ReusePortOption int
   500  
   501  // BindToDeviceOption is used by SetSockOpt/GetSockOpt to specify that sockets
   502  // should bind only on a specific NIC.
   503  type BindToDeviceOption string
   504  
   505  // QuickAckOption is stubbed out in SetSockOpt/GetSockOpt.
   506  type QuickAckOption int
   507  
   508  // PasscredOption is used by SetSockOpt/GetSockOpt to specify whether
   509  // SCM_CREDENTIALS socket control messages are enabled.
   510  //
   511  // Only supported on Unix sockets.
   512  type PasscredOption int
   513  
   514  // TCPInfoOption is used by GetSockOpt to expose TCP statistics.
   515  //
   516  // TODO(b/64800844): Add and populate stat fields.
   517  type TCPInfoOption struct {
   518  	RTT    time.Duration
   519  	RTTVar time.Duration
   520  }
   521  
   522  // KeepaliveEnabledOption is used by SetSockOpt/GetSockOpt to specify whether
   523  // TCP keepalive is enabled for this socket.
   524  type KeepaliveEnabledOption int
   525  
   526  // KeepaliveIdleOption is used by SetSockOpt/GetSockOpt to specify the time a
   527  // connection must remain idle before the first TCP keepalive packet is sent.
   528  // Once this time is reached, KeepaliveIntervalOption is used instead.
   529  type KeepaliveIdleOption time.Duration
   530  
   531  // KeepaliveIntervalOption is used by SetSockOpt/GetSockOpt to specify the
   532  // interval between sending TCP keepalive packets.
   533  type KeepaliveIntervalOption time.Duration
   534  
   535  // KeepaliveCountOption is used by SetSockOpt/GetSockOpt to specify the number
   536  // of un-ACKed TCP keepalives that will be sent before the connection is
   537  // closed.
   538  type KeepaliveCountOption int
   539  
   540  // CongestionControlOption is used by SetSockOpt/GetSockOpt to set/get
   541  // the current congestion control algorithm.
   542  type CongestionControlOption string
   543  
   544  // AvailableCongestionControlOption is used to query the supported congestion
   545  // control algorithms.
   546  type AvailableCongestionControlOption string
   547  
   548  // ModerateReceiveBufferOption allows the caller to enable/disable TCP receive
   549  // buffer moderation.
   550  type ModerateReceiveBufferOption bool
   551  
   552  // MaxSegOption is used by SetSockOpt/GetSockOpt to set/get the current
   553  // Maximum Segment Size(MSS) value as specified using the TCP_MAXSEG option.
   554  type MaxSegOption int
   555  
   556  // TTLOption is used by SetSockOpt/GetSockOpt to control the default TTL/hop
   557  // limit value for unicast messages. The default is protocol specific.
   558  //
   559  // A zero value indicates the default.
   560  type TTLOption uint8
   561  
   562  // MulticastTTLOption is used by SetSockOpt/GetSockOpt to control the default
   563  // TTL value for multicast messages. The default is 1.
   564  type MulticastTTLOption uint8
   565  
   566  // MulticastInterfaceOption is used by SetSockOpt/GetSockOpt to specify a
   567  // default interface for multicast.
   568  type MulticastInterfaceOption struct {
   569  	NIC           NICID
   570  	InterfaceAddr Address
   571  }
   572  
   573  // MulticastLoopOption is used by SetSockOpt/GetSockOpt to specify whether
   574  // multicast packets sent over a non-loopback interface will be looped back.
   575  type MulticastLoopOption bool
   576  
   577  // MembershipOption is used by SetSockOpt/GetSockOpt as an argument to
   578  // AddMembershipOption and RemoveMembershipOption.
   579  type MembershipOption struct {
   580  	NIC           NICID
   581  	InterfaceAddr Address
   582  	MulticastAddr Address
   583  }
   584  
   585  // AddMembershipOption is used by SetSockOpt/GetSockOpt to join a multicast
   586  // group identified by the given multicast address, on the interface matching
   587  // the given interface address.
   588  type AddMembershipOption MembershipOption
   589  
   590  // RemoveMembershipOption is used by SetSockOpt/GetSockOpt to leave a multicast
   591  // group identified by the given multicast address, on the interface matching
   592  // the given interface address.
   593  type RemoveMembershipOption MembershipOption
   594  
   595  // OutOfBandInlineOption is used by SetSockOpt/GetSockOpt to specify whether
   596  // TCP out-of-band data is delivered along with the normal in-band data.
   597  type OutOfBandInlineOption int
   598  
   599  // BroadcastOption is used by SetSockOpt/GetSockOpt to specify whether
   600  // datagram sockets are allowed to send packets to a broadcast address.
   601  type BroadcastOption int
   602  
   603  // DefaultTTLOption is used by stack.(*Stack).NetworkProtocolOption to specify
   604  // a default TTL.
   605  type DefaultTTLOption uint8
   606  
   607  // Route is a row in the routing table. It specifies through which NIC (and
   608  // gateway) sets of packets should be routed. A row is considered viable if the
   609  // masked target address matches the destination address in the row.
   610  type Route struct {
   611  	// Destination must contain the target address for this row to be viable.
   612  	Destination Subnet
   613  
   614  	// Gateway is the gateway to be used if this row is viable.
   615  	Gateway Address
   616  
   617  	// NIC is the id of the nic to be used if this row is viable.
   618  	NIC NICID
   619  }
   620  
   621  // String implements the fmt.Stringer interface.
   622  func (r Route) String() string {
   623  	var out strings.Builder
   624  	fmt.Fprintf(&out, "%s", r.Destination)
   625  	if len(r.Gateway) > 0 {
   626  		fmt.Fprintf(&out, " via %s", r.Gateway)
   627  	}
   628  	fmt.Fprintf(&out, " nic %d", r.NIC)
   629  	return out.String()
   630  }
   631  
   632  // TransportProtocolNumber is the number of a transport protocol.
   633  type TransportProtocolNumber uint32
   634  
   635  // NetworkProtocolNumber is the number of a network protocol.
   636  type NetworkProtocolNumber uint32
   637  
   638  // A StatCounter keeps track of a statistic.
   639  type StatCounter struct {
   640  	count uint64
   641  }
   642  
   643  // Increment adds one to the counter.
   644  func (s *StatCounter) Increment() {
   645  	s.IncrementBy(1)
   646  }
   647  
   648  // Value returns the current value of the counter.
   649  func (s *StatCounter) Value() uint64 {
   650  	return atomic.LoadUint64(&s.count)
   651  }
   652  
   653  // IncrementBy increments the counter by v.
   654  func (s *StatCounter) IncrementBy(v uint64) {
   655  	atomic.AddUint64(&s.count, v)
   656  }
   657  
   658  func (s *StatCounter) String() string {
   659  	return strconv.FormatUint(s.Value(), 10)
   660  }
   661  
   662  // ICMPv4PacketStats enumerates counts for all ICMPv4 packet types.
   663  type ICMPv4PacketStats struct {
   664  	// Echo is the total number of ICMPv4 echo packets counted.
   665  	Echo *StatCounter
   666  
   667  	// EchoReply is the total number of ICMPv4 echo reply packets counted.
   668  	EchoReply *StatCounter
   669  
   670  	// DstUnreachable is the total number of ICMPv4 destination unreachable
   671  	// packets counted.
   672  	DstUnreachable *StatCounter
   673  
   674  	// SrcQuench is the total number of ICMPv4 source quench packets
   675  	// counted.
   676  	SrcQuench *StatCounter
   677  
   678  	// Redirect is the total number of ICMPv4 redirect packets counted.
   679  	Redirect *StatCounter
   680  
   681  	// TimeExceeded is the total number of ICMPv4 time exceeded packets
   682  	// counted.
   683  	TimeExceeded *StatCounter
   684  
   685  	// ParamProblem is the total number of ICMPv4 parameter problem packets
   686  	// counted.
   687  	ParamProblem *StatCounter
   688  
   689  	// Timestamp is the total number of ICMPv4 timestamp packets counted.
   690  	Timestamp *StatCounter
   691  
   692  	// TimestampReply is the total number of ICMPv4 timestamp reply packets
   693  	// counted.
   694  	TimestampReply *StatCounter
   695  
   696  	// InfoRequest is the total number of ICMPv4 information request
   697  	// packets counted.
   698  	InfoRequest *StatCounter
   699  
   700  	// InfoReply is the total number of ICMPv4 information reply packets
   701  	// counted.
   702  	InfoReply *StatCounter
   703  }
   704  
   705  // ICMPv6PacketStats enumerates counts for all ICMPv6 packet types.
   706  type ICMPv6PacketStats struct {
   707  	// EchoRequest is the total number of ICMPv6 echo request packets
   708  	// counted.
   709  	EchoRequest *StatCounter
   710  
   711  	// EchoReply is the total number of ICMPv6 echo reply packets counted.
   712  	EchoReply *StatCounter
   713  
   714  	// DstUnreachable is the total number of ICMPv6 destination unreachable
   715  	// packets counted.
   716  	DstUnreachable *StatCounter
   717  
   718  	// PacketTooBig is the total number of ICMPv6 packet too big packets
   719  	// counted.
   720  	PacketTooBig *StatCounter
   721  
   722  	// TimeExceeded is the total number of ICMPv6 time exceeded packets
   723  	// counted.
   724  	TimeExceeded *StatCounter
   725  
   726  	// ParamProblem is the total number of ICMPv6 parameter problem packets
   727  	// counted.
   728  	ParamProblem *StatCounter
   729  
   730  	// RouterSolicit is the total number of ICMPv6 router solicit packets
   731  	// counted.
   732  	RouterSolicit *StatCounter
   733  
   734  	// RouterAdvert is the total number of ICMPv6 router advert packets
   735  	// counted.
   736  	RouterAdvert *StatCounter
   737  
   738  	// NeighborSolicit is the total number of ICMPv6 neighbor solicit
   739  	// packets counted.
   740  	NeighborSolicit *StatCounter
   741  
   742  	// NeighborAdvert is the total number of ICMPv6 neighbor advert packets
   743  	// counted.
   744  	NeighborAdvert *StatCounter
   745  
   746  	// RedirectMsg is the total number of ICMPv6 redirect message packets
   747  	// counted.
   748  	RedirectMsg *StatCounter
   749  }
   750  
   751  // ICMPv4SentPacketStats collects outbound ICMPv4-specific stats.
   752  type ICMPv4SentPacketStats struct {
   753  	ICMPv4PacketStats
   754  
   755  	// Dropped is the total number of ICMPv4 packets dropped due to link
   756  	// layer errors.
   757  	Dropped *StatCounter
   758  
   759  	// RateLimited is the total number of ICMPv6 packets dropped due to
   760  	// rate limit being exceeded.
   761  	RateLimited *StatCounter
   762  }
   763  
   764  // ICMPv4ReceivedPacketStats collects inbound ICMPv4-specific stats.
   765  type ICMPv4ReceivedPacketStats struct {
   766  	ICMPv4PacketStats
   767  
   768  	// Invalid is the total number of ICMPv4 packets received that the
   769  	// transport layer could not parse.
   770  	Invalid *StatCounter
   771  }
   772  
   773  // ICMPv6SentPacketStats collects outbound ICMPv6-specific stats.
   774  type ICMPv6SentPacketStats struct {
   775  	ICMPv6PacketStats
   776  
   777  	// Dropped is the total number of ICMPv6 packets dropped due to link
   778  	// layer errors.
   779  	Dropped *StatCounter
   780  
   781  	// RateLimited is the total number of ICMPv6 packets dropped due to
   782  	// rate limit being exceeded.
   783  	RateLimited *StatCounter
   784  }
   785  
   786  // ICMPv6ReceivedPacketStats collects inbound ICMPv6-specific stats.
   787  type ICMPv6ReceivedPacketStats struct {
   788  	ICMPv6PacketStats
   789  
   790  	// Invalid is the total number of ICMPv6 packets received that the
   791  	// transport layer could not parse.
   792  	Invalid *StatCounter
   793  }
   794  
   795  // ICMPStats collects ICMP-specific stats (both v4 and v6).
   796  type ICMPStats struct {
   797  	// ICMPv4SentPacketStats contains counts of sent packets by ICMPv4 packet type
   798  	// and a single count of packets which failed to write to the link
   799  	// layer.
   800  	V4PacketsSent ICMPv4SentPacketStats
   801  
   802  	// ICMPv4ReceivedPacketStats contains counts of received packets by ICMPv4
   803  	// packet type and a single count of invalid packets received.
   804  	V4PacketsReceived ICMPv4ReceivedPacketStats
   805  
   806  	// ICMPv6SentPacketStats contains counts of sent packets by ICMPv6 packet type
   807  	// and a single count of packets which failed to write to the link
   808  	// layer.
   809  	V6PacketsSent ICMPv6SentPacketStats
   810  
   811  	// ICMPv6ReceivedPacketStats contains counts of received packets by ICMPv6
   812  	// packet type and a single count of invalid packets received.
   813  	V6PacketsReceived ICMPv6ReceivedPacketStats
   814  }
   815  
   816  // IPStats collects IP-specific stats (both v4 and v6).
   817  type IPStats struct {
   818  	// PacketsReceived is the total number of IP packets received from the
   819  	// link layer in nic.DeliverNetworkPacket.
   820  	PacketsReceived *StatCounter
   821  
   822  	// InvalidAddressesReceived is the total number of IP packets received
   823  	// with an unknown or invalid destination address.
   824  	InvalidAddressesReceived *StatCounter
   825  
   826  	// PacketsDelivered is the total number of incoming IP packets that
   827  	// are successfully delivered to the transport layer via HandlePacket.
   828  	PacketsDelivered *StatCounter
   829  
   830  	// PacketsSent is the total number of IP packets sent via WritePacket.
   831  	PacketsSent *StatCounter
   832  
   833  	// OutgoingPacketErrors is the total number of IP packets which failed
   834  	// to write to a link-layer endpoint.
   835  	OutgoingPacketErrors *StatCounter
   836  }
   837  
   838  // TCPStats collects TCP-specific stats.
   839  type TCPStats struct {
   840  	// ActiveConnectionOpenings is the number of connections opened
   841  	// successfully via Connect.
   842  	ActiveConnectionOpenings *StatCounter
   843  
   844  	// PassiveConnectionOpenings is the number of connections opened
   845  	// successfully via Listen.
   846  	PassiveConnectionOpenings *StatCounter
   847  
   848  	// ListenOverflowSynDrop is the number of times the listen queue overflowed
   849  	// and a SYN was dropped.
   850  	ListenOverflowSynDrop *StatCounter
   851  
   852  	// ListenOverflowAckDrop is the number of times the final ACK
   853  	// in the handshake was dropped due to overflow.
   854  	ListenOverflowAckDrop *StatCounter
   855  
   856  	// ListenOverflowCookieSent is the number of times a SYN cookie was sent.
   857  	ListenOverflowSynCookieSent *StatCounter
   858  
   859  	// ListenOverflowSynCookieRcvd is the number of times a valid SYN
   860  	// cookie was received.
   861  	ListenOverflowSynCookieRcvd *StatCounter
   862  
   863  	// ListenOverflowInvalidSynCookieRcvd is the number of times an invalid SYN cookie
   864  	// was received.
   865  	ListenOverflowInvalidSynCookieRcvd *StatCounter
   866  
   867  	// FailedConnectionAttempts is the number of calls to Connect or Listen
   868  	// (active and passive openings, respectively) that end in an error.
   869  	FailedConnectionAttempts *StatCounter
   870  
   871  	// ValidSegmentsReceived is the number of TCP segments received that
   872  	// the transport layer successfully parsed.
   873  	ValidSegmentsReceived *StatCounter
   874  
   875  	// InvalidSegmentsReceived is the number of TCP segments received that
   876  	// the transport layer could not parse.
   877  	InvalidSegmentsReceived *StatCounter
   878  
   879  	// SegmentsSent is the number of TCP segments sent.
   880  	SegmentsSent *StatCounter
   881  
   882  	// ResetsSent is the number of TCP resets sent.
   883  	ResetsSent *StatCounter
   884  
   885  	// ResetsReceived is the number of TCP resets received.
   886  	ResetsReceived *StatCounter
   887  
   888  	// Retransmits is the number of TCP segments retransmitted.
   889  	Retransmits *StatCounter
   890  
   891  	// FastRecovery is the number of times Fast Recovery was used to
   892  	// recover from packet loss.
   893  	FastRecovery *StatCounter
   894  
   895  	// SACKRecovery is the number of times SACK Recovery was used to
   896  	// recover from packet loss.
   897  	SACKRecovery *StatCounter
   898  
   899  	// SlowStartRetransmits is the number of segments retransmitted in slow
   900  	// start.
   901  	SlowStartRetransmits *StatCounter
   902  
   903  	// FastRetransmit is the number of segments retransmitted in fast
   904  	// recovery.
   905  	FastRetransmit *StatCounter
   906  
   907  	// Timeouts is the number of times the RTO expired.
   908  	Timeouts *StatCounter
   909  
   910  	// ChecksumErrors is the number of segments dropped due to bad checksums.
   911  	ChecksumErrors *StatCounter
   912  }
   913  
   914  // UDPStats collects UDP-specific stats.
   915  type UDPStats struct {
   916  	// PacketsReceived is the number of UDP datagrams received via
   917  	// HandlePacket.
   918  	PacketsReceived *StatCounter
   919  
   920  	// UnknownPortErrors is the number of incoming UDP datagrams dropped
   921  	// because they did not have a known destination port.
   922  	UnknownPortErrors *StatCounter
   923  
   924  	// ReceiveBufferErrors is the number of incoming UDP datagrams dropped
   925  	// due to the receiving buffer being in an invalid state.
   926  	ReceiveBufferErrors *StatCounter
   927  
   928  	// MalformedPacketsReceived is the number of incoming UDP datagrams
   929  	// dropped due to the UDP header being in a malformed state.
   930  	MalformedPacketsReceived *StatCounter
   931  
   932  	// PacketsSent is the number of UDP datagrams sent via sendUDP.
   933  	PacketsSent *StatCounter
   934  }
   935  
   936  // Stats holds statistics about the networking stack.
   937  //
   938  // All fields are optional.
   939  type Stats struct {
   940  	// UnknownProtocolRcvdPackets is the number of packets received by the
   941  	// stack that were for an unknown or unsupported protocol.
   942  	UnknownProtocolRcvdPackets *StatCounter
   943  
   944  	// MalformedRcvPackets is the number of packets received by the stack
   945  	// that were deemed malformed.
   946  	MalformedRcvdPackets *StatCounter
   947  
   948  	// DroppedPackets is the number of packets dropped due to full queues.
   949  	DroppedPackets *StatCounter
   950  
   951  	// ICMP breaks out ICMP-specific stats (both v4 and v6).
   952  	ICMP ICMPStats
   953  
   954  	// IP breaks out IP-specific stats (both v4 and v6).
   955  	IP IPStats
   956  
   957  	// TCP breaks out TCP-specific stats.
   958  	TCP TCPStats
   959  
   960  	// UDP breaks out UDP-specific stats.
   961  	UDP UDPStats
   962  }
   963  
   964  func fillIn(v reflect.Value) {
   965  	for i := 0; i < v.NumField(); i++ {
   966  		v := v.Field(i)
   967  		switch v.Kind() {
   968  		case reflect.Ptr:
   969  			if s := v.Addr().Interface().(**StatCounter); *s == nil {
   970  				*s = &StatCounter{}
   971  			}
   972  		case reflect.Struct:
   973  			fillIn(v)
   974  		default:
   975  			panic(fmt.Sprintf("unexpected type %s", v.Type()))
   976  		}
   977  	}
   978  }
   979  
   980  // FillIn returns a copy of s with nil fields initialized to new StatCounters.
   981  func (s Stats) FillIn() Stats {
   982  	fillIn(reflect.ValueOf(&s).Elem())
   983  	return s
   984  }
   985  
   986  // String implements the fmt.Stringer interface.
   987  func (a Address) String() string {
   988  	switch len(a) {
   989  	case 4:
   990  		return fmt.Sprintf("%d.%d.%d.%d", int(a[0]), int(a[1]), int(a[2]), int(a[3]))
   991  	case 16:
   992  		// Find the longest subsequence of hexadecimal zeros.
   993  		start, end := -1, -1
   994  		for i := 0; i < len(a); i += 2 {
   995  			j := i
   996  			for j < len(a) && a[j] == 0 && a[j+1] == 0 {
   997  				j += 2
   998  			}
   999  			if j > i+2 && j-i > end-start {
  1000  				start, end = i, j
  1001  			}
  1002  		}
  1003  
  1004  		var b strings.Builder
  1005  		for i := 0; i < len(a); i += 2 {
  1006  			if i == start {
  1007  				b.WriteString("::")
  1008  				i = end
  1009  				if end >= len(a) {
  1010  					break
  1011  				}
  1012  			} else if i > 0 {
  1013  				b.WriteByte(':')
  1014  			}
  1015  			v := uint16(a[i+0])<<8 | uint16(a[i+1])
  1016  			if v == 0 {
  1017  				b.WriteByte('0')
  1018  			} else {
  1019  				const digits = "0123456789abcdef"
  1020  				for i := uint(3); i < 4; i-- {
  1021  					if v := v >> (i * 4); v != 0 {
  1022  						b.WriteByte(digits[v&0xf])
  1023  					}
  1024  				}
  1025  			}
  1026  		}
  1027  		return b.String()
  1028  	default:
  1029  		return fmt.Sprintf("%x", []byte(a))
  1030  	}
  1031  }
  1032  
  1033  // To4 converts the IPv4 address to a 4-byte representation.
  1034  // If the address is not an IPv4 address, To4 returns "".
  1035  func (a Address) To4() Address {
  1036  	const (
  1037  		ipv4len = 4
  1038  		ipv6len = 16
  1039  	)
  1040  	if len(a) == ipv4len {
  1041  		return a
  1042  	}
  1043  	if len(a) == ipv6len &&
  1044  		isZeros(a[0:10]) &&
  1045  		a[10] == 0xff &&
  1046  		a[11] == 0xff {
  1047  		return a[12:16]
  1048  	}
  1049  	return ""
  1050  }
  1051  
  1052  // isZeros reports whether a is all zeros.
  1053  func isZeros(a Address) bool {
  1054  	for i := 0; i < len(a); i++ {
  1055  		if a[i] != 0 {
  1056  			return false
  1057  		}
  1058  	}
  1059  	return true
  1060  }
  1061  
  1062  // LinkAddress is a byte slice cast as a string that represents a link address.
  1063  // It is typically a 6-byte MAC address.
  1064  type LinkAddress string
  1065  
  1066  // String implements the fmt.Stringer interface.
  1067  func (a LinkAddress) String() string {
  1068  	switch len(a) {
  1069  	case 6:
  1070  		return fmt.Sprintf("%02x:%02x:%02x:%02x:%02x:%02x", a[0], a[1], a[2], a[3], a[4], a[5])
  1071  	default:
  1072  		return fmt.Sprintf("%x", []byte(a))
  1073  	}
  1074  }
  1075  
  1076  // ParseMACAddress parses an IEEE 802 address.
  1077  //
  1078  // It must be in the format aa:bb:cc:dd:ee:ff or aa-bb-cc-dd-ee-ff.
  1079  func ParseMACAddress(s string) (LinkAddress, error) {
  1080  	parts := strings.FieldsFunc(s, func(c rune) bool {
  1081  		return c == ':' || c == '-'
  1082  	})
  1083  	if len(parts) != 6 {
  1084  		return "", fmt.Errorf("inconsistent parts: %s", s)
  1085  	}
  1086  	addr := make([]byte, 0, len(parts))
  1087  	for _, part := range parts {
  1088  		u, err := strconv.ParseUint(part, 16, 8)
  1089  		if err != nil {
  1090  			return "", fmt.Errorf("invalid hex digits: %s", s)
  1091  		}
  1092  		addr = append(addr, byte(u))
  1093  	}
  1094  	return LinkAddress(addr), nil
  1095  }
  1096  
  1097  // AddressWithPrefix is an address with its subnet prefix length.
  1098  type AddressWithPrefix struct {
  1099  	// Address is a network address.
  1100  	Address Address
  1101  
  1102  	// PrefixLen is the subnet prefix length.
  1103  	PrefixLen int
  1104  }
  1105  
  1106  // String implements the fmt.Stringer interface.
  1107  func (a AddressWithPrefix) String() string {
  1108  	return fmt.Sprintf("%s/%d", a.Address, a.PrefixLen)
  1109  }
  1110  
  1111  // Subnet converts the address and prefix into a Subnet value and returns it.
  1112  func (a AddressWithPrefix) Subnet() Subnet {
  1113  	addrLen := len(a.Address)
  1114  	if a.PrefixLen <= 0 {
  1115  		return Subnet{
  1116  			address: Address(strings.Repeat("\x00", addrLen)),
  1117  			mask:    AddressMask(strings.Repeat("\x00", addrLen)),
  1118  		}
  1119  	}
  1120  	if a.PrefixLen >= addrLen*8 {
  1121  		return Subnet{
  1122  			address: a.Address,
  1123  			mask:    AddressMask(strings.Repeat("\xff", addrLen)),
  1124  		}
  1125  	}
  1126  
  1127  	sa := make([]byte, addrLen)
  1128  	sm := make([]byte, addrLen)
  1129  	n := uint(a.PrefixLen)
  1130  	for i := 0; i < addrLen; i++ {
  1131  		if n >= 8 {
  1132  			sa[i] = a.Address[i]
  1133  			sm[i] = 0xff
  1134  			n -= 8
  1135  			continue
  1136  		}
  1137  		sm[i] = ^byte(0xff >> n)
  1138  		sa[i] = a.Address[i] & sm[i]
  1139  		n = 0
  1140  	}
  1141  
  1142  	// For extra caution, call NewSubnet rather than directly creating the Subnet
  1143  	// value. If that fails it indicates a serious bug in this code, so panic is
  1144  	// in order.
  1145  	s, err := NewSubnet(Address(sa), AddressMask(sm))
  1146  	if err != nil {
  1147  		panic("invalid subnet: " + err.Error())
  1148  	}
  1149  	return s
  1150  }
  1151  
  1152  // ProtocolAddress is an address and the network protocol it is associated
  1153  // with.
  1154  type ProtocolAddress struct {
  1155  	// Protocol is the protocol of the address.
  1156  	Protocol NetworkProtocolNumber
  1157  
  1158  	// AddressWithPrefix is a network address with its subnet prefix length.
  1159  	AddressWithPrefix AddressWithPrefix
  1160  }
  1161  
  1162  var (
  1163  	// danglingEndpointsMu protects access to danglingEndpoints.
  1164  	danglingEndpointsMu sync.Mutex
  1165  
  1166  	// danglingEndpoints tracks all dangling endpoints no longer owned by the app.
  1167  	danglingEndpoints = make(map[Endpoint]struct{})
  1168  )
  1169  
  1170  // GetDanglingEndpoints returns all dangling endpoints.
  1171  func GetDanglingEndpoints() []Endpoint {
  1172  	es := make([]Endpoint, 0, len(danglingEndpoints))
  1173  	danglingEndpointsMu.Lock()
  1174  	for e := range danglingEndpoints {
  1175  		es = append(es, e)
  1176  	}
  1177  	danglingEndpointsMu.Unlock()
  1178  	return es
  1179  }
  1180  
  1181  // AddDanglingEndpoint adds a dangling endpoint.
  1182  func AddDanglingEndpoint(e Endpoint) {
  1183  	danglingEndpointsMu.Lock()
  1184  	danglingEndpoints[e] = struct{}{}
  1185  	danglingEndpointsMu.Unlock()
  1186  }
  1187  
  1188  // DeleteDanglingEndpoint removes a dangling endpoint.
  1189  func DeleteDanglingEndpoint(e Endpoint) {
  1190  	danglingEndpointsMu.Lock()
  1191  	delete(danglingEndpoints, e)
  1192  	danglingEndpointsMu.Unlock()
  1193  }
  1194  
  1195  // AsyncLoading is the global barrier for asynchronous endpoint loading
  1196  // activities.
  1197  var AsyncLoading sync.WaitGroup