gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/tcpip/network/ipv4/ipv4.go (about)

     1  // Copyright 2021 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package ipv4 contains the implementation of the ipv4 network protocol.
    16  package ipv4
    17  
    18  import (
    19  	"fmt"
    20  	"math"
    21  	"reflect"
    22  	"time"
    23  
    24  	"gvisor.dev/gvisor/pkg/atomicbitops"
    25  	"gvisor.dev/gvisor/pkg/buffer"
    26  	"gvisor.dev/gvisor/pkg/sync"
    27  	"gvisor.dev/gvisor/pkg/tcpip"
    28  	"gvisor.dev/gvisor/pkg/tcpip/header"
    29  	"gvisor.dev/gvisor/pkg/tcpip/header/parse"
    30  	"gvisor.dev/gvisor/pkg/tcpip/network/hash"
    31  	"gvisor.dev/gvisor/pkg/tcpip/network/internal/fragmentation"
    32  	"gvisor.dev/gvisor/pkg/tcpip/network/internal/ip"
    33  	"gvisor.dev/gvisor/pkg/tcpip/network/internal/multicast"
    34  	"gvisor.dev/gvisor/pkg/tcpip/stack"
    35  )
    36  
    37  const (
    38  	// ReassembleTimeout is the time a packet stays in the reassembly
    39  	// system before being evicted.
    40  	// As per RFC 791 section 3.2:
    41  	//   The current recommendation for the initial timer setting is 15 seconds.
    42  	//   This may be changed as experience with this protocol accumulates.
    43  	//
    44  	// Considering that it is an old recommendation, we use the same reassembly
    45  	// timeout that linux defines, which is 30 seconds:
    46  	// https://github.com/torvalds/linux/blob/47ec5303d73ea344e84f46660fff693c57641386/include/net/ip.h#L138
    47  	ReassembleTimeout = 30 * time.Second
    48  
    49  	// ProtocolNumber is the ipv4 protocol number.
    50  	ProtocolNumber = header.IPv4ProtocolNumber
    51  
    52  	// MaxTotalSize is maximum size that can be encoded in the 16-bit
    53  	// TotalLength field of the ipv4 header.
    54  	MaxTotalSize = 0xffff
    55  
    56  	// DefaultTTL is the default time-to-live value for this endpoint.
    57  	DefaultTTL = 64
    58  
    59  	// buckets is the number of identifier buckets.
    60  	buckets = 2048
    61  
    62  	// The size of a fragment block, in bytes, as per RFC 791 section 3.1,
    63  	// page 14.
    64  	fragmentblockSize = 8
    65  )
    66  
    67  const (
    68  	forwardingDisabled = 0
    69  	forwardingEnabled  = 1
    70  )
    71  
    72  var ipv4BroadcastAddr = header.IPv4Broadcast.WithPrefix()
    73  
    74  var _ stack.LinkResolvableNetworkEndpoint = (*endpoint)(nil)
    75  var _ stack.ForwardingNetworkEndpoint = (*endpoint)(nil)
    76  var _ stack.MulticastForwardingNetworkEndpoint = (*endpoint)(nil)
    77  var _ stack.GroupAddressableEndpoint = (*endpoint)(nil)
    78  var _ stack.AddressableEndpoint = (*endpoint)(nil)
    79  var _ stack.NetworkEndpoint = (*endpoint)(nil)
    80  var _ IGMPEndpoint = (*endpoint)(nil)
    81  
    82  type endpoint struct {
    83  	nic        stack.NetworkInterface
    84  	dispatcher stack.TransportDispatcher
    85  	protocol   *protocol
    86  	stats      sharedStats
    87  
    88  	// enabled is set to 1 when the endpoint is enabled and 0 when it is
    89  	// disabled.
    90  	enabled atomicbitops.Uint32
    91  
    92  	// forwarding is set to forwardingEnabled when the endpoint has forwarding
    93  	// enabled and forwardingDisabled when it is disabled.
    94  	forwarding atomicbitops.Uint32
    95  
    96  	// multicastForwarding is set to forwardingEnabled when the endpoint has
    97  	// forwarding enabled and forwardingDisabled when it is disabled.
    98  	//
    99  	// TODO(https://gvisor.dev/issue/7338): Implement support for multicast
   100  	//forwarding. Currently, setting this value to true is a no-op.
   101  	multicastForwarding atomicbitops.Uint32
   102  
   103  	// mu protects below.
   104  	mu sync.RWMutex
   105  
   106  	// +checklocks:mu
   107  	addressableEndpointState stack.AddressableEndpointState
   108  
   109  	// +checklocks:mu
   110  	igmp igmpState
   111  }
   112  
   113  // SetIGMPVersion implements IGMPEndpoint.
   114  func (e *endpoint) SetIGMPVersion(v IGMPVersion) IGMPVersion {
   115  	e.mu.Lock()
   116  	defer e.mu.Unlock()
   117  	return e.setIGMPVersionLocked(v)
   118  }
   119  
   120  // GetIGMPVersion implements IGMPEndpoint.
   121  func (e *endpoint) GetIGMPVersion() IGMPVersion {
   122  	e.mu.RLock()
   123  	defer e.mu.RUnlock()
   124  	return e.getIGMPVersionLocked()
   125  }
   126  
   127  // +checklocks:e.mu
   128  // +checklocksalias:e.igmp.ep.mu=e.mu
   129  func (e *endpoint) setIGMPVersionLocked(v IGMPVersion) IGMPVersion {
   130  	return e.igmp.setVersion(v)
   131  }
   132  
   133  // +checklocksread:e.mu
   134  // +checklocksalias:e.igmp.ep.mu=e.mu
   135  func (e *endpoint) getIGMPVersionLocked() IGMPVersion {
   136  	return e.igmp.getVersion()
   137  }
   138  
   139  // HandleLinkResolutionFailure implements stack.LinkResolvableNetworkEndpoint.
   140  func (e *endpoint) HandleLinkResolutionFailure(pkt *stack.PacketBuffer) {
   141  	// If we are operating as a router, return an ICMP error to the original
   142  	// packet's sender.
   143  	if pkt.NetworkPacketInfo.IsForwardedPacket {
   144  		// TODO(gvisor.dev/issue/6005): Propagate asynchronously generated ICMP
   145  		// errors to local endpoints.
   146  		e.protocol.returnError(&icmpReasonHostUnreachable{}, pkt, false /* deliveredLocally */)
   147  		e.stats.ip.Forwarding.Errors.Increment()
   148  		e.stats.ip.Forwarding.HostUnreachable.Increment()
   149  		return
   150  	}
   151  	// handleControl expects the entire offending packet to be in the packet
   152  	// buffer's data field.
   153  	pkt = stack.NewPacketBuffer(stack.PacketBufferOptions{
   154  		Payload: pkt.ToBuffer(),
   155  	})
   156  	defer pkt.DecRef()
   157  	pkt.NICID = e.nic.ID()
   158  	pkt.NetworkProtocolNumber = ProtocolNumber
   159  	// Use the same control type as an ICMPv4 destination host unreachable error
   160  	// since the host is considered unreachable if we cannot resolve the link
   161  	// address to the next hop.
   162  	e.handleControl(&icmpv4DestinationHostUnreachableSockError{}, pkt)
   163  }
   164  
   165  // NewEndpoint creates a new ipv4 endpoint.
   166  func (p *protocol) NewEndpoint(nic stack.NetworkInterface, dispatcher stack.TransportDispatcher) stack.NetworkEndpoint {
   167  	e := &endpoint{
   168  		nic:        nic,
   169  		dispatcher: dispatcher,
   170  		protocol:   p,
   171  	}
   172  	e.mu.Lock()
   173  	e.addressableEndpointState.Init(e, stack.AddressableEndpointStateOptions{HiddenWhileDisabled: false})
   174  	e.igmp.init(e)
   175  	e.mu.Unlock()
   176  
   177  	tcpip.InitStatCounters(reflect.ValueOf(&e.stats.localStats).Elem())
   178  
   179  	stackStats := p.stack.Stats()
   180  	e.stats.ip.Init(&e.stats.localStats.IP, &stackStats.IP)
   181  	e.stats.icmp.init(&e.stats.localStats.ICMP, &stackStats.ICMP.V4)
   182  	e.stats.igmp.init(&e.stats.localStats.IGMP, &stackStats.IGMP)
   183  
   184  	p.mu.Lock()
   185  	p.eps[nic.ID()] = e
   186  	p.mu.Unlock()
   187  
   188  	return e
   189  }
   190  
   191  func (p *protocol) findEndpointWithAddress(addr tcpip.Address) *endpoint {
   192  	p.mu.RLock()
   193  	defer p.mu.RUnlock()
   194  
   195  	for _, e := range p.eps {
   196  		if addressEndpoint := e.AcquireAssignedAddress(addr, false /* allowTemp */, stack.NeverPrimaryEndpoint, true /* readOnly */); addressEndpoint != nil {
   197  			return e
   198  		}
   199  	}
   200  
   201  	return nil
   202  }
   203  
   204  func (p *protocol) getEndpointForNIC(id tcpip.NICID) (*endpoint, bool) {
   205  	p.mu.RLock()
   206  	defer p.mu.RUnlock()
   207  	ep, ok := p.eps[id]
   208  	return ep, ok
   209  }
   210  
   211  func (p *protocol) forgetEndpoint(nicID tcpip.NICID) {
   212  	p.mu.Lock()
   213  	defer p.mu.Unlock()
   214  	delete(p.eps, nicID)
   215  }
   216  
   217  // Forwarding implements stack.ForwardingNetworkEndpoint.
   218  func (e *endpoint) Forwarding() bool {
   219  	return e.forwarding.Load() == forwardingEnabled
   220  }
   221  
   222  // setForwarding sets the forwarding status for the endpoint.
   223  //
   224  // Returns the previous forwarding status.
   225  func (e *endpoint) setForwarding(v bool) bool {
   226  	forwarding := uint32(forwardingDisabled)
   227  	if v {
   228  		forwarding = forwardingEnabled
   229  	}
   230  
   231  	return e.forwarding.Swap(forwarding) != forwardingDisabled
   232  }
   233  
   234  // SetForwarding implements stack.ForwardingNetworkEndpoint.
   235  func (e *endpoint) SetForwarding(forwarding bool) bool {
   236  	e.mu.Lock()
   237  	defer e.mu.Unlock()
   238  
   239  	prevForwarding := e.setForwarding(forwarding)
   240  	if prevForwarding == forwarding {
   241  		return prevForwarding
   242  	}
   243  
   244  	if forwarding {
   245  		// There does not seem to be an RFC requirement for a node to join the all
   246  		// routers multicast address but
   247  		// https://www.iana.org/assignments/multicast-addresses/multicast-addresses.xhtml
   248  		// specifies the address as a group for all routers on a subnet so we join
   249  		// the group here.
   250  		if err := e.joinGroupLocked(header.IPv4AllRoutersGroup); err != nil {
   251  			// joinGroupLocked only returns an error if the group address is not a
   252  			// valid IPv4 multicast address.
   253  			panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", header.IPv4AllRoutersGroup, err))
   254  		}
   255  
   256  		return prevForwarding
   257  	}
   258  
   259  	switch err := e.leaveGroupLocked(header.IPv4AllRoutersGroup).(type) {
   260  	case nil:
   261  	case *tcpip.ErrBadLocalAddress:
   262  		// The endpoint may have already left the multicast group.
   263  	default:
   264  		panic(fmt.Sprintf("e.leaveGroupLocked(%s): %s", header.IPv4AllRoutersGroup, err))
   265  	}
   266  
   267  	return prevForwarding
   268  }
   269  
   270  // MulticastForwarding implements stack.MulticastForwardingNetworkEndpoint.
   271  func (e *endpoint) MulticastForwarding() bool {
   272  	return e.multicastForwarding.Load() == forwardingEnabled
   273  }
   274  
   275  // SetMulticastForwarding implements stack.MulticastForwardingNetworkEndpoint.
   276  func (e *endpoint) SetMulticastForwarding(forwarding bool) bool {
   277  	updatedForwarding := uint32(forwardingDisabled)
   278  	if forwarding {
   279  		updatedForwarding = forwardingEnabled
   280  	}
   281  
   282  	return e.multicastForwarding.Swap(updatedForwarding) != forwardingDisabled
   283  }
   284  
   285  // Enable implements stack.NetworkEndpoint.
   286  func (e *endpoint) Enable() tcpip.Error {
   287  	e.mu.Lock()
   288  	defer e.mu.Unlock()
   289  	return e.enableLocked()
   290  }
   291  
   292  // +checklocks:e.mu
   293  // +checklocksalias:e.igmp.ep.mu=e.mu
   294  func (e *endpoint) enableLocked() tcpip.Error {
   295  	// If the NIC is not enabled, the endpoint can't do anything meaningful so
   296  	// don't enable the endpoint.
   297  	if !e.nic.Enabled() {
   298  		return &tcpip.ErrNotPermitted{}
   299  	}
   300  
   301  	// If the endpoint is already enabled, there is nothing for it to do.
   302  	if !e.setEnabled(true) {
   303  		return nil
   304  	}
   305  
   306  	// Must be called after Enabled has already been set.
   307  	e.addressableEndpointState.OnNetworkEndpointEnabledChanged()
   308  
   309  	// Create an endpoint to receive broadcast packets on this interface.
   310  	ep, err := e.addressableEndpointState.AddAndAcquirePermanentAddress(ipv4BroadcastAddr, stack.AddressProperties{PEB: stack.NeverPrimaryEndpoint})
   311  	if err != nil {
   312  		return err
   313  	}
   314  	// We have no need for the address endpoint.
   315  	ep.DecRef()
   316  
   317  	// Groups may have been joined while the endpoint was disabled, or the
   318  	// endpoint may have left groups from the perspective of IGMP when the
   319  	// endpoint was disabled. Either way, we need to let routers know to
   320  	// send us multicast traffic.
   321  	e.igmp.initializeAll()
   322  
   323  	// As per RFC 1122 section 3.3.7, all hosts should join the all-hosts
   324  	// multicast group. Note, the IANA calls the all-hosts multicast group the
   325  	// all-systems multicast group.
   326  	if err := e.joinGroupLocked(header.IPv4AllSystems); err != nil {
   327  		// joinGroupLocked only returns an error if the group address is not a valid
   328  		// IPv4 multicast address.
   329  		panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", header.IPv4AllSystems, err))
   330  	}
   331  
   332  	return nil
   333  }
   334  
   335  // Enabled implements stack.NetworkEndpoint.
   336  func (e *endpoint) Enabled() bool {
   337  	return e.nic.Enabled() && e.isEnabled()
   338  }
   339  
   340  // isEnabled returns true if the endpoint is enabled, regardless of the
   341  // enabled status of the NIC.
   342  func (e *endpoint) isEnabled() bool {
   343  	return e.enabled.Load() == 1
   344  }
   345  
   346  // setEnabled sets the enabled status for the endpoint.
   347  //
   348  // Returns true if the enabled status was updated.
   349  func (e *endpoint) setEnabled(v bool) bool {
   350  	if v {
   351  		return e.enabled.Swap(1) == 0
   352  	}
   353  	return e.enabled.Swap(0) == 1
   354  }
   355  
   356  // Disable implements stack.NetworkEndpoint.
   357  func (e *endpoint) Disable() {
   358  	e.mu.Lock()
   359  	defer e.mu.Unlock()
   360  	e.disableLocked()
   361  }
   362  
   363  // +checklocks:e.mu
   364  // +checklocksalias:e.igmp.ep.mu=e.mu
   365  func (e *endpoint) disableLocked() {
   366  	if !e.isEnabled() {
   367  		return
   368  	}
   369  
   370  	// The endpoint may have already left the multicast group.
   371  	switch err := e.leaveGroupLocked(header.IPv4AllSystems).(type) {
   372  	case nil, *tcpip.ErrBadLocalAddress:
   373  	default:
   374  		panic(fmt.Sprintf("unexpected error when leaving group = %s: %s", header.IPv4AllSystems, err))
   375  	}
   376  
   377  	// Leave groups from the perspective of IGMP so that routers know that
   378  	// we are no longer interested in the group.
   379  	e.igmp.softLeaveAll()
   380  
   381  	// The address may have already been removed.
   382  	switch err := e.addressableEndpointState.RemovePermanentAddress(ipv4BroadcastAddr.Address); err.(type) {
   383  	case nil, *tcpip.ErrBadLocalAddress:
   384  	default:
   385  		panic(fmt.Sprintf("unexpected error when removing address = %s: %s", ipv4BroadcastAddr.Address, err))
   386  	}
   387  
   388  	// Reset the IGMP V1 present flag.
   389  	//
   390  	// If the node comes back up on the same network, it will re-learn that it
   391  	// needs to perform IGMPv1.
   392  	e.igmp.resetV1Present()
   393  
   394  	if !e.setEnabled(false) {
   395  		panic("should have only done work to disable the endpoint if it was enabled")
   396  	}
   397  
   398  	// Must be called after Enabled has been set.
   399  	e.addressableEndpointState.OnNetworkEndpointEnabledChanged()
   400  }
   401  
   402  // emitMulticastEvent emits a multicast forwarding event using the provided
   403  // generator if a valid event dispatcher exists.
   404  func (e *endpoint) emitMulticastEvent(eventGenerator func(stack.MulticastForwardingEventDispatcher)) {
   405  	e.protocol.mu.RLock()
   406  	defer e.protocol.mu.RUnlock()
   407  
   408  	if mcastDisp := e.protocol.multicastForwardingDisp; mcastDisp != nil {
   409  		eventGenerator(mcastDisp)
   410  	}
   411  }
   412  
   413  // DefaultTTL is the default time-to-live value for this endpoint.
   414  func (e *endpoint) DefaultTTL() uint8 {
   415  	return e.protocol.DefaultTTL()
   416  }
   417  
   418  // MTU implements stack.NetworkEndpoint. It returns the link-layer MTU minus the
   419  // network layer max header length.
   420  func (e *endpoint) MTU() uint32 {
   421  	networkMTU, err := calculateNetworkMTU(e.nic.MTU(), header.IPv4MinimumSize)
   422  	if err != nil {
   423  		return 0
   424  	}
   425  	return networkMTU
   426  }
   427  
   428  // MaxHeaderLength returns the maximum length needed by ipv4 headers (and
   429  // underlying protocols).
   430  func (e *endpoint) MaxHeaderLength() uint16 {
   431  	return e.nic.MaxHeaderLength() + header.IPv4MaximumHeaderSize
   432  }
   433  
   434  // NetworkProtocolNumber implements stack.NetworkEndpoint.
   435  func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
   436  	return e.protocol.Number()
   437  }
   438  
   439  // getID returns a random uint16 number (other than zero) to be used as ID in
   440  // the IPv4 header.
   441  func (e *endpoint) getID() uint16 {
   442  	rng := e.protocol.stack.SecureRNG()
   443  	id := rng.Uint16()
   444  	for id == 0 {
   445  		id = rng.Uint16()
   446  	}
   447  	return id
   448  }
   449  
   450  func (e *endpoint) addIPHeader(srcAddr, dstAddr tcpip.Address, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams, options header.IPv4OptionsSerializer) tcpip.Error {
   451  	hdrLen := header.IPv4MinimumSize
   452  	var optLen int
   453  	if options != nil {
   454  		optLen = int(options.Length())
   455  	}
   456  	hdrLen += optLen
   457  	if hdrLen > header.IPv4MaximumHeaderSize {
   458  		return &tcpip.ErrMessageTooLong{}
   459  	}
   460  	ipH := header.IPv4(pkt.NetworkHeader().Push(hdrLen))
   461  	length := pkt.Size()
   462  	if length > math.MaxUint16 {
   463  		return &tcpip.ErrMessageTooLong{}
   464  	}
   465  
   466  	fields := header.IPv4Fields{
   467  		TotalLength: uint16(length),
   468  		TTL:         params.TTL,
   469  		TOS:         params.TOS,
   470  		Protocol:    uint8(params.Protocol),
   471  		SrcAddr:     srcAddr,
   472  		DstAddr:     dstAddr,
   473  		Options:     options,
   474  	}
   475  	if params.DF {
   476  		// Treat want and do the same.
   477  		fields.Flags = header.IPv4FlagDontFragment
   478  	} else {
   479  		// RFC 6864 section 4.3 mandates uniqueness of ID values for
   480  		// non-atomic datagrams.
   481  		fields.ID = e.getID()
   482  	}
   483  	ipH.Encode(&fields)
   484  
   485  	ipH.SetChecksum(^ipH.CalculateChecksum())
   486  	pkt.NetworkProtocolNumber = ProtocolNumber
   487  	return nil
   488  }
   489  
   490  // handleFragments fragments pkt and calls the handler function on each
   491  // fragment. It returns the number of fragments handled and the number of
   492  // fragments left to be processed. The IP header must already be present in the
   493  // original packet.
   494  func (e *endpoint) handleFragments(_ *stack.Route, networkMTU uint32, pkt *stack.PacketBuffer, handler func(*stack.PacketBuffer) tcpip.Error) (int, int, tcpip.Error) {
   495  	// Round the MTU down to align to 8 bytes.
   496  	fragmentPayloadSize := networkMTU &^ 7
   497  	networkHeader := header.IPv4(pkt.NetworkHeader().Slice())
   498  	pf := fragmentation.MakePacketFragmenter(pkt, fragmentPayloadSize, pkt.AvailableHeaderBytes()+len(networkHeader))
   499  	defer pf.Release()
   500  
   501  	var n int
   502  	for {
   503  		fragPkt, more := buildNextFragment(&pf, networkHeader)
   504  		err := handler(fragPkt)
   505  		fragPkt.DecRef()
   506  		if err != nil {
   507  			return n, pf.RemainingFragmentCount() + 1, err
   508  		}
   509  		n++
   510  		if !more {
   511  			return n, pf.RemainingFragmentCount(), nil
   512  		}
   513  	}
   514  }
   515  
   516  // WritePacket writes a packet to the given destination address and protocol.
   517  func (e *endpoint) WritePacket(r *stack.Route, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) tcpip.Error {
   518  	if err := e.addIPHeader(r.LocalAddress(), r.RemoteAddress(), pkt, params, nil /* options */); err != nil {
   519  		return err
   520  	}
   521  
   522  	return e.writePacket(r, pkt)
   523  }
   524  
   525  func (e *endpoint) writePacket(r *stack.Route, pkt *stack.PacketBuffer) tcpip.Error {
   526  	netHeader := header.IPv4(pkt.NetworkHeader().Slice())
   527  	dstAddr := netHeader.DestinationAddress()
   528  
   529  	// iptables filtering. All packets that reach here are locally
   530  	// generated.
   531  	outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
   532  	if ok := e.protocol.stack.IPTables().CheckOutput(pkt, r, outNicName); !ok {
   533  		// iptables is telling us to drop the packet.
   534  		e.stats.ip.IPTablesOutputDropped.Increment()
   535  		return nil
   536  	}
   537  
   538  	// If the packet is manipulated as per DNAT Output rules, handle packet
   539  	// based on destination address and do not send the packet to link
   540  	// layer.
   541  	//
   542  	// We should do this for every packet, rather than only DNATted packets, but
   543  	// removing this check short circuits broadcasts before they are sent out to
   544  	// other hosts.
   545  	if newDstAddr := netHeader.DestinationAddress(); dstAddr != newDstAddr {
   546  		if ep := e.protocol.findEndpointWithAddress(newDstAddr); ep != nil {
   547  			// Since we rewrote the packet but it is being routed back to us, we
   548  			// can safely assume the checksum is valid.
   549  			ep.handleLocalPacket(pkt, true /* canSkipRXChecksum */)
   550  			return nil
   551  		}
   552  	}
   553  
   554  	return e.writePacketPostRouting(r, pkt, false /* headerIncluded */)
   555  }
   556  
   557  func (e *endpoint) writePacketPostRouting(r *stack.Route, pkt *stack.PacketBuffer, headerIncluded bool) tcpip.Error {
   558  	if r.Loop()&stack.PacketLoop != 0 {
   559  		// If the packet was generated by the stack (not a raw/packet endpoint
   560  		// where a packet may be written with the header included), then we can
   561  		// safely assume the checksum is valid.
   562  		e.handleLocalPacket(pkt, !headerIncluded /* canSkipRXChecksum */)
   563  	}
   564  	if r.Loop()&stack.PacketOut == 0 {
   565  		return nil
   566  	}
   567  
   568  	// Postrouting NAT can only change the source address, and does not alter the
   569  	// route or outgoing interface of the packet.
   570  	outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
   571  	if ok := e.protocol.stack.IPTables().CheckPostrouting(pkt, r, e, outNicName); !ok {
   572  		// iptables is telling us to drop the packet.
   573  		e.stats.ip.IPTablesPostroutingDropped.Increment()
   574  		return nil
   575  	}
   576  
   577  	stats := e.stats.ip
   578  
   579  	networkMTU, err := calculateNetworkMTU(e.nic.MTU(), uint32(len(pkt.NetworkHeader().Slice())))
   580  	if err != nil {
   581  		stats.OutgoingPacketErrors.Increment()
   582  		return err
   583  	}
   584  
   585  	if packetMustBeFragmented(pkt, networkMTU) {
   586  		h := header.IPv4(pkt.NetworkHeader().Slice())
   587  		if h.Flags()&header.IPv4FlagDontFragment != 0 && pkt.NetworkPacketInfo.IsForwardedPacket {
   588  			// TODO(gvisor.dev/issue/5919): Handle error condition in which DontFragment
   589  			// is set but the packet must be fragmented for the non-forwarding case.
   590  			return &tcpip.ErrMessageTooLong{}
   591  		}
   592  		sent, remain, err := e.handleFragments(r, networkMTU, pkt, func(fragPkt *stack.PacketBuffer) tcpip.Error {
   593  			// TODO(gvisor.dev/issue/3884): Evaluate whether we want to send each
   594  			// fragment one by one using WritePacket() (current strategy) or if we
   595  			// want to create a PacketBufferList from the fragments and feed it to
   596  			// WritePackets(). It'll be faster but cost more memory.
   597  			return e.nic.WritePacket(r, fragPkt)
   598  		})
   599  		stats.PacketsSent.IncrementBy(uint64(sent))
   600  		stats.OutgoingPacketErrors.IncrementBy(uint64(remain))
   601  		return err
   602  	}
   603  
   604  	if err := e.nic.WritePacket(r, pkt); err != nil {
   605  		stats.OutgoingPacketErrors.Increment()
   606  		return err
   607  	}
   608  	stats.PacketsSent.Increment()
   609  	return nil
   610  }
   611  
   612  // WriteHeaderIncludedPacket implements stack.NetworkEndpoint.
   613  func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) tcpip.Error {
   614  	// The packet already has an IP header, but there are a few required
   615  	// checks.
   616  	h, ok := pkt.Data().PullUp(header.IPv4MinimumSize)
   617  	if !ok {
   618  		return &tcpip.ErrMalformedHeader{}
   619  	}
   620  
   621  	hdrLen := header.IPv4(h).HeaderLength()
   622  	if hdrLen < header.IPv4MinimumSize {
   623  		return &tcpip.ErrMalformedHeader{}
   624  	}
   625  
   626  	h, ok = pkt.Data().PullUp(int(hdrLen))
   627  	if !ok {
   628  		return &tcpip.ErrMalformedHeader{}
   629  	}
   630  	ipH := header.IPv4(h)
   631  
   632  	// Always set the total length.
   633  	pktSize := pkt.Data().Size()
   634  	ipH.SetTotalLength(uint16(pktSize))
   635  
   636  	// Set the source address when zero.
   637  	if ipH.SourceAddress() == header.IPv4Any {
   638  		ipH.SetSourceAddress(r.LocalAddress())
   639  	}
   640  
   641  	// Set the packet ID when zero.
   642  	if ipH.ID() == 0 {
   643  		// RFC 6864 section 4.3 mandates uniqueness of ID values for
   644  		// non-atomic datagrams, so assign an ID to all such datagrams
   645  		// according to the definition given in RFC 6864 section 4.
   646  		if ipH.Flags()&header.IPv4FlagDontFragment == 0 || ipH.Flags()&header.IPv4FlagMoreFragments != 0 || ipH.FragmentOffset() > 0 {
   647  			ipH.SetID(e.getID())
   648  		}
   649  	}
   650  
   651  	// Always set the checksum.
   652  	ipH.SetChecksum(0)
   653  	ipH.SetChecksum(^ipH.CalculateChecksum())
   654  
   655  	// Populate the packet buffer's network header and don't allow an invalid
   656  	// packet to be sent.
   657  	//
   658  	// Note that parsing only makes sure that the packet is well formed as per the
   659  	// wire format. We also want to check if the header's fields are valid before
   660  	// sending the packet.
   661  	if !parse.IPv4(pkt) || !header.IPv4(pkt.NetworkHeader().Slice()).IsValid(pktSize) {
   662  		return &tcpip.ErrMalformedHeader{}
   663  	}
   664  
   665  	return e.writePacketPostRouting(r, pkt, true /* headerIncluded */)
   666  }
   667  
   668  // forwardPacketWithRoute emits the pkt using the provided route.
   669  //
   670  // If updateOptions is true, then the IP options will be updated in the copied
   671  // pkt using the outgoing endpoint. Otherwise, the caller is responsible for
   672  // updating the options.
   673  //
   674  // This method should be invoked by the endpoint that received the pkt.
   675  func (e *endpoint) forwardPacketWithRoute(route *stack.Route, pkt *stack.PacketBuffer, updateOptions bool) ip.ForwardingError {
   676  	h := header.IPv4(pkt.NetworkHeader().Slice())
   677  	stk := e.protocol.stack
   678  
   679  	inNicName := stk.FindNICNameFromID(e.nic.ID())
   680  	outNicName := stk.FindNICNameFromID(route.NICID())
   681  	if ok := stk.IPTables().CheckForward(pkt, inNicName, outNicName); !ok {
   682  		// iptables is telling us to drop the packet.
   683  		e.stats.ip.IPTablesForwardDropped.Increment()
   684  		return nil
   685  	}
   686  
   687  	// We need to do a deep copy of the IP packet because
   688  	// WriteHeaderIncludedPacket may modify the packet buffer, but we do
   689  	// not own it.
   690  	//
   691  	// TODO(https://gvisor.dev/issue/7473): For multicast, only create one deep
   692  	// copy and then clone.
   693  	newPkt := pkt.DeepCopyForForwarding(int(route.MaxHeaderLength()))
   694  	newHdr := header.IPv4(newPkt.NetworkHeader().Slice())
   695  	defer newPkt.DecRef()
   696  
   697  	forwardToEp, ok := e.protocol.getEndpointForNIC(route.NICID())
   698  	if !ok {
   699  		return &ip.ErrUnknownOutputEndpoint{}
   700  	}
   701  
   702  	if updateOptions {
   703  		if err := forwardToEp.updateOptionsForForwarding(newPkt); err != nil {
   704  			return err
   705  		}
   706  	}
   707  
   708  	ttl := h.TTL()
   709  	// As per RFC 791 page 30, Time to Live,
   710  	//
   711  	//   This field must be decreased at each point that the internet header
   712  	//   is processed to reflect the time spent processing the datagram.
   713  	//   Even if no local information is available on the time actually
   714  	//   spent, the field must be decremented by 1.
   715  	newHdr.SetTTL(ttl - 1)
   716  	// We perform a full checksum as we may have updated options above. The IP
   717  	// header is relatively small so this is not expected to be an expensive
   718  	// operation.
   719  	newHdr.SetChecksum(0)
   720  	newHdr.SetChecksum(^newHdr.CalculateChecksum())
   721  
   722  	switch err := forwardToEp.writePacketPostRouting(route, newPkt, true /* headerIncluded */); err.(type) {
   723  	case nil:
   724  		return nil
   725  	case *tcpip.ErrMessageTooLong:
   726  		// As per RFC 792, page 4, Destination Unreachable:
   727  		//
   728  		//   Another case is when a datagram must be fragmented to be forwarded by a
   729  		//   gateway yet the Don't Fragment flag is on. In this case the gateway must
   730  		//   discard the datagram and may return a destination unreachable message.
   731  		//
   732  		// WriteHeaderIncludedPacket checks for the presence of the Don't Fragment bit
   733  		// while sending the packet and returns this error iff fragmentation is
   734  		// necessary and the bit is also set.
   735  		_ = e.protocol.returnError(&icmpReasonFragmentationNeeded{}, pkt, false /* deliveredLocally */)
   736  		return &ip.ErrMessageTooLong{}
   737  	case *tcpip.ErrNoBufferSpace:
   738  		return &ip.ErrOutgoingDeviceNoBufferSpace{}
   739  	default:
   740  		return &ip.ErrOther{Err: err}
   741  	}
   742  }
   743  
   744  // forwardUnicastPacket attempts to forward a packet to its final destination.
   745  func (e *endpoint) forwardUnicastPacket(pkt *stack.PacketBuffer) ip.ForwardingError {
   746  	hView := pkt.NetworkHeader().View()
   747  	defer hView.Release()
   748  	h := header.IPv4(hView.AsSlice())
   749  
   750  	dstAddr := h.DestinationAddress()
   751  
   752  	if err := validateAddressesForForwarding(h); err != nil {
   753  		return err
   754  	}
   755  
   756  	ttl := h.TTL()
   757  	if ttl == 0 {
   758  		// As per RFC 792 page 6, Time Exceeded Message,
   759  		//
   760  		//  If the gateway processing a datagram finds the time to live field
   761  		//  is zero it must discard the datagram.  The gateway may also notify
   762  		//  the source host via the time exceeded message.
   763  		//
   764  		// We return the original error rather than the result of returning
   765  		// the ICMP packet because the original error is more relevant to
   766  		// the caller.
   767  		_ = e.protocol.returnError(&icmpReasonTTLExceeded{}, pkt, false /* deliveredLocally */)
   768  		return &ip.ErrTTLExceeded{}
   769  	}
   770  
   771  	if err := e.updateOptionsForForwarding(pkt); err != nil {
   772  		return err
   773  	}
   774  
   775  	stk := e.protocol.stack
   776  
   777  	// Check if the destination is owned by the stack.
   778  	if ep := e.protocol.findEndpointWithAddress(dstAddr); ep != nil {
   779  		inNicName := stk.FindNICNameFromID(e.nic.ID())
   780  		outNicName := stk.FindNICNameFromID(ep.nic.ID())
   781  		if ok := stk.IPTables().CheckForward(pkt, inNicName, outNicName); !ok {
   782  			// iptables is telling us to drop the packet.
   783  			e.stats.ip.IPTablesForwardDropped.Increment()
   784  			return nil
   785  		}
   786  
   787  		// The packet originally arrived on e so provide its NIC as the input NIC.
   788  		ep.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */)
   789  		return nil
   790  	}
   791  
   792  	r, err := stk.FindRoute(0, tcpip.Address{}, dstAddr, ProtocolNumber, false /* multicastLoop */)
   793  	switch err.(type) {
   794  	case nil:
   795  	// TODO(https://gvisor.dev/issues/8105): We should not observe ErrHostUnreachable from route
   796  	// lookups.
   797  	case *tcpip.ErrHostUnreachable, *tcpip.ErrNetworkUnreachable:
   798  		// We return the original error rather than the result of returning
   799  		// the ICMP packet because the original error is more relevant to
   800  		// the caller.
   801  		_ = e.protocol.returnError(&icmpReasonNetworkUnreachable{}, pkt, false /* deliveredLocally */)
   802  		return &ip.ErrHostUnreachable{}
   803  	default:
   804  		return &ip.ErrOther{Err: err}
   805  	}
   806  	defer r.Release()
   807  
   808  	// TODO(https://gvisor.dev/issue/7472): Unicast IP options should be updated
   809  	// using the output endpoint (instead of the input endpoint). In particular,
   810  	// RFC 1812 section 5.2.1 states the following:
   811  	//
   812  	//	 Processing of certain IP options requires that the router insert its IP
   813  	//	 address into the option. As noted in Section [5.2.4], the address
   814  	//	 inserted MUST be the address of the logical interface on which the
   815  	//	 packet is sent or the router's router-id if the packet is sent over an
   816  	//	 unnumbered interface. Thus, processing of these options cannot be
   817  	//	 completed until after the output interface is chosen.
   818  	return e.forwardPacketWithRoute(r, pkt, false /* updateOptions */)
   819  }
   820  
   821  // HandlePacket is called by the link layer when new ipv4 packets arrive for
   822  // this endpoint.
   823  func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) {
   824  	stats := e.stats.ip
   825  
   826  	stats.PacketsReceived.Increment()
   827  
   828  	if !e.isEnabled() {
   829  		stats.DisabledPacketsReceived.Increment()
   830  		return
   831  	}
   832  
   833  	hView, ok := e.protocol.parseAndValidate(pkt)
   834  	if !ok {
   835  		stats.MalformedPacketsReceived.Increment()
   836  		return
   837  	}
   838  	h := header.IPv4(hView.AsSlice())
   839  	defer hView.Release()
   840  
   841  	if !e.nic.IsLoopback() {
   842  		if !e.protocol.options.AllowExternalLoopbackTraffic {
   843  			if header.IsV4LoopbackAddress(h.SourceAddress()) {
   844  				stats.InvalidSourceAddressesReceived.Increment()
   845  				return
   846  			}
   847  
   848  			if header.IsV4LoopbackAddress(h.DestinationAddress()) {
   849  				stats.InvalidDestinationAddressesReceived.Increment()
   850  				return
   851  			}
   852  		}
   853  
   854  		if e.protocol.stack.HandleLocal() {
   855  			addressEndpoint := e.AcquireAssignedAddress(header.IPv4(pkt.NetworkHeader().Slice()).SourceAddress(), e.nic.Promiscuous(), stack.CanBePrimaryEndpoint, true /* readOnly */)
   856  			if addressEndpoint != nil {
   857  				// The source address is one of our own, so we never should have gotten
   858  				// a packet like this unless HandleLocal is false or our NIC is the
   859  				// loopback interface.
   860  				stats.InvalidSourceAddressesReceived.Increment()
   861  				return
   862  			}
   863  		}
   864  
   865  		// Loopback traffic skips the prerouting chain.
   866  		inNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
   867  		if ok := e.protocol.stack.IPTables().CheckPrerouting(pkt, e, inNicName); !ok {
   868  			// iptables is telling us to drop the packet.
   869  			stats.IPTablesPreroutingDropped.Increment()
   870  			return
   871  		}
   872  	}
   873  
   874  	e.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */)
   875  }
   876  
   877  // handleLocalPacket is like HandlePacket except it does not perform the
   878  // prerouting iptables hook or check for loopback traffic that originated from
   879  // outside of the netstack (i.e. martian loopback packets).
   880  func (e *endpoint) handleLocalPacket(pkt *stack.PacketBuffer, canSkipRXChecksum bool) {
   881  	stats := e.stats.ip
   882  	stats.PacketsReceived.Increment()
   883  
   884  	pkt = pkt.CloneToInbound()
   885  	defer pkt.DecRef()
   886  	pkt.RXChecksumValidated = canSkipRXChecksum
   887  
   888  	hView, ok := e.protocol.parseAndValidate(pkt)
   889  	if !ok {
   890  		stats.MalformedPacketsReceived.Increment()
   891  		return
   892  	}
   893  	h := header.IPv4(hView.AsSlice())
   894  	defer hView.Release()
   895  
   896  	e.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */)
   897  }
   898  
   899  func validateAddressesForForwarding(h header.IPv4) ip.ForwardingError {
   900  	srcAddr := h.SourceAddress()
   901  
   902  	// As per RFC 5735 section 3,
   903  	//
   904  	//   0.0.0.0/8 - Addresses in this block refer to source hosts on "this"
   905  	//   network.  Address 0.0.0.0/32 may be used as a source address for this
   906  	//   host on this network; other addresses within 0.0.0.0/8 may be used to
   907  	//   refer to specified hosts on this network ([RFC1122], Section 3.2.1.3).
   908  	//
   909  	// And RFC 6890 section 2.2.2,
   910  	//
   911  	//                +----------------------+----------------------------+
   912  	//                | Attribute            | Value                      |
   913  	//                +----------------------+----------------------------+
   914  	//                | Address Block        | 0.0.0.0/8                  |
   915  	//                | Name                 | "This host on this network"|
   916  	//                | RFC                  | [RFC1122], Section 3.2.1.3 |
   917  	//                | Allocation Date      | September 1981             |
   918  	//                | Termination Date     | N/A                        |
   919  	//                | Source               | True                       |
   920  	//                | Destination          | False                      |
   921  	//                | Forwardable          | False                      |
   922  	//                | Global               | False                      |
   923  	//                | Reserved-by-Protocol | True                       |
   924  	//                +----------------------+----------------------------+
   925  	if header.IPv4CurrentNetworkSubnet.Contains(srcAddr) {
   926  		return &ip.ErrInitializingSourceAddress{}
   927  	}
   928  
   929  	// As per RFC 3927 section 7,
   930  	//
   931  	//   A router MUST NOT forward a packet with an IPv4 Link-Local source or
   932  	//   destination address, irrespective of the router's default route
   933  	//   configuration or routes obtained from dynamic routing protocols.
   934  	//
   935  	//   A router which receives a packet with an IPv4 Link-Local source or
   936  	//   destination address MUST NOT forward the packet.  This prevents
   937  	//   forwarding of packets back onto the network segment from which they
   938  	//   originated, or to any other segment.
   939  	if header.IsV4LinkLocalUnicastAddress(srcAddr) {
   940  		return &ip.ErrLinkLocalSourceAddress{}
   941  	}
   942  	if dstAddr := h.DestinationAddress(); header.IsV4LinkLocalUnicastAddress(dstAddr) || header.IsV4LinkLocalMulticastAddress(dstAddr) {
   943  		return &ip.ErrLinkLocalDestinationAddress{}
   944  	}
   945  	return nil
   946  }
   947  
   948  // forwardMulticastPacket validates a multicast pkt and attempts to forward it.
   949  //
   950  // This method should be invoked for incoming multicast packets using the
   951  // endpoint that received the packet.
   952  func (e *endpoint) forwardMulticastPacket(h header.IPv4, pkt *stack.PacketBuffer) ip.ForwardingError {
   953  	if err := validateAddressesForForwarding(h); err != nil {
   954  		return err
   955  	}
   956  
   957  	if opts := h.Options(); len(opts) != 0 {
   958  		// Check if the options are valid, but don't mutate them. This corresponds
   959  		// to step 3 of RFC 1812 section 5.2.1.1.
   960  		if _, _, optProblem := e.processIPOptions(pkt, opts, &optionUsageVerify{}); optProblem != nil {
   961  			// Per RFC 1812 section 4.3.2.7, an ICMP error message should not be
   962  			// sent for:
   963  			//
   964  			//	 A packet destined to an IP broadcast or IP multicast address.
   965  			//
   966  			// Note that protocol.returnError also enforces this requirement.
   967  			// However, we intentionally omit it here since this path is multicast
   968  			// only.
   969  			return &ip.ErrParameterProblem{}
   970  		}
   971  	}
   972  
   973  	routeKey := stack.UnicastSourceAndMulticastDestination{
   974  		Source:      h.SourceAddress(),
   975  		Destination: h.DestinationAddress(),
   976  	}
   977  
   978  	// The pkt has been validated. Consequently, if a route is not found, then
   979  	// the pkt can safely be queued.
   980  	result, hasBufferSpace := e.protocol.multicastRouteTable.GetRouteOrInsertPending(routeKey, pkt)
   981  
   982  	if !hasBufferSpace {
   983  		// Unable to queue the pkt. Silently drop it.
   984  		return &ip.ErrNoMulticastPendingQueueBufferSpace{}
   985  	}
   986  
   987  	switch result.GetRouteResultState {
   988  	case multicast.InstalledRouteFound:
   989  		// Attempt to forward the pkt using an existing route.
   990  		return e.forwardValidatedMulticastPacket(pkt, result.InstalledRoute)
   991  	case multicast.NoRouteFoundAndPendingInserted:
   992  		e.emitMulticastEvent(func(disp stack.MulticastForwardingEventDispatcher) {
   993  			disp.OnMissingRoute(stack.MulticastPacketContext{
   994  				stack.UnicastSourceAndMulticastDestination{h.SourceAddress(), h.DestinationAddress()},
   995  				e.nic.ID(),
   996  			})
   997  		})
   998  	case multicast.PacketQueuedInPendingRoute:
   999  	default:
  1000  		panic(fmt.Sprintf("unexpected GetRouteResultState: %s", result.GetRouteResultState))
  1001  	}
  1002  	return &ip.ErrHostUnreachable{}
  1003  }
  1004  
  1005  func (e *endpoint) updateOptionsForForwarding(pkt *stack.PacketBuffer) ip.ForwardingError {
  1006  	h := header.IPv4(pkt.NetworkHeader().Slice())
  1007  	if opts := h.Options(); len(opts) != 0 {
  1008  		newOpts, _, optProblem := e.processIPOptions(pkt, opts, &optionUsageForward{})
  1009  		if optProblem != nil {
  1010  			if optProblem.NeedICMP {
  1011  				// Note that this will not emit an ICMP error if the destination is
  1012  				// multicast.
  1013  				_ = e.protocol.returnError(&icmpReasonParamProblem{
  1014  					pointer: optProblem.Pointer,
  1015  				}, pkt, false /* deliveredLocally */)
  1016  			}
  1017  			return &ip.ErrParameterProblem{}
  1018  		}
  1019  		copied := copy(opts, newOpts)
  1020  		if copied != len(newOpts) {
  1021  			panic(fmt.Sprintf("copied %d bytes of new options, expected %d bytes", copied, len(newOpts)))
  1022  		}
  1023  		// Since in forwarding we handle all options, including copying those we
  1024  		// do not recognise, the options region should remain the same size which
  1025  		// simplifies processing. As we MAY receive a packet with a lot of padded
  1026  		// bytes after the "end of options list" byte, make sure we copy
  1027  		// them as the legal padding value (0).
  1028  		for i := copied; i < len(opts); i++ {
  1029  			// Pad with 0 (EOL). RFC 791 page 23 says "The padding is zero".
  1030  			opts[i] = byte(header.IPv4OptionListEndType)
  1031  		}
  1032  	}
  1033  	return nil
  1034  }
  1035  
  1036  // forwardValidatedMulticastPacket attempts to forward the pkt using the
  1037  // provided installedRoute.
  1038  //
  1039  // This method should be invoked by the endpoint that received the pkt.
  1040  func (e *endpoint) forwardValidatedMulticastPacket(pkt *stack.PacketBuffer, installedRoute *multicast.InstalledRoute) ip.ForwardingError {
  1041  	// Per RFC 1812 section 5.2.1.3,
  1042  	//
  1043  	//	 Based on the IP source and destination addresses found in the datagram
  1044  	//	 header, the router determines whether the datagram has been received
  1045  	//	 on the proper interface for forwarding.  If not, the datagram is
  1046  	//	 dropped silently.
  1047  	if e.nic.ID() != installedRoute.ExpectedInputInterface {
  1048  		h := header.IPv4(pkt.NetworkHeader().Slice())
  1049  		e.emitMulticastEvent(func(disp stack.MulticastForwardingEventDispatcher) {
  1050  			disp.OnUnexpectedInputInterface(stack.MulticastPacketContext{
  1051  				stack.UnicastSourceAndMulticastDestination{h.SourceAddress(), h.DestinationAddress()},
  1052  				e.nic.ID(),
  1053  			}, installedRoute.ExpectedInputInterface)
  1054  		})
  1055  		return &ip.ErrUnexpectedMulticastInputInterface{}
  1056  	}
  1057  
  1058  	for _, outgoingInterface := range installedRoute.OutgoingInterfaces {
  1059  		if err := e.forwardMulticastPacketForOutgoingInterface(pkt, outgoingInterface); err != nil {
  1060  			e.handleForwardingError(err)
  1061  			continue
  1062  		}
  1063  		// The pkt was successfully forwarded. Mark the route as used.
  1064  		installedRoute.SetLastUsedTimestamp(e.protocol.stack.Clock().NowMonotonic())
  1065  	}
  1066  	return nil
  1067  }
  1068  
  1069  // forwardMulticastPacketForOutgoingInterface attempts to forward the pkt out
  1070  // of the provided outgoingInterface.
  1071  //
  1072  // This method should be invoked by the endpoint that received the pkt.
  1073  func (e *endpoint) forwardMulticastPacketForOutgoingInterface(pkt *stack.PacketBuffer, outgoingInterface stack.MulticastRouteOutgoingInterface) ip.ForwardingError {
  1074  	h := header.IPv4(pkt.NetworkHeader().Slice())
  1075  
  1076  	// Per RFC 1812 section 5.2.1.3,
  1077  	//
  1078  	//	 A copy of the multicast datagram is forwarded out each outgoing
  1079  	//	 interface whose minimum TTL value is less than or equal to the TTL
  1080  	//	 value in the datagram header.
  1081  	//
  1082  	// Copying of the packet is deferred to forwardPacketWithRoute since unicast
  1083  	// and multicast both require a copy.
  1084  	if outgoingInterface.MinTTL > h.TTL() {
  1085  		return &ip.ErrTTLExceeded{}
  1086  	}
  1087  
  1088  	route := e.protocol.stack.NewRouteForMulticast(outgoingInterface.ID, h.DestinationAddress(), e.NetworkProtocolNumber())
  1089  
  1090  	if route == nil {
  1091  		// Failed to convert to a stack.Route. This likely means that the outgoing
  1092  		// endpoint no longer exists.
  1093  		return &ip.ErrHostUnreachable{}
  1094  	}
  1095  	defer route.Release()
  1096  
  1097  	return e.forwardPacketWithRoute(route, pkt, true /* updateOptions */)
  1098  }
  1099  
  1100  func (e *endpoint) handleValidatedPacket(h header.IPv4, pkt *stack.PacketBuffer, inNICName string) {
  1101  	pkt.NICID = e.nic.ID()
  1102  
  1103  	// Raw socket packets are delivered based solely on the transport protocol
  1104  	// number. We only require that the packet be valid IPv4, and that they not
  1105  	// be fragmented.
  1106  	if !h.More() && h.FragmentOffset() == 0 {
  1107  		e.dispatcher.DeliverRawPacket(h.TransportProtocol(), pkt)
  1108  	}
  1109  
  1110  	stats := e.stats
  1111  	stats.ip.ValidPacketsReceived.Increment()
  1112  
  1113  	srcAddr := h.SourceAddress()
  1114  	dstAddr := h.DestinationAddress()
  1115  
  1116  	// As per RFC 1122 section 3.2.1.3:
  1117  	//   When a host sends any datagram, the IP source address MUST
  1118  	//   be one of its own IP addresses (but not a broadcast or
  1119  	//   multicast address).
  1120  	if srcAddr == header.IPv4Broadcast || header.IsV4MulticastAddress(srcAddr) {
  1121  		stats.ip.InvalidSourceAddressesReceived.Increment()
  1122  		return
  1123  	}
  1124  	// Make sure the source address is not a subnet-local broadcast address.
  1125  	if addressEndpoint := e.AcquireAssignedAddress(srcAddr, false /* createTemp */, stack.NeverPrimaryEndpoint, true /* readOnly */); addressEndpoint != nil {
  1126  		subnet := addressEndpoint.Subnet()
  1127  		if subnet.IsBroadcast(srcAddr) {
  1128  			stats.ip.InvalidSourceAddressesReceived.Increment()
  1129  			return
  1130  		}
  1131  	}
  1132  
  1133  	if header.IsV4MulticastAddress(dstAddr) {
  1134  		// Handle all packets destined to a multicast address separately. Unlike
  1135  		// unicast, these packets can be both delivered locally and forwarded. See
  1136  		// RFC 1812 section 5.2.3 for details regarding the forwarding/local
  1137  		// delivery decision.
  1138  
  1139  		multicastForwarding := e.MulticastForwarding() && e.protocol.multicastForwarding()
  1140  
  1141  		if multicastForwarding {
  1142  			e.handleForwardingError(e.forwardMulticastPacket(h, pkt))
  1143  		}
  1144  
  1145  		if e.IsInGroup(dstAddr) {
  1146  			e.deliverPacketLocally(h, pkt, inNICName)
  1147  			return
  1148  		}
  1149  
  1150  		if !multicastForwarding {
  1151  			// Only consider the destination address invalid if we didn't attempt to
  1152  			// forward the pkt and it was not delivered locally.
  1153  			stats.ip.InvalidDestinationAddressesReceived.Increment()
  1154  		}
  1155  		return
  1156  	}
  1157  
  1158  	// Before we do any processing, check if the packet was received as some
  1159  	// sort of broadcast.
  1160  	//
  1161  	// If the packet is destined for this device, then it should be delivered
  1162  	// locally. Otherwise, if forwarding is enabled, it should be forwarded.
  1163  	if addressEndpoint := e.AcquireAssignedAddress(dstAddr, e.nic.Promiscuous(), stack.CanBePrimaryEndpoint, true /* readOnly */); addressEndpoint != nil {
  1164  		subnet := addressEndpoint.AddressWithPrefix().Subnet()
  1165  		pkt.NetworkPacketInfo.LocalAddressBroadcast = subnet.IsBroadcast(dstAddr) || dstAddr == header.IPv4Broadcast
  1166  		e.deliverPacketLocally(h, pkt, inNICName)
  1167  	} else if e.Forwarding() {
  1168  		e.handleForwardingError(e.forwardUnicastPacket(pkt))
  1169  	} else {
  1170  		stats.ip.InvalidDestinationAddressesReceived.Increment()
  1171  	}
  1172  }
  1173  
  1174  // handleForwardingError processes the provided err and increments any relevant
  1175  // counters.
  1176  func (e *endpoint) handleForwardingError(err ip.ForwardingError) {
  1177  	stats := e.stats.ip
  1178  	switch err := err.(type) {
  1179  	case nil:
  1180  		return
  1181  	case *ip.ErrInitializingSourceAddress:
  1182  		stats.Forwarding.InitializingSource.Increment()
  1183  	case *ip.ErrLinkLocalSourceAddress:
  1184  		stats.Forwarding.LinkLocalSource.Increment()
  1185  	case *ip.ErrLinkLocalDestinationAddress:
  1186  		stats.Forwarding.LinkLocalDestination.Increment()
  1187  	case *ip.ErrTTLExceeded:
  1188  		stats.Forwarding.ExhaustedTTL.Increment()
  1189  	case *ip.ErrHostUnreachable:
  1190  		stats.Forwarding.Unrouteable.Increment()
  1191  	case *ip.ErrParameterProblem:
  1192  		stats.MalformedPacketsReceived.Increment()
  1193  	case *ip.ErrMessageTooLong:
  1194  		stats.Forwarding.PacketTooBig.Increment()
  1195  	case *ip.ErrNoMulticastPendingQueueBufferSpace:
  1196  		stats.Forwarding.NoMulticastPendingQueueBufferSpace.Increment()
  1197  	case *ip.ErrUnexpectedMulticastInputInterface:
  1198  		stats.Forwarding.UnexpectedMulticastInputInterface.Increment()
  1199  	case *ip.ErrUnknownOutputEndpoint:
  1200  		stats.Forwarding.UnknownOutputEndpoint.Increment()
  1201  	case *ip.ErrOutgoingDeviceNoBufferSpace:
  1202  		stats.Forwarding.OutgoingDeviceNoBufferSpace.Increment()
  1203  	default:
  1204  		panic(fmt.Sprintf("unrecognized forwarding error: %s", err))
  1205  	}
  1206  	stats.Forwarding.Errors.Increment()
  1207  }
  1208  
  1209  func (e *endpoint) deliverPacketLocally(h header.IPv4, pkt *stack.PacketBuffer, inNICName string) {
  1210  	stats := e.stats
  1211  	// iptables filtering. All packets that reach here are intended for
  1212  	// this machine and will not be forwarded.
  1213  	if ok := e.protocol.stack.IPTables().CheckInput(pkt, inNICName); !ok {
  1214  		// iptables is telling us to drop the packet.
  1215  		stats.ip.IPTablesInputDropped.Increment()
  1216  		return
  1217  	}
  1218  
  1219  	if h.More() || h.FragmentOffset() != 0 {
  1220  		if pkt.Data().Size()+len(pkt.TransportHeader().Slice()) == 0 {
  1221  			// Drop the packet as it's marked as a fragment but has
  1222  			// no payload.
  1223  			stats.ip.MalformedPacketsReceived.Increment()
  1224  			stats.ip.MalformedFragmentsReceived.Increment()
  1225  			return
  1226  		}
  1227  		if opts := h.Options(); len(opts) != 0 {
  1228  			// If there are options we need to check them before we do assembly
  1229  			// or we could be assembling errant packets. However we do not change the
  1230  			// options as that could lead to double processing later.
  1231  			if _, _, optProblem := e.processIPOptions(pkt, opts, &optionUsageVerify{}); optProblem != nil {
  1232  				if optProblem.NeedICMP {
  1233  					_ = e.protocol.returnError(&icmpReasonParamProblem{
  1234  						pointer: optProblem.Pointer,
  1235  					}, pkt, true /* deliveredLocally */)
  1236  					e.stats.ip.MalformedPacketsReceived.Increment()
  1237  				}
  1238  				return
  1239  			}
  1240  		}
  1241  		// The packet is a fragment, let's try to reassemble it.
  1242  		start := h.FragmentOffset()
  1243  		// Drop the fragment if the size of the reassembled payload would exceed the
  1244  		// maximum payload size.
  1245  		//
  1246  		// Note that this addition doesn't overflow even on 32bit architecture
  1247  		// because pkt.Data().Size() should not exceed 65535 (the max IP datagram
  1248  		// size). Otherwise the packet would've been rejected as invalid before
  1249  		// reaching here.
  1250  		if int(start)+pkt.Data().Size() > header.IPv4MaximumPayloadSize {
  1251  			stats.ip.MalformedPacketsReceived.Increment()
  1252  			stats.ip.MalformedFragmentsReceived.Increment()
  1253  			return
  1254  		}
  1255  
  1256  		proto := h.Protocol()
  1257  		resPkt, transProtoNum, ready, err := e.protocol.fragmentation.Process(
  1258  			// As per RFC 791 section 2.3, the identification value is unique
  1259  			// for a source-destination pair and protocol.
  1260  			fragmentation.FragmentID{
  1261  				Source:      h.SourceAddress(),
  1262  				Destination: h.DestinationAddress(),
  1263  				ID:          uint32(h.ID()),
  1264  				Protocol:    proto,
  1265  			},
  1266  			start,
  1267  			start+uint16(pkt.Data().Size())-1,
  1268  			h.More(),
  1269  			proto,
  1270  			pkt,
  1271  		)
  1272  		if err != nil {
  1273  			stats.ip.MalformedPacketsReceived.Increment()
  1274  			stats.ip.MalformedFragmentsReceived.Increment()
  1275  			return
  1276  		}
  1277  		if !ready {
  1278  			return
  1279  		}
  1280  		defer resPkt.DecRef()
  1281  		pkt = resPkt
  1282  		h = header.IPv4(pkt.NetworkHeader().Slice())
  1283  
  1284  		// The reassembler doesn't take care of fixing up the header, so we need
  1285  		// to do it here.
  1286  		h.SetTotalLength(uint16(pkt.Data().Size() + len(h)))
  1287  		h.SetFlagsFragmentOffset(0, 0)
  1288  
  1289  		e.protocol.parseTransport(pkt, tcpip.TransportProtocolNumber(transProtoNum))
  1290  
  1291  		// Now that the packet is reassembled, it can be sent to raw sockets.
  1292  		e.dispatcher.DeliverRawPacket(h.TransportProtocol(), pkt)
  1293  	}
  1294  	stats.ip.PacketsDelivered.Increment()
  1295  
  1296  	p := h.TransportProtocol()
  1297  	if p == header.ICMPv4ProtocolNumber {
  1298  		// TODO(gvisor.dev/issues/3810): when we sort out ICMP and transport
  1299  		// headers, the setting of the transport number here should be
  1300  		// unnecessary and removed.
  1301  		pkt.TransportProtocolNumber = p
  1302  		e.handleICMP(pkt)
  1303  		return
  1304  	}
  1305  	// ICMP handles options itself but do it here for all remaining destinations.
  1306  	var hasRouterAlertOption bool
  1307  	if opts := h.Options(); len(opts) != 0 {
  1308  		newOpts, processedOpts, optProblem := e.processIPOptions(pkt, opts, &optionUsageReceive{})
  1309  		if optProblem != nil {
  1310  			if optProblem.NeedICMP {
  1311  				_ = e.protocol.returnError(&icmpReasonParamProblem{
  1312  					pointer: optProblem.Pointer,
  1313  				}, pkt, true /* deliveredLocally */)
  1314  				stats.ip.MalformedPacketsReceived.Increment()
  1315  			}
  1316  			return
  1317  		}
  1318  		hasRouterAlertOption = processedOpts.routerAlert
  1319  		copied := copy(opts, newOpts)
  1320  		if copied != len(newOpts) {
  1321  			panic(fmt.Sprintf("copied %d bytes of new options, expected %d bytes", copied, len(newOpts)))
  1322  		}
  1323  		for i := copied; i < len(opts); i++ {
  1324  			// Pad with 0 (EOL). RFC 791 page 23 says "The padding is zero".
  1325  			opts[i] = byte(header.IPv4OptionListEndType)
  1326  		}
  1327  	}
  1328  	if p == header.IGMPProtocolNumber {
  1329  		e.mu.Lock()
  1330  		e.igmp.handleIGMP(pkt, hasRouterAlertOption) // +checklocksforce: e == e.igmp.ep.
  1331  		e.mu.Unlock()
  1332  		return
  1333  	}
  1334  
  1335  	switch res := e.dispatcher.DeliverTransportPacket(p, pkt); res {
  1336  	case stack.TransportPacketHandled:
  1337  	case stack.TransportPacketDestinationPortUnreachable:
  1338  		// As per RFC: 1122 Section 3.2.2.1 A host SHOULD generate Destination
  1339  		//   Unreachable messages with code:
  1340  		//     3 (Port Unreachable), when the designated transport protocol
  1341  		//     (e.g., UDP) is unable to demultiplex the datagram but has no
  1342  		//     protocol mechanism to inform the sender.
  1343  		_ = e.protocol.returnError(&icmpReasonPortUnreachable{}, pkt, true /* deliveredLocally */)
  1344  	case stack.TransportPacketProtocolUnreachable:
  1345  		// As per RFC: 1122 Section 3.2.2.1
  1346  		//   A host SHOULD generate Destination Unreachable messages with code:
  1347  		//     2 (Protocol Unreachable), when the designated transport protocol
  1348  		//     is not supported
  1349  		_ = e.protocol.returnError(&icmpReasonProtoUnreachable{}, pkt, true /* deliveredLocally */)
  1350  	default:
  1351  		panic(fmt.Sprintf("unrecognized result from DeliverTransportPacket = %d", res))
  1352  	}
  1353  }
  1354  
  1355  // Close cleans up resources associated with the endpoint.
  1356  func (e *endpoint) Close() {
  1357  	e.mu.Lock()
  1358  	e.disableLocked()
  1359  	e.addressableEndpointState.Cleanup()
  1360  	e.mu.Unlock()
  1361  
  1362  	e.protocol.forgetEndpoint(e.nic.ID())
  1363  }
  1364  
  1365  // AddAndAcquirePermanentAddress implements stack.AddressableEndpoint.
  1366  func (e *endpoint) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, properties stack.AddressProperties) (stack.AddressEndpoint, tcpip.Error) {
  1367  	e.mu.Lock()
  1368  	defer e.mu.Unlock()
  1369  
  1370  	ep, err := e.addressableEndpointState.AddAndAcquireAddress(addr, properties, stack.Permanent)
  1371  	if err == nil {
  1372  		e.sendQueuedReports()
  1373  	}
  1374  	return ep, err
  1375  }
  1376  
  1377  // sendQueuedReports sends queued igmp reports.
  1378  //
  1379  // +checklocks:e.mu
  1380  // +checklocksalias:e.igmp.ep.mu=e.mu
  1381  func (e *endpoint) sendQueuedReports() {
  1382  	e.igmp.sendQueuedReports()
  1383  }
  1384  
  1385  // RemovePermanentAddress implements stack.AddressableEndpoint.
  1386  func (e *endpoint) RemovePermanentAddress(addr tcpip.Address) tcpip.Error {
  1387  	e.mu.RLock()
  1388  	defer e.mu.RUnlock()
  1389  	return e.addressableEndpointState.RemovePermanentAddress(addr)
  1390  }
  1391  
  1392  // SetDeprecated implements stack.AddressableEndpoint.
  1393  func (e *endpoint) SetDeprecated(addr tcpip.Address, deprecated bool) tcpip.Error {
  1394  	e.mu.RLock()
  1395  	defer e.mu.RUnlock()
  1396  	return e.addressableEndpointState.SetDeprecated(addr, deprecated)
  1397  }
  1398  
  1399  // SetLifetimes implements stack.AddressableEndpoint.
  1400  func (e *endpoint) SetLifetimes(addr tcpip.Address, lifetimes stack.AddressLifetimes) tcpip.Error {
  1401  	e.mu.RLock()
  1402  	defer e.mu.RUnlock()
  1403  	return e.addressableEndpointState.SetLifetimes(addr, lifetimes)
  1404  }
  1405  
  1406  // MainAddress implements stack.AddressableEndpoint.
  1407  func (e *endpoint) MainAddress() tcpip.AddressWithPrefix {
  1408  	e.mu.RLock()
  1409  	defer e.mu.RUnlock()
  1410  	return e.addressableEndpointState.MainAddress()
  1411  }
  1412  
  1413  // AcquireAssignedAddress implements stack.AddressableEndpoint.
  1414  func (e *endpoint) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB stack.PrimaryEndpointBehavior, readOnly bool) stack.AddressEndpoint {
  1415  	e.mu.RLock()
  1416  	defer e.mu.RUnlock()
  1417  
  1418  	loopback := e.nic.IsLoopback()
  1419  	return e.addressableEndpointState.AcquireAssignedAddressOrMatching(localAddr, func(addressEndpoint stack.AddressEndpoint) bool {
  1420  		subnet := addressEndpoint.Subnet()
  1421  		// IPv4 has a notion of a subnet broadcast address and considers the
  1422  		// loopback interface bound to an address's whole subnet (on linux).
  1423  		return subnet.IsBroadcast(localAddr) || (loopback && subnet.Contains(localAddr))
  1424  	}, allowTemp, tempPEB, readOnly)
  1425  }
  1426  
  1427  // AcquireOutgoingPrimaryAddress implements stack.AddressableEndpoint.
  1428  func (e *endpoint) AcquireOutgoingPrimaryAddress(remoteAddr, srcHint tcpip.Address, allowExpired bool) stack.AddressEndpoint {
  1429  	e.mu.RLock()
  1430  	defer e.mu.RUnlock()
  1431  	return e.acquireOutgoingPrimaryAddressRLocked(remoteAddr, srcHint, allowExpired)
  1432  }
  1433  
  1434  // acquireOutgoingPrimaryAddressRLocked is like AcquireOutgoingPrimaryAddress
  1435  // but with locking requirements
  1436  //
  1437  // +checklocksread:e.mu
  1438  func (e *endpoint) acquireOutgoingPrimaryAddressRLocked(remoteAddr, srcHint tcpip.Address, allowExpired bool) stack.AddressEndpoint {
  1439  	return e.addressableEndpointState.AcquireOutgoingPrimaryAddress(remoteAddr, srcHint, allowExpired)
  1440  }
  1441  
  1442  // PrimaryAddresses implements stack.AddressableEndpoint.
  1443  func (e *endpoint) PrimaryAddresses() []tcpip.AddressWithPrefix {
  1444  	e.mu.RLock()
  1445  	defer e.mu.RUnlock()
  1446  	return e.addressableEndpointState.PrimaryAddresses()
  1447  }
  1448  
  1449  // PermanentAddresses implements stack.AddressableEndpoint.
  1450  func (e *endpoint) PermanentAddresses() []tcpip.AddressWithPrefix {
  1451  	e.mu.RLock()
  1452  	defer e.mu.RUnlock()
  1453  	return e.addressableEndpointState.PermanentAddresses()
  1454  }
  1455  
  1456  // JoinGroup implements stack.GroupAddressableEndpoint.
  1457  func (e *endpoint) JoinGroup(addr tcpip.Address) tcpip.Error {
  1458  	e.mu.Lock()
  1459  	defer e.mu.Unlock()
  1460  	return e.joinGroupLocked(addr)
  1461  }
  1462  
  1463  // joinGroupLocked is like JoinGroup but with locking requirements.
  1464  //
  1465  // +checklocks:e.mu
  1466  // +checklocksalias:e.igmp.ep.mu=e.mu
  1467  func (e *endpoint) joinGroupLocked(addr tcpip.Address) tcpip.Error {
  1468  	if !header.IsV4MulticastAddress(addr) {
  1469  		return &tcpip.ErrBadAddress{}
  1470  	}
  1471  
  1472  	e.igmp.joinGroup(addr)
  1473  	return nil
  1474  }
  1475  
  1476  // LeaveGroup implements stack.GroupAddressableEndpoint.
  1477  func (e *endpoint) LeaveGroup(addr tcpip.Address) tcpip.Error {
  1478  	e.mu.Lock()
  1479  	defer e.mu.Unlock()
  1480  	return e.leaveGroupLocked(addr)
  1481  }
  1482  
  1483  // leaveGroupLocked is like LeaveGroup but with locking requirements.
  1484  //
  1485  // +checklocks:e.mu
  1486  // +checklocksalias:e.igmp.ep.mu=e.mu
  1487  func (e *endpoint) leaveGroupLocked(addr tcpip.Address) tcpip.Error {
  1488  	return e.igmp.leaveGroup(addr)
  1489  }
  1490  
  1491  // IsInGroup implements stack.GroupAddressableEndpoint.
  1492  func (e *endpoint) IsInGroup(addr tcpip.Address) bool {
  1493  	e.mu.RLock()
  1494  	defer e.mu.RUnlock()
  1495  	return e.igmp.isInGroup(addr) // +checklocksforce: e.mu==e.igmp.ep.mu.
  1496  }
  1497  
  1498  // Stats implements stack.NetworkEndpoint.
  1499  func (e *endpoint) Stats() stack.NetworkEndpointStats {
  1500  	return &e.stats.localStats
  1501  }
  1502  
  1503  var _ stack.NetworkProtocol = (*protocol)(nil)
  1504  var _ stack.MulticastForwardingNetworkProtocol = (*protocol)(nil)
  1505  var _ stack.RejectIPv4WithHandler = (*protocol)(nil)
  1506  var _ fragmentation.TimeoutHandler = (*protocol)(nil)
  1507  
  1508  type protocol struct {
  1509  	stack *stack.Stack
  1510  
  1511  	// mu protects annotated fields below.
  1512  	mu sync.RWMutex
  1513  
  1514  	// eps is keyed by NICID to allow protocol methods to retrieve an endpoint
  1515  	// when handling a packet, by looking at which NIC handled the packet.
  1516  	// +checklocks:mu
  1517  	eps map[tcpip.NICID]*endpoint
  1518  
  1519  	// ICMP types for which the stack's global rate limiting must apply.
  1520  	// +checklocks:mu
  1521  	icmpRateLimitedTypes map[header.ICMPv4Type]struct{}
  1522  
  1523  	// defaultTTL is the current default TTL for the protocol. Only the
  1524  	// uint8 portion of it is meaningful.
  1525  	defaultTTL atomicbitops.Uint32
  1526  
  1527  	ids    []atomicbitops.Uint32
  1528  	hashIV uint32
  1529  	// idTS is the unix timestamp in milliseconds 'ids' was last accessed.
  1530  	idTS atomicbitops.Int64
  1531  
  1532  	fragmentation *fragmentation.Fragmentation
  1533  
  1534  	options Options
  1535  
  1536  	multicastRouteTable multicast.RouteTable
  1537  	// multicastForwardingDisp is the multicast forwarding event dispatcher that
  1538  	// an integrator can provide to receive multicast forwarding events. Note
  1539  	// that multicast packets will only be forwarded if this is non-nil.
  1540  	// +checklocks:mu
  1541  	multicastForwardingDisp stack.MulticastForwardingEventDispatcher
  1542  }
  1543  
  1544  // Number returns the ipv4 protocol number.
  1545  func (p *protocol) Number() tcpip.NetworkProtocolNumber {
  1546  	return ProtocolNumber
  1547  }
  1548  
  1549  // MinimumPacketSize returns the minimum valid ipv4 packet size.
  1550  func (p *protocol) MinimumPacketSize() int {
  1551  	return header.IPv4MinimumSize
  1552  }
  1553  
  1554  // ParseAddresses implements stack.NetworkProtocol.
  1555  func (*protocol) ParseAddresses(v []byte) (src, dst tcpip.Address) {
  1556  	h := header.IPv4(v)
  1557  	return h.SourceAddress(), h.DestinationAddress()
  1558  }
  1559  
  1560  // SetOption implements stack.NetworkProtocol.
  1561  func (p *protocol) SetOption(option tcpip.SettableNetworkProtocolOption) tcpip.Error {
  1562  	switch v := option.(type) {
  1563  	case *tcpip.DefaultTTLOption:
  1564  		p.SetDefaultTTL(uint8(*v))
  1565  		return nil
  1566  	default:
  1567  		return &tcpip.ErrUnknownProtocolOption{}
  1568  	}
  1569  }
  1570  
  1571  // Option implements stack.NetworkProtocol.
  1572  func (p *protocol) Option(option tcpip.GettableNetworkProtocolOption) tcpip.Error {
  1573  	switch v := option.(type) {
  1574  	case *tcpip.DefaultTTLOption:
  1575  		*v = tcpip.DefaultTTLOption(p.DefaultTTL())
  1576  		return nil
  1577  	default:
  1578  		return &tcpip.ErrUnknownProtocolOption{}
  1579  	}
  1580  }
  1581  
  1582  // SetDefaultTTL sets the default TTL for endpoints created with this protocol.
  1583  func (p *protocol) SetDefaultTTL(ttl uint8) {
  1584  	p.defaultTTL.Store(uint32(ttl))
  1585  }
  1586  
  1587  // DefaultTTL returns the default TTL for endpoints created with this protocol.
  1588  func (p *protocol) DefaultTTL() uint8 {
  1589  	return uint8(p.defaultTTL.Load())
  1590  }
  1591  
  1592  // Close implements stack.TransportProtocol.
  1593  func (p *protocol) Close() {
  1594  	p.fragmentation.Release()
  1595  	p.multicastRouteTable.Close()
  1596  }
  1597  
  1598  // Wait implements stack.TransportProtocol.
  1599  func (*protocol) Wait() {}
  1600  
  1601  func (p *protocol) validateUnicastSourceAndMulticastDestination(addresses stack.UnicastSourceAndMulticastDestination) tcpip.Error {
  1602  	if !p.isUnicastAddress(addresses.Source) || header.IsV4LinkLocalUnicastAddress(addresses.Source) {
  1603  		return &tcpip.ErrBadAddress{}
  1604  	}
  1605  
  1606  	if !header.IsV4MulticastAddress(addresses.Destination) || header.IsV4LinkLocalMulticastAddress(addresses.Destination) {
  1607  		return &tcpip.ErrBadAddress{}
  1608  	}
  1609  
  1610  	return nil
  1611  }
  1612  
  1613  func (p *protocol) multicastForwarding() bool {
  1614  	p.mu.RLock()
  1615  	defer p.mu.RUnlock()
  1616  	return p.multicastForwardingDisp != nil
  1617  }
  1618  
  1619  func (p *protocol) newInstalledRoute(route stack.MulticastRoute) (*multicast.InstalledRoute, tcpip.Error) {
  1620  	if len(route.OutgoingInterfaces) == 0 {
  1621  		return nil, &tcpip.ErrMissingRequiredFields{}
  1622  	}
  1623  
  1624  	if !p.stack.HasNIC(route.ExpectedInputInterface) {
  1625  		return nil, &tcpip.ErrUnknownNICID{}
  1626  	}
  1627  
  1628  	for _, outgoingInterface := range route.OutgoingInterfaces {
  1629  		if route.ExpectedInputInterface == outgoingInterface.ID {
  1630  			return nil, &tcpip.ErrMulticastInputCannotBeOutput{}
  1631  		}
  1632  
  1633  		if !p.stack.HasNIC(outgoingInterface.ID) {
  1634  			return nil, &tcpip.ErrUnknownNICID{}
  1635  		}
  1636  	}
  1637  	return p.multicastRouteTable.NewInstalledRoute(route), nil
  1638  }
  1639  
  1640  // AddMulticastRoute implements stack.MulticastForwardingNetworkProtocol.
  1641  func (p *protocol) AddMulticastRoute(addresses stack.UnicastSourceAndMulticastDestination, route stack.MulticastRoute) tcpip.Error {
  1642  	if !p.multicastForwarding() {
  1643  		return &tcpip.ErrNotPermitted{}
  1644  	}
  1645  
  1646  	if err := p.validateUnicastSourceAndMulticastDestination(addresses); err != nil {
  1647  		return err
  1648  	}
  1649  
  1650  	installedRoute, err := p.newInstalledRoute(route)
  1651  	if err != nil {
  1652  		return err
  1653  	}
  1654  
  1655  	pendingPackets := p.multicastRouteTable.AddInstalledRoute(addresses, installedRoute)
  1656  
  1657  	for _, pkt := range pendingPackets {
  1658  		p.forwardPendingMulticastPacket(pkt, installedRoute)
  1659  	}
  1660  	return nil
  1661  }
  1662  
  1663  // RemoveMulticastRoute implements
  1664  // stack.MulticastForwardingNetworkProtocol.RemoveMulticastRoute.
  1665  func (p *protocol) RemoveMulticastRoute(addresses stack.UnicastSourceAndMulticastDestination) tcpip.Error {
  1666  	if err := p.validateUnicastSourceAndMulticastDestination(addresses); err != nil {
  1667  		return err
  1668  	}
  1669  
  1670  	if removed := p.multicastRouteTable.RemoveInstalledRoute(addresses); !removed {
  1671  		return &tcpip.ErrHostUnreachable{}
  1672  	}
  1673  
  1674  	return nil
  1675  }
  1676  
  1677  // EnableMulticastForwarding implements
  1678  // stack.MulticastForwardingNetworkProtocol.EnableMulticastForwarding.
  1679  func (p *protocol) EnableMulticastForwarding(disp stack.MulticastForwardingEventDispatcher) (bool, tcpip.Error) {
  1680  	p.mu.Lock()
  1681  	defer p.mu.Unlock()
  1682  
  1683  	if p.multicastForwardingDisp != nil {
  1684  		return true, nil
  1685  	}
  1686  
  1687  	if disp == nil {
  1688  		return false, &tcpip.ErrInvalidOptionValue{}
  1689  	}
  1690  
  1691  	p.multicastForwardingDisp = disp
  1692  	return false, nil
  1693  }
  1694  
  1695  // DisableMulticastForwarding implements
  1696  // stack.MulticastForwardingNetworkProtocol.DisableMulticastForwarding.
  1697  func (p *protocol) DisableMulticastForwarding() {
  1698  	p.mu.Lock()
  1699  	defer p.mu.Unlock()
  1700  
  1701  	p.multicastForwardingDisp = nil
  1702  	p.multicastRouteTable.RemoveAllInstalledRoutes()
  1703  }
  1704  
  1705  // MulticastRouteLastUsedTime implements
  1706  // stack.MulticastForwardingNetworkProtocol.
  1707  func (p *protocol) MulticastRouteLastUsedTime(addresses stack.UnicastSourceAndMulticastDestination) (tcpip.MonotonicTime, tcpip.Error) {
  1708  	if err := p.validateUnicastSourceAndMulticastDestination(addresses); err != nil {
  1709  		return tcpip.MonotonicTime{}, err
  1710  	}
  1711  
  1712  	timestamp, found := p.multicastRouteTable.GetLastUsedTimestamp(addresses)
  1713  
  1714  	if !found {
  1715  		return tcpip.MonotonicTime{}, &tcpip.ErrHostUnreachable{}
  1716  	}
  1717  
  1718  	return timestamp, nil
  1719  }
  1720  
  1721  func (p *protocol) forwardPendingMulticastPacket(pkt *stack.PacketBuffer, installedRoute *multicast.InstalledRoute) {
  1722  	defer pkt.DecRef()
  1723  
  1724  	// Attempt to forward the packet using the endpoint that it originally
  1725  	// arrived on. This ensures that the packet is only forwarded if it
  1726  	// matches the route's expected input interface (see 5a of RFC 1812 section
  1727  	// 5.2.1.3).
  1728  	ep, ok := p.getEndpointForNIC(pkt.NICID)
  1729  
  1730  	if !ok {
  1731  		// The endpoint that the packet arrived on no longer exists. Silently
  1732  		// drop the pkt.
  1733  		return
  1734  	}
  1735  
  1736  	if !ep.MulticastForwarding() {
  1737  		return
  1738  	}
  1739  
  1740  	ep.handleForwardingError(ep.forwardValidatedMulticastPacket(pkt, installedRoute))
  1741  }
  1742  
  1743  func (p *protocol) isUnicastAddress(addr tcpip.Address) bool {
  1744  	if addr.BitLen() != header.IPv4AddressSizeBits {
  1745  		return false
  1746  	}
  1747  
  1748  	if addr == header.IPv4Any || addr == header.IPv4Broadcast {
  1749  		return false
  1750  	}
  1751  
  1752  	if p.isSubnetLocalBroadcastAddress(addr) {
  1753  		return false
  1754  	}
  1755  	return !header.IsV4MulticastAddress(addr)
  1756  }
  1757  
  1758  func (p *protocol) isSubnetLocalBroadcastAddress(addr tcpip.Address) bool {
  1759  	p.mu.RLock()
  1760  	defer p.mu.RUnlock()
  1761  
  1762  	for _, e := range p.eps {
  1763  		if addressEndpoint := e.AcquireAssignedAddress(addr, false /* createTemp */, stack.NeverPrimaryEndpoint, true /* readOnly */); addressEndpoint != nil {
  1764  			subnet := addressEndpoint.Subnet()
  1765  			if subnet.IsBroadcast(addr) {
  1766  				return true
  1767  			}
  1768  		}
  1769  	}
  1770  	return false
  1771  }
  1772  
  1773  // parseAndValidate parses the packet (including its transport layer header) and
  1774  // returns the parsed IP header.
  1775  //
  1776  // Returns true if the IP header was successfully parsed.
  1777  func (p *protocol) parseAndValidate(pkt *stack.PacketBuffer) (*buffer.View, bool) {
  1778  	transProtoNum, hasTransportHdr, ok := p.Parse(pkt)
  1779  	if !ok {
  1780  		return nil, false
  1781  	}
  1782  
  1783  	h := header.IPv4(pkt.NetworkHeader().Slice())
  1784  	// Do not include the link header's size when calculating the size of the IP
  1785  	// packet.
  1786  	if !h.IsValid(pkt.Size() - len(pkt.LinkHeader().Slice())) {
  1787  		return nil, false
  1788  	}
  1789  
  1790  	if !pkt.RXChecksumValidated && !h.IsChecksumValid() {
  1791  		return nil, false
  1792  	}
  1793  
  1794  	if hasTransportHdr {
  1795  		p.parseTransport(pkt, transProtoNum)
  1796  	}
  1797  
  1798  	return pkt.NetworkHeader().View(), true
  1799  }
  1800  
  1801  func (p *protocol) parseTransport(pkt *stack.PacketBuffer, transProtoNum tcpip.TransportProtocolNumber) {
  1802  	if transProtoNum == header.ICMPv4ProtocolNumber {
  1803  		// The transport layer will handle transport layer parsing errors.
  1804  		_ = parse.ICMPv4(pkt)
  1805  		return
  1806  	}
  1807  
  1808  	switch err := p.stack.ParsePacketBufferTransport(transProtoNum, pkt); err {
  1809  	case stack.ParsedOK:
  1810  	case stack.UnknownTransportProtocol, stack.TransportLayerParseError:
  1811  		// The transport layer will handle unknown protocols and transport layer
  1812  		// parsing errors.
  1813  	default:
  1814  		panic(fmt.Sprintf("unexpected error parsing transport header = %d", err))
  1815  	}
  1816  }
  1817  
  1818  // Parse implements stack.NetworkProtocol.
  1819  func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
  1820  	if ok := parse.IPv4(pkt); !ok {
  1821  		return 0, false, false
  1822  	}
  1823  
  1824  	ipHdr := header.IPv4(pkt.NetworkHeader().Slice())
  1825  	return ipHdr.TransportProtocol(), !ipHdr.More() && ipHdr.FragmentOffset() == 0, true
  1826  }
  1827  
  1828  // allowICMPReply reports whether an ICMP reply with provided type and code may
  1829  // be sent following the rate mask options and global ICMP rate limiter.
  1830  func (p *protocol) allowICMPReply(icmpType header.ICMPv4Type, code header.ICMPv4Code) bool {
  1831  	// Mimic linux and never rate limit for PMTU discovery.
  1832  	// https://github.com/torvalds/linux/blob/9e9fb7655ed585da8f468e29221f0ba194a5f613/net/ipv4/icmp.c#L288
  1833  	if icmpType == header.ICMPv4DstUnreachable && code == header.ICMPv4FragmentationNeeded {
  1834  		return true
  1835  	}
  1836  	p.mu.RLock()
  1837  	defer p.mu.RUnlock()
  1838  
  1839  	if _, ok := p.icmpRateLimitedTypes[icmpType]; ok {
  1840  		return p.stack.AllowICMPMessage()
  1841  	}
  1842  	return true
  1843  }
  1844  
  1845  // SendRejectionError implements stack.RejectIPv4WithHandler.
  1846  func (p *protocol) SendRejectionError(pkt *stack.PacketBuffer, rejectWith stack.RejectIPv4WithICMPType, inputHook bool) tcpip.Error {
  1847  	switch rejectWith {
  1848  	case stack.RejectIPv4WithICMPNetUnreachable:
  1849  		return p.returnError(&icmpReasonNetworkUnreachable{}, pkt, inputHook)
  1850  	case stack.RejectIPv4WithICMPHostUnreachable:
  1851  		return p.returnError(&icmpReasonHostUnreachable{}, pkt, inputHook)
  1852  	case stack.RejectIPv4WithICMPPortUnreachable:
  1853  		return p.returnError(&icmpReasonPortUnreachable{}, pkt, inputHook)
  1854  	case stack.RejectIPv4WithICMPNetProhibited:
  1855  		return p.returnError(&icmpReasonNetworkProhibited{}, pkt, inputHook)
  1856  	case stack.RejectIPv4WithICMPHostProhibited:
  1857  		return p.returnError(&icmpReasonHostProhibited{}, pkt, inputHook)
  1858  	case stack.RejectIPv4WithICMPAdminProhibited:
  1859  		return p.returnError(&icmpReasonAdministrativelyProhibited{}, pkt, inputHook)
  1860  	default:
  1861  		panic(fmt.Sprintf("unhandled %[1]T = %[1]d", rejectWith))
  1862  	}
  1863  }
  1864  
  1865  // calculateNetworkMTU calculates the network-layer payload MTU based on the
  1866  // link-layer payload mtu.
  1867  func calculateNetworkMTU(linkMTU, networkHeaderSize uint32) (uint32, tcpip.Error) {
  1868  	if linkMTU < header.IPv4MinimumMTU {
  1869  		return 0, &tcpip.ErrInvalidEndpointState{}
  1870  	}
  1871  
  1872  	// As per RFC 791 section 3.1, an IPv4 header cannot exceed 60 bytes in
  1873  	// length:
  1874  	//   The maximal internet header is 60 octets, and a typical internet header
  1875  	//   is 20 octets, allowing a margin for headers of higher level protocols.
  1876  	if networkHeaderSize > header.IPv4MaximumHeaderSize {
  1877  		return 0, &tcpip.ErrMalformedHeader{}
  1878  	}
  1879  
  1880  	networkMTU := linkMTU
  1881  	if networkMTU > MaxTotalSize {
  1882  		networkMTU = MaxTotalSize
  1883  	}
  1884  
  1885  	return networkMTU - networkHeaderSize, nil
  1886  }
  1887  
  1888  func packetMustBeFragmented(pkt *stack.PacketBuffer, networkMTU uint32) bool {
  1889  	payload := len(pkt.TransportHeader().Slice()) + pkt.Data().Size()
  1890  	return pkt.GSOOptions.Type == stack.GSONone && uint32(payload) > networkMTU
  1891  }
  1892  
  1893  // addressToUint32 translates an IPv4 address into its little endian uint32
  1894  // representation.
  1895  //
  1896  // This function does the same thing as binary.LittleEndian.Uint32 but operates
  1897  // on a tcpip.Address (a string) without the need to convert it to a byte slice,
  1898  // which would cause an allocation.
  1899  func addressToUint32(addr tcpip.Address) uint32 {
  1900  	addrBytes := addr.As4()
  1901  	_ = addrBytes[3] // bounds check hint to compiler
  1902  	return uint32(addrBytes[0]) | uint32(addrBytes[1])<<8 | uint32(addrBytes[2])<<16 | uint32(addrBytes[3])<<24
  1903  }
  1904  
  1905  // hashRoute calculates a hash value for the given source/destination pair using
  1906  // the addresses, transport protocol number and a 32-bit number to generate the
  1907  // hash.
  1908  func hashRoute(srcAddr, dstAddr tcpip.Address, protocol tcpip.TransportProtocolNumber, hashIV uint32) uint32 {
  1909  	a := addressToUint32(srcAddr)
  1910  	b := addressToUint32(dstAddr)
  1911  	return hash.Hash3Words(a, b, uint32(protocol), hashIV)
  1912  }
  1913  
  1914  // Options holds options to configure a new protocol.
  1915  type Options struct {
  1916  	// IGMP holds options for IGMP.
  1917  	IGMP IGMPOptions
  1918  
  1919  	// AllowExternalLoopbackTraffic indicates that inbound loopback packets (i.e.
  1920  	// martian loopback packets) should be accepted.
  1921  	AllowExternalLoopbackTraffic bool
  1922  }
  1923  
  1924  // NewProtocolWithOptions returns an IPv4 network protocol.
  1925  func NewProtocolWithOptions(opts Options) stack.NetworkProtocolFactory {
  1926  	ids := make([]atomicbitops.Uint32, buckets)
  1927  
  1928  	// Randomly initialize hashIV and the ids.
  1929  	r := hash.RandN32(1 + buckets)
  1930  	for i := range ids {
  1931  		ids[i] = atomicbitops.FromUint32(r[i])
  1932  	}
  1933  	hashIV := r[buckets]
  1934  
  1935  	return func(s *stack.Stack) stack.NetworkProtocol {
  1936  		p := &protocol{
  1937  			stack:      s,
  1938  			ids:        ids,
  1939  			hashIV:     hashIV,
  1940  			defaultTTL: atomicbitops.FromUint32(DefaultTTL),
  1941  			options:    opts,
  1942  		}
  1943  		p.fragmentation = fragmentation.NewFragmentation(fragmentblockSize, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, ReassembleTimeout, s.Clock(), p)
  1944  		p.eps = make(map[tcpip.NICID]*endpoint)
  1945  		// Set ICMP rate limiting to Linux defaults.
  1946  		// See https://man7.org/linux/man-pages/man7/icmp.7.html.
  1947  		p.icmpRateLimitedTypes = map[header.ICMPv4Type]struct{}{
  1948  			header.ICMPv4DstUnreachable: {},
  1949  			header.ICMPv4SrcQuench:      {},
  1950  			header.ICMPv4TimeExceeded:   {},
  1951  			header.ICMPv4ParamProblem:   {},
  1952  		}
  1953  		if err := p.multicastRouteTable.Init(multicast.DefaultConfig(s.Clock())); err != nil {
  1954  			panic(fmt.Sprintf("p.multicastRouteTable.Init(_): %s", err))
  1955  		}
  1956  		return p
  1957  	}
  1958  }
  1959  
  1960  // NewProtocol is equivalent to NewProtocolWithOptions with an empty Options.
  1961  func NewProtocol(s *stack.Stack) stack.NetworkProtocol {
  1962  	return NewProtocolWithOptions(Options{})(s)
  1963  }
  1964  
  1965  func buildNextFragment(pf *fragmentation.PacketFragmenter, originalIPHeader header.IPv4) (*stack.PacketBuffer, bool) {
  1966  	fragPkt, offset, copied, more := pf.BuildNextFragment()
  1967  	fragPkt.NetworkProtocolNumber = ProtocolNumber
  1968  
  1969  	originalIPHeaderLength := len(originalIPHeader)
  1970  	nextFragIPHeader := header.IPv4(fragPkt.NetworkHeader().Push(originalIPHeaderLength))
  1971  	fragPkt.NetworkProtocolNumber = ProtocolNumber
  1972  
  1973  	if copied := copy(nextFragIPHeader, originalIPHeader); copied != len(originalIPHeader) {
  1974  		panic(fmt.Sprintf("wrong number of bytes copied into fragmentIPHeaders: got = %d, want = %d", copied, originalIPHeaderLength))
  1975  	}
  1976  
  1977  	flags := originalIPHeader.Flags()
  1978  	if more {
  1979  		flags |= header.IPv4FlagMoreFragments
  1980  	}
  1981  	nextFragIPHeader.SetFlagsFragmentOffset(flags, uint16(offset))
  1982  	nextFragIPHeader.SetTotalLength(uint16(nextFragIPHeader.HeaderLength()) + uint16(copied))
  1983  	nextFragIPHeader.SetChecksum(0)
  1984  	nextFragIPHeader.SetChecksum(^nextFragIPHeader.CalculateChecksum())
  1985  
  1986  	return fragPkt, more
  1987  }
  1988  
  1989  // optionAction describes possible actions that may be taken on an option
  1990  // while processing it.
  1991  type optionAction uint8
  1992  
  1993  const (
  1994  	// optionRemove says that the option should not be in the output option set.
  1995  	optionRemove optionAction = iota
  1996  
  1997  	// optionProcess says that the option should be fully processed.
  1998  	optionProcess
  1999  
  2000  	// optionVerify says the option should be checked and passed unchanged.
  2001  	optionVerify
  2002  
  2003  	// optionPass says to pass the output set without checking.
  2004  	optionPass
  2005  )
  2006  
  2007  // optionActions list what to do for each option in a given scenario.
  2008  type optionActions struct {
  2009  	// timestamp controls what to do with a Timestamp option.
  2010  	timestamp optionAction
  2011  
  2012  	// recordRoute controls what to do with a Record Route option.
  2013  	recordRoute optionAction
  2014  
  2015  	// routerAlert controls what to do with a Router Alert option.
  2016  	routerAlert optionAction
  2017  
  2018  	// unknown controls what to do with an unknown option.
  2019  	unknown optionAction
  2020  }
  2021  
  2022  // optionsUsage specifies the ways options may be operated upon for a given
  2023  // scenario during packet processing.
  2024  type optionsUsage interface {
  2025  	actions() optionActions
  2026  }
  2027  
  2028  // optionUsageVerify implements optionsUsage for when we just want to check
  2029  // fragments. Don't change anything, just check and reject if bad. No
  2030  // replacement options are generated.
  2031  type optionUsageVerify struct{}
  2032  
  2033  // actions implements optionsUsage.
  2034  func (*optionUsageVerify) actions() optionActions {
  2035  	return optionActions{
  2036  		timestamp:   optionVerify,
  2037  		recordRoute: optionVerify,
  2038  		routerAlert: optionVerify,
  2039  		unknown:     optionRemove,
  2040  	}
  2041  }
  2042  
  2043  // optionUsageReceive implements optionsUsage for packets we will pass
  2044  // to the transport layer (with the exception of Echo requests).
  2045  type optionUsageReceive struct{}
  2046  
  2047  // actions implements optionsUsage.
  2048  func (*optionUsageReceive) actions() optionActions {
  2049  	return optionActions{
  2050  		timestamp:   optionProcess,
  2051  		recordRoute: optionProcess,
  2052  		routerAlert: optionVerify,
  2053  		unknown:     optionPass,
  2054  	}
  2055  }
  2056  
  2057  // optionUsageForward implements optionsUsage for packets about to be forwarded.
  2058  // All options are passed on regardless of whether we recognise them, however
  2059  // we do process the Timestamp and Record Route options.
  2060  type optionUsageForward struct{}
  2061  
  2062  // actions implements optionsUsage.
  2063  func (*optionUsageForward) actions() optionActions {
  2064  	return optionActions{
  2065  		timestamp:   optionProcess,
  2066  		recordRoute: optionProcess,
  2067  		routerAlert: optionVerify,
  2068  		unknown:     optionPass,
  2069  	}
  2070  }
  2071  
  2072  // optionUsageEcho implements optionsUsage for echo packet processing.
  2073  // Only Timestamp and RecordRoute are processed and sent back.
  2074  type optionUsageEcho struct{}
  2075  
  2076  // actions implements optionsUsage.
  2077  func (*optionUsageEcho) actions() optionActions {
  2078  	return optionActions{
  2079  		timestamp:   optionProcess,
  2080  		recordRoute: optionProcess,
  2081  		routerAlert: optionVerify,
  2082  		unknown:     optionRemove,
  2083  	}
  2084  }
  2085  
  2086  // handleTimestamp does any required processing on a Timestamp option
  2087  // in place.
  2088  func handleTimestamp(tsOpt header.IPv4OptionTimestamp, localAddress tcpip.Address, clock tcpip.Clock, usage optionsUsage) *header.IPv4OptParameterProblem {
  2089  	flags := tsOpt.Flags()
  2090  	var entrySize uint8
  2091  	switch flags {
  2092  	case header.IPv4OptionTimestampOnlyFlag:
  2093  		entrySize = header.IPv4OptionTimestampSize
  2094  	case
  2095  		header.IPv4OptionTimestampWithIPFlag,
  2096  		header.IPv4OptionTimestampWithPredefinedIPFlag:
  2097  		entrySize = header.IPv4OptionTimestampWithAddrSize
  2098  	default:
  2099  		return &header.IPv4OptParameterProblem{
  2100  			Pointer:  header.IPv4OptTSOFLWAndFLGOffset,
  2101  			NeedICMP: true,
  2102  		}
  2103  	}
  2104  
  2105  	pointer := tsOpt.Pointer()
  2106  	// RFC 791 page 22 states: "The smallest legal value is 5."
  2107  	// Since the pointer is 1 based, and the header is 4 bytes long the
  2108  	// pointer must point beyond the header therefore 4 or less is bad.
  2109  	if pointer <= header.IPv4OptionTimestampHdrLength {
  2110  		return &header.IPv4OptParameterProblem{
  2111  			Pointer:  header.IPv4OptTSPointerOffset,
  2112  			NeedICMP: true,
  2113  		}
  2114  	}
  2115  	// To simplify processing below, base further work on the array of timestamps
  2116  	// beyond the header, rather than on the whole option. Also to aid
  2117  	// calculations set 'nextSlot' to be 0 based as in the packet it is 1 based.
  2118  	nextSlot := pointer - (header.IPv4OptionTimestampHdrLength + 1)
  2119  	optLen := tsOpt.Size()
  2120  	dataLength := optLen - header.IPv4OptionTimestampHdrLength
  2121  
  2122  	// In the section below, we verify the pointer, length and overflow counter
  2123  	// fields of the option. The distinction is in which byte you return as being
  2124  	// in error in the ICMP packet. Offsets 1 (length), 2 pointer)
  2125  	// or 3 (overflowed counter).
  2126  	//
  2127  	// The following RFC sections cover this section:
  2128  	//
  2129  	// RFC 791 (page 22):
  2130  	//    If there is some room but not enough room for a full timestamp
  2131  	//    to be inserted, or the overflow count itself overflows, the
  2132  	//    original datagram is considered to be in error and is discarded.
  2133  	//    In either case an ICMP parameter problem message may be sent to
  2134  	//    the source host [3].
  2135  	//
  2136  	// You can get this situation in two ways. Firstly if the data area is not
  2137  	// a multiple of the entry size or secondly, if the pointer is not at a
  2138  	// multiple of the entry size. The wording of the RFC suggests that
  2139  	// this is not an error until you actually run out of space.
  2140  	if pointer > optLen {
  2141  		// RFC 791 (page 22) says we should switch to using the overflow count.
  2142  		//    If the timestamp data area is already full (the pointer exceeds
  2143  		//    the length) the datagram is forwarded without inserting the
  2144  		//    timestamp, but the overflow count is incremented by one.
  2145  		if flags == header.IPv4OptionTimestampWithPredefinedIPFlag {
  2146  			// By definition we have nothing to do.
  2147  			return nil
  2148  		}
  2149  
  2150  		if tsOpt.IncOverflow() != 0 {
  2151  			return nil
  2152  		}
  2153  		// The overflow count is also full.
  2154  		return &header.IPv4OptParameterProblem{
  2155  			Pointer:  header.IPv4OptTSOFLWAndFLGOffset,
  2156  			NeedICMP: true,
  2157  		}
  2158  	}
  2159  	if nextSlot+entrySize > dataLength {
  2160  		// The data area isn't full but there isn't room for a new entry.
  2161  		// Either Length or Pointer could be bad.
  2162  		if false {
  2163  			// We must select Pointer for Linux compatibility, even if
  2164  			// only the length is bad.
  2165  			// The Linux code is at (in October 2020)
  2166  			// https://github.com/torvalds/linux/blob/bbf5c979011a099af5dc76498918ed7df445635b/net/ipv4/ip_options.c#L367-L370
  2167  			//		if (optptr[2]+3 > optlen) {
  2168  			//			pp_ptr = optptr + 2;
  2169  			//			goto error;
  2170  			//		}
  2171  			// which doesn't distinguish between which of optptr[2] or optlen
  2172  			// is wrong, but just arbitrarily decides on optptr+2.
  2173  			if dataLength%entrySize != 0 {
  2174  				// The Data section size should be a multiple of the expected
  2175  				// timestamp entry size.
  2176  				return &header.IPv4OptParameterProblem{
  2177  					Pointer:  header.IPv4OptionLengthOffset,
  2178  					NeedICMP: false,
  2179  				}
  2180  			}
  2181  			// If the size is OK, the pointer must be corrupted.
  2182  		}
  2183  		return &header.IPv4OptParameterProblem{
  2184  			Pointer:  header.IPv4OptTSPointerOffset,
  2185  			NeedICMP: true,
  2186  		}
  2187  	}
  2188  
  2189  	if usage.actions().timestamp == optionProcess {
  2190  		tsOpt.UpdateTimestamp(localAddress, clock)
  2191  	}
  2192  	return nil
  2193  }
  2194  
  2195  // handleRecordRoute checks and processes a Record route option. It is much
  2196  // like the timestamp type 1 option, but without timestamps. The passed in
  2197  // address is stored in the option in the correct spot if possible.
  2198  func handleRecordRoute(rrOpt header.IPv4OptionRecordRoute, localAddress tcpip.Address, usage optionsUsage) *header.IPv4OptParameterProblem {
  2199  	optlen := rrOpt.Size()
  2200  
  2201  	if optlen < header.IPv4AddressSize+header.IPv4OptionRecordRouteHdrLength {
  2202  		return &header.IPv4OptParameterProblem{
  2203  			Pointer:  header.IPv4OptionLengthOffset,
  2204  			NeedICMP: true,
  2205  		}
  2206  	}
  2207  
  2208  	pointer := rrOpt.Pointer()
  2209  	// RFC 791 page 20 states:
  2210  	//      The pointer is relative to this option, and the
  2211  	//      smallest legal value for the pointer is 4.
  2212  	// Since the pointer is 1 based, and the header is 3 bytes long the
  2213  	// pointer must point beyond the header therefore 3 or less is bad.
  2214  	if pointer <= header.IPv4OptionRecordRouteHdrLength {
  2215  		return &header.IPv4OptParameterProblem{
  2216  			Pointer:  header.IPv4OptRRPointerOffset,
  2217  			NeedICMP: true,
  2218  		}
  2219  	}
  2220  
  2221  	// RFC 791 page 21 says
  2222  	//       If the route data area is already full (the pointer exceeds the
  2223  	//       length) the datagram is forwarded without inserting the address
  2224  	//       into the recorded route. If there is some room but not enough
  2225  	//       room for a full address to be inserted, the original datagram is
  2226  	//       considered to be in error and is discarded.  In either case an
  2227  	//       ICMP parameter problem message may be sent to the source
  2228  	//       host.
  2229  	// The use of the words "In either case" suggests that a 'full' RR option
  2230  	// could generate an ICMP at every hop after it fills up. We chose to not
  2231  	// do this (as do most implementations). It is probable that the inclusion
  2232  	// of these words is a copy/paste error from the timestamp option where
  2233  	// there are two failure reasons given.
  2234  	if pointer > optlen {
  2235  		return nil
  2236  	}
  2237  
  2238  	// The data area isn't full but there isn't room for a new entry.
  2239  	// Either Length or Pointer could be bad. We must select Pointer for Linux
  2240  	// compatibility, even if only the length is bad. NB. pointer is 1 based.
  2241  	if pointer+header.IPv4AddressSize > optlen+1 {
  2242  		if false {
  2243  			// This is what we would do if we were not being Linux compatible.
  2244  			// Check for bad pointer or length value. Must be a multiple of 4 after
  2245  			// accounting for the 3 byte header and not within that header.
  2246  			// RFC 791, page 20 says:
  2247  			//       The pointer is relative to this option, and the
  2248  			//       smallest legal value for the pointer is 4.
  2249  			//
  2250  			//       A recorded route is composed of a series of internet addresses.
  2251  			//       Each internet address is 32 bits or 4 octets.
  2252  			// Linux skips this test so we must too.  See Linux code at:
  2253  			// https://github.com/torvalds/linux/blob/bbf5c979011a099af5dc76498918ed7df445635b/net/ipv4/ip_options.c#L338-L341
  2254  			//    if (optptr[2]+3 > optlen) {
  2255  			//      pp_ptr = optptr + 2;
  2256  			//      goto error;
  2257  			//    }
  2258  			if (optlen-header.IPv4OptionRecordRouteHdrLength)%header.IPv4AddressSize != 0 {
  2259  				// Length is bad, not on integral number of slots.
  2260  				return &header.IPv4OptParameterProblem{
  2261  					Pointer:  header.IPv4OptionLengthOffset,
  2262  					NeedICMP: true,
  2263  				}
  2264  			}
  2265  			// If not length, the fault must be with the pointer.
  2266  		}
  2267  		return &header.IPv4OptParameterProblem{
  2268  			Pointer:  header.IPv4OptRRPointerOffset,
  2269  			NeedICMP: true,
  2270  		}
  2271  	}
  2272  	if usage.actions().recordRoute == optionVerify {
  2273  		return nil
  2274  	}
  2275  	rrOpt.StoreAddress(localAddress)
  2276  	return nil
  2277  }
  2278  
  2279  // handleRouterAlert performs sanity checks on a Router Alert option.
  2280  func handleRouterAlert(raOpt header.IPv4OptionRouterAlert) *header.IPv4OptParameterProblem {
  2281  	// Only the zero value is acceptable, as per RFC 2113, section 2.1:
  2282  	//   Value:  A two octet code with the following values:
  2283  	//     0 - Router shall examine packet
  2284  	//     1-65535 - Reserved
  2285  	if raOpt.Value() != header.IPv4OptionRouterAlertValue {
  2286  		return &header.IPv4OptParameterProblem{
  2287  			Pointer:  header.IPv4OptionRouterAlertValueOffset,
  2288  			NeedICMP: true,
  2289  		}
  2290  	}
  2291  	return nil
  2292  }
  2293  
  2294  type optionTracker struct {
  2295  	timestamp   bool
  2296  	recordRoute bool
  2297  	routerAlert bool
  2298  }
  2299  
  2300  // processIPOptions parses the IPv4 options and produces a new set of options
  2301  // suitable for use in the next step of packet processing as informed by usage.
  2302  // The original will not be touched.
  2303  //
  2304  // If there were no errors during parsing, the new set of options is returned as
  2305  // a new buffer.
  2306  func (e *endpoint) processIPOptions(pkt *stack.PacketBuffer, opts header.IPv4Options, usage optionsUsage) (header.IPv4Options, optionTracker, *header.IPv4OptParameterProblem) {
  2307  	stats := e.stats.ip
  2308  	optIter := opts.MakeIterator()
  2309  
  2310  	// Except NOP, each option must only appear at most once (RFC 791 section 3.1,
  2311  	// at the definition of every type).
  2312  	// Keep track of each option we find to enable duplicate option detection.
  2313  	var seenOptions [math.MaxUint8 + 1]bool
  2314  
  2315  	// TODO(https://gvisor.dev/issue/4586): This will need tweaking when we start
  2316  	// really forwarding packets as we may need to get two addresses, for rx and
  2317  	// tx interfaces. We will also have to take usage into account.
  2318  	localAddress := e.MainAddress().Address
  2319  	if localAddress.BitLen() == 0 {
  2320  		h := header.IPv4(pkt.NetworkHeader().Slice())
  2321  		dstAddr := h.DestinationAddress()
  2322  		if pkt.NetworkPacketInfo.LocalAddressBroadcast || header.IsV4MulticastAddress(dstAddr) {
  2323  			return nil, optionTracker{}, &header.IPv4OptParameterProblem{
  2324  				NeedICMP: false,
  2325  			}
  2326  		}
  2327  		localAddress = dstAddr
  2328  	}
  2329  
  2330  	var optionsProcessed optionTracker
  2331  	for {
  2332  		option, done, optProblem := optIter.Next()
  2333  		if done || optProblem != nil {
  2334  			return optIter.Finalize(), optionsProcessed, optProblem
  2335  		}
  2336  		optType := option.Type()
  2337  		if optType == header.IPv4OptionNOPType {
  2338  			optIter.PushNOPOrEnd(optType)
  2339  			continue
  2340  		}
  2341  		if optType == header.IPv4OptionListEndType {
  2342  			optIter.PushNOPOrEnd(optType)
  2343  			return optIter.Finalize(), optionsProcessed, nil
  2344  		}
  2345  
  2346  		// check for repeating options (multiple NOPs are OK)
  2347  		if seenOptions[optType] {
  2348  			return nil, optionTracker{}, &header.IPv4OptParameterProblem{
  2349  				Pointer:  optIter.ErrCursor,
  2350  				NeedICMP: true,
  2351  			}
  2352  		}
  2353  		seenOptions[optType] = true
  2354  
  2355  		optLen, optProblem := func() (int, *header.IPv4OptParameterProblem) {
  2356  			switch option := option.(type) {
  2357  			case *header.IPv4OptionTimestamp:
  2358  				stats.OptionTimestampReceived.Increment()
  2359  				optionsProcessed.timestamp = true
  2360  				if usage.actions().timestamp != optionRemove {
  2361  					clock := e.protocol.stack.Clock()
  2362  					newBuffer := optIter.InitReplacement(option)
  2363  					optProblem := handleTimestamp(header.IPv4OptionTimestamp(newBuffer), localAddress, clock, usage)
  2364  					return len(newBuffer), optProblem
  2365  				}
  2366  
  2367  			case *header.IPv4OptionRecordRoute:
  2368  				stats.OptionRecordRouteReceived.Increment()
  2369  				optionsProcessed.recordRoute = true
  2370  				if usage.actions().recordRoute != optionRemove {
  2371  					newBuffer := optIter.InitReplacement(option)
  2372  					optProblem := handleRecordRoute(header.IPv4OptionRecordRoute(newBuffer), localAddress, usage)
  2373  					return len(newBuffer), optProblem
  2374  				}
  2375  
  2376  			case *header.IPv4OptionRouterAlert:
  2377  				stats.OptionRouterAlertReceived.Increment()
  2378  				optionsProcessed.routerAlert = true
  2379  				if usage.actions().routerAlert != optionRemove {
  2380  					newBuffer := optIter.InitReplacement(option)
  2381  					optProblem := handleRouterAlert(header.IPv4OptionRouterAlert(newBuffer))
  2382  					return len(newBuffer), optProblem
  2383  				}
  2384  
  2385  			default:
  2386  				stats.OptionUnknownReceived.Increment()
  2387  				if usage.actions().unknown == optionPass {
  2388  					return len(optIter.InitReplacement(option)), nil
  2389  				}
  2390  			}
  2391  			return 0, nil
  2392  		}()
  2393  
  2394  		if optProblem != nil {
  2395  			optProblem.Pointer += optIter.ErrCursor
  2396  			return nil, optionTracker{}, optProblem
  2397  		}
  2398  		optIter.ConsumeBuffer(optLen)
  2399  	}
  2400  }