inet.af/netstack@v0.0.0-20220214151720-7585b01ddccf/tcpip/network/ipv4/ipv4.go (about)

     1  // Copyright 2021 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package ipv4 contains the implementation of the ipv4 network protocol.
    16  package ipv4
    17  
    18  import (
    19  	"fmt"
    20  	"math"
    21  	"reflect"
    22  	"sync/atomic"
    23  	"time"
    24  
    25  	"inet.af/netstack/sync"
    26  	"inet.af/netstack/tcpip"
    27  	"inet.af/netstack/tcpip/buffer"
    28  	"inet.af/netstack/tcpip/header"
    29  	"inet.af/netstack/tcpip/header/parse"
    30  	"inet.af/netstack/tcpip/network/hash"
    31  	"inet.af/netstack/tcpip/network/internal/fragmentation"
    32  	"inet.af/netstack/tcpip/network/internal/ip"
    33  	"inet.af/netstack/tcpip/stack"
    34  )
    35  
    36  const (
    37  	// ReassembleTimeout is the time a packet stays in the reassembly
    38  	// system before being evicted.
    39  	// As per RFC 791 section 3.2:
    40  	//   The current recommendation for the initial timer setting is 15 seconds.
    41  	//   This may be changed as experience with this protocol accumulates.
    42  	//
    43  	// Considering that it is an old recommendation, we use the same reassembly
    44  	// timeout that linux defines, which is 30 seconds:
    45  	// https://github.com/torvalds/linux/blob/47ec5303d73ea344e84f46660fff693c57641386/include/net/ip.h#L138
    46  	ReassembleTimeout = 30 * time.Second
    47  
    48  	// ProtocolNumber is the ipv4 protocol number.
    49  	ProtocolNumber = header.IPv4ProtocolNumber
    50  
    51  	// MaxTotalSize is maximum size that can be encoded in the 16-bit
    52  	// TotalLength field of the ipv4 header.
    53  	MaxTotalSize = 0xffff
    54  
    55  	// DefaultTTL is the default time-to-live value for this endpoint.
    56  	DefaultTTL = 64
    57  
    58  	// buckets is the number of identifier buckets.
    59  	buckets = 2048
    60  
    61  	// The size of a fragment block, in bytes, as per RFC 791 section 3.1,
    62  	// page 14.
    63  	fragmentblockSize = 8
    64  )
    65  
    66  const (
    67  	forwardingDisabled = 0
    68  	forwardingEnabled  = 1
    69  )
    70  
    71  var ipv4BroadcastAddr = header.IPv4Broadcast.WithPrefix()
    72  
    73  var _ stack.LinkResolvableNetworkEndpoint = (*endpoint)(nil)
    74  var _ stack.ForwardingNetworkEndpoint = (*endpoint)(nil)
    75  var _ stack.GroupAddressableEndpoint = (*endpoint)(nil)
    76  var _ stack.AddressableEndpoint = (*endpoint)(nil)
    77  var _ stack.NetworkEndpoint = (*endpoint)(nil)
    78  
    79  type endpoint struct {
    80  	nic        stack.NetworkInterface
    81  	dispatcher stack.TransportDispatcher
    82  	protocol   *protocol
    83  	stats      sharedStats
    84  
    85  	// enabled is set to 1 when the endpoint is enabled and 0 when it is
    86  	// disabled.
    87  	//
    88  	// Must be accessed using atomic operations.
    89  	enabled uint32
    90  
    91  	// forwarding is set to forwardingEnabled when the endpoint has forwarding
    92  	// enabled and forwardingDisabled when it is disabled.
    93  	//
    94  	// Must be accessed using atomic operations.
    95  	forwarding uint32
    96  
    97  	mu struct {
    98  		sync.RWMutex
    99  
   100  		addressableEndpointState stack.AddressableEndpointState
   101  		igmp                     igmpState
   102  	}
   103  }
   104  
   105  // HandleLinkResolutionFailure implements stack.LinkResolvableNetworkEndpoint.
   106  func (e *endpoint) HandleLinkResolutionFailure(pkt *stack.PacketBuffer) {
   107  	// If we are operating as a router, return an ICMP error to the original
   108  	// packet's sender.
   109  	if pkt.NetworkPacketInfo.IsForwardedPacket {
   110  		// TODO(gvisor.dev/issue/6005): Propagate asynchronously generated ICMP
   111  		// errors to local endpoints.
   112  		e.protocol.returnError(&icmpReasonHostUnreachable{}, pkt)
   113  		e.stats.ip.Forwarding.Errors.Increment()
   114  		e.stats.ip.Forwarding.HostUnreachable.Increment()
   115  		return
   116  	}
   117  	// handleControl expects the entire offending packet to be in the packet
   118  	// buffer's data field.
   119  	pkt = stack.NewPacketBuffer(stack.PacketBufferOptions{
   120  		Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views()),
   121  	})
   122  	defer pkt.DecRef()
   123  	pkt.NICID = e.nic.ID()
   124  	pkt.NetworkProtocolNumber = ProtocolNumber
   125  	// Use the same control type as an ICMPv4 destination host unreachable error
   126  	// since the host is considered unreachable if we cannot resolve the link
   127  	// address to the next hop.
   128  	e.handleControl(&icmpv4DestinationHostUnreachableSockError{}, pkt)
   129  }
   130  
   131  // NewEndpoint creates a new ipv4 endpoint.
   132  func (p *protocol) NewEndpoint(nic stack.NetworkInterface, dispatcher stack.TransportDispatcher) stack.NetworkEndpoint {
   133  	e := &endpoint{
   134  		nic:        nic,
   135  		dispatcher: dispatcher,
   136  		protocol:   p,
   137  	}
   138  	e.mu.Lock()
   139  	e.mu.addressableEndpointState.Init(e)
   140  	e.mu.igmp.init(e)
   141  	e.mu.Unlock()
   142  
   143  	tcpip.InitStatCounters(reflect.ValueOf(&e.stats.localStats).Elem())
   144  
   145  	stackStats := p.stack.Stats()
   146  	e.stats.ip.Init(&e.stats.localStats.IP, &stackStats.IP)
   147  	e.stats.icmp.init(&e.stats.localStats.ICMP, &stackStats.ICMP.V4)
   148  	e.stats.igmp.init(&e.stats.localStats.IGMP, &stackStats.IGMP)
   149  
   150  	p.mu.Lock()
   151  	p.mu.eps[nic.ID()] = e
   152  	p.mu.Unlock()
   153  
   154  	return e
   155  }
   156  
   157  func (p *protocol) findEndpointWithAddress(addr tcpip.Address) *endpoint {
   158  	p.mu.RLock()
   159  	defer p.mu.RUnlock()
   160  
   161  	for _, e := range p.mu.eps {
   162  		if addressEndpoint := e.AcquireAssignedAddress(addr, false /* allowTemp */, stack.NeverPrimaryEndpoint); addressEndpoint != nil {
   163  			addressEndpoint.DecRef()
   164  			return e
   165  		}
   166  	}
   167  
   168  	return nil
   169  }
   170  
   171  func (p *protocol) getEndpointForNIC(id tcpip.NICID) (*endpoint, bool) {
   172  	p.mu.RLock()
   173  	defer p.mu.RUnlock()
   174  	ep, ok := p.mu.eps[id]
   175  	return ep, ok
   176  }
   177  
   178  func (p *protocol) forgetEndpoint(nicID tcpip.NICID) {
   179  	p.mu.Lock()
   180  	defer p.mu.Unlock()
   181  	delete(p.mu.eps, nicID)
   182  }
   183  
   184  // Forwarding implements stack.ForwardingNetworkEndpoint.
   185  func (e *endpoint) Forwarding() bool {
   186  	return atomic.LoadUint32(&e.forwarding) == forwardingEnabled
   187  }
   188  
   189  // setForwarding sets the forwarding status for the endpoint.
   190  //
   191  // Returns true if the forwarding status was updated.
   192  func (e *endpoint) setForwarding(v bool) bool {
   193  	forwarding := uint32(forwardingDisabled)
   194  	if v {
   195  		forwarding = forwardingEnabled
   196  	}
   197  
   198  	return atomic.SwapUint32(&e.forwarding, forwarding) != forwarding
   199  }
   200  
   201  // SetForwarding implements stack.ForwardingNetworkEndpoint.
   202  func (e *endpoint) SetForwarding(forwarding bool) {
   203  	e.mu.Lock()
   204  	defer e.mu.Unlock()
   205  
   206  	if !e.setForwarding(forwarding) {
   207  		return
   208  	}
   209  
   210  	if forwarding {
   211  		// There does not seem to be an RFC requirement for a node to join the all
   212  		// routers multicast address but
   213  		// https://www.iana.org/assignments/multicast-addresses/multicast-addresses.xhtml
   214  		// specifies the address as a group for all routers on a subnet so we join
   215  		// the group here.
   216  		if err := e.joinGroupLocked(header.IPv4AllRoutersGroup); err != nil {
   217  			// joinGroupLocked only returns an error if the group address is not a
   218  			// valid IPv4 multicast address.
   219  			panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", header.IPv4AllRoutersGroup, err))
   220  		}
   221  
   222  		return
   223  	}
   224  
   225  	switch err := e.leaveGroupLocked(header.IPv4AllRoutersGroup).(type) {
   226  	case nil:
   227  	case *tcpip.ErrBadLocalAddress:
   228  		// The endpoint may have already left the multicast group.
   229  	default:
   230  		panic(fmt.Sprintf("e.leaveGroupLocked(%s): %s", header.IPv4AllRoutersGroup, err))
   231  	}
   232  }
   233  
   234  // Enable implements stack.NetworkEndpoint.
   235  func (e *endpoint) Enable() tcpip.Error {
   236  	e.mu.Lock()
   237  	defer e.mu.Unlock()
   238  
   239  	// If the NIC is not enabled, the endpoint can't do anything meaningful so
   240  	// don't enable the endpoint.
   241  	if !e.nic.Enabled() {
   242  		return &tcpip.ErrNotPermitted{}
   243  	}
   244  
   245  	// If the endpoint is already enabled, there is nothing for it to do.
   246  	if !e.setEnabled(true) {
   247  		return nil
   248  	}
   249  
   250  	// Create an endpoint to receive broadcast packets on this interface.
   251  	ep, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(ipv4BroadcastAddr, stack.AddressProperties{PEB: stack.NeverPrimaryEndpoint})
   252  	if err != nil {
   253  		return err
   254  	}
   255  	// We have no need for the address endpoint.
   256  	ep.DecRef()
   257  
   258  	// Groups may have been joined while the endpoint was disabled, or the
   259  	// endpoint may have left groups from the perspective of IGMP when the
   260  	// endpoint was disabled. Either way, we need to let routers know to
   261  	// send us multicast traffic.
   262  	e.mu.igmp.initializeAll()
   263  
   264  	// As per RFC 1122 section 3.3.7, all hosts should join the all-hosts
   265  	// multicast group. Note, the IANA calls the all-hosts multicast group the
   266  	// all-systems multicast group.
   267  	if err := e.joinGroupLocked(header.IPv4AllSystems); err != nil {
   268  		// joinGroupLocked only returns an error if the group address is not a valid
   269  		// IPv4 multicast address.
   270  		panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", header.IPv4AllSystems, err))
   271  	}
   272  
   273  	return nil
   274  }
   275  
   276  // Enabled implements stack.NetworkEndpoint.
   277  func (e *endpoint) Enabled() bool {
   278  	return e.nic.Enabled() && e.isEnabled()
   279  }
   280  
   281  // isEnabled returns true if the endpoint is enabled, regardless of the
   282  // enabled status of the NIC.
   283  func (e *endpoint) isEnabled() bool {
   284  	return atomic.LoadUint32(&e.enabled) == 1
   285  }
   286  
   287  // setEnabled sets the enabled status for the endpoint.
   288  //
   289  // Returns true if the enabled status was updated.
   290  func (e *endpoint) setEnabled(v bool) bool {
   291  	if v {
   292  		return atomic.SwapUint32(&e.enabled, 1) == 0
   293  	}
   294  	return atomic.SwapUint32(&e.enabled, 0) == 1
   295  }
   296  
   297  // Disable implements stack.NetworkEndpoint.
   298  func (e *endpoint) Disable() {
   299  	e.mu.Lock()
   300  	defer e.mu.Unlock()
   301  	e.disableLocked()
   302  }
   303  
   304  func (e *endpoint) disableLocked() {
   305  	if !e.isEnabled() {
   306  		return
   307  	}
   308  
   309  	// The endpoint may have already left the multicast group.
   310  	switch err := e.leaveGroupLocked(header.IPv4AllSystems).(type) {
   311  	case nil, *tcpip.ErrBadLocalAddress:
   312  	default:
   313  		panic(fmt.Sprintf("unexpected error when leaving group = %s: %s", header.IPv4AllSystems, err))
   314  	}
   315  
   316  	// Leave groups from the perspective of IGMP so that routers know that
   317  	// we are no longer interested in the group.
   318  	e.mu.igmp.softLeaveAll()
   319  
   320  	// The address may have already been removed.
   321  	switch err := e.mu.addressableEndpointState.RemovePermanentAddress(ipv4BroadcastAddr.Address); err.(type) {
   322  	case nil, *tcpip.ErrBadLocalAddress:
   323  	default:
   324  		panic(fmt.Sprintf("unexpected error when removing address = %s: %s", ipv4BroadcastAddr.Address, err))
   325  	}
   326  
   327  	// Reset the IGMP V1 present flag.
   328  	//
   329  	// If the node comes back up on the same network, it will re-learn that it
   330  	// needs to perform IGMPv1.
   331  	e.mu.igmp.resetV1Present()
   332  
   333  	if !e.setEnabled(false) {
   334  		panic("should have only done work to disable the endpoint if it was enabled")
   335  	}
   336  }
   337  
   338  // DefaultTTL is the default time-to-live value for this endpoint.
   339  func (e *endpoint) DefaultTTL() uint8 {
   340  	return e.protocol.DefaultTTL()
   341  }
   342  
   343  // MTU implements stack.NetworkEndpoint. It returns the link-layer MTU minus the
   344  // network layer max header length.
   345  func (e *endpoint) MTU() uint32 {
   346  	networkMTU, err := calculateNetworkMTU(e.nic.MTU(), header.IPv4MinimumSize)
   347  	if err != nil {
   348  		return 0
   349  	}
   350  	return networkMTU
   351  }
   352  
   353  // MaxHeaderLength returns the maximum length needed by ipv4 headers (and
   354  // underlying protocols).
   355  func (e *endpoint) MaxHeaderLength() uint16 {
   356  	return e.nic.MaxHeaderLength() + header.IPv4MaximumHeaderSize
   357  }
   358  
   359  // NetworkProtocolNumber implements stack.NetworkEndpoint.
   360  func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
   361  	return e.protocol.Number()
   362  }
   363  
   364  func (e *endpoint) addIPHeader(srcAddr, dstAddr tcpip.Address, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams, options header.IPv4OptionsSerializer) tcpip.Error {
   365  	hdrLen := header.IPv4MinimumSize
   366  	var optLen int
   367  	if options != nil {
   368  		optLen = int(options.Length())
   369  	}
   370  	hdrLen += optLen
   371  	if hdrLen > header.IPv4MaximumHeaderSize {
   372  		return &tcpip.ErrMessageTooLong{}
   373  	}
   374  	ipH := header.IPv4(pkt.NetworkHeader().Push(hdrLen))
   375  	length := pkt.Size()
   376  	if length > math.MaxUint16 {
   377  		return &tcpip.ErrMessageTooLong{}
   378  	}
   379  	// RFC 6864 section 4.3 mandates uniqueness of ID values for non-atomic
   380  	// datagrams. Since the DF bit is never being set here, all datagrams
   381  	// are non-atomic and need an ID.
   382  	id := atomic.AddUint32(&e.protocol.ids[hashRoute(srcAddr, dstAddr, params.Protocol, e.protocol.hashIV)%buckets], 1)
   383  	ipH.Encode(&header.IPv4Fields{
   384  		TotalLength: uint16(length),
   385  		ID:          uint16(id),
   386  		TTL:         params.TTL,
   387  		TOS:         params.TOS,
   388  		Protocol:    uint8(params.Protocol),
   389  		SrcAddr:     srcAddr,
   390  		DstAddr:     dstAddr,
   391  		Options:     options,
   392  	})
   393  	ipH.SetChecksum(^ipH.CalculateChecksum())
   394  	pkt.NetworkProtocolNumber = ProtocolNumber
   395  	return nil
   396  }
   397  
   398  // handleFragments fragments pkt and calls the handler function on each
   399  // fragment. It returns the number of fragments handled and the number of
   400  // fragments left to be processed. The IP header must already be present in the
   401  // original packet.
   402  func (e *endpoint) handleFragments(_ *stack.Route, networkMTU uint32, pkt *stack.PacketBuffer, handler func(*stack.PacketBuffer) tcpip.Error) (int, int, tcpip.Error) {
   403  	// Round the MTU down to align to 8 bytes.
   404  	fragmentPayloadSize := networkMTU &^ 7
   405  	networkHeader := header.IPv4(pkt.NetworkHeader().View())
   406  	pf := fragmentation.MakePacketFragmenter(pkt, fragmentPayloadSize, pkt.AvailableHeaderBytes()+len(networkHeader))
   407  
   408  	var n int
   409  	for {
   410  		fragPkt, more := buildNextFragment(&pf, networkHeader)
   411  		if err := handler(fragPkt); err != nil {
   412  			return n, pf.RemainingFragmentCount() + 1, err
   413  		}
   414  		n++
   415  		if !more {
   416  			return n, pf.RemainingFragmentCount(), nil
   417  		}
   418  	}
   419  }
   420  
   421  // WritePacket writes a packet to the given destination address and protocol.
   422  func (e *endpoint) WritePacket(r *stack.Route, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) tcpip.Error {
   423  	if err := e.addIPHeader(r.LocalAddress(), r.RemoteAddress(), pkt, params, nil /* options */); err != nil {
   424  		return err
   425  	}
   426  
   427  	// iptables filtering. All packets that reach here are locally
   428  	// generated.
   429  	outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
   430  	if ok := e.protocol.stack.IPTables().CheckOutput(pkt, r, outNicName); !ok {
   431  		// iptables is telling us to drop the packet.
   432  		e.stats.ip.IPTablesOutputDropped.Increment()
   433  		return nil
   434  	}
   435  
   436  	// If the packet is manipulated as per NAT Output rules, handle packet
   437  	// based on destination address and do not send the packet to link
   438  	// layer.
   439  	//
   440  	// We should do this for every packet, rather than only NATted packets, but
   441  	// removing this check short circuits broadcasts before they are sent out to
   442  	// other hosts.
   443  	if pkt.DNATDone {
   444  		netHeader := header.IPv4(pkt.NetworkHeader().View())
   445  		if ep := e.protocol.findEndpointWithAddress(netHeader.DestinationAddress()); ep != nil {
   446  			// Since we rewrote the packet but it is being routed back to us, we
   447  			// can safely assume the checksum is valid.
   448  			ep.handleLocalPacket(pkt, true /* canSkipRXChecksum */)
   449  			return nil
   450  		}
   451  	}
   452  
   453  	return e.writePacket(r, pkt, false /* headerIncluded */)
   454  }
   455  
   456  func (e *endpoint) writePacket(r *stack.Route, pkt *stack.PacketBuffer, headerIncluded bool) tcpip.Error {
   457  	if r.Loop()&stack.PacketLoop != 0 {
   458  		// If the packet was generated by the stack (not a raw/packet endpoint
   459  		// where a packet may be written with the header included), then we can
   460  		// safely assume the checksum is valid.
   461  		e.handleLocalPacket(pkt, !headerIncluded /* canSkipRXChecksum */)
   462  	}
   463  	if r.Loop()&stack.PacketOut == 0 {
   464  		return nil
   465  	}
   466  
   467  	// Postrouting NAT can only change the source address, and does not alter the
   468  	// route or outgoing interface of the packet.
   469  	outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
   470  	if ok := e.protocol.stack.IPTables().CheckPostrouting(pkt, r, e, outNicName); !ok {
   471  		// iptables is telling us to drop the packet.
   472  		e.stats.ip.IPTablesPostroutingDropped.Increment()
   473  		return nil
   474  	}
   475  
   476  	stats := e.stats.ip
   477  
   478  	networkMTU, err := calculateNetworkMTU(e.nic.MTU(), uint32(pkt.NetworkHeader().View().Size()))
   479  	if err != nil {
   480  		stats.OutgoingPacketErrors.Increment()
   481  		return err
   482  	}
   483  
   484  	if packetMustBeFragmented(pkt, networkMTU) {
   485  		h := header.IPv4(pkt.NetworkHeader().View())
   486  		if h.Flags()&header.IPv4FlagDontFragment != 0 && pkt.NetworkPacketInfo.IsForwardedPacket {
   487  			// TODO(gvisor.dev/issue/5919): Handle error condition in which DontFragment
   488  			// is set but the packet must be fragmented for the non-forwarding case.
   489  			return &tcpip.ErrMessageTooLong{}
   490  		}
   491  		sent, remain, err := e.handleFragments(r, networkMTU, pkt, func(fragPkt *stack.PacketBuffer) tcpip.Error {
   492  			// TODO(gvisor.dev/issue/3884): Evaluate whether we want to send each
   493  			// fragment one by one using WritePacket() (current strategy) or if we
   494  			// want to create a PacketBufferList from the fragments and feed it to
   495  			// WritePackets(). It'll be faster but cost more memory.
   496  			return e.nic.WritePacket(r, ProtocolNumber, fragPkt)
   497  		})
   498  		stats.PacketsSent.IncrementBy(uint64(sent))
   499  		stats.OutgoingPacketErrors.IncrementBy(uint64(remain))
   500  		return err
   501  	}
   502  
   503  	if err := e.nic.WritePacket(r, ProtocolNumber, pkt); err != nil {
   504  		stats.OutgoingPacketErrors.Increment()
   505  		return err
   506  	}
   507  	stats.PacketsSent.Increment()
   508  	return nil
   509  }
   510  
   511  // WritePackets implements stack.NetworkEndpoint.
   512  func (e *endpoint) WritePackets(r *stack.Route, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, tcpip.Error) {
   513  	if r.Loop()&stack.PacketLoop != 0 {
   514  		panic("multiple packets in local loop")
   515  	}
   516  	if r.Loop()&stack.PacketOut == 0 {
   517  		return pkts.Len(), nil
   518  	}
   519  
   520  	stats := e.stats.ip
   521  
   522  	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
   523  		if err := e.addIPHeader(r.LocalAddress(), r.RemoteAddress(), pkt, params, nil /* options */); err != nil {
   524  			return 0, err
   525  		}
   526  
   527  		networkMTU, err := calculateNetworkMTU(e.nic.MTU(), uint32(pkt.NetworkHeader().View().Size()))
   528  		if err != nil {
   529  			stats.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len()))
   530  			return 0, err
   531  		}
   532  
   533  		if packetMustBeFragmented(pkt, networkMTU) {
   534  			// Keep track of the packet that is about to be fragmented so it can be
   535  			// removed once the fragmentation is done.
   536  			originalPkt := pkt
   537  			if _, _, err := e.handleFragments(r, networkMTU, pkt, func(fragPkt *stack.PacketBuffer) tcpip.Error {
   538  				fragPkt.IncRef()
   539  				// Modify the packet list in place with the new fragments.
   540  				pkts.InsertAfter(pkt, fragPkt)
   541  				pkt = fragPkt
   542  				return nil
   543  			}); err != nil {
   544  				panic(fmt.Sprintf("e.handleFragments(_, _, %d, _, _) = %s", networkMTU, err))
   545  			}
   546  			// Remove the packet that was just fragmented and process the rest.
   547  			pkts.Remove(originalPkt)
   548  		}
   549  	}
   550  
   551  	outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
   552  	// iptables filtering. All packets that reach here are locally
   553  	// generated.
   554  	outputDropped, natPkts := e.protocol.stack.IPTables().CheckOutputPackets(pkts, r, outNicName)
   555  	stats.IPTablesOutputDropped.IncrementBy(uint64(len(outputDropped)))
   556  	for pkt := range outputDropped {
   557  		pkts.Remove(pkt)
   558  	}
   559  
   560  	// The NAT-ed packets may now be destined for us.
   561  	locallyDelivered := 0
   562  	for pkt := range natPkts {
   563  		ep := e.protocol.findEndpointWithAddress(header.IPv4(pkt.NetworkHeader().View()).DestinationAddress())
   564  		if ep == nil {
   565  			// The NAT-ed packet is still destined for some remote node.
   566  			continue
   567  		}
   568  
   569  		// Do not send the locally destined packet out the NIC.
   570  		pkts.Remove(pkt)
   571  
   572  		// Deliver the packet locally.
   573  		ep.handleLocalPacket(pkt, true /* canSkipRXChecksum */)
   574  		locallyDelivered++
   575  
   576  	}
   577  
   578  	// We ignore the list of NAT-ed packets here because Postrouting NAT can only
   579  	// change the source address, and does not alter the route or outgoing
   580  	// interface of the packet.
   581  	postroutingDropped, _ := e.protocol.stack.IPTables().CheckPostroutingPackets(pkts, r, e, outNicName)
   582  	stats.IPTablesPostroutingDropped.IncrementBy(uint64(len(postroutingDropped)))
   583  	for pkt := range postroutingDropped {
   584  		pkts.Remove(pkt)
   585  	}
   586  
   587  	// The rest of the packets can be delivered to the NIC as a batch.
   588  	pktsLen := pkts.Len()
   589  	written, err := e.nic.WritePackets(r, pkts, ProtocolNumber)
   590  	stats.PacketsSent.IncrementBy(uint64(written))
   591  	stats.OutgoingPacketErrors.IncrementBy(uint64(pktsLen - written))
   592  
   593  	// Dropped packets aren't errors, so include them in the return value.
   594  	return locallyDelivered + written + len(outputDropped) + len(postroutingDropped), err
   595  }
   596  
   597  // WriteHeaderIncludedPacket implements stack.NetworkEndpoint.
   598  func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) tcpip.Error {
   599  	// The packet already has an IP header, but there are a few required
   600  	// checks.
   601  	h, ok := pkt.Data().PullUp(header.IPv4MinimumSize)
   602  	if !ok {
   603  		return &tcpip.ErrMalformedHeader{}
   604  	}
   605  
   606  	hdrLen := header.IPv4(h).HeaderLength()
   607  	if hdrLen < header.IPv4MinimumSize {
   608  		return &tcpip.ErrMalformedHeader{}
   609  	}
   610  
   611  	h, ok = pkt.Data().PullUp(int(hdrLen))
   612  	if !ok {
   613  		return &tcpip.ErrMalformedHeader{}
   614  	}
   615  	ipH := header.IPv4(h)
   616  
   617  	// Always set the total length.
   618  	pktSize := pkt.Data().Size()
   619  	ipH.SetTotalLength(uint16(pktSize))
   620  
   621  	// Set the source address when zero.
   622  	if ipH.SourceAddress() == header.IPv4Any {
   623  		ipH.SetSourceAddress(r.LocalAddress())
   624  	}
   625  
   626  	// Set the packet ID when zero.
   627  	if ipH.ID() == 0 {
   628  		// RFC 6864 section 4.3 mandates uniqueness of ID values for
   629  		// non-atomic datagrams, so assign an ID to all such datagrams
   630  		// according to the definition given in RFC 6864 section 4.
   631  		if ipH.Flags()&header.IPv4FlagDontFragment == 0 || ipH.Flags()&header.IPv4FlagMoreFragments != 0 || ipH.FragmentOffset() > 0 {
   632  			ipH.SetID(uint16(atomic.AddUint32(&e.protocol.ids[hashRoute(r.LocalAddress(), r.RemoteAddress(), 0 /* protocol */, e.protocol.hashIV)%buckets], 1)))
   633  		}
   634  	}
   635  
   636  	// Always set the checksum.
   637  	ipH.SetChecksum(0)
   638  	ipH.SetChecksum(^ipH.CalculateChecksum())
   639  
   640  	// Populate the packet buffer's network header and don't allow an invalid
   641  	// packet to be sent.
   642  	//
   643  	// Note that parsing only makes sure that the packet is well formed as per the
   644  	// wire format. We also want to check if the header's fields are valid before
   645  	// sending the packet.
   646  	if !parse.IPv4(pkt) || !header.IPv4(pkt.NetworkHeader().View()).IsValid(pktSize) {
   647  		return &tcpip.ErrMalformedHeader{}
   648  	}
   649  
   650  	return e.writePacket(r, pkt, true /* headerIncluded */)
   651  }
   652  
   653  // forwardPacket attempts to forward a packet to its final destination.
   654  func (e *endpoint) forwardPacket(pkt *stack.PacketBuffer) ip.ForwardingError {
   655  	h := header.IPv4(pkt.NetworkHeader().View())
   656  
   657  	dstAddr := h.DestinationAddress()
   658  	// As per RFC 3927 section 7,
   659  	//
   660  	//   A router MUST NOT forward a packet with an IPv4 Link-Local source or
   661  	//   destination address, irrespective of the router's default route
   662  	//   configuration or routes obtained from dynamic routing protocols.
   663  	//
   664  	//   A router which receives a packet with an IPv4 Link-Local source or
   665  	//   destination address MUST NOT forward the packet.  This prevents
   666  	//   forwarding of packets back onto the network segment from which they
   667  	//   originated, or to any other segment.
   668  	if header.IsV4LinkLocalUnicastAddress(h.SourceAddress()) {
   669  		return &ip.ErrLinkLocalSourceAddress{}
   670  	}
   671  	if header.IsV4LinkLocalUnicastAddress(dstAddr) || header.IsV4LinkLocalMulticastAddress(dstAddr) {
   672  		return &ip.ErrLinkLocalDestinationAddress{}
   673  	}
   674  
   675  	ttl := h.TTL()
   676  	if ttl == 0 {
   677  		// As per RFC 792 page 6, Time Exceeded Message,
   678  		//
   679  		//  If the gateway processing a datagram finds the time to live field
   680  		//  is zero it must discard the datagram.  The gateway may also notify
   681  		//  the source host via the time exceeded message.
   682  		//
   683  		// We return the original error rather than the result of returning
   684  		// the ICMP packet because the original error is more relevant to
   685  		// the caller.
   686  		_ = e.protocol.returnError(&icmpReasonTTLExceeded{}, pkt)
   687  		return &ip.ErrTTLExceeded{}
   688  	}
   689  
   690  	if opts := h.Options(); len(opts) != 0 {
   691  		newOpts, _, optProblem := e.processIPOptions(pkt, opts, &optionUsageForward{})
   692  		if optProblem != nil {
   693  			if optProblem.NeedICMP {
   694  				_ = e.protocol.returnError(&icmpReasonParamProblem{
   695  					pointer:    optProblem.Pointer,
   696  					forwarding: true,
   697  				}, pkt)
   698  			}
   699  			return &ip.ErrParameterProblem{}
   700  		}
   701  		copied := copy(opts, newOpts)
   702  		if copied != len(newOpts) {
   703  			panic(fmt.Sprintf("copied %d bytes of new options, expected %d bytes", copied, len(newOpts)))
   704  		}
   705  		// Since in forwarding we handle all options, including copying those we
   706  		// do not recognise, the options region should remain the same size which
   707  		// simplifies processing. As we MAY receive a packet with a lot of padded
   708  		// bytes after the "end of options list" byte, make sure we copy
   709  		// them as the legal padding value (0).
   710  		for i := copied; i < len(opts); i++ {
   711  			// Pad with 0 (EOL). RFC 791 page 23 says "The padding is zero".
   712  			opts[i] = byte(header.IPv4OptionListEndType)
   713  		}
   714  	}
   715  
   716  	stk := e.protocol.stack
   717  
   718  	// Check if the destination is owned by the stack.
   719  	if ep := e.protocol.findEndpointWithAddress(dstAddr); ep != nil {
   720  		inNicName := stk.FindNICNameFromID(e.nic.ID())
   721  		outNicName := stk.FindNICNameFromID(ep.nic.ID())
   722  		if ok := stk.IPTables().CheckForward(pkt, inNicName, outNicName); !ok {
   723  			// iptables is telling us to drop the packet.
   724  			e.stats.ip.IPTablesForwardDropped.Increment()
   725  			return nil
   726  		}
   727  
   728  		// The packet originally arrived on e so provide its NIC as the input NIC.
   729  		ep.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */)
   730  		return nil
   731  	}
   732  
   733  	r, err := stk.FindRoute(0, "", dstAddr, ProtocolNumber, false /* multicastLoop */)
   734  	switch err.(type) {
   735  	case nil:
   736  	case *tcpip.ErrNoRoute, *tcpip.ErrNetworkUnreachable:
   737  		// We return the original error rather than the result of returning
   738  		// the ICMP packet because the original error is more relevant to
   739  		// the caller.
   740  		_ = e.protocol.returnError(&icmpReasonNetworkUnreachable{}, pkt)
   741  		return &ip.ErrNoRoute{}
   742  	default:
   743  		return &ip.ErrOther{Err: err}
   744  	}
   745  	defer r.Release()
   746  
   747  	inNicName := stk.FindNICNameFromID(e.nic.ID())
   748  	outNicName := stk.FindNICNameFromID(r.NICID())
   749  	if ok := stk.IPTables().CheckForward(pkt, inNicName, outNicName); !ok {
   750  		// iptables is telling us to drop the packet.
   751  		e.stats.ip.IPTablesForwardDropped.Increment()
   752  		return nil
   753  	}
   754  
   755  	// We need to do a deep copy of the IP packet because
   756  	// WriteHeaderIncludedPacket may modify the packet buffer, but we do
   757  	// not own it.
   758  	newPkt := pkt.DeepCopyForForwarding(int(r.MaxHeaderLength()))
   759  	newHdr := header.IPv4(newPkt.NetworkHeader().View())
   760  	defer newPkt.DecRef()
   761  
   762  	// As per RFC 791 page 30, Time to Live,
   763  	//
   764  	//   This field must be decreased at each point that the internet header
   765  	//   is processed to reflect the time spent processing the datagram.
   766  	//   Even if no local information is available on the time actually
   767  	//   spent, the field must be decremented by 1.
   768  	newHdr.SetTTL(ttl - 1)
   769  	// We perform a full checksum as we may have updated options above. The IP
   770  	// header is relatively small so this is not expected to be an expensive
   771  	// operation.
   772  	newHdr.SetChecksum(0)
   773  	newHdr.SetChecksum(^newHdr.CalculateChecksum())
   774  
   775  	forwardToEp, ok := e.protocol.getEndpointForNIC(r.NICID())
   776  	if !ok {
   777  		// The interface was removed after we obtained the route.
   778  		return &ip.ErrOther{Err: &tcpip.ErrUnknownDevice{}}
   779  	}
   780  
   781  	switch err := forwardToEp.writePacket(r, newPkt, true /* headerIncluded */); err.(type) {
   782  	case nil:
   783  		return nil
   784  	case *tcpip.ErrMessageTooLong:
   785  		// As per RFC 792, page 4, Destination Unreachable:
   786  		//
   787  		//   Another case is when a datagram must be fragmented to be forwarded by a
   788  		//   gateway yet the Don't Fragment flag is on. In this case the gateway must
   789  		//   discard the datagram and may return a destination unreachable message.
   790  		//
   791  		// WriteHeaderIncludedPacket checks for the presence of the Don't Fragment bit
   792  		// while sending the packet and returns this error iff fragmentation is
   793  		// necessary and the bit is also set.
   794  		_ = e.protocol.returnError(&icmpReasonFragmentationNeeded{}, pkt)
   795  		return &ip.ErrMessageTooLong{}
   796  	default:
   797  		return &ip.ErrOther{Err: err}
   798  	}
   799  }
   800  
   801  // HandlePacket is called by the link layer when new ipv4 packets arrive for
   802  // this endpoint.
   803  func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) {
   804  	stats := e.stats.ip
   805  
   806  	stats.PacketsReceived.Increment()
   807  
   808  	if !e.isEnabled() {
   809  		stats.DisabledPacketsReceived.Increment()
   810  		return
   811  	}
   812  
   813  	h, ok := e.protocol.parseAndValidate(pkt)
   814  	if !ok {
   815  		stats.MalformedPacketsReceived.Increment()
   816  		return
   817  	}
   818  
   819  	if !e.nic.IsLoopback() {
   820  		if !e.protocol.options.AllowExternalLoopbackTraffic {
   821  			if header.IsV4LoopbackAddress(h.SourceAddress()) {
   822  				stats.InvalidSourceAddressesReceived.Increment()
   823  				return
   824  			}
   825  
   826  			if header.IsV4LoopbackAddress(h.DestinationAddress()) {
   827  				stats.InvalidDestinationAddressesReceived.Increment()
   828  				return
   829  			}
   830  		}
   831  
   832  		if e.protocol.stack.HandleLocal() {
   833  			addressEndpoint := e.AcquireAssignedAddress(header.IPv4(pkt.NetworkHeader().View()).SourceAddress(), e.nic.Promiscuous(), stack.CanBePrimaryEndpoint)
   834  			if addressEndpoint != nil {
   835  				addressEndpoint.DecRef()
   836  
   837  				// The source address is one of our own, so we never should have gotten
   838  				// a packet like this unless HandleLocal is false or our NIC is the
   839  				// loopback interface.
   840  				stats.InvalidSourceAddressesReceived.Increment()
   841  				return
   842  			}
   843  		}
   844  
   845  		// Loopback traffic skips the prerouting chain.
   846  		inNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
   847  		if ok := e.protocol.stack.IPTables().CheckPrerouting(pkt, e, inNicName); !ok {
   848  			// iptables is telling us to drop the packet.
   849  			stats.IPTablesPreroutingDropped.Increment()
   850  			return
   851  		}
   852  	}
   853  
   854  	e.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */)
   855  }
   856  
   857  // handleLocalPacket is like HandlePacket except it does not perform the
   858  // prerouting iptables hook or check for loopback traffic that originated from
   859  // outside of the netstack (i.e. martian loopback packets).
   860  func (e *endpoint) handleLocalPacket(pkt *stack.PacketBuffer, canSkipRXChecksum bool) {
   861  	stats := e.stats.ip
   862  	stats.PacketsReceived.Increment()
   863  
   864  	pkt = pkt.CloneToInbound()
   865  	defer pkt.DecRef()
   866  	pkt.RXTransportChecksumValidated = canSkipRXChecksum
   867  
   868  	h, ok := e.protocol.parseAndValidate(pkt)
   869  	if !ok {
   870  		stats.MalformedPacketsReceived.Increment()
   871  		return
   872  	}
   873  
   874  	e.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */)
   875  }
   876  
   877  func (e *endpoint) handleValidatedPacket(h header.IPv4, pkt *stack.PacketBuffer, inNICName string) {
   878  	pkt.NICID = e.nic.ID()
   879  
   880  	// Raw socket packets are delivered based solely on the transport protocol
   881  	// number. We only require that the packet be valid IPv4, and that they not
   882  	// be fragmented.
   883  	if !h.More() && h.FragmentOffset() == 0 {
   884  		e.dispatcher.DeliverRawPacket(h.TransportProtocol(), pkt)
   885  	}
   886  
   887  	stats := e.stats
   888  	stats.ip.ValidPacketsReceived.Increment()
   889  
   890  	srcAddr := h.SourceAddress()
   891  	dstAddr := h.DestinationAddress()
   892  
   893  	// As per RFC 1122 section 3.2.1.3:
   894  	//   When a host sends any datagram, the IP source address MUST
   895  	//   be one of its own IP addresses (but not a broadcast or
   896  	//   multicast address).
   897  	if srcAddr == header.IPv4Broadcast || header.IsV4MulticastAddress(srcAddr) {
   898  		stats.ip.InvalidSourceAddressesReceived.Increment()
   899  		return
   900  	}
   901  	// Make sure the source address is not a subnet-local broadcast address.
   902  	if addressEndpoint := e.AcquireAssignedAddress(srcAddr, false /* createTemp */, stack.NeverPrimaryEndpoint); addressEndpoint != nil {
   903  		subnet := addressEndpoint.Subnet()
   904  		addressEndpoint.DecRef()
   905  		if subnet.IsBroadcast(srcAddr) {
   906  			stats.ip.InvalidSourceAddressesReceived.Increment()
   907  			return
   908  		}
   909  	}
   910  
   911  	// Before we do any processing, note if the packet was received as some
   912  	// sort of broadcast. The destination address should be an address we own
   913  	// or a group we joined.
   914  	if addressEndpoint := e.AcquireAssignedAddress(dstAddr, e.nic.Promiscuous(), stack.CanBePrimaryEndpoint); addressEndpoint != nil {
   915  		subnet := addressEndpoint.AddressWithPrefix().Subnet()
   916  		addressEndpoint.DecRef()
   917  		pkt.NetworkPacketInfo.LocalAddressBroadcast = subnet.IsBroadcast(dstAddr) || dstAddr == header.IPv4Broadcast
   918  	} else if !e.IsInGroup(dstAddr) {
   919  		if !e.Forwarding() {
   920  			stats.ip.InvalidDestinationAddressesReceived.Increment()
   921  			return
   922  		}
   923  		switch err := e.forwardPacket(pkt); err.(type) {
   924  		case nil:
   925  			return
   926  		case *ip.ErrLinkLocalSourceAddress:
   927  			stats.ip.Forwarding.LinkLocalSource.Increment()
   928  		case *ip.ErrLinkLocalDestinationAddress:
   929  			stats.ip.Forwarding.LinkLocalDestination.Increment()
   930  		case *ip.ErrTTLExceeded:
   931  			stats.ip.Forwarding.ExhaustedTTL.Increment()
   932  		case *ip.ErrNoRoute:
   933  			stats.ip.Forwarding.Unrouteable.Increment()
   934  		case *ip.ErrParameterProblem:
   935  			stats.ip.MalformedPacketsReceived.Increment()
   936  		case *ip.ErrMessageTooLong:
   937  			stats.ip.Forwarding.PacketTooBig.Increment()
   938  		default:
   939  			panic(fmt.Sprintf("unexpected error %s while trying to forward packet: %#v", err, pkt))
   940  		}
   941  		stats.ip.Forwarding.Errors.Increment()
   942  		return
   943  	}
   944  
   945  	// iptables filtering. All packets that reach here are intended for
   946  	// this machine and will not be forwarded.
   947  	if ok := e.protocol.stack.IPTables().CheckInput(pkt, inNICName); !ok {
   948  		// iptables is telling us to drop the packet.
   949  		stats.ip.IPTablesInputDropped.Increment()
   950  		return
   951  	}
   952  
   953  	if h.More() || h.FragmentOffset() != 0 {
   954  		if pkt.Data().Size()+pkt.TransportHeader().View().Size() == 0 {
   955  			// Drop the packet as it's marked as a fragment but has
   956  			// no payload.
   957  			stats.ip.MalformedPacketsReceived.Increment()
   958  			stats.ip.MalformedFragmentsReceived.Increment()
   959  			return
   960  		}
   961  		if opts := h.Options(); len(opts) != 0 {
   962  			// If there are options we need to check them before we do assembly
   963  			// or we could be assembling errant packets. However we do not change the
   964  			// options as that could lead to double processing later.
   965  			if _, _, optProblem := e.processIPOptions(pkt, opts, &optionUsageVerify{}); optProblem != nil {
   966  				if optProblem.NeedICMP {
   967  					_ = e.protocol.returnError(&icmpReasonParamProblem{
   968  						pointer: optProblem.Pointer,
   969  					}, pkt)
   970  					e.stats.ip.MalformedPacketsReceived.Increment()
   971  				}
   972  				return
   973  			}
   974  		}
   975  		// The packet is a fragment, let's try to reassemble it.
   976  		start := h.FragmentOffset()
   977  		// Drop the fragment if the size of the reassembled payload would exceed the
   978  		// maximum payload size.
   979  		//
   980  		// Note that this addition doesn't overflow even on 32bit architecture
   981  		// because pkt.Data().Size() should not exceed 65535 (the max IP datagram
   982  		// size). Otherwise the packet would've been rejected as invalid before
   983  		// reaching here.
   984  		if int(start)+pkt.Data().Size() > header.IPv4MaximumPayloadSize {
   985  			stats.ip.MalformedPacketsReceived.Increment()
   986  			stats.ip.MalformedFragmentsReceived.Increment()
   987  			return
   988  		}
   989  
   990  		proto := h.Protocol()
   991  		resPkt, transProtoNum, ready, err := e.protocol.fragmentation.Process(
   992  			// As per RFC 791 section 2.3, the identification value is unique
   993  			// for a source-destination pair and protocol.
   994  			fragmentation.FragmentID{
   995  				Source:      h.SourceAddress(),
   996  				Destination: h.DestinationAddress(),
   997  				ID:          uint32(h.ID()),
   998  				Protocol:    proto,
   999  			},
  1000  			start,
  1001  			start+uint16(pkt.Data().Size())-1,
  1002  			h.More(),
  1003  			proto,
  1004  			pkt,
  1005  		)
  1006  		if err != nil {
  1007  			stats.ip.MalformedPacketsReceived.Increment()
  1008  			stats.ip.MalformedFragmentsReceived.Increment()
  1009  			return
  1010  		}
  1011  		if !ready {
  1012  			return
  1013  		}
  1014  		pkt = resPkt
  1015  		h = header.IPv4(pkt.NetworkHeader().View())
  1016  
  1017  		// The reassembler doesn't take care of fixing up the header, so we need
  1018  		// to do it here.
  1019  		h.SetTotalLength(uint16(pkt.Data().Size() + len(h)))
  1020  		h.SetFlagsFragmentOffset(0, 0)
  1021  
  1022  		e.protocol.parseTransport(pkt, tcpip.TransportProtocolNumber(transProtoNum))
  1023  
  1024  		// Now that the packet is reassembled, it can be sent to raw sockets.
  1025  		e.dispatcher.DeliverRawPacket(h.TransportProtocol(), pkt)
  1026  	}
  1027  	stats.ip.PacketsDelivered.Increment()
  1028  
  1029  	p := h.TransportProtocol()
  1030  	if p == header.ICMPv4ProtocolNumber {
  1031  		// TODO(gvisor.dev/issues/3810): when we sort out ICMP and transport
  1032  		// headers, the setting of the transport number here should be
  1033  		// unnecessary and removed.
  1034  		pkt.TransportProtocolNumber = p
  1035  		e.handleICMP(pkt)
  1036  		return
  1037  	}
  1038  	// ICMP handles options itself but do it here for all remaining destinations.
  1039  	var hasRouterAlertOption bool
  1040  	if opts := h.Options(); len(opts) != 0 {
  1041  		newOpts, processedOpts, optProblem := e.processIPOptions(pkt, opts, &optionUsageReceive{})
  1042  		if optProblem != nil {
  1043  			if optProblem.NeedICMP {
  1044  				_ = e.protocol.returnError(&icmpReasonParamProblem{
  1045  					pointer: optProblem.Pointer,
  1046  				}, pkt)
  1047  				stats.ip.MalformedPacketsReceived.Increment()
  1048  			}
  1049  			return
  1050  		}
  1051  		hasRouterAlertOption = processedOpts.routerAlert
  1052  		copied := copy(opts, newOpts)
  1053  		if copied != len(newOpts) {
  1054  			panic(fmt.Sprintf("copied %d bytes of new options, expected %d bytes", copied, len(newOpts)))
  1055  		}
  1056  		for i := copied; i < len(opts); i++ {
  1057  			// Pad with 0 (EOL). RFC 791 page 23 says "The padding is zero".
  1058  			opts[i] = byte(header.IPv4OptionListEndType)
  1059  		}
  1060  	}
  1061  	if p == header.IGMPProtocolNumber {
  1062  		e.mu.Lock()
  1063  		e.mu.igmp.handleIGMP(pkt, hasRouterAlertOption)
  1064  		e.mu.Unlock()
  1065  		return
  1066  	}
  1067  
  1068  	switch res := e.dispatcher.DeliverTransportPacket(p, pkt); res {
  1069  	case stack.TransportPacketHandled:
  1070  	case stack.TransportPacketDestinationPortUnreachable:
  1071  		// As per RFC: 1122 Section 3.2.2.1 A host SHOULD generate Destination
  1072  		//   Unreachable messages with code:
  1073  		//     3 (Port Unreachable), when the designated transport protocol
  1074  		//     (e.g., UDP) is unable to demultiplex the datagram but has no
  1075  		//     protocol mechanism to inform the sender.
  1076  		_ = e.protocol.returnError(&icmpReasonPortUnreachable{}, pkt)
  1077  	case stack.TransportPacketProtocolUnreachable:
  1078  		// As per RFC: 1122 Section 3.2.2.1
  1079  		//   A host SHOULD generate Destination Unreachable messages with code:
  1080  		//     2 (Protocol Unreachable), when the designated transport protocol
  1081  		//     is not supported
  1082  		_ = e.protocol.returnError(&icmpReasonProtoUnreachable{}, pkt)
  1083  	default:
  1084  		panic(fmt.Sprintf("unrecognized result from DeliverTransportPacket = %d", res))
  1085  	}
  1086  }
  1087  
  1088  // Close cleans up resources associated with the endpoint.
  1089  func (e *endpoint) Close() {
  1090  	e.mu.Lock()
  1091  	e.disableLocked()
  1092  	e.mu.addressableEndpointState.Cleanup()
  1093  	e.mu.Unlock()
  1094  
  1095  	e.protocol.forgetEndpoint(e.nic.ID())
  1096  }
  1097  
  1098  // AddAndAcquirePermanentAddress implements stack.AddressableEndpoint.
  1099  func (e *endpoint) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, properties stack.AddressProperties) (stack.AddressEndpoint, tcpip.Error) {
  1100  	e.mu.RLock()
  1101  	defer e.mu.RUnlock()
  1102  
  1103  	ep, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(addr, properties)
  1104  	if err == nil {
  1105  		e.mu.igmp.sendQueuedReports()
  1106  	}
  1107  	return ep, err
  1108  }
  1109  
  1110  // RemovePermanentAddress implements stack.AddressableEndpoint.
  1111  func (e *endpoint) RemovePermanentAddress(addr tcpip.Address) tcpip.Error {
  1112  	e.mu.RLock()
  1113  	defer e.mu.RUnlock()
  1114  	return e.mu.addressableEndpointState.RemovePermanentAddress(addr)
  1115  }
  1116  
  1117  // MainAddress implements stack.AddressableEndpoint.
  1118  func (e *endpoint) MainAddress() tcpip.AddressWithPrefix {
  1119  	e.mu.RLock()
  1120  	defer e.mu.RUnlock()
  1121  	return e.mu.addressableEndpointState.MainAddress()
  1122  }
  1123  
  1124  // AcquireAssignedAddress implements stack.AddressableEndpoint.
  1125  func (e *endpoint) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB stack.PrimaryEndpointBehavior) stack.AddressEndpoint {
  1126  	e.mu.RLock()
  1127  	defer e.mu.RUnlock()
  1128  
  1129  	loopback := e.nic.IsLoopback()
  1130  	return e.mu.addressableEndpointState.AcquireAssignedAddressOrMatching(localAddr, func(addressEndpoint stack.AddressEndpoint) bool {
  1131  		subnet := addressEndpoint.Subnet()
  1132  		// IPv4 has a notion of a subnet broadcast address and considers the
  1133  		// loopback interface bound to an address's whole subnet (on linux).
  1134  		return subnet.IsBroadcast(localAddr) || (loopback && subnet.Contains(localAddr))
  1135  	}, allowTemp, tempPEB)
  1136  }
  1137  
  1138  // AcquireOutgoingPrimaryAddress implements stack.AddressableEndpoint.
  1139  func (e *endpoint) AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint {
  1140  	e.mu.RLock()
  1141  	defer e.mu.RUnlock()
  1142  	return e.acquireOutgoingPrimaryAddressRLocked(remoteAddr, allowExpired)
  1143  }
  1144  
  1145  // acquireOutgoingPrimaryAddressRLocked is like AcquireOutgoingPrimaryAddress
  1146  // but with locking requirements
  1147  //
  1148  // Precondition: igmp.ep.mu must be read locked.
  1149  func (e *endpoint) acquireOutgoingPrimaryAddressRLocked(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint {
  1150  	return e.mu.addressableEndpointState.AcquireOutgoingPrimaryAddress(remoteAddr, allowExpired)
  1151  }
  1152  
  1153  // PrimaryAddresses implements stack.AddressableEndpoint.
  1154  func (e *endpoint) PrimaryAddresses() []tcpip.AddressWithPrefix {
  1155  	e.mu.RLock()
  1156  	defer e.mu.RUnlock()
  1157  	return e.mu.addressableEndpointState.PrimaryAddresses()
  1158  }
  1159  
  1160  // PermanentAddresses implements stack.AddressableEndpoint.
  1161  func (e *endpoint) PermanentAddresses() []tcpip.AddressWithPrefix {
  1162  	e.mu.RLock()
  1163  	defer e.mu.RUnlock()
  1164  	return e.mu.addressableEndpointState.PermanentAddresses()
  1165  }
  1166  
  1167  // JoinGroup implements stack.GroupAddressableEndpoint.
  1168  func (e *endpoint) JoinGroup(addr tcpip.Address) tcpip.Error {
  1169  	e.mu.Lock()
  1170  	defer e.mu.Unlock()
  1171  	return e.joinGroupLocked(addr)
  1172  }
  1173  
  1174  // joinGroupLocked is like JoinGroup but with locking requirements.
  1175  //
  1176  // Precondition: e.mu must be locked.
  1177  func (e *endpoint) joinGroupLocked(addr tcpip.Address) tcpip.Error {
  1178  	if !header.IsV4MulticastAddress(addr) {
  1179  		return &tcpip.ErrBadAddress{}
  1180  	}
  1181  
  1182  	e.mu.igmp.joinGroup(addr)
  1183  	return nil
  1184  }
  1185  
  1186  // LeaveGroup implements stack.GroupAddressableEndpoint.
  1187  func (e *endpoint) LeaveGroup(addr tcpip.Address) tcpip.Error {
  1188  	e.mu.Lock()
  1189  	defer e.mu.Unlock()
  1190  	return e.leaveGroupLocked(addr)
  1191  }
  1192  
  1193  // leaveGroupLocked is like LeaveGroup but with locking requirements.
  1194  //
  1195  // Precondition: e.mu must be locked.
  1196  func (e *endpoint) leaveGroupLocked(addr tcpip.Address) tcpip.Error {
  1197  	return e.mu.igmp.leaveGroup(addr)
  1198  }
  1199  
  1200  // IsInGroup implements stack.GroupAddressableEndpoint.
  1201  func (e *endpoint) IsInGroup(addr tcpip.Address) bool {
  1202  	e.mu.RLock()
  1203  	defer e.mu.RUnlock()
  1204  	return e.mu.igmp.isInGroup(addr)
  1205  }
  1206  
  1207  // Stats implements stack.NetworkEndpoint.
  1208  func (e *endpoint) Stats() stack.NetworkEndpointStats {
  1209  	return &e.stats.localStats
  1210  }
  1211  
  1212  var _ stack.NetworkProtocol = (*protocol)(nil)
  1213  var _ fragmentation.TimeoutHandler = (*protocol)(nil)
  1214  
  1215  type protocol struct {
  1216  	stack *stack.Stack
  1217  
  1218  	mu struct {
  1219  		sync.RWMutex
  1220  
  1221  		// eps is keyed by NICID to allow protocol methods to retrieve an endpoint
  1222  		// when handling a packet, by looking at which NIC handled the packet.
  1223  		eps map[tcpip.NICID]*endpoint
  1224  
  1225  		// ICMP types for which the stack's global rate limiting must apply.
  1226  		icmpRateLimitedTypes map[header.ICMPv4Type]struct{}
  1227  	}
  1228  
  1229  	// defaultTTL is the current default TTL for the protocol. Only the
  1230  	// uint8 portion of it is meaningful.
  1231  	//
  1232  	// Must be accessed using atomic operations.
  1233  	defaultTTL uint32
  1234  
  1235  	ids    []uint32
  1236  	hashIV uint32
  1237  
  1238  	fragmentation *fragmentation.Fragmentation
  1239  
  1240  	options Options
  1241  }
  1242  
  1243  // Number returns the ipv4 protocol number.
  1244  func (p *protocol) Number() tcpip.NetworkProtocolNumber {
  1245  	return ProtocolNumber
  1246  }
  1247  
  1248  // MinimumPacketSize returns the minimum valid ipv4 packet size.
  1249  func (p *protocol) MinimumPacketSize() int {
  1250  	return header.IPv4MinimumSize
  1251  }
  1252  
  1253  // ParseAddresses implements stack.NetworkProtocol.
  1254  func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
  1255  	h := header.IPv4(v)
  1256  	return h.SourceAddress(), h.DestinationAddress()
  1257  }
  1258  
  1259  // SetOption implements stack.NetworkProtocol.
  1260  func (p *protocol) SetOption(option tcpip.SettableNetworkProtocolOption) tcpip.Error {
  1261  	switch v := option.(type) {
  1262  	case *tcpip.DefaultTTLOption:
  1263  		p.SetDefaultTTL(uint8(*v))
  1264  		return nil
  1265  	default:
  1266  		return &tcpip.ErrUnknownProtocolOption{}
  1267  	}
  1268  }
  1269  
  1270  // Option implements stack.NetworkProtocol.
  1271  func (p *protocol) Option(option tcpip.GettableNetworkProtocolOption) tcpip.Error {
  1272  	switch v := option.(type) {
  1273  	case *tcpip.DefaultTTLOption:
  1274  		*v = tcpip.DefaultTTLOption(p.DefaultTTL())
  1275  		return nil
  1276  	default:
  1277  		return &tcpip.ErrUnknownProtocolOption{}
  1278  	}
  1279  }
  1280  
  1281  // SetDefaultTTL sets the default TTL for endpoints created with this protocol.
  1282  func (p *protocol) SetDefaultTTL(ttl uint8) {
  1283  	atomic.StoreUint32(&p.defaultTTL, uint32(ttl))
  1284  }
  1285  
  1286  // DefaultTTL returns the default TTL for endpoints created with this protocol.
  1287  func (p *protocol) DefaultTTL() uint8 {
  1288  	return uint8(atomic.LoadUint32(&p.defaultTTL))
  1289  }
  1290  
  1291  // Close implements stack.TransportProtocol.
  1292  func (*protocol) Close() {}
  1293  
  1294  // Wait implements stack.TransportProtocol.
  1295  func (*protocol) Wait() {}
  1296  
  1297  // parseAndValidate parses the packet (including its transport layer header) and
  1298  // returns the parsed IP header.
  1299  //
  1300  // Returns true if the IP header was successfully parsed.
  1301  func (p *protocol) parseAndValidate(pkt *stack.PacketBuffer) (header.IPv4, bool) {
  1302  	transProtoNum, hasTransportHdr, ok := p.Parse(pkt)
  1303  	if !ok {
  1304  		return nil, false
  1305  	}
  1306  
  1307  	h := header.IPv4(pkt.NetworkHeader().View())
  1308  	// Do not include the link header's size when calculating the size of the IP
  1309  	// packet.
  1310  	if !h.IsValid(pkt.Size() - pkt.LinkHeader().View().Size()) {
  1311  		return nil, false
  1312  	}
  1313  
  1314  	if !h.IsChecksumValid() {
  1315  		return nil, false
  1316  	}
  1317  
  1318  	if hasTransportHdr {
  1319  		p.parseTransport(pkt, transProtoNum)
  1320  	}
  1321  
  1322  	return h, true
  1323  }
  1324  
  1325  func (p *protocol) parseTransport(pkt *stack.PacketBuffer, transProtoNum tcpip.TransportProtocolNumber) {
  1326  	if transProtoNum == header.ICMPv4ProtocolNumber {
  1327  		// The transport layer will handle transport layer parsing errors.
  1328  		_ = parse.ICMPv4(pkt)
  1329  		return
  1330  	}
  1331  
  1332  	switch err := p.stack.ParsePacketBufferTransport(transProtoNum, pkt); err {
  1333  	case stack.ParsedOK:
  1334  	case stack.UnknownTransportProtocol, stack.TransportLayerParseError:
  1335  		// The transport layer will handle unknown protocols and transport layer
  1336  		// parsing errors.
  1337  	default:
  1338  		panic(fmt.Sprintf("unexpected error parsing transport header = %d", err))
  1339  	}
  1340  }
  1341  
  1342  // Parse implements stack.NetworkProtocol.
  1343  func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
  1344  	if ok := parse.IPv4(pkt); !ok {
  1345  		return 0, false, false
  1346  	}
  1347  
  1348  	ipHdr := header.IPv4(pkt.NetworkHeader().View())
  1349  	return ipHdr.TransportProtocol(), !ipHdr.More() && ipHdr.FragmentOffset() == 0, true
  1350  }
  1351  
  1352  // allowICMPReply reports whether an ICMP reply with provided type and code may
  1353  // be sent following the rate mask options and global ICMP rate limiter.
  1354  func (p *protocol) allowICMPReply(icmpType header.ICMPv4Type, code header.ICMPv4Code) bool {
  1355  	// Mimic linux and never rate limit for PMTU discovery.
  1356  	// https://github.com/torvalds/linux/blob/9e9fb7655ed585da8f468e29221f0ba194a5f613/net/ipv4/icmp.c#L288
  1357  	if icmpType == header.ICMPv4DstUnreachable && code == header.ICMPv4FragmentationNeeded {
  1358  		return true
  1359  	}
  1360  	p.mu.RLock()
  1361  	defer p.mu.RUnlock()
  1362  
  1363  	if _, ok := p.mu.icmpRateLimitedTypes[icmpType]; ok {
  1364  		return p.stack.AllowICMPMessage()
  1365  	}
  1366  	return true
  1367  }
  1368  
  1369  // calculateNetworkMTU calculates the network-layer payload MTU based on the
  1370  // link-layer payload mtu.
  1371  func calculateNetworkMTU(linkMTU, networkHeaderSize uint32) (uint32, tcpip.Error) {
  1372  	if linkMTU < header.IPv4MinimumMTU {
  1373  		return 0, &tcpip.ErrInvalidEndpointState{}
  1374  	}
  1375  
  1376  	// As per RFC 791 section 3.1, an IPv4 header cannot exceed 60 bytes in
  1377  	// length:
  1378  	//   The maximal internet header is 60 octets, and a typical internet header
  1379  	//   is 20 octets, allowing a margin for headers of higher level protocols.
  1380  	if networkHeaderSize > header.IPv4MaximumHeaderSize {
  1381  		return 0, &tcpip.ErrMalformedHeader{}
  1382  	}
  1383  
  1384  	networkMTU := linkMTU
  1385  	if networkMTU > MaxTotalSize {
  1386  		networkMTU = MaxTotalSize
  1387  	}
  1388  
  1389  	return networkMTU - networkHeaderSize, nil
  1390  }
  1391  
  1392  func packetMustBeFragmented(pkt *stack.PacketBuffer, networkMTU uint32) bool {
  1393  	payload := pkt.TransportHeader().View().Size() + pkt.Data().Size()
  1394  	return pkt.GSOOptions.Type == stack.GSONone && uint32(payload) > networkMTU
  1395  }
  1396  
  1397  // addressToUint32 translates an IPv4 address into its little endian uint32
  1398  // representation.
  1399  //
  1400  // This function does the same thing as binary.LittleEndian.Uint32 but operates
  1401  // on a tcpip.Address (a string) without the need to convert it to a byte slice,
  1402  // which would cause an allocation.
  1403  func addressToUint32(addr tcpip.Address) uint32 {
  1404  	_ = addr[3] // bounds check hint to compiler
  1405  	return uint32(addr[0]) | uint32(addr[1])<<8 | uint32(addr[2])<<16 | uint32(addr[3])<<24
  1406  }
  1407  
  1408  // hashRoute calculates a hash value for the given source/destination pair using
  1409  // the addresses, transport protocol number and a 32-bit number to generate the
  1410  // hash.
  1411  func hashRoute(srcAddr, dstAddr tcpip.Address, protocol tcpip.TransportProtocolNumber, hashIV uint32) uint32 {
  1412  	a := addressToUint32(srcAddr)
  1413  	b := addressToUint32(dstAddr)
  1414  	return hash.Hash3Words(a, b, uint32(protocol), hashIV)
  1415  }
  1416  
  1417  // Options holds options to configure a new protocol.
  1418  type Options struct {
  1419  	// IGMP holds options for IGMP.
  1420  	IGMP IGMPOptions
  1421  
  1422  	// AllowExternalLoopbackTraffic indicates that inbound loopback packets (i.e.
  1423  	// martian loopback packets) should be accepted.
  1424  	AllowExternalLoopbackTraffic bool
  1425  }
  1426  
  1427  // NewProtocolWithOptions returns an IPv4 network protocol.
  1428  func NewProtocolWithOptions(opts Options) stack.NetworkProtocolFactory {
  1429  	ids := make([]uint32, buckets)
  1430  
  1431  	// Randomly initialize hashIV and the ids.
  1432  	r := hash.RandN32(1 + buckets)
  1433  	for i := range ids {
  1434  		ids[i] = r[i]
  1435  	}
  1436  	hashIV := r[buckets]
  1437  
  1438  	return func(s *stack.Stack) stack.NetworkProtocol {
  1439  		p := &protocol{
  1440  			stack:      s,
  1441  			ids:        ids,
  1442  			hashIV:     hashIV,
  1443  			defaultTTL: DefaultTTL,
  1444  			options:    opts,
  1445  		}
  1446  		p.fragmentation = fragmentation.NewFragmentation(fragmentblockSize, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, ReassembleTimeout, s.Clock(), p)
  1447  		p.mu.eps = make(map[tcpip.NICID]*endpoint)
  1448  		// Set ICMP rate limiting to Linux defaults.
  1449  		// See https://man7.org/linux/man-pages/man7/icmp.7.html.
  1450  		p.mu.icmpRateLimitedTypes = map[header.ICMPv4Type]struct{}{
  1451  			header.ICMPv4DstUnreachable: struct{}{},
  1452  			header.ICMPv4SrcQuench:      struct{}{},
  1453  			header.ICMPv4TimeExceeded:   struct{}{},
  1454  			header.ICMPv4ParamProblem:   struct{}{},
  1455  		}
  1456  		return p
  1457  	}
  1458  }
  1459  
  1460  // NewProtocol is equivalent to NewProtocolWithOptions with an empty Options.
  1461  func NewProtocol(s *stack.Stack) stack.NetworkProtocol {
  1462  	return NewProtocolWithOptions(Options{})(s)
  1463  }
  1464  
  1465  func buildNextFragment(pf *fragmentation.PacketFragmenter, originalIPHeader header.IPv4) (*stack.PacketBuffer, bool) {
  1466  	fragPkt, offset, copied, more := pf.BuildNextFragment()
  1467  	fragPkt.NetworkProtocolNumber = ProtocolNumber
  1468  
  1469  	originalIPHeaderLength := len(originalIPHeader)
  1470  	nextFragIPHeader := header.IPv4(fragPkt.NetworkHeader().Push(originalIPHeaderLength))
  1471  	fragPkt.NetworkProtocolNumber = ProtocolNumber
  1472  
  1473  	if copied := copy(nextFragIPHeader, originalIPHeader); copied != len(originalIPHeader) {
  1474  		panic(fmt.Sprintf("wrong number of bytes copied into fragmentIPHeaders: got = %d, want = %d", copied, originalIPHeaderLength))
  1475  	}
  1476  
  1477  	flags := originalIPHeader.Flags()
  1478  	if more {
  1479  		flags |= header.IPv4FlagMoreFragments
  1480  	}
  1481  	nextFragIPHeader.SetFlagsFragmentOffset(flags, uint16(offset))
  1482  	nextFragIPHeader.SetTotalLength(uint16(nextFragIPHeader.HeaderLength()) + uint16(copied))
  1483  	nextFragIPHeader.SetChecksum(0)
  1484  	nextFragIPHeader.SetChecksum(^nextFragIPHeader.CalculateChecksum())
  1485  
  1486  	return fragPkt, more
  1487  }
  1488  
  1489  // optionAction describes possible actions that may be taken on an option
  1490  // while processing it.
  1491  type optionAction uint8
  1492  
  1493  const (
  1494  	// optionRemove says that the option should not be in the output option set.
  1495  	optionRemove optionAction = iota
  1496  
  1497  	// optionProcess says that the option should be fully processed.
  1498  	optionProcess
  1499  
  1500  	// optionVerify says the option should be checked and passed unchanged.
  1501  	optionVerify
  1502  
  1503  	// optionPass says to pass the output set without checking.
  1504  	optionPass
  1505  )
  1506  
  1507  // optionActions list what to do for each option in a given scenario.
  1508  type optionActions struct {
  1509  	// timestamp controls what to do with a Timestamp option.
  1510  	timestamp optionAction
  1511  
  1512  	// recordRoute controls what to do with a Record Route option.
  1513  	recordRoute optionAction
  1514  
  1515  	// routerAlert controls what to do with a Router Alert option.
  1516  	routerAlert optionAction
  1517  
  1518  	// unknown controls what to do with an unknown option.
  1519  	unknown optionAction
  1520  }
  1521  
  1522  // optionsUsage specifies the ways options may be operated upon for a given
  1523  // scenario during packet processing.
  1524  type optionsUsage interface {
  1525  	actions() optionActions
  1526  }
  1527  
  1528  // optionUsageVerify implements optionsUsage for when we just want to check
  1529  // fragments. Don't change anything, just check and reject if bad. No
  1530  // replacement options are generated.
  1531  type optionUsageVerify struct{}
  1532  
  1533  // actions implements optionsUsage.
  1534  func (*optionUsageVerify) actions() optionActions {
  1535  	return optionActions{
  1536  		timestamp:   optionVerify,
  1537  		recordRoute: optionVerify,
  1538  		routerAlert: optionVerify,
  1539  		unknown:     optionRemove,
  1540  	}
  1541  }
  1542  
  1543  // optionUsageReceive implements optionsUsage for packets we will pass
  1544  // to the transport layer (with the exception of Echo requests).
  1545  type optionUsageReceive struct{}
  1546  
  1547  // actions implements optionsUsage.
  1548  func (*optionUsageReceive) actions() optionActions {
  1549  	return optionActions{
  1550  		timestamp:   optionProcess,
  1551  		recordRoute: optionProcess,
  1552  		routerAlert: optionVerify,
  1553  		unknown:     optionPass,
  1554  	}
  1555  }
  1556  
  1557  // optionUsageForward implements optionsUsage for packets about to be forwarded.
  1558  // All options are passed on regardless of whether we recognise them, however
  1559  // we do process the Timestamp and Record Route options.
  1560  type optionUsageForward struct{}
  1561  
  1562  // actions implements optionsUsage.
  1563  func (*optionUsageForward) actions() optionActions {
  1564  	return optionActions{
  1565  		timestamp:   optionProcess,
  1566  		recordRoute: optionProcess,
  1567  		routerAlert: optionVerify,
  1568  		unknown:     optionPass,
  1569  	}
  1570  }
  1571  
  1572  // optionUsageEcho implements optionsUsage for echo packet processing.
  1573  // Only Timestamp and RecordRoute are processed and sent back.
  1574  type optionUsageEcho struct{}
  1575  
  1576  // actions implements optionsUsage.
  1577  func (*optionUsageEcho) actions() optionActions {
  1578  	return optionActions{
  1579  		timestamp:   optionProcess,
  1580  		recordRoute: optionProcess,
  1581  		routerAlert: optionVerify,
  1582  		unknown:     optionRemove,
  1583  	}
  1584  }
  1585  
  1586  // handleTimestamp does any required processing on a Timestamp option
  1587  // in place.
  1588  func handleTimestamp(tsOpt header.IPv4OptionTimestamp, localAddress tcpip.Address, clock tcpip.Clock, usage optionsUsage) *header.IPv4OptParameterProblem {
  1589  	flags := tsOpt.Flags()
  1590  	var entrySize uint8
  1591  	switch flags {
  1592  	case header.IPv4OptionTimestampOnlyFlag:
  1593  		entrySize = header.IPv4OptionTimestampSize
  1594  	case
  1595  		header.IPv4OptionTimestampWithIPFlag,
  1596  		header.IPv4OptionTimestampWithPredefinedIPFlag:
  1597  		entrySize = header.IPv4OptionTimestampWithAddrSize
  1598  	default:
  1599  		return &header.IPv4OptParameterProblem{
  1600  			Pointer:  header.IPv4OptTSOFLWAndFLGOffset,
  1601  			NeedICMP: true,
  1602  		}
  1603  	}
  1604  
  1605  	pointer := tsOpt.Pointer()
  1606  	// RFC 791 page 22 states: "The smallest legal value is 5."
  1607  	// Since the pointer is 1 based, and the header is 4 bytes long the
  1608  	// pointer must point beyond the header therefore 4 or less is bad.
  1609  	if pointer <= header.IPv4OptionTimestampHdrLength {
  1610  		return &header.IPv4OptParameterProblem{
  1611  			Pointer:  header.IPv4OptTSPointerOffset,
  1612  			NeedICMP: true,
  1613  		}
  1614  	}
  1615  	// To simplify processing below, base further work on the array of timestamps
  1616  	// beyond the header, rather than on the whole option. Also to aid
  1617  	// calculations set 'nextSlot' to be 0 based as in the packet it is 1 based.
  1618  	nextSlot := pointer - (header.IPv4OptionTimestampHdrLength + 1)
  1619  	optLen := tsOpt.Size()
  1620  	dataLength := optLen - header.IPv4OptionTimestampHdrLength
  1621  
  1622  	// In the section below, we verify the pointer, length and overflow counter
  1623  	// fields of the option. The distinction is in which byte you return as being
  1624  	// in error in the ICMP packet. Offsets 1 (length), 2 pointer)
  1625  	// or 3 (overflowed counter).
  1626  	//
  1627  	// The following RFC sections cover this section:
  1628  	//
  1629  	// RFC 791 (page 22):
  1630  	//    If there is some room but not enough room for a full timestamp
  1631  	//    to be inserted, or the overflow count itself overflows, the
  1632  	//    original datagram is considered to be in error and is discarded.
  1633  	//    In either case an ICMP parameter problem message may be sent to
  1634  	//    the source host [3].
  1635  	//
  1636  	// You can get this situation in two ways. Firstly if the data area is not
  1637  	// a multiple of the entry size or secondly, if the pointer is not at a
  1638  	// multiple of the entry size. The wording of the RFC suggests that
  1639  	// this is not an error until you actually run out of space.
  1640  	if pointer > optLen {
  1641  		// RFC 791 (page 22) says we should switch to using the overflow count.
  1642  		//    If the timestamp data area is already full (the pointer exceeds
  1643  		//    the length) the datagram is forwarded without inserting the
  1644  		//    timestamp, but the overflow count is incremented by one.
  1645  		if flags == header.IPv4OptionTimestampWithPredefinedIPFlag {
  1646  			// By definition we have nothing to do.
  1647  			return nil
  1648  		}
  1649  
  1650  		if tsOpt.IncOverflow() != 0 {
  1651  			return nil
  1652  		}
  1653  		// The overflow count is also full.
  1654  		return &header.IPv4OptParameterProblem{
  1655  			Pointer:  header.IPv4OptTSOFLWAndFLGOffset,
  1656  			NeedICMP: true,
  1657  		}
  1658  	}
  1659  	if nextSlot+entrySize > dataLength {
  1660  		// The data area isn't full but there isn't room for a new entry.
  1661  		// Either Length or Pointer could be bad.
  1662  		if false {
  1663  			// We must select Pointer for Linux compatibility, even if
  1664  			// only the length is bad.
  1665  			// The Linux code is at (in October 2020)
  1666  			// https://github.com/torvalds/linux/blob/bbf5c979011a099af5dc76498918ed7df445635b/net/ipv4/ip_options.c#L367-L370
  1667  			//		if (optptr[2]+3 > optlen) {
  1668  			//			pp_ptr = optptr + 2;
  1669  			//			goto error;
  1670  			//		}
  1671  			// which doesn't distinguish between which of optptr[2] or optlen
  1672  			// is wrong, but just arbitrarily decides on optptr+2.
  1673  			if dataLength%entrySize != 0 {
  1674  				// The Data section size should be a multiple of the expected
  1675  				// timestamp entry size.
  1676  				return &header.IPv4OptParameterProblem{
  1677  					Pointer:  header.IPv4OptionLengthOffset,
  1678  					NeedICMP: false,
  1679  				}
  1680  			}
  1681  			// If the size is OK, the pointer must be corrupted.
  1682  		}
  1683  		return &header.IPv4OptParameterProblem{
  1684  			Pointer:  header.IPv4OptTSPointerOffset,
  1685  			NeedICMP: true,
  1686  		}
  1687  	}
  1688  
  1689  	if usage.actions().timestamp == optionProcess {
  1690  		tsOpt.UpdateTimestamp(localAddress, clock)
  1691  	}
  1692  	return nil
  1693  }
  1694  
  1695  // handleRecordRoute checks and processes a Record route option. It is much
  1696  // like the timestamp type 1 option, but without timestamps. The passed in
  1697  // address is stored in the option in the correct spot if possible.
  1698  func handleRecordRoute(rrOpt header.IPv4OptionRecordRoute, localAddress tcpip.Address, usage optionsUsage) *header.IPv4OptParameterProblem {
  1699  	optlen := rrOpt.Size()
  1700  
  1701  	if optlen < header.IPv4AddressSize+header.IPv4OptionRecordRouteHdrLength {
  1702  		return &header.IPv4OptParameterProblem{
  1703  			Pointer:  header.IPv4OptionLengthOffset,
  1704  			NeedICMP: true,
  1705  		}
  1706  	}
  1707  
  1708  	pointer := rrOpt.Pointer()
  1709  	// RFC 791 page 20 states:
  1710  	//      The pointer is relative to this option, and the
  1711  	//      smallest legal value for the pointer is 4.
  1712  	// Since the pointer is 1 based, and the header is 3 bytes long the
  1713  	// pointer must point beyond the header therefore 3 or less is bad.
  1714  	if pointer <= header.IPv4OptionRecordRouteHdrLength {
  1715  		return &header.IPv4OptParameterProblem{
  1716  			Pointer:  header.IPv4OptRRPointerOffset,
  1717  			NeedICMP: true,
  1718  		}
  1719  	}
  1720  
  1721  	// RFC 791 page 21 says
  1722  	//       If the route data area is already full (the pointer exceeds the
  1723  	//       length) the datagram is forwarded without inserting the address
  1724  	//       into the recorded route. If there is some room but not enough
  1725  	//       room for a full address to be inserted, the original datagram is
  1726  	//       considered to be in error and is discarded.  In either case an
  1727  	//       ICMP parameter problem message may be sent to the source
  1728  	//       host.
  1729  	// The use of the words "In either case" suggests that a 'full' RR option
  1730  	// could generate an ICMP at every hop after it fills up. We chose to not
  1731  	// do this (as do most implementations). It is probable that the inclusion
  1732  	// of these words is a copy/paste error from the timestamp option where
  1733  	// there are two failure reasons given.
  1734  	if pointer > optlen {
  1735  		return nil
  1736  	}
  1737  
  1738  	// The data area isn't full but there isn't room for a new entry.
  1739  	// Either Length or Pointer could be bad. We must select Pointer for Linux
  1740  	// compatibility, even if only the length is bad. NB. pointer is 1 based.
  1741  	if pointer+header.IPv4AddressSize > optlen+1 {
  1742  		if false {
  1743  			// This is what we would do if we were not being Linux compatible.
  1744  			// Check for bad pointer or length value. Must be a multiple of 4 after
  1745  			// accounting for the 3 byte header and not within that header.
  1746  			// RFC 791, page 20 says:
  1747  			//       The pointer is relative to this option, and the
  1748  			//       smallest legal value for the pointer is 4.
  1749  			//
  1750  			//       A recorded route is composed of a series of internet addresses.
  1751  			//       Each internet address is 32 bits or 4 octets.
  1752  			// Linux skips this test so we must too.  See Linux code at:
  1753  			// https://github.com/torvalds/linux/blob/bbf5c979011a099af5dc76498918ed7df445635b/net/ipv4/ip_options.c#L338-L341
  1754  			//    if (optptr[2]+3 > optlen) {
  1755  			//      pp_ptr = optptr + 2;
  1756  			//      goto error;
  1757  			//    }
  1758  			if (optlen-header.IPv4OptionRecordRouteHdrLength)%header.IPv4AddressSize != 0 {
  1759  				// Length is bad, not on integral number of slots.
  1760  				return &header.IPv4OptParameterProblem{
  1761  					Pointer:  header.IPv4OptionLengthOffset,
  1762  					NeedICMP: true,
  1763  				}
  1764  			}
  1765  			// If not length, the fault must be with the pointer.
  1766  		}
  1767  		return &header.IPv4OptParameterProblem{
  1768  			Pointer:  header.IPv4OptRRPointerOffset,
  1769  			NeedICMP: true,
  1770  		}
  1771  	}
  1772  	if usage.actions().recordRoute == optionVerify {
  1773  		return nil
  1774  	}
  1775  	rrOpt.StoreAddress(localAddress)
  1776  	return nil
  1777  }
  1778  
  1779  // handleRouterAlert performs sanity checks on a Router Alert option.
  1780  func handleRouterAlert(raOpt header.IPv4OptionRouterAlert) *header.IPv4OptParameterProblem {
  1781  	// Only the zero value is acceptable, as per RFC 2113, section 2.1:
  1782  	//   Value:  A two octet code with the following values:
  1783  	//     0 - Router shall examine packet
  1784  	//     1-65535 - Reserved
  1785  	if raOpt.Value() != header.IPv4OptionRouterAlertValue {
  1786  		return &header.IPv4OptParameterProblem{
  1787  			Pointer:  header.IPv4OptionRouterAlertValueOffset,
  1788  			NeedICMP: true,
  1789  		}
  1790  	}
  1791  	return nil
  1792  }
  1793  
  1794  type optionTracker struct {
  1795  	timestamp   bool
  1796  	recordRoute bool
  1797  	routerAlert bool
  1798  }
  1799  
  1800  // processIPOptions parses the IPv4 options and produces a new set of options
  1801  // suitable for use in the next step of packet processing as informed by usage.
  1802  // The original will not be touched.
  1803  //
  1804  // If there were no errors during parsing, the new set of options is returned as
  1805  // a new buffer.
  1806  func (e *endpoint) processIPOptions(pkt *stack.PacketBuffer, opts header.IPv4Options, usage optionsUsage) (header.IPv4Options, optionTracker, *header.IPv4OptParameterProblem) {
  1807  	stats := e.stats.ip
  1808  	optIter := opts.MakeIterator()
  1809  
  1810  	// Except NOP, each option must only appear at most once (RFC 791 section 3.1,
  1811  	// at the definition of every type).
  1812  	// Keep track of each option we find to enable duplicate option detection.
  1813  	var seenOptions [math.MaxUint8 + 1]bool
  1814  
  1815  	// TODO(https://gvisor.dev/issue/4586): This will need tweaking when we start
  1816  	// really forwarding packets as we may need to get two addresses, for rx and
  1817  	// tx interfaces. We will also have to take usage into account.
  1818  	localAddress := e.MainAddress().Address
  1819  	if len(localAddress) == 0 {
  1820  		h := header.IPv4(pkt.NetworkHeader().View())
  1821  		dstAddr := h.DestinationAddress()
  1822  		if pkt.NetworkPacketInfo.LocalAddressBroadcast || header.IsV4MulticastAddress(dstAddr) {
  1823  			return nil, optionTracker{}, &header.IPv4OptParameterProblem{
  1824  				NeedICMP: false,
  1825  			}
  1826  		}
  1827  		localAddress = dstAddr
  1828  	}
  1829  
  1830  	var optionsProcessed optionTracker
  1831  	for {
  1832  		option, done, optProblem := optIter.Next()
  1833  		if done || optProblem != nil {
  1834  			return optIter.Finalize(), optionsProcessed, optProblem
  1835  		}
  1836  		optType := option.Type()
  1837  		if optType == header.IPv4OptionNOPType {
  1838  			optIter.PushNOPOrEnd(optType)
  1839  			continue
  1840  		}
  1841  		if optType == header.IPv4OptionListEndType {
  1842  			optIter.PushNOPOrEnd(optType)
  1843  			return optIter.Finalize(), optionsProcessed, nil
  1844  		}
  1845  
  1846  		// check for repeating options (multiple NOPs are OK)
  1847  		if seenOptions[optType] {
  1848  			return nil, optionTracker{}, &header.IPv4OptParameterProblem{
  1849  				Pointer:  optIter.ErrCursor,
  1850  				NeedICMP: true,
  1851  			}
  1852  		}
  1853  		seenOptions[optType] = true
  1854  
  1855  		optLen, optProblem := func() (int, *header.IPv4OptParameterProblem) {
  1856  			switch option := option.(type) {
  1857  			case *header.IPv4OptionTimestamp:
  1858  				stats.OptionTimestampReceived.Increment()
  1859  				optionsProcessed.timestamp = true
  1860  				if usage.actions().timestamp != optionRemove {
  1861  					clock := e.protocol.stack.Clock()
  1862  					newBuffer := optIter.InitReplacement(option)
  1863  					optProblem := handleTimestamp(header.IPv4OptionTimestamp(newBuffer), localAddress, clock, usage)
  1864  					return len(newBuffer), optProblem
  1865  				}
  1866  
  1867  			case *header.IPv4OptionRecordRoute:
  1868  				stats.OptionRecordRouteReceived.Increment()
  1869  				optionsProcessed.recordRoute = true
  1870  				if usage.actions().recordRoute != optionRemove {
  1871  					newBuffer := optIter.InitReplacement(option)
  1872  					optProblem := handleRecordRoute(header.IPv4OptionRecordRoute(newBuffer), localAddress, usage)
  1873  					return len(newBuffer), optProblem
  1874  				}
  1875  
  1876  			case *header.IPv4OptionRouterAlert:
  1877  				stats.OptionRouterAlertReceived.Increment()
  1878  				optionsProcessed.routerAlert = true
  1879  				if usage.actions().routerAlert != optionRemove {
  1880  					newBuffer := optIter.InitReplacement(option)
  1881  					optProblem := handleRouterAlert(header.IPv4OptionRouterAlert(newBuffer))
  1882  					return len(newBuffer), optProblem
  1883  				}
  1884  
  1885  			default:
  1886  				stats.OptionUnknownReceived.Increment()
  1887  				if usage.actions().unknown == optionPass {
  1888  					return len(optIter.InitReplacement(option)), nil
  1889  				}
  1890  			}
  1891  			return 0, nil
  1892  		}()
  1893  
  1894  		if optProblem != nil {
  1895  			optProblem.Pointer += optIter.ErrCursor
  1896  			return nil, optionTracker{}, optProblem
  1897  		}
  1898  		optIter.ConsumeBuffer(optLen)
  1899  	}
  1900  }