inet.af/netstack@v0.0.0-20220214151720-7585b01ddccf/tcpip/network/internal/fragmentation/fragmentation.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package fragmentation contains the implementation of IP fragmentation.
    16  // It is based on RFC 791, RFC 815 and RFC 8200.
    17  package fragmentation
    18  
    19  import (
    20  	"errors"
    21  	"fmt"
    22  	"log"
    23  	"time"
    24  
    25  	"inet.af/netstack/sync"
    26  	"inet.af/netstack/tcpip"
    27  	"inet.af/netstack/tcpip/buffer"
    28  	"inet.af/netstack/tcpip/stack"
    29  )
    30  
    31  const (
    32  	// HighFragThreshold is the threshold at which we start trimming old
    33  	// fragmented packets. Linux uses a default value of 4 MB. See
    34  	// net.ipv4.ipfrag_high_thresh for more information.
    35  	HighFragThreshold = 4 << 20 // 4MB
    36  
    37  	// LowFragThreshold is the threshold we reach to when we start dropping
    38  	// older fragmented packets. It's important that we keep enough room for newer
    39  	// packets to be re-assembled. Hence, this needs to be lower than
    40  	// HighFragThreshold enough. Linux uses a default value of 3 MB. See
    41  	// net.ipv4.ipfrag_low_thresh for more information.
    42  	LowFragThreshold = 3 << 20 // 3MB
    43  
    44  	// minBlockSize is the minimum block size for fragments.
    45  	minBlockSize = 1
    46  )
    47  
    48  var (
    49  	// ErrInvalidArgs indicates to the caller that an invalid argument was
    50  	// provided.
    51  	ErrInvalidArgs = errors.New("invalid args")
    52  
    53  	// ErrFragmentOverlap indicates that, during reassembly, a fragment overlaps
    54  	// with another one.
    55  	ErrFragmentOverlap = errors.New("overlapping fragments")
    56  
    57  	// ErrFragmentConflict indicates that, during reassembly, some fragments are
    58  	// in conflict with one another.
    59  	ErrFragmentConflict = errors.New("conflicting fragments")
    60  )
    61  
    62  // FragmentID is the identifier for a fragment.
    63  type FragmentID struct {
    64  	// Source is the source address of the fragment.
    65  	Source tcpip.Address
    66  
    67  	// Destination is the destination address of the fragment.
    68  	Destination tcpip.Address
    69  
    70  	// ID is the identification value of the fragment.
    71  	//
    72  	// This is a uint32 because IPv6 uses a 32-bit identification value.
    73  	ID uint32
    74  
    75  	// The protocol for the packet.
    76  	Protocol uint8
    77  }
    78  
    79  // Fragmentation is the main structure that other modules
    80  // of the stack should use to implement IP Fragmentation.
    81  type Fragmentation struct {
    82  	mu             sync.Mutex
    83  	highLimit      int
    84  	lowLimit       int
    85  	reassemblers   map[FragmentID]*reassembler
    86  	rList          reassemblerList
    87  	memSize        int
    88  	timeout        time.Duration
    89  	blockSize      uint16
    90  	clock          tcpip.Clock
    91  	releaseJob     *tcpip.Job
    92  	timeoutHandler TimeoutHandler
    93  }
    94  
    95  // TimeoutHandler is consulted if a packet reassembly has timed out.
    96  type TimeoutHandler interface {
    97  	// OnReassemblyTimeout will be called with the first fragment (or nil, if the
    98  	// first fragment has not been received) of a packet whose reassembly has
    99  	// timed out.
   100  	OnReassemblyTimeout(pkt *stack.PacketBuffer)
   101  }
   102  
   103  // NewFragmentation creates a new Fragmentation.
   104  //
   105  // blockSize specifies the fragment block size, in bytes.
   106  //
   107  // highMemoryLimit specifies the limit on the memory consumed
   108  // by the fragments stored by Fragmentation (overhead of internal data-structures
   109  // is not accounted). Fragments are dropped when the limit is reached.
   110  //
   111  // lowMemoryLimit specifies the limit on which we will reach by dropping
   112  // fragments after reaching highMemoryLimit.
   113  //
   114  // reassemblingTimeout specifies the maximum time allowed to reassemble a packet.
   115  // Fragments are lazily evicted only when a new a packet with an
   116  // already existing fragmentation-id arrives after the timeout.
   117  func NewFragmentation(blockSize uint16, highMemoryLimit, lowMemoryLimit int, reassemblingTimeout time.Duration, clock tcpip.Clock, timeoutHandler TimeoutHandler) *Fragmentation {
   118  	if lowMemoryLimit >= highMemoryLimit {
   119  		lowMemoryLimit = highMemoryLimit
   120  	}
   121  
   122  	if lowMemoryLimit < 0 {
   123  		lowMemoryLimit = 0
   124  	}
   125  
   126  	if blockSize < minBlockSize {
   127  		blockSize = minBlockSize
   128  	}
   129  
   130  	f := &Fragmentation{
   131  		reassemblers:   make(map[FragmentID]*reassembler),
   132  		highLimit:      highMemoryLimit,
   133  		lowLimit:       lowMemoryLimit,
   134  		timeout:        reassemblingTimeout,
   135  		blockSize:      blockSize,
   136  		clock:          clock,
   137  		timeoutHandler: timeoutHandler,
   138  	}
   139  	f.releaseJob = tcpip.NewJob(f.clock, &f.mu, f.releaseReassemblersLocked)
   140  
   141  	return f
   142  }
   143  
   144  // Process processes an incoming fragment belonging to an ID and returns a
   145  // complete packet and its protocol number when all the packets belonging to
   146  // that ID have been received.
   147  //
   148  // [first, last] is the range of the fragment bytes.
   149  //
   150  // first must be a multiple of the block size f is configured with. The size
   151  // of the fragment data must be a multiple of the block size, unless there are
   152  // no fragments following this fragment (more set to false).
   153  //
   154  // proto is the protocol number marked in the fragment being processed. It has
   155  // to be given here outside of the FragmentID struct because IPv6 should not use
   156  // the protocol to identify a fragment.
   157  func (f *Fragmentation) Process(
   158  	id FragmentID, first, last uint16, more bool, proto uint8, pkt *stack.PacketBuffer) (
   159  	*stack.PacketBuffer, uint8, bool, error) {
   160  	if first > last {
   161  		return nil, 0, false, fmt.Errorf("first=%d is greater than last=%d: %w", first, last, ErrInvalidArgs)
   162  	}
   163  
   164  	if first%f.blockSize != 0 {
   165  		return nil, 0, false, fmt.Errorf("first=%d is not a multiple of block size=%d: %w", first, f.blockSize, ErrInvalidArgs)
   166  	}
   167  
   168  	fragmentSize := last - first + 1
   169  	if more && fragmentSize%f.blockSize != 0 {
   170  		return nil, 0, false, fmt.Errorf("fragment size=%d bytes is not a multiple of block size=%d on non-final fragment: %w", fragmentSize, f.blockSize, ErrInvalidArgs)
   171  	}
   172  
   173  	if l := pkt.Data().Size(); l != int(fragmentSize) {
   174  		return nil, 0, false, fmt.Errorf("got fragment size=%d bytes not equal to the expected fragment size=%d bytes (first=%d last=%d): %w", l, fragmentSize, first, last, ErrInvalidArgs)
   175  	}
   176  
   177  	f.mu.Lock()
   178  	r, ok := f.reassemblers[id]
   179  	if !ok {
   180  		r = newReassembler(id, f.clock)
   181  		f.reassemblers[id] = r
   182  		wasEmpty := f.rList.Empty()
   183  		f.rList.PushFront(r)
   184  		if wasEmpty {
   185  			// If we have just pushed a first reassembler into an empty list, we
   186  			// should kickstart the release job. The release job will keep
   187  			// rescheduling itself until the list becomes empty.
   188  			f.releaseReassemblersLocked()
   189  		}
   190  	}
   191  	f.mu.Unlock()
   192  
   193  	resPkt, firstFragmentProto, done, memConsumed, err := r.process(first, last, more, proto, pkt)
   194  	if err != nil {
   195  		// We probably got an invalid sequence of fragments. Just
   196  		// discard the reassembler and move on.
   197  		f.mu.Lock()
   198  		f.release(r, false /* timedOut */)
   199  		f.mu.Unlock()
   200  		return nil, 0, false, fmt.Errorf("fragmentation processing error: %w", err)
   201  	}
   202  	f.mu.Lock()
   203  	f.memSize += memConsumed
   204  	if done {
   205  		f.release(r, false /* timedOut */)
   206  	}
   207  	// Evict reassemblers if we are consuming more memory than highLimit until
   208  	// we reach lowLimit.
   209  	if f.memSize > f.highLimit {
   210  		for f.memSize > f.lowLimit {
   211  			tail := f.rList.Back()
   212  			if tail == nil {
   213  				break
   214  			}
   215  			f.release(tail, false /* timedOut */)
   216  		}
   217  	}
   218  	f.mu.Unlock()
   219  	return resPkt, firstFragmentProto, done, nil
   220  }
   221  
   222  func (f *Fragmentation) release(r *reassembler, timedOut bool) {
   223  	// Before releasing a fragment we need to check if r is already marked as done.
   224  	// Otherwise, we would delete it twice.
   225  	if r.checkDoneOrMark() {
   226  		return
   227  	}
   228  
   229  	delete(f.reassemblers, r.id)
   230  	f.rList.Remove(r)
   231  	f.memSize -= r.memSize
   232  	if f.memSize < 0 {
   233  		log.Printf("memory counter < 0 (%d), this is an accounting bug that requires investigation", f.memSize)
   234  		f.memSize = 0
   235  	}
   236  
   237  	if h := f.timeoutHandler; timedOut && h != nil {
   238  		h.OnReassemblyTimeout(r.pkt)
   239  	}
   240  }
   241  
   242  // releaseReassemblersLocked releases already-expired reassemblers, then
   243  // schedules the job to call back itself for the remaining reassemblers if
   244  // any. This function must be called with f.mu locked.
   245  func (f *Fragmentation) releaseReassemblersLocked() {
   246  	now := f.clock.NowMonotonic()
   247  	for {
   248  		// The reassembler at the end of the list is the oldest.
   249  		r := f.rList.Back()
   250  		if r == nil {
   251  			// The list is empty.
   252  			break
   253  		}
   254  		elapsed := now.Sub(r.createdAt)
   255  		if f.timeout > elapsed {
   256  			// If the oldest reassembler has not expired, schedule the release
   257  			// job so that this function is called back when it has expired.
   258  			f.releaseJob.Schedule(f.timeout - elapsed)
   259  			break
   260  		}
   261  		// If the oldest reassembler has already expired, release it.
   262  		f.release(r, true /* timedOut*/)
   263  	}
   264  }
   265  
   266  // PacketFragmenter is the book-keeping struct for packet fragmentation.
   267  type PacketFragmenter struct {
   268  	transportHeader    buffer.View
   269  	data               buffer.VectorisedView
   270  	reserve            int
   271  	fragmentPayloadLen int
   272  	fragmentCount      int
   273  	currentFragment    int
   274  	fragmentOffset     int
   275  }
   276  
   277  // MakePacketFragmenter prepares the struct needed for packet fragmentation.
   278  //
   279  // pkt is the packet to be fragmented.
   280  //
   281  // fragmentPayloadLen is the maximum number of bytes of fragmentable data a fragment can
   282  // have.
   283  //
   284  // reserve is the number of bytes that should be reserved for the headers in
   285  // each generated fragment.
   286  func MakePacketFragmenter(pkt *stack.PacketBuffer, fragmentPayloadLen uint32, reserve int) PacketFragmenter {
   287  	// As per RFC 8200 Section 4.5, some IPv6 extension headers should not be
   288  	// repeated in each fragment. However we do not currently support any header
   289  	// of that kind yet, so the following computation is valid for both IPv4 and
   290  	// IPv6.
   291  	// TODO(gvisor.dev/issue/3912): Once Authentication or ESP Headers are
   292  	// supported for outbound packets, the fragmentable data should not include
   293  	// these headers.
   294  	var fragmentableData buffer.VectorisedView
   295  	fragmentableData.AppendView(pkt.TransportHeader().View())
   296  	fragmentableData.Append(pkt.Data().ExtractVV())
   297  	fragmentCount := (uint32(fragmentableData.Size()) + fragmentPayloadLen - 1) / fragmentPayloadLen
   298  
   299  	return PacketFragmenter{
   300  		data:               fragmentableData,
   301  		reserve:            reserve,
   302  		fragmentPayloadLen: int(fragmentPayloadLen),
   303  		fragmentCount:      int(fragmentCount),
   304  	}
   305  }
   306  
   307  // BuildNextFragment returns a packet with the payload of the next fragment,
   308  // along with the fragment's offset, the number of bytes copied and a boolean
   309  // indicating if there are more fragments left or not. If this function is
   310  // called again after it indicated that no more fragments were left, it will
   311  // panic.
   312  //
   313  // Note that the returned packet will not have its network and link headers
   314  // populated, but space for them will be reserved. The transport header will be
   315  // stored in the packet's data.
   316  func (pf *PacketFragmenter) BuildNextFragment() (*stack.PacketBuffer, int, int, bool) {
   317  	if pf.currentFragment >= pf.fragmentCount {
   318  		panic("BuildNextFragment should not be called again after the last fragment was returned")
   319  	}
   320  
   321  	fragPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
   322  		ReserveHeaderBytes: pf.reserve,
   323  	})
   324  
   325  	// Copy data for the fragment.
   326  	copied := fragPkt.Data().ReadFromVV(&pf.data, pf.fragmentPayloadLen)
   327  
   328  	offset := pf.fragmentOffset
   329  	pf.fragmentOffset += copied
   330  	pf.currentFragment++
   331  	more := pf.currentFragment != pf.fragmentCount
   332  
   333  	return fragPkt, offset, copied, more
   334  }
   335  
   336  // RemainingFragmentCount returns the number of fragments left to be built.
   337  func (pf *PacketFragmenter) RemainingFragmentCount() int {
   338  	return pf.fragmentCount - pf.currentFragment
   339  }