gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/tcpip/link/fdbased/packet_dispatchers.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build linux
    16  // +build linux
    17  
    18  package fdbased
    19  
    20  import (
    21  	"golang.org/x/sys/unix"
    22  	"gvisor.dev/gvisor/pkg/buffer"
    23  	"gvisor.dev/gvisor/pkg/tcpip"
    24  	"gvisor.dev/gvisor/pkg/tcpip/header"
    25  	"gvisor.dev/gvisor/pkg/tcpip/link/rawfile"
    26  	"gvisor.dev/gvisor/pkg/tcpip/link/stopfd"
    27  	"gvisor.dev/gvisor/pkg/tcpip/stack"
    28  	"gvisor.dev/gvisor/pkg/tcpip/stack/gro"
    29  )
    30  
    31  // BufConfig defines the shape of the buffer used to read packets from the NIC.
    32  var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768}
    33  
    34  type iovecBuffer struct {
    35  	// buffer is the actual buffer that holds the packet contents. Some contents
    36  	// are reused across calls to pullBuffer if number of requested bytes is
    37  	// smaller than the number of bytes allocated in the buffer.
    38  	views []*buffer.View
    39  
    40  	// iovecs are initialized with base pointers/len of the corresponding
    41  	// entries in the views defined above, except when GSO is enabled
    42  	// (skipsVnetHdr) then the first iovec points to a buffer for the vnet header
    43  	// which is stripped before the views are passed up the stack for further
    44  	// processing.
    45  	iovecs []unix.Iovec
    46  
    47  	// sizes is an array of buffer sizes for the underlying views. sizes is
    48  	// immutable.
    49  	sizes []int
    50  
    51  	// skipsVnetHdr is true if virtioNetHdr is to skipped.
    52  	skipsVnetHdr bool
    53  
    54  	// pulledIndex is the index of the last []byte buffer pulled from the
    55  	// underlying buffer storage during a call to pullBuffers. It is -1
    56  	// if no buffer is pulled.
    57  	pulledIndex int
    58  }
    59  
    60  func newIovecBuffer(sizes []int, skipsVnetHdr bool) *iovecBuffer {
    61  	b := &iovecBuffer{
    62  		views:        make([]*buffer.View, len(sizes)),
    63  		sizes:        sizes,
    64  		skipsVnetHdr: skipsVnetHdr,
    65  	}
    66  	niov := len(b.views)
    67  	if b.skipsVnetHdr {
    68  		niov++
    69  	}
    70  	b.iovecs = make([]unix.Iovec, niov)
    71  	return b
    72  }
    73  
    74  func (b *iovecBuffer) nextIovecs() []unix.Iovec {
    75  	vnetHdrOff := 0
    76  	if b.skipsVnetHdr {
    77  		var vnetHdr [virtioNetHdrSize]byte
    78  		// The kernel adds virtioNetHdr before each packet, but
    79  		// we don't use it, so we allocate a buffer for it,
    80  		// add it in iovecs but don't add it in a view.
    81  		b.iovecs[0] = unix.Iovec{Base: &vnetHdr[0]}
    82  		b.iovecs[0].SetLen(virtioNetHdrSize)
    83  		vnetHdrOff++
    84  	}
    85  
    86  	for i := range b.views {
    87  		if b.views[i] != nil {
    88  			break
    89  		}
    90  		v := buffer.NewViewSize(b.sizes[i])
    91  		b.views[i] = v
    92  		b.iovecs[i+vnetHdrOff] = unix.Iovec{Base: v.BasePtr()}
    93  		b.iovecs[i+vnetHdrOff].SetLen(v.Size())
    94  	}
    95  	return b.iovecs
    96  }
    97  
    98  // pullBuffer extracts the enough underlying storage from b.buffer to hold n
    99  // bytes. It removes this storage from b.buffer, returns a new buffer
   100  // that holds the storage, and updates pulledIndex to indicate which part
   101  // of b.buffer's storage must be reallocated during the next call to
   102  // nextIovecs.
   103  func (b *iovecBuffer) pullBuffer(n int) buffer.Buffer {
   104  	var views []*buffer.View
   105  	c := 0
   106  	if b.skipsVnetHdr {
   107  		c += virtioNetHdrSize
   108  		if c >= n {
   109  			// Nothing in the packet.
   110  			return buffer.Buffer{}
   111  		}
   112  	}
   113  	// Remove the used views from the buffer.
   114  	for i, v := range b.views {
   115  		c += v.Size()
   116  		if c >= n {
   117  			b.views[i].CapLength(v.Size() - (c - n))
   118  			views = append(views, b.views[:i+1]...)
   119  			break
   120  		}
   121  	}
   122  	for i := range views {
   123  		b.views[i] = nil
   124  	}
   125  	if b.skipsVnetHdr {
   126  		// Exclude the size of the vnet header.
   127  		n -= virtioNetHdrSize
   128  	}
   129  	pulled := buffer.Buffer{}
   130  	for _, v := range views {
   131  		pulled.Append(v)
   132  	}
   133  	pulled.Truncate(int64(n))
   134  	return pulled
   135  }
   136  
   137  func (b *iovecBuffer) release() {
   138  	for _, v := range b.views {
   139  		if v != nil {
   140  			v.Release()
   141  			v = nil
   142  		}
   143  	}
   144  }
   145  
   146  // readVDispatcher uses readv() system call to read inbound packets and
   147  // dispatches them.
   148  type readVDispatcher struct {
   149  	stopfd.StopFD
   150  	// fd is the file descriptor used to send and receive packets.
   151  	fd int
   152  
   153  	// e is the endpoint this dispatcher is attached to.
   154  	e *endpoint
   155  
   156  	// buf is the iovec buffer that contains the packet contents.
   157  	buf *iovecBuffer
   158  
   159  	// mgr is the processor goroutine manager.
   160  	mgr *processorManager
   161  }
   162  
   163  func newReadVDispatcher(fd int, e *endpoint, opts *Options) (linkDispatcher, error) {
   164  	stopFD, err := stopfd.New()
   165  	if err != nil {
   166  		return nil, err
   167  	}
   168  	d := &readVDispatcher{
   169  		StopFD: stopFD,
   170  		fd:     fd,
   171  		e:      e,
   172  	}
   173  	skipsVnetHdr := d.e.gsoKind == stack.HostGSOSupported
   174  	d.buf = newIovecBuffer(BufConfig, skipsVnetHdr)
   175  	d.mgr = newProcessorManager(opts, e)
   176  	d.mgr.start()
   177  	return d, nil
   178  }
   179  
   180  func (d *readVDispatcher) release() {
   181  	d.buf.release()
   182  	d.mgr.close()
   183  }
   184  
   185  // dispatch reads one packet from the file descriptor and dispatches it.
   186  func (d *readVDispatcher) dispatch() (bool, tcpip.Error) {
   187  	n, err := rawfile.BlockingReadvUntilStopped(d.EFD, d.fd, d.buf.nextIovecs())
   188  	if n <= 0 || err != nil {
   189  		return false, err
   190  	}
   191  
   192  	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
   193  		Payload: d.buf.pullBuffer(n),
   194  	})
   195  	defer pkt.DecRef()
   196  
   197  	if d.e.hdrSize > 0 {
   198  		if !d.e.parseHeader(pkt) {
   199  			return false, nil
   200  		}
   201  		pkt.NetworkProtocolNumber = header.Ethernet(pkt.LinkHeader().Slice()).Type()
   202  	}
   203  	d.mgr.queuePacket(pkt, d.e.hdrSize > 0)
   204  	d.mgr.wakeReady()
   205  	return true, nil
   206  }
   207  
   208  // recvMMsgDispatcher uses the recvmmsg system call to read inbound packets and
   209  // dispatches them.
   210  type recvMMsgDispatcher struct {
   211  	stopfd.StopFD
   212  	// fd is the file descriptor used to send and receive packets.
   213  	fd int
   214  
   215  	// e is the endpoint this dispatcher is attached to.
   216  	e *endpoint
   217  
   218  	// bufs is an array of iovec buffers that contain packet contents.
   219  	bufs []*iovecBuffer
   220  
   221  	// msgHdrs is an array of MMsgHdr objects where each MMsghdr is used to
   222  	// reference an array of iovecs in the iovecs field defined above.  This
   223  	// array is passed as the parameter to recvmmsg call to retrieve
   224  	// potentially more than 1 packet per unix.
   225  	msgHdrs []rawfile.MMsgHdr
   226  
   227  	// pkts is reused to avoid allocations.
   228  	pkts stack.PacketBufferList
   229  
   230  	// gro coalesces incoming packets to increase throughput.
   231  	gro gro.GRO
   232  
   233  	// mgr is the processor goroutine manager.
   234  	mgr *processorManager
   235  }
   236  
   237  const (
   238  	// MaxMsgsPerRecv is the maximum number of packets we want to retrieve
   239  	// in a single RecvMMsg call.
   240  	MaxMsgsPerRecv = 8
   241  )
   242  
   243  func newRecvMMsgDispatcher(fd int, e *endpoint, opts *Options) (linkDispatcher, error) {
   244  	stopFD, err := stopfd.New()
   245  	if err != nil {
   246  		return nil, err
   247  	}
   248  	d := &recvMMsgDispatcher{
   249  		StopFD:  stopFD,
   250  		fd:      fd,
   251  		e:       e,
   252  		bufs:    make([]*iovecBuffer, MaxMsgsPerRecv),
   253  		msgHdrs: make([]rawfile.MMsgHdr, MaxMsgsPerRecv),
   254  	}
   255  	skipsVnetHdr := d.e.gsoKind == stack.HostGSOSupported
   256  	for i := range d.bufs {
   257  		d.bufs[i] = newIovecBuffer(BufConfig, skipsVnetHdr)
   258  	}
   259  	d.gro.Init(opts.GRO)
   260  	d.mgr = newProcessorManager(opts, e)
   261  	d.mgr.start()
   262  
   263  	return d, nil
   264  }
   265  
   266  func (d *recvMMsgDispatcher) release() {
   267  	for _, iov := range d.bufs {
   268  		iov.release()
   269  	}
   270  	d.mgr.close()
   271  }
   272  
   273  // recvMMsgDispatch reads more than one packet at a time from the file
   274  // descriptor and dispatches it.
   275  func (d *recvMMsgDispatcher) dispatch() (bool, tcpip.Error) {
   276  	// Fill message headers.
   277  	for k := range d.msgHdrs {
   278  		if d.msgHdrs[k].Msg.Iovlen > 0 {
   279  			break
   280  		}
   281  		iovecs := d.bufs[k].nextIovecs()
   282  		iovLen := len(iovecs)
   283  		d.msgHdrs[k].Len = 0
   284  		d.msgHdrs[k].Msg.Iov = &iovecs[0]
   285  		d.msgHdrs[k].Msg.SetIovlen(iovLen)
   286  	}
   287  
   288  	nMsgs, err := rawfile.BlockingRecvMMsgUntilStopped(d.EFD, d.fd, d.msgHdrs)
   289  	if nMsgs == -1 || err != nil {
   290  		return false, err
   291  	}
   292  
   293  	// Process each of received packets.
   294  
   295  	d.e.mu.RLock()
   296  	dsp := d.e.dispatcher
   297  	d.e.mu.RUnlock()
   298  
   299  	d.gro.Dispatcher = dsp
   300  	defer d.pkts.Reset()
   301  
   302  	for k := 0; k < nMsgs; k++ {
   303  		n := int(d.msgHdrs[k].Len)
   304  		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
   305  			Payload: d.bufs[k].pullBuffer(n),
   306  		})
   307  		d.pkts.PushBack(pkt)
   308  
   309  		// Mark that this iovec has been processed.
   310  		d.msgHdrs[k].Msg.Iovlen = 0
   311  
   312  		if d.e.hdrSize > 0 {
   313  			hdr, ok := pkt.LinkHeader().Consume(d.e.hdrSize)
   314  			if !ok {
   315  				return false, nil
   316  			}
   317  			pkt.NetworkProtocolNumber = header.Ethernet(hdr).Type()
   318  		}
   319  		pkt.RXChecksumValidated = d.e.caps&stack.CapabilityRXChecksumOffload != 0
   320  		d.mgr.queuePacket(pkt, d.e.hdrSize > 0)
   321  	}
   322  	d.mgr.wakeReady()
   323  
   324  	return true, nil
   325  }