github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/tcpip/link/fdbased/packet_dispatchers.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build linux
    16  // +build linux
    17  
    18  package fdbased
    19  
    20  import (
    21  	"golang.org/x/sys/unix"
    22  	"github.com/metacubex/gvisor/pkg/buffer"
    23  	"github.com/metacubex/gvisor/pkg/tcpip"
    24  	"github.com/metacubex/gvisor/pkg/tcpip/header"
    25  	"github.com/metacubex/gvisor/pkg/tcpip/link/rawfile"
    26  	"github.com/metacubex/gvisor/pkg/tcpip/link/stopfd"
    27  	"github.com/metacubex/gvisor/pkg/tcpip/stack"
    28  )
    29  
    30  // BufConfig defines the shape of the buffer used to read packets from the NIC.
    31  var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768}
    32  
    33  type iovecBuffer struct {
    34  	// buffer is the actual buffer that holds the packet contents. Some contents
    35  	// are reused across calls to pullBuffer if number of requested bytes is
    36  	// smaller than the number of bytes allocated in the buffer.
    37  	views []*buffer.View
    38  
    39  	// iovecs are initialized with base pointers/len of the corresponding
    40  	// entries in the views defined above, except when GSO is enabled
    41  	// (skipsVnetHdr) then the first iovec points to a buffer for the vnet header
    42  	// which is stripped before the views are passed up the stack for further
    43  	// processing.
    44  	iovecs []unix.Iovec
    45  
    46  	// sizes is an array of buffer sizes for the underlying views. sizes is
    47  	// immutable.
    48  	sizes []int
    49  
    50  	// skipsVnetHdr is true if virtioNetHdr is to skipped.
    51  	skipsVnetHdr bool
    52  
    53  	// pulledIndex is the index of the last []byte buffer pulled from the
    54  	// underlying buffer storage during a call to pullBuffers. It is -1
    55  	// if no buffer is pulled.
    56  	pulledIndex int
    57  }
    58  
    59  func newIovecBuffer(sizes []int, skipsVnetHdr bool) *iovecBuffer {
    60  	b := &iovecBuffer{
    61  		views:        make([]*buffer.View, len(sizes)),
    62  		sizes:        sizes,
    63  		skipsVnetHdr: skipsVnetHdr,
    64  	}
    65  	niov := len(b.views)
    66  	if b.skipsVnetHdr {
    67  		niov++
    68  	}
    69  	b.iovecs = make([]unix.Iovec, niov)
    70  	return b
    71  }
    72  
    73  func (b *iovecBuffer) nextIovecs() []unix.Iovec {
    74  	vnetHdrOff := 0
    75  	if b.skipsVnetHdr {
    76  		var vnetHdr [virtioNetHdrSize]byte
    77  		// The kernel adds virtioNetHdr before each packet, but
    78  		// we don't use it, so we allocate a buffer for it,
    79  		// add it in iovecs but don't add it in a view.
    80  		b.iovecs[0] = unix.Iovec{Base: &vnetHdr[0]}
    81  		b.iovecs[0].SetLen(virtioNetHdrSize)
    82  		vnetHdrOff++
    83  	}
    84  
    85  	for i := range b.views {
    86  		if b.views[i] != nil {
    87  			break
    88  		}
    89  		v := buffer.NewViewSize(b.sizes[i])
    90  		b.views[i] = v
    91  		b.iovecs[i+vnetHdrOff] = unix.Iovec{Base: v.BasePtr()}
    92  		b.iovecs[i+vnetHdrOff].SetLen(v.Size())
    93  	}
    94  	return b.iovecs
    95  }
    96  
    97  // pullBuffer extracts the enough underlying storage from b.buffer to hold n
    98  // bytes. It removes this storage from b.buffer, returns a new buffer
    99  // that holds the storage, and updates pulledIndex to indicate which part
   100  // of b.buffer's storage must be reallocated during the next call to
   101  // nextIovecs.
   102  func (b *iovecBuffer) pullBuffer(n int) buffer.Buffer {
   103  	var views []*buffer.View
   104  	c := 0
   105  	if b.skipsVnetHdr {
   106  		c += virtioNetHdrSize
   107  		if c >= n {
   108  			// Nothing in the packet.
   109  			return buffer.Buffer{}
   110  		}
   111  	}
   112  	// Remove the used views from the buffer.
   113  	for i, v := range b.views {
   114  		c += v.Size()
   115  		if c >= n {
   116  			b.views[i].CapLength(v.Size() - (c - n))
   117  			views = append(views, b.views[:i+1]...)
   118  			break
   119  		}
   120  	}
   121  	for i := range views {
   122  		b.views[i] = nil
   123  	}
   124  	if b.skipsVnetHdr {
   125  		// Exclude the size of the vnet header.
   126  		n -= virtioNetHdrSize
   127  	}
   128  	pulled := buffer.Buffer{}
   129  	for _, v := range views {
   130  		pulled.Append(v)
   131  	}
   132  	pulled.Truncate(int64(n))
   133  	return pulled
   134  }
   135  
   136  func (b *iovecBuffer) release() {
   137  	for _, v := range b.views {
   138  		if v != nil {
   139  			v.Release()
   140  			v = nil
   141  		}
   142  	}
   143  }
   144  
   145  // readVDispatcher uses readv() system call to read inbound packets and
   146  // dispatches them.
   147  type readVDispatcher struct {
   148  	stopfd.StopFD
   149  	// fd is the file descriptor used to send and receive packets.
   150  	fd int
   151  
   152  	// e is the endpoint this dispatcher is attached to.
   153  	e *endpoint
   154  
   155  	// buf is the iovec buffer that contains the packet contents.
   156  	buf *iovecBuffer
   157  }
   158  
   159  func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
   160  	stopFD, err := stopfd.New()
   161  	if err != nil {
   162  		return nil, err
   163  	}
   164  	d := &readVDispatcher{
   165  		StopFD: stopFD,
   166  		fd:     fd,
   167  		e:      e,
   168  	}
   169  	skipsVnetHdr := d.e.gsoKind == stack.HostGSOSupported
   170  	d.buf = newIovecBuffer(BufConfig, skipsVnetHdr)
   171  	return d, nil
   172  }
   173  
   174  func (d *readVDispatcher) release() {
   175  	d.buf.release()
   176  }
   177  
   178  // dispatch reads one packet from the file descriptor and dispatches it.
   179  func (d *readVDispatcher) dispatch() (bool, tcpip.Error) {
   180  	n, err := rawfile.BlockingReadvUntilStopped(d.EFD, d.fd, d.buf.nextIovecs())
   181  	if n <= 0 || err != nil {
   182  		return false, err
   183  	}
   184  
   185  	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
   186  		Payload: d.buf.pullBuffer(n),
   187  	})
   188  	defer pkt.DecRef()
   189  
   190  	var p tcpip.NetworkProtocolNumber
   191  	if d.e.hdrSize > 0 {
   192  		if !d.e.parseHeader(pkt) {
   193  			return false, nil
   194  		}
   195  		p = header.Ethernet(pkt.LinkHeader().Slice()).Type()
   196  	} else {
   197  		// We don't get any indication of what the packet is, so try to guess
   198  		// if it's an IPv4 or IPv6 packet.
   199  		// IP version information is at the first octet, so pulling up 1 byte.
   200  		h, ok := pkt.Data().PullUp(1)
   201  		if !ok {
   202  			return true, nil
   203  		}
   204  		switch header.IPVersion(h) {
   205  		case header.IPv4Version:
   206  			p = header.IPv4ProtocolNumber
   207  		case header.IPv6Version:
   208  			p = header.IPv6ProtocolNumber
   209  		default:
   210  			return true, nil
   211  		}
   212  	}
   213  
   214  	d.e.mu.RLock()
   215  	dsp := d.e.dispatcher
   216  	d.e.mu.RUnlock()
   217  	dsp.DeliverNetworkPacket(p, pkt)
   218  
   219  	return true, nil
   220  }
   221  
   222  // recvMMsgDispatcher uses the recvmmsg system call to read inbound packets and
   223  // dispatches them.
   224  type recvMMsgDispatcher struct {
   225  	stopfd.StopFD
   226  	// fd is the file descriptor used to send and receive packets.
   227  	fd int
   228  
   229  	// e is the endpoint this dispatcher is attached to.
   230  	e *endpoint
   231  
   232  	// bufs is an array of iovec buffers that contain packet contents.
   233  	bufs []*iovecBuffer
   234  
   235  	// msgHdrs is an array of MMsgHdr objects where each MMsghdr is used to
   236  	// reference an array of iovecs in the iovecs field defined above.  This
   237  	// array is passed as the parameter to recvmmsg call to retrieve
   238  	// potentially more than 1 packet per unix.
   239  	msgHdrs []rawfile.MMsgHdr
   240  }
   241  
   242  const (
   243  	// MaxMsgsPerRecv is the maximum number of packets we want to retrieve
   244  	// in a single RecvMMsg call.
   245  	MaxMsgsPerRecv = 8
   246  )
   247  
   248  func newRecvMMsgDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
   249  	stopFD, err := stopfd.New()
   250  	if err != nil {
   251  		return nil, err
   252  	}
   253  	d := &recvMMsgDispatcher{
   254  		StopFD:  stopFD,
   255  		fd:      fd,
   256  		e:       e,
   257  		bufs:    make([]*iovecBuffer, MaxMsgsPerRecv),
   258  		msgHdrs: make([]rawfile.MMsgHdr, MaxMsgsPerRecv),
   259  	}
   260  	skipsVnetHdr := d.e.gsoKind == stack.HostGSOSupported
   261  	for i := range d.bufs {
   262  		d.bufs[i] = newIovecBuffer(BufConfig, skipsVnetHdr)
   263  	}
   264  	return d, nil
   265  }
   266  
   267  func (d *recvMMsgDispatcher) release() {
   268  	for _, iov := range d.bufs {
   269  		iov.release()
   270  	}
   271  }
   272  
   273  // recvMMsgDispatch reads more than one packet at a time from the file
   274  // descriptor and dispatches it.
   275  func (d *recvMMsgDispatcher) dispatch() (bool, tcpip.Error) {
   276  	// Fill message headers.
   277  	for k := range d.msgHdrs {
   278  		if d.msgHdrs[k].Msg.Iovlen > 0 {
   279  			break
   280  		}
   281  		iovecs := d.bufs[k].nextIovecs()
   282  		iovLen := len(iovecs)
   283  		d.msgHdrs[k].Len = 0
   284  		d.msgHdrs[k].Msg.Iov = &iovecs[0]
   285  		d.msgHdrs[k].Msg.SetIovlen(iovLen)
   286  	}
   287  
   288  	nMsgs, err := rawfile.BlockingRecvMMsgUntilStopped(d.EFD, d.fd, d.msgHdrs)
   289  	if nMsgs == -1 || err != nil {
   290  		return false, err
   291  	}
   292  	// Process each of received packets.
   293  	// Keep a list of packets so we can DecRef outside of the loop.
   294  	var pkts stack.PacketBufferList
   295  
   296  	d.e.mu.RLock()
   297  	dsp := d.e.dispatcher
   298  	d.e.mu.RUnlock()
   299  
   300  	defer func() { pkts.DecRef() }()
   301  	for k := 0; k < nMsgs; k++ {
   302  		n := int(d.msgHdrs[k].Len)
   303  		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
   304  			Payload: d.bufs[k].pullBuffer(n),
   305  		})
   306  		pkts.PushBack(pkt)
   307  
   308  		// Mark that this iovec has been processed.
   309  		d.msgHdrs[k].Msg.Iovlen = 0
   310  
   311  		var p tcpip.NetworkProtocolNumber
   312  		if d.e.hdrSize > 0 {
   313  			hdr, ok := pkt.LinkHeader().Consume(d.e.hdrSize)
   314  			if !ok {
   315  				return false, nil
   316  			}
   317  			p = header.Ethernet(hdr).Type()
   318  		} else {
   319  			// We don't get any indication of what the packet is, so try to guess
   320  			// if it's an IPv4 or IPv6 packet.
   321  			// IP version information is at the first octet, so pulling up 1 byte.
   322  			h, ok := pkt.Data().PullUp(1)
   323  			if !ok {
   324  				// Skip this packet.
   325  				continue
   326  			}
   327  			switch header.IPVersion(h) {
   328  			case header.IPv4Version:
   329  				p = header.IPv4ProtocolNumber
   330  			case header.IPv6Version:
   331  				p = header.IPv6ProtocolNumber
   332  			default:
   333  				// Skip this packet.
   334  				continue
   335  			}
   336  		}
   337  
   338  		dsp.DeliverNetworkPacket(p, pkt)
   339  	}
   340  
   341  	return true, nil
   342  }