github.com/sagernet/gvisor@v0.0.0-20240428053021-e691de28565f/pkg/tcpip/link/fdbased/mmap.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build (linux && amd64) || (linux && arm64)
    16  // +build linux,amd64 linux,arm64
    17  
    18  package fdbased
    19  
    20  import (
    21  	"encoding/binary"
    22  	"fmt"
    23  
    24  	"golang.org/x/sys/unix"
    25  	"github.com/sagernet/gvisor/pkg/buffer"
    26  	"github.com/sagernet/gvisor/pkg/tcpip"
    27  	"github.com/sagernet/gvisor/pkg/tcpip/header"
    28  	"github.com/sagernet/gvisor/pkg/tcpip/link/rawfile"
    29  	"github.com/sagernet/gvisor/pkg/tcpip/link/stopfd"
    30  	"github.com/sagernet/gvisor/pkg/tcpip/stack"
    31  )
    32  
    33  const (
    34  	tPacketAlignment = uintptr(16)
    35  	tpStatusKernel   = 0
    36  	tpStatusUser     = 1
    37  	tpStatusCopy     = 2
    38  	tpStatusLosing   = 4
    39  )
    40  
    41  // We overallocate the frame size to accommodate space for the
    42  // TPacketHdr+RawSockAddrLinkLayer+MAC header and any padding.
    43  //
    44  // Memory allocated for the ring buffer: tpBlockSize * tpBlockNR = 2 MiB
    45  //
    46  // NOTE:
    47  //
    48  //	Frames need to be aligned at 16 byte boundaries.
    49  //	BlockSize needs to be page aligned.
    50  //
    51  //	For details see PACKET_MMAP setting constraints in
    52  //	https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
    53  const (
    54  	tpFrameSize = 65536 + 128
    55  	tpBlockSize = tpFrameSize * 32
    56  	tpBlockNR   = 1
    57  	tpFrameNR   = (tpBlockSize * tpBlockNR) / tpFrameSize
    58  )
    59  
    60  // tPacketAlign aligns the pointer v at a tPacketAlignment boundary. Direct
    61  // translation of the TPACKET_ALIGN macro in <linux/if_packet.h>.
    62  func tPacketAlign(v uintptr) uintptr {
    63  	return (v + tPacketAlignment - 1) & uintptr(^(tPacketAlignment - 1))
    64  }
    65  
    66  // tPacketReq is the tpacket_req structure as described in
    67  // https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
    68  type tPacketReq struct {
    69  	tpBlockSize uint32
    70  	tpBlockNR   uint32
    71  	tpFrameSize uint32
    72  	tpFrameNR   uint32
    73  }
    74  
    75  // tPacketHdr is tpacket_hdr structure as described in <linux/if_packet.h>
    76  type tPacketHdr []byte
    77  
    78  const (
    79  	tpStatusOffset  = 0
    80  	tpLenOffset     = 8
    81  	tpSnapLenOffset = 12
    82  	tpMacOffset     = 16
    83  	tpNetOffset     = 18
    84  	tpSecOffset     = 20
    85  	tpUSecOffset    = 24
    86  )
    87  
    88  func (t tPacketHdr) tpLen() uint32 {
    89  	return binary.LittleEndian.Uint32(t[tpLenOffset:])
    90  }
    91  
    92  func (t tPacketHdr) tpSnapLen() uint32 {
    93  	return binary.LittleEndian.Uint32(t[tpSnapLenOffset:])
    94  }
    95  
    96  func (t tPacketHdr) tpMac() uint16 {
    97  	return binary.LittleEndian.Uint16(t[tpMacOffset:])
    98  }
    99  
   100  func (t tPacketHdr) tpNet() uint16 {
   101  	return binary.LittleEndian.Uint16(t[tpNetOffset:])
   102  }
   103  
   104  func (t tPacketHdr) tpSec() uint32 {
   105  	return binary.LittleEndian.Uint32(t[tpSecOffset:])
   106  }
   107  
   108  func (t tPacketHdr) tpUSec() uint32 {
   109  	return binary.LittleEndian.Uint32(t[tpUSecOffset:])
   110  }
   111  
   112  func (t tPacketHdr) Payload() []byte {
   113  	return t[uint32(t.tpMac()) : uint32(t.tpMac())+t.tpSnapLen()]
   114  }
   115  
   116  // packetMMapDispatcher uses PACKET_RX_RING's to read/dispatch inbound packets.
   117  // See: mmap_amd64_unsafe.go for implementation details.
   118  type packetMMapDispatcher struct {
   119  	stopfd.StopFD
   120  	// fd is the file descriptor used to send and receive packets.
   121  	fd int
   122  
   123  	// e is the endpoint this dispatcher is attached to.
   124  	e *endpoint
   125  
   126  	// ringBuffer is only used when PacketMMap dispatcher is used and points
   127  	// to the start of the mmapped PACKET_RX_RING buffer.
   128  	ringBuffer []byte
   129  
   130  	// ringOffset is the current offset into the ring buffer where the next
   131  	// inbound packet will be placed by the kernel.
   132  	ringOffset int
   133  }
   134  
   135  func (*packetMMapDispatcher) release() {}
   136  
   137  func (d *packetMMapDispatcher) readMMappedPacket() (*buffer.View, bool, tcpip.Error) {
   138  	hdr := tPacketHdr(d.ringBuffer[d.ringOffset*tpFrameSize:])
   139  	for hdr.tpStatus()&tpStatusUser == 0 {
   140  		stopped, errno := rawfile.BlockingPollUntilStopped(d.EFD, d.fd, unix.POLLIN|unix.POLLERR)
   141  		if errno != 0 {
   142  			if errno == unix.EINTR {
   143  				continue
   144  			}
   145  			return nil, stopped, rawfile.TranslateErrno(errno)
   146  		}
   147  		if stopped {
   148  			return nil, true, nil
   149  		}
   150  		if hdr.tpStatus()&tpStatusCopy != 0 {
   151  			// This frame is truncated so skip it after flipping the
   152  			// buffer to the kernel.
   153  			hdr.setTPStatus(tpStatusKernel)
   154  			d.ringOffset = (d.ringOffset + 1) % tpFrameNR
   155  			hdr = (tPacketHdr)(d.ringBuffer[d.ringOffset*tpFrameSize:])
   156  			continue
   157  		}
   158  	}
   159  
   160  	// Copy out the packet from the mmapped frame to a locally owned buffer.
   161  	pkt := buffer.NewView(int(hdr.tpSnapLen()))
   162  	pkt.Write(hdr.Payload())
   163  	// Release packet to kernel.
   164  	hdr.setTPStatus(tpStatusKernel)
   165  	d.ringOffset = (d.ringOffset + 1) % tpFrameNR
   166  	return pkt, false, nil
   167  }
   168  
   169  // dispatch reads packets from an mmaped ring buffer and dispatches them to the
   170  // network stack.
   171  func (d *packetMMapDispatcher) dispatch() (bool, tcpip.Error) {
   172  	pkt, stopped, err := d.readMMappedPacket()
   173  	if err != nil || stopped {
   174  		return false, err
   175  	}
   176  	var p tcpip.NetworkProtocolNumber
   177  	if d.e.hdrSize > 0 {
   178  		p = header.Ethernet(pkt.AsSlice()).Type()
   179  	} else {
   180  		// We don't get any indication of what the packet is, so try to guess
   181  		// if it's an IPv4 or IPv6 packet.
   182  		switch header.IPVersion(pkt.AsSlice()) {
   183  		case header.IPv4Version:
   184  			p = header.IPv4ProtocolNumber
   185  		case header.IPv6Version:
   186  			p = header.IPv6ProtocolNumber
   187  		default:
   188  			return true, nil
   189  		}
   190  	}
   191  
   192  	pbuf := stack.NewPacketBuffer(stack.PacketBufferOptions{
   193  		Payload: buffer.MakeWithView(pkt),
   194  	})
   195  	defer pbuf.DecRef()
   196  	if d.e.hdrSize > 0 {
   197  		if _, ok := pbuf.LinkHeader().Consume(d.e.hdrSize); !ok {
   198  			panic(fmt.Sprintf("LinkHeader().Consume(%d) must succeed", d.e.hdrSize))
   199  		}
   200  	}
   201  	d.e.mu.RLock()
   202  	dsp := d.e.dispatcher
   203  	d.e.mu.RUnlock()
   204  	dsp.DeliverNetworkPacket(p, pbuf)
   205  	return true, nil
   206  }