github.com/cilium/ebpf@v0.15.1-0.20240517100537-8079b37aa138/perf/ring.go (about)

     1  package perf
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"io"
     7  	"math"
     8  	"os"
     9  	"runtime"
    10  	"sync/atomic"
    11  	"unsafe"
    12  
    13  	"github.com/cilium/ebpf/internal/sys"
    14  	"github.com/cilium/ebpf/internal/unix"
    15  )
    16  
    17  // perfEventRing is a page of metadata followed by
    18  // a variable number of pages which form a ring buffer.
    19  type perfEventRing struct {
    20  	cpu  int
    21  	mmap []byte
    22  	ringReader
    23  }
    24  
    25  func newPerfEventRing(cpu, perCPUBuffer int, opts ReaderOptions) (_ *sys.FD, _ *perfEventRing, err error) {
    26  	closeOnError := func(c io.Closer) {
    27  		if err != nil {
    28  			c.Close()
    29  		}
    30  	}
    31  
    32  	if opts.Watermark >= perCPUBuffer {
    33  		return nil, nil, errors.New("watermark must be smaller than perCPUBuffer")
    34  	}
    35  
    36  	fd, err := createPerfEvent(cpu, opts)
    37  	if err != nil {
    38  		return nil, nil, err
    39  	}
    40  	defer closeOnError(fd)
    41  
    42  	if err := unix.SetNonblock(fd.Int(), true); err != nil {
    43  		return nil, nil, err
    44  	}
    45  
    46  	protections := unix.PROT_READ
    47  	if !opts.Overwritable {
    48  		protections |= unix.PROT_WRITE
    49  	}
    50  
    51  	mmap, err := unix.Mmap(fd.Int(), 0, perfBufferSize(perCPUBuffer), protections, unix.MAP_SHARED)
    52  	if err != nil {
    53  		return nil, nil, fmt.Errorf("can't mmap: %v", err)
    54  	}
    55  
    56  	// This relies on the fact that we allocate an extra metadata page,
    57  	// and that the struct is smaller than an OS page.
    58  	// This use of unsafe.Pointer isn't explicitly sanctioned by the
    59  	// documentation, since a byte is smaller than sampledPerfEvent.
    60  	meta := (*unix.PerfEventMmapPage)(unsafe.Pointer(&mmap[0]))
    61  
    62  	var reader ringReader
    63  	if opts.Overwritable {
    64  		reader = newReverseReader(meta, mmap[meta.Data_offset:meta.Data_offset+meta.Data_size])
    65  	} else {
    66  		reader = newForwardReader(meta, mmap[meta.Data_offset:meta.Data_offset+meta.Data_size])
    67  	}
    68  
    69  	ring := &perfEventRing{
    70  		cpu:        cpu,
    71  		mmap:       mmap,
    72  		ringReader: reader,
    73  	}
    74  	runtime.SetFinalizer(ring, (*perfEventRing).Close)
    75  
    76  	return fd, ring, nil
    77  }
    78  
    79  // perfBufferSize returns a valid mmap buffer size for use with perf_event_open (1+2^n pages)
    80  func perfBufferSize(perCPUBuffer int) int {
    81  	pageSize := os.Getpagesize()
    82  
    83  	// Smallest whole number of pages
    84  	nPages := (perCPUBuffer + pageSize - 1) / pageSize
    85  
    86  	// Round up to nearest power of two number of pages
    87  	nPages = int(math.Pow(2, math.Ceil(math.Log2(float64(nPages)))))
    88  
    89  	// Add one for metadata
    90  	nPages += 1
    91  
    92  	return nPages * pageSize
    93  }
    94  
    95  func (ring *perfEventRing) Close() error {
    96  	runtime.SetFinalizer(ring, nil)
    97  	mmap := ring.mmap
    98  	ring.mmap = nil
    99  	return unix.Munmap(mmap)
   100  }
   101  
   102  func createPerfEvent(cpu int, opts ReaderOptions) (*sys.FD, error) {
   103  	wakeup := 0
   104  	bits := 0
   105  	if opts.WakeupEvents > 0 {
   106  		wakeup = opts.WakeupEvents
   107  	} else {
   108  		wakeup = opts.Watermark
   109  		if wakeup == 0 {
   110  			wakeup = 1
   111  		}
   112  		bits |= unix.PerfBitWatermark
   113  	}
   114  
   115  	if opts.Overwritable {
   116  		bits |= unix.PerfBitWriteBackward
   117  	}
   118  
   119  	attr := unix.PerfEventAttr{
   120  		Type:        unix.PERF_TYPE_SOFTWARE,
   121  		Config:      unix.PERF_COUNT_SW_BPF_OUTPUT,
   122  		Bits:        uint64(bits),
   123  		Sample_type: unix.PERF_SAMPLE_RAW,
   124  		Wakeup:      uint32(wakeup),
   125  	}
   126  
   127  	attr.Size = uint32(unsafe.Sizeof(attr))
   128  	fd, err := unix.PerfEventOpen(&attr, -1, cpu, -1, unix.PERF_FLAG_FD_CLOEXEC)
   129  	if err != nil {
   130  		return nil, fmt.Errorf("can't create perf event: %w", err)
   131  	}
   132  	return sys.NewFD(fd)
   133  }
   134  
   135  type ringReader interface {
   136  	loadHead()
   137  	size() int
   138  	remaining() int
   139  	writeTail()
   140  	Read(p []byte) (int, error)
   141  }
   142  
   143  type forwardReader struct {
   144  	meta       *unix.PerfEventMmapPage
   145  	head, tail uint64
   146  	mask       uint64
   147  	ring       []byte
   148  }
   149  
   150  func newForwardReader(meta *unix.PerfEventMmapPage, ring []byte) *forwardReader {
   151  	return &forwardReader{
   152  		meta: meta,
   153  		head: atomic.LoadUint64(&meta.Data_head),
   154  		tail: atomic.LoadUint64(&meta.Data_tail),
   155  		// cap is always a power of two
   156  		mask: uint64(cap(ring) - 1),
   157  		ring: ring,
   158  	}
   159  }
   160  
   161  func (rr *forwardReader) loadHead() {
   162  	rr.head = atomic.LoadUint64(&rr.meta.Data_head)
   163  }
   164  
   165  func (rr *forwardReader) size() int {
   166  	return len(rr.ring)
   167  }
   168  
   169  func (rr *forwardReader) remaining() int {
   170  	return int((rr.head - rr.tail) & rr.mask)
   171  }
   172  
   173  func (rr *forwardReader) writeTail() {
   174  	// Commit the new tail. This lets the kernel know that
   175  	// the ring buffer has been consumed.
   176  	atomic.StoreUint64(&rr.meta.Data_tail, rr.tail)
   177  }
   178  
   179  func (rr *forwardReader) Read(p []byte) (int, error) {
   180  	start := int(rr.tail & rr.mask)
   181  
   182  	n := len(p)
   183  	// Truncate if the read wraps in the ring buffer
   184  	if remainder := cap(rr.ring) - start; n > remainder {
   185  		n = remainder
   186  	}
   187  
   188  	// Truncate if there isn't enough data
   189  	if remainder := int(rr.head - rr.tail); n > remainder {
   190  		n = remainder
   191  	}
   192  
   193  	copy(p, rr.ring[start:start+n])
   194  	rr.tail += uint64(n)
   195  
   196  	if rr.tail == rr.head {
   197  		return n, io.EOF
   198  	}
   199  
   200  	return n, nil
   201  }
   202  
   203  type reverseReader struct {
   204  	meta *unix.PerfEventMmapPage
   205  	// head is the position where the kernel last wrote data.
   206  	head uint64
   207  	// read is the position we read the next data from. Updated as reads are made.
   208  	read uint64
   209  	// tail is the end of the ring buffer. No reads must be made past it.
   210  	tail uint64
   211  	mask uint64
   212  	ring []byte
   213  }
   214  
   215  func newReverseReader(meta *unix.PerfEventMmapPage, ring []byte) *reverseReader {
   216  	rr := &reverseReader{
   217  		meta: meta,
   218  		mask: uint64(cap(ring) - 1),
   219  		ring: ring,
   220  	}
   221  	rr.loadHead()
   222  	return rr
   223  }
   224  
   225  func (rr *reverseReader) loadHead() {
   226  	// The diagram below represents an overwritable perf ring buffer:
   227  	//
   228  	//    head     read                            tail
   229  	//     |        |                               |
   230  	//     V        V                               V
   231  	// +---+--------+------------+---------+--------+
   232  	// |   |H-D....D|H-C........C|H-B.....B|H-A....A|
   233  	// +---+--------+------------+---------+--------+
   234  	// <--Write from right to left
   235  	//                     Read from left to right-->
   236  	// (H means header)
   237  	//
   238  	// The buffer is read left to right beginning from head to tail.
   239  	// [head, read) is the read portion of the buffer, [read, tail) the unread one.
   240  	// read is adjusted as we progress through the buffer.
   241  
   242  	// Avoid reading sample D multiple times by discarding unread samples C, B, A.
   243  	rr.tail = rr.head
   244  
   245  	// Get the new head and starting reading from it.
   246  	rr.head = atomic.LoadUint64(&rr.meta.Data_head)
   247  	rr.read = rr.head
   248  
   249  	if rr.tail-rr.head > uint64(cap(rr.ring)) {
   250  		// ring has been fully written, only permit at most cap(rr.ring)
   251  		// bytes to be read.
   252  		rr.tail = rr.head + uint64(cap(rr.ring))
   253  	}
   254  }
   255  
   256  func (rr *reverseReader) size() int {
   257  	return len(rr.ring)
   258  }
   259  
   260  func (rr *reverseReader) remaining() int {
   261  	// remaining data is inaccurate for overwritable buffers
   262  	// once an overwrite happens, so return -1 here.
   263  	return -1
   264  }
   265  
   266  func (rr *reverseReader) writeTail() {
   267  	// We do not care about tail for over writable perf buffer.
   268  	// So, this function is noop.
   269  }
   270  
   271  func (rr *reverseReader) Read(p []byte) (int, error) {
   272  	start := int(rr.read & rr.mask)
   273  
   274  	n := len(p)
   275  	// Truncate if the read wraps in the ring buffer
   276  	if remainder := cap(rr.ring) - start; n > remainder {
   277  		n = remainder
   278  	}
   279  
   280  	// Truncate if there isn't enough data
   281  	if remainder := int(rr.tail - rr.read); n > remainder {
   282  		n = remainder
   283  	}
   284  
   285  	copy(p, rr.ring[start:start+n])
   286  	rr.read += uint64(n)
   287  
   288  	if rr.read == rr.tail {
   289  		return n, io.EOF
   290  	}
   291  
   292  	return n, nil
   293  }