github.com/cilium/ebpf@v0.15.0/perf/ring.go (about)

     1  package perf
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"io"
     7  	"math"
     8  	"os"
     9  	"runtime"
    10  	"sync/atomic"
    11  	"unsafe"
    12  
    13  	"github.com/cilium/ebpf/internal/unix"
    14  )
    15  
    16  // perfEventRing is a page of metadata followed by
    17  // a variable number of pages which form a ring buffer.
    18  type perfEventRing struct {
    19  	fd   int
    20  	cpu  int
    21  	mmap []byte
    22  	ringReader
    23  }
    24  
    25  func newPerfEventRing(cpu, perCPUBuffer int, opts ReaderOptions) (*perfEventRing, error) {
    26  	if opts.Watermark >= perCPUBuffer {
    27  		return nil, errors.New("watermark must be smaller than perCPUBuffer")
    28  	}
    29  
    30  	fd, err := createPerfEvent(cpu, opts)
    31  	if err != nil {
    32  		return nil, err
    33  	}
    34  
    35  	if err := unix.SetNonblock(fd, true); err != nil {
    36  		unix.Close(fd)
    37  		return nil, err
    38  	}
    39  
    40  	protections := unix.PROT_READ
    41  	if !opts.Overwritable {
    42  		protections |= unix.PROT_WRITE
    43  	}
    44  
    45  	mmap, err := unix.Mmap(fd, 0, perfBufferSize(perCPUBuffer), protections, unix.MAP_SHARED)
    46  	if err != nil {
    47  		unix.Close(fd)
    48  		return nil, fmt.Errorf("can't mmap: %v", err)
    49  	}
    50  
    51  	// This relies on the fact that we allocate an extra metadata page,
    52  	// and that the struct is smaller than an OS page.
    53  	// This use of unsafe.Pointer isn't explicitly sanctioned by the
    54  	// documentation, since a byte is smaller than sampledPerfEvent.
    55  	meta := (*unix.PerfEventMmapPage)(unsafe.Pointer(&mmap[0]))
    56  
    57  	var reader ringReader
    58  	if opts.Overwritable {
    59  		reader = newReverseReader(meta, mmap[meta.Data_offset:meta.Data_offset+meta.Data_size])
    60  	} else {
    61  		reader = newForwardReader(meta, mmap[meta.Data_offset:meta.Data_offset+meta.Data_size])
    62  	}
    63  
    64  	ring := &perfEventRing{
    65  		fd:         fd,
    66  		cpu:        cpu,
    67  		mmap:       mmap,
    68  		ringReader: reader,
    69  	}
    70  	runtime.SetFinalizer(ring, (*perfEventRing).Close)
    71  
    72  	return ring, nil
    73  }
    74  
    75  // perfBufferSize returns a valid mmap buffer size for use with perf_event_open (1+2^n pages)
    76  func perfBufferSize(perCPUBuffer int) int {
    77  	pageSize := os.Getpagesize()
    78  
    79  	// Smallest whole number of pages
    80  	nPages := (perCPUBuffer + pageSize - 1) / pageSize
    81  
    82  	// Round up to nearest power of two number of pages
    83  	nPages = int(math.Pow(2, math.Ceil(math.Log2(float64(nPages)))))
    84  
    85  	// Add one for metadata
    86  	nPages += 1
    87  
    88  	return nPages * pageSize
    89  }
    90  
    91  func (ring *perfEventRing) Close() {
    92  	runtime.SetFinalizer(ring, nil)
    93  
    94  	_ = unix.Close(ring.fd)
    95  	_ = unix.Munmap(ring.mmap)
    96  
    97  	ring.fd = -1
    98  	ring.mmap = nil
    99  }
   100  
   101  func createPerfEvent(cpu int, opts ReaderOptions) (int, error) {
   102  	wakeup := 0
   103  	bits := 0
   104  	if opts.WakeupEvents > 0 {
   105  		wakeup = opts.WakeupEvents
   106  	} else {
   107  		wakeup = opts.Watermark
   108  		if wakeup == 0 {
   109  			wakeup = 1
   110  		}
   111  		bits |= unix.PerfBitWatermark
   112  	}
   113  
   114  	if opts.Overwritable {
   115  		bits |= unix.PerfBitWriteBackward
   116  	}
   117  
   118  	attr := unix.PerfEventAttr{
   119  		Type:        unix.PERF_TYPE_SOFTWARE,
   120  		Config:      unix.PERF_COUNT_SW_BPF_OUTPUT,
   121  		Bits:        uint64(bits),
   122  		Sample_type: unix.PERF_SAMPLE_RAW,
   123  		Wakeup:      uint32(wakeup),
   124  	}
   125  
   126  	attr.Size = uint32(unsafe.Sizeof(attr))
   127  	fd, err := unix.PerfEventOpen(&attr, -1, cpu, -1, unix.PERF_FLAG_FD_CLOEXEC)
   128  	if err != nil {
   129  		return -1, fmt.Errorf("can't create perf event: %w", err)
   130  	}
   131  	return fd, nil
   132  }
   133  
   134  type ringReader interface {
   135  	loadHead()
   136  	size() int
   137  	remaining() int
   138  	writeTail()
   139  	Read(p []byte) (int, error)
   140  }
   141  
   142  type forwardReader struct {
   143  	meta       *unix.PerfEventMmapPage
   144  	head, tail uint64
   145  	mask       uint64
   146  	ring       []byte
   147  }
   148  
   149  func newForwardReader(meta *unix.PerfEventMmapPage, ring []byte) *forwardReader {
   150  	return &forwardReader{
   151  		meta: meta,
   152  		head: atomic.LoadUint64(&meta.Data_head),
   153  		tail: atomic.LoadUint64(&meta.Data_tail),
   154  		// cap is always a power of two
   155  		mask: uint64(cap(ring) - 1),
   156  		ring: ring,
   157  	}
   158  }
   159  
   160  func (rr *forwardReader) loadHead() {
   161  	rr.head = atomic.LoadUint64(&rr.meta.Data_head)
   162  }
   163  
   164  func (rr *forwardReader) size() int {
   165  	return len(rr.ring)
   166  }
   167  
   168  func (rr *forwardReader) remaining() int {
   169  	return int((rr.head - rr.tail) & rr.mask)
   170  }
   171  
   172  func (rr *forwardReader) writeTail() {
   173  	// Commit the new tail. This lets the kernel know that
   174  	// the ring buffer has been consumed.
   175  	atomic.StoreUint64(&rr.meta.Data_tail, rr.tail)
   176  }
   177  
   178  func (rr *forwardReader) Read(p []byte) (int, error) {
   179  	start := int(rr.tail & rr.mask)
   180  
   181  	n := len(p)
   182  	// Truncate if the read wraps in the ring buffer
   183  	if remainder := cap(rr.ring) - start; n > remainder {
   184  		n = remainder
   185  	}
   186  
   187  	// Truncate if there isn't enough data
   188  	if remainder := int(rr.head - rr.tail); n > remainder {
   189  		n = remainder
   190  	}
   191  
   192  	copy(p, rr.ring[start:start+n])
   193  	rr.tail += uint64(n)
   194  
   195  	if rr.tail == rr.head {
   196  		return n, io.EOF
   197  	}
   198  
   199  	return n, nil
   200  }
   201  
   202  type reverseReader struct {
   203  	meta *unix.PerfEventMmapPage
   204  	// head is the position where the kernel last wrote data.
   205  	head uint64
   206  	// read is the position we read the next data from. Updated as reads are made.
   207  	read uint64
   208  	// tail is the end of the ring buffer. No reads must be made past it.
   209  	tail uint64
   210  	mask uint64
   211  	ring []byte
   212  }
   213  
   214  func newReverseReader(meta *unix.PerfEventMmapPage, ring []byte) *reverseReader {
   215  	rr := &reverseReader{
   216  		meta: meta,
   217  		mask: uint64(cap(ring) - 1),
   218  		ring: ring,
   219  	}
   220  	rr.loadHead()
   221  	return rr
   222  }
   223  
   224  func (rr *reverseReader) loadHead() {
   225  	// The diagram below represents an overwritable perf ring buffer:
   226  	//
   227  	//    head     read                            tail
   228  	//     |        |                               |
   229  	//     V        V                               V
   230  	// +---+--------+------------+---------+--------+
   231  	// |   |H-D....D|H-C........C|H-B.....B|H-A....A|
   232  	// +---+--------+------------+---------+--------+
   233  	// <--Write from right to left
   234  	//                     Read from left to right-->
   235  	// (H means header)
   236  	//
   237  	// The buffer is read left to right beginning from head to tail.
   238  	// [head, read) is the read portion of the buffer, [read, tail) the unread one.
   239  	// read is adjusted as we progress through the buffer.
   240  
   241  	// Avoid reading sample D multiple times by discarding unread samples C, B, A.
   242  	rr.tail = rr.head
   243  
   244  	// Get the new head and starting reading from it.
   245  	rr.head = atomic.LoadUint64(&rr.meta.Data_head)
   246  	rr.read = rr.head
   247  
   248  	if rr.tail-rr.head > uint64(cap(rr.ring)) {
   249  		// ring has been fully written, only permit at most cap(rr.ring)
   250  		// bytes to be read.
   251  		rr.tail = rr.head + uint64(cap(rr.ring))
   252  	}
   253  }
   254  
   255  func (rr *reverseReader) size() int {
   256  	return len(rr.ring)
   257  }
   258  
   259  func (rr *reverseReader) remaining() int {
   260  	// remaining data is inaccurate for overwritable buffers
   261  	// once an overwrite happens, so return -1 here.
   262  	return -1
   263  }
   264  
   265  func (rr *reverseReader) writeTail() {
   266  	// We do not care about tail for over writable perf buffer.
   267  	// So, this function is noop.
   268  }
   269  
   270  func (rr *reverseReader) Read(p []byte) (int, error) {
   271  	start := int(rr.read & rr.mask)
   272  
   273  	n := len(p)
   274  	// Truncate if the read wraps in the ring buffer
   275  	if remainder := cap(rr.ring) - start; n > remainder {
   276  		n = remainder
   277  	}
   278  
   279  	// Truncate if there isn't enough data
   280  	if remainder := int(rr.tail - rr.read); n > remainder {
   281  		n = remainder
   282  	}
   283  
   284  	copy(p, rr.ring[start:start+n])
   285  	rr.read += uint64(n)
   286  
   287  	if rr.read == rr.tail {
   288  		return n, io.EOF
   289  	}
   290  
   291  	return n, nil
   292  }