github.com/cilium/ebpf@v0.16.0/perf/reader.go (about)

     1  package perf
     2  
     3  import (
     4  	"encoding/binary"
     5  	"errors"
     6  	"fmt"
     7  	"io"
     8  	"os"
     9  	"runtime"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/cilium/ebpf"
    14  	"github.com/cilium/ebpf/internal"
    15  	"github.com/cilium/ebpf/internal/epoll"
    16  	"github.com/cilium/ebpf/internal/sys"
    17  	"github.com/cilium/ebpf/internal/unix"
    18  )
    19  
    20  var (
    21  	ErrClosed  = os.ErrClosed
    22  	ErrFlushed = epoll.ErrFlushed
    23  	errEOR     = errors.New("end of ring")
    24  )
    25  
    26  var perfEventHeaderSize = binary.Size(perfEventHeader{})
    27  
    28  // perfEventHeader must match 'struct perf_event_header` in <linux/perf_event.h>.
    29  type perfEventHeader struct {
    30  	Type uint32
    31  	Misc uint16
    32  	Size uint16
    33  }
    34  
    35  // Record contains either a sample or a counter of the
    36  // number of lost samples.
    37  type Record struct {
    38  	// The CPU this record was generated on.
    39  	CPU int
    40  
    41  	// The data submitted via bpf_perf_event_output.
    42  	// Due to a kernel bug, this can contain between 0 and 7 bytes of trailing
    43  	// garbage from the ring depending on the input sample's length.
    44  	RawSample []byte
    45  
    46  	// The number of samples which could not be output, since
    47  	// the ring buffer was full.
    48  	LostSamples uint64
    49  
    50  	// The minimum number of bytes remaining in the per-CPU buffer after this Record has been read.
    51  	// Negative for overwritable buffers.
    52  	Remaining int
    53  }
    54  
    55  // Read a record from a reader and tag it as being from the given CPU.
    56  //
    57  // buf must be at least perfEventHeaderSize bytes long.
    58  func readRecord(rd io.Reader, rec *Record, buf []byte, overwritable bool) error {
    59  	// Assert that the buffer is large enough.
    60  	buf = buf[:perfEventHeaderSize]
    61  	_, err := io.ReadFull(rd, buf)
    62  	if errors.Is(err, io.EOF) {
    63  		return errEOR
    64  	} else if err != nil {
    65  		return fmt.Errorf("read perf event header: %v", err)
    66  	}
    67  
    68  	header := perfEventHeader{
    69  		internal.NativeEndian.Uint32(buf[0:4]),
    70  		internal.NativeEndian.Uint16(buf[4:6]),
    71  		internal.NativeEndian.Uint16(buf[6:8]),
    72  	}
    73  
    74  	switch header.Type {
    75  	case unix.PERF_RECORD_LOST:
    76  		rec.RawSample = rec.RawSample[:0]
    77  		rec.LostSamples, err = readLostRecords(rd)
    78  		return err
    79  
    80  	case unix.PERF_RECORD_SAMPLE:
    81  		rec.LostSamples = 0
    82  		// We can reuse buf here because perfEventHeaderSize > perfEventSampleSize.
    83  		rec.RawSample, err = readRawSample(rd, buf, rec.RawSample)
    84  		return err
    85  
    86  	default:
    87  		return &unknownEventError{header.Type}
    88  	}
    89  }
    90  
    91  func readLostRecords(rd io.Reader) (uint64, error) {
    92  	// lostHeader must match 'struct perf_event_lost in kernel sources.
    93  	var lostHeader struct {
    94  		ID   uint64
    95  		Lost uint64
    96  	}
    97  
    98  	err := binary.Read(rd, internal.NativeEndian, &lostHeader)
    99  	if err != nil {
   100  		return 0, fmt.Errorf("can't read lost records header: %v", err)
   101  	}
   102  
   103  	return lostHeader.Lost, nil
   104  }
   105  
   106  var perfEventSampleSize = binary.Size(uint32(0))
   107  
   108  // This must match 'struct perf_event_sample in kernel sources.
   109  type perfEventSample struct {
   110  	Size uint32
   111  }
   112  
   113  func readRawSample(rd io.Reader, buf, sampleBuf []byte) ([]byte, error) {
   114  	buf = buf[:perfEventSampleSize]
   115  	if _, err := io.ReadFull(rd, buf); err != nil {
   116  		return nil, fmt.Errorf("read sample size: %w", err)
   117  	}
   118  
   119  	sample := perfEventSample{
   120  		internal.NativeEndian.Uint32(buf),
   121  	}
   122  
   123  	var data []byte
   124  	if size := int(sample.Size); cap(sampleBuf) < size {
   125  		data = make([]byte, size)
   126  	} else {
   127  		data = sampleBuf[:size]
   128  	}
   129  
   130  	if _, err := io.ReadFull(rd, data); err != nil {
   131  		return nil, fmt.Errorf("read sample: %w", err)
   132  	}
   133  	return data, nil
   134  }
   135  
   136  // Reader allows reading bpf_perf_event_output
   137  // from user space.
   138  type Reader struct {
   139  	poller *epoll.Poller
   140  
   141  	// mu protects read/write access to the Reader structure with the
   142  	// exception fields protected by 'pauseMu'.
   143  	// If locking both 'mu' and 'pauseMu', 'mu' must be locked first.
   144  	mu           sync.Mutex
   145  	array        *ebpf.Map
   146  	rings        []*perfEventRing
   147  	epollEvents  []unix.EpollEvent
   148  	epollRings   []*perfEventRing
   149  	eventHeader  []byte
   150  	deadline     time.Time
   151  	overwritable bool
   152  	bufferSize   int
   153  	pendingErr   error
   154  
   155  	// pauseMu protects eventFds so that Pause / Resume can be invoked while
   156  	// Read is blocked.
   157  	pauseMu  sync.Mutex
   158  	eventFds []*sys.FD
   159  	paused   bool
   160  }
   161  
   162  // ReaderOptions control the behaviour of the user
   163  // space reader.
   164  type ReaderOptions struct {
   165  	// The number of events required in any per CPU buffer before
   166  	// Read will process data. This is mutually exclusive with Watermark.
   167  	// The default is zero, which means Watermark will take precedence.
   168  	WakeupEvents int
   169  	// The number of written bytes required in any per CPU buffer before
   170  	// Read will process data. Must be smaller than PerCPUBuffer.
   171  	// The default is to start processing as soon as data is available.
   172  	Watermark int
   173  	// This perf ring buffer is overwritable, once full the oldest event will be
   174  	// overwritten by newest.
   175  	Overwritable bool
   176  }
   177  
   178  // NewReader creates a new reader with default options.
   179  //
   180  // array must be a PerfEventArray. perCPUBuffer gives the size of the
   181  // per CPU buffer in bytes. It is rounded up to the nearest multiple
   182  // of the current page size.
   183  func NewReader(array *ebpf.Map, perCPUBuffer int) (*Reader, error) {
   184  	return NewReaderWithOptions(array, perCPUBuffer, ReaderOptions{})
   185  }
   186  
   187  // NewReaderWithOptions creates a new reader with the given options.
   188  func NewReaderWithOptions(array *ebpf.Map, perCPUBuffer int, opts ReaderOptions) (pr *Reader, err error) {
   189  	closeOnError := func(c io.Closer) {
   190  		if err != nil {
   191  			c.Close()
   192  		}
   193  	}
   194  
   195  	if perCPUBuffer < 1 {
   196  		return nil, errors.New("perCPUBuffer must be larger than 0")
   197  	}
   198  	if opts.WakeupEvents > 0 && opts.Watermark > 0 {
   199  		return nil, errors.New("WakeupEvents and Watermark cannot both be non-zero")
   200  	}
   201  
   202  	var (
   203  		nCPU     = int(array.MaxEntries())
   204  		rings    = make([]*perfEventRing, 0, nCPU)
   205  		eventFds = make([]*sys.FD, 0, nCPU)
   206  	)
   207  
   208  	poller, err := epoll.New()
   209  	if err != nil {
   210  		return nil, err
   211  	}
   212  	defer closeOnError(poller)
   213  
   214  	// bpf_perf_event_output checks which CPU an event is enabled on,
   215  	// but doesn't allow using a wildcard like -1 to specify "all CPUs".
   216  	// Hence we have to create a ring for each CPU.
   217  	bufferSize := 0
   218  	for i := 0; i < nCPU; i++ {
   219  		event, ring, err := newPerfEventRing(i, perCPUBuffer, opts)
   220  		if errors.Is(err, unix.ENODEV) {
   221  			// The requested CPU is currently offline, skip it.
   222  			continue
   223  		}
   224  
   225  		if err != nil {
   226  			return nil, fmt.Errorf("failed to create perf ring for CPU %d: %v", i, err)
   227  		}
   228  		defer closeOnError(event)
   229  		defer closeOnError(ring)
   230  
   231  		bufferSize = ring.size()
   232  		rings = append(rings, ring)
   233  		eventFds = append(eventFds, event)
   234  
   235  		if err := poller.Add(event.Int(), 0); err != nil {
   236  			return nil, err
   237  		}
   238  	}
   239  
   240  	// Closing a PERF_EVENT_ARRAY removes all event fds
   241  	// stored in it, so we keep a reference alive.
   242  	array, err = array.Clone()
   243  	if err != nil {
   244  		return nil, err
   245  	}
   246  
   247  	pr = &Reader{
   248  		array:        array,
   249  		rings:        rings,
   250  		poller:       poller,
   251  		deadline:     time.Time{},
   252  		epollEvents:  make([]unix.EpollEvent, len(rings)),
   253  		epollRings:   make([]*perfEventRing, 0, len(rings)),
   254  		eventHeader:  make([]byte, perfEventHeaderSize),
   255  		eventFds:     eventFds,
   256  		overwritable: opts.Overwritable,
   257  		bufferSize:   bufferSize,
   258  	}
   259  	if err = pr.Resume(); err != nil {
   260  		return nil, err
   261  	}
   262  	runtime.SetFinalizer(pr, (*Reader).Close)
   263  	return pr, nil
   264  }
   265  
   266  // Close frees resources used by the reader.
   267  //
   268  // It interrupts calls to Read.
   269  //
   270  // Calls to perf_event_output from eBPF programs will return
   271  // ENOENT after calling this method.
   272  func (pr *Reader) Close() error {
   273  	if err := pr.poller.Close(); err != nil {
   274  		if errors.Is(err, os.ErrClosed) {
   275  			return nil
   276  		}
   277  		return fmt.Errorf("close poller: %w", err)
   278  	}
   279  
   280  	// Trying to poll will now fail, so Read() can't block anymore. Acquire the
   281  	// locks so that we can clean up.
   282  	pr.mu.Lock()
   283  	defer pr.mu.Unlock()
   284  
   285  	pr.pauseMu.Lock()
   286  	defer pr.pauseMu.Unlock()
   287  
   288  	for _, ring := range pr.rings {
   289  		ring.Close()
   290  	}
   291  	for _, event := range pr.eventFds {
   292  		event.Close()
   293  	}
   294  	pr.rings = nil
   295  	pr.eventFds = nil
   296  	pr.array.Close()
   297  
   298  	return nil
   299  }
   300  
   301  // SetDeadline controls how long Read and ReadInto will block waiting for samples.
   302  //
   303  // Passing a zero time.Time will remove the deadline. Passing a deadline in the
   304  // past will prevent the reader from blocking if there are no records to be read.
   305  func (pr *Reader) SetDeadline(t time.Time) {
   306  	pr.mu.Lock()
   307  	defer pr.mu.Unlock()
   308  
   309  	pr.deadline = t
   310  }
   311  
   312  // Read the next record from the perf ring buffer.
   313  //
   314  // The method blocks until there are at least Watermark bytes in one
   315  // of the per CPU buffers. Records from buffers below the Watermark
   316  // are not returned.
   317  //
   318  // Records can contain between 0 and 7 bytes of trailing garbage from the ring
   319  // depending on the input sample's length.
   320  //
   321  // Calling [Close] interrupts the method with [os.ErrClosed]. Calling [Flush]
   322  // makes it return all records currently in the ring buffer, followed by [ErrFlushed].
   323  //
   324  // Returns [os.ErrDeadlineExceeded] if a deadline was set and after all records
   325  // have been read from the ring.
   326  //
   327  // See [Reader.ReadInto] for a more efficient version of this method.
   328  func (pr *Reader) Read() (Record, error) {
   329  	var r Record
   330  
   331  	return r, pr.ReadInto(&r)
   332  }
   333  
   334  var errMustBePaused = fmt.Errorf("perf ringbuffer: must have been paused before reading overwritable buffer")
   335  
   336  // ReadInto is like [Reader.Read] except that it allows reusing Record and associated buffers.
   337  func (pr *Reader) ReadInto(rec *Record) error {
   338  	pr.mu.Lock()
   339  	defer pr.mu.Unlock()
   340  
   341  	pr.pauseMu.Lock()
   342  	defer pr.pauseMu.Unlock()
   343  
   344  	if pr.overwritable && !pr.paused {
   345  		return errMustBePaused
   346  	}
   347  
   348  	if pr.rings == nil {
   349  		return fmt.Errorf("perf ringbuffer: %w", ErrClosed)
   350  	}
   351  
   352  	for {
   353  		if len(pr.epollRings) == 0 {
   354  			if pe := pr.pendingErr; pe != nil {
   355  				// All rings have been emptied since the error occurred, return
   356  				// appropriate error.
   357  				pr.pendingErr = nil
   358  				return pe
   359  			}
   360  
   361  			// NB: The deferred pauseMu.Unlock will panic if Wait panics, which
   362  			// might obscure the original panic.
   363  			pr.pauseMu.Unlock()
   364  			_, err := pr.poller.Wait(pr.epollEvents, pr.deadline)
   365  			pr.pauseMu.Lock()
   366  
   367  			if errors.Is(err, os.ErrDeadlineExceeded) || errors.Is(err, ErrFlushed) {
   368  				// We've hit the deadline, check whether there is any data in
   369  				// the rings that we've not been woken up for.
   370  				pr.pendingErr = err
   371  			} else if err != nil {
   372  				return err
   373  			}
   374  
   375  			// Re-validate pr.paused since we dropped pauseMu.
   376  			if pr.overwritable && !pr.paused {
   377  				return errMustBePaused
   378  			}
   379  
   380  			// Waking up userspace is expensive, make the most of it by checking
   381  			// all rings.
   382  			for _, ring := range pr.rings {
   383  				ring.loadHead()
   384  				pr.epollRings = append(pr.epollRings, ring)
   385  			}
   386  		}
   387  
   388  		// Start at the last available event. The order in which we
   389  		// process them doesn't matter, and starting at the back allows
   390  		// resizing epollRings to keep track of processed rings.
   391  		err := pr.readRecordFromRing(rec, pr.epollRings[len(pr.epollRings)-1])
   392  		if err == errEOR {
   393  			// We've emptied the current ring buffer, process
   394  			// the next one.
   395  			pr.epollRings = pr.epollRings[:len(pr.epollRings)-1]
   396  			continue
   397  		}
   398  
   399  		return err
   400  	}
   401  }
   402  
   403  // Pause stops all notifications from this Reader.
   404  //
   405  // While the Reader is paused, any attempts to write to the event buffer from
   406  // BPF programs will return -ENOENT.
   407  //
   408  // Subsequent calls to Read will block until a call to Resume.
   409  func (pr *Reader) Pause() error {
   410  	pr.pauseMu.Lock()
   411  	defer pr.pauseMu.Unlock()
   412  
   413  	if pr.eventFds == nil {
   414  		return fmt.Errorf("%w", ErrClosed)
   415  	}
   416  
   417  	for i := range pr.eventFds {
   418  		if err := pr.array.Delete(uint32(i)); err != nil && !errors.Is(err, ebpf.ErrKeyNotExist) {
   419  			return fmt.Errorf("could't delete event fd for CPU %d: %w", i, err)
   420  		}
   421  	}
   422  
   423  	pr.paused = true
   424  
   425  	return nil
   426  }
   427  
   428  // Resume allows this perf reader to emit notifications.
   429  //
   430  // Subsequent calls to Read will block until the next event notification.
   431  func (pr *Reader) Resume() error {
   432  	pr.pauseMu.Lock()
   433  	defer pr.pauseMu.Unlock()
   434  
   435  	if pr.eventFds == nil {
   436  		return fmt.Errorf("%w", ErrClosed)
   437  	}
   438  
   439  	for i, fd := range pr.eventFds {
   440  		if fd == nil {
   441  			continue
   442  		}
   443  
   444  		if err := pr.array.Put(uint32(i), fd.Uint()); err != nil {
   445  			return fmt.Errorf("couldn't put event fd %d for CPU %d: %w", fd, i, err)
   446  		}
   447  	}
   448  
   449  	pr.paused = false
   450  
   451  	return nil
   452  }
   453  
   454  // BufferSize is the size in bytes of each per-CPU buffer
   455  func (pr *Reader) BufferSize() int {
   456  	return pr.bufferSize
   457  }
   458  
   459  // Flush unblocks Read/ReadInto and successive Read/ReadInto calls will return pending samples at this point,
   460  // until you receive a [ErrFlushed] error.
   461  func (pr *Reader) Flush() error {
   462  	return pr.poller.Flush()
   463  }
   464  
   465  // NB: Has to be preceded by a call to ring.loadHead.
   466  func (pr *Reader) readRecordFromRing(rec *Record, ring *perfEventRing) error {
   467  	defer ring.writeTail()
   468  
   469  	rec.CPU = ring.cpu
   470  	err := readRecord(ring, rec, pr.eventHeader, pr.overwritable)
   471  	if pr.overwritable && (errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF)) {
   472  		return errEOR
   473  	}
   474  	rec.Remaining = ring.remaining()
   475  	return err
   476  }
   477  
   478  type unknownEventError struct {
   479  	eventType uint32
   480  }
   481  
   482  func (uev *unknownEventError) Error() string {
   483  	return fmt.Sprintf("unknown event type: %d", uev.eventType)
   484  }
   485  
   486  // IsUnknownEvent returns true if the error occurred
   487  // because an unknown event was submitted to the perf event ring.
   488  func IsUnknownEvent(err error) bool {
   489  	var uee *unknownEventError
   490  	return errors.As(err, &uee)
   491  }