github.com/cilium/ebpf@v0.15.1-0.20240517100537-8079b37aa138/perf/reader.go (about)

     1  package perf
     2  
     3  import (
     4  	"encoding/binary"
     5  	"errors"
     6  	"fmt"
     7  	"io"
     8  	"os"
     9  	"runtime"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/cilium/ebpf"
    14  	"github.com/cilium/ebpf/internal"
    15  	"github.com/cilium/ebpf/internal/epoll"
    16  	"github.com/cilium/ebpf/internal/sys"
    17  	"github.com/cilium/ebpf/internal/unix"
    18  )
    19  
    20  var (
    21  	ErrClosed = os.ErrClosed
    22  	errEOR    = errors.New("end of ring")
    23  )
    24  
    25  var perfEventHeaderSize = binary.Size(perfEventHeader{})
    26  
    27  // perfEventHeader must match 'struct perf_event_header` in <linux/perf_event.h>.
    28  type perfEventHeader struct {
    29  	Type uint32
    30  	Misc uint16
    31  	Size uint16
    32  }
    33  
    34  // Record contains either a sample or a counter of the
    35  // number of lost samples.
    36  type Record struct {
    37  	// The CPU this record was generated on.
    38  	CPU int
    39  
    40  	// The data submitted via bpf_perf_event_output.
    41  	// Due to a kernel bug, this can contain between 0 and 7 bytes of trailing
    42  	// garbage from the ring depending on the input sample's length.
    43  	RawSample []byte
    44  
    45  	// The number of samples which could not be output, since
    46  	// the ring buffer was full.
    47  	LostSamples uint64
    48  
    49  	// The minimum number of bytes remaining in the per-CPU buffer after this Record has been read.
    50  	// Negative for overwritable buffers.
    51  	Remaining int
    52  }
    53  
    54  // Read a record from a reader and tag it as being from the given CPU.
    55  //
    56  // buf must be at least perfEventHeaderSize bytes long.
    57  func readRecord(rd io.Reader, rec *Record, buf []byte, overwritable bool) error {
    58  	// Assert that the buffer is large enough.
    59  	buf = buf[:perfEventHeaderSize]
    60  	_, err := io.ReadFull(rd, buf)
    61  	if errors.Is(err, io.EOF) {
    62  		return errEOR
    63  	} else if err != nil {
    64  		return fmt.Errorf("read perf event header: %v", err)
    65  	}
    66  
    67  	header := perfEventHeader{
    68  		internal.NativeEndian.Uint32(buf[0:4]),
    69  		internal.NativeEndian.Uint16(buf[4:6]),
    70  		internal.NativeEndian.Uint16(buf[6:8]),
    71  	}
    72  
    73  	switch header.Type {
    74  	case unix.PERF_RECORD_LOST:
    75  		rec.RawSample = rec.RawSample[:0]
    76  		rec.LostSamples, err = readLostRecords(rd)
    77  		return err
    78  
    79  	case unix.PERF_RECORD_SAMPLE:
    80  		rec.LostSamples = 0
    81  		// We can reuse buf here because perfEventHeaderSize > perfEventSampleSize.
    82  		rec.RawSample, err = readRawSample(rd, buf, rec.RawSample)
    83  		return err
    84  
    85  	default:
    86  		return &unknownEventError{header.Type}
    87  	}
    88  }
    89  
    90  func readLostRecords(rd io.Reader) (uint64, error) {
    91  	// lostHeader must match 'struct perf_event_lost in kernel sources.
    92  	var lostHeader struct {
    93  		ID   uint64
    94  		Lost uint64
    95  	}
    96  
    97  	err := binary.Read(rd, internal.NativeEndian, &lostHeader)
    98  	if err != nil {
    99  		return 0, fmt.Errorf("can't read lost records header: %v", err)
   100  	}
   101  
   102  	return lostHeader.Lost, nil
   103  }
   104  
   105  var perfEventSampleSize = binary.Size(uint32(0))
   106  
   107  // This must match 'struct perf_event_sample in kernel sources.
   108  type perfEventSample struct {
   109  	Size uint32
   110  }
   111  
   112  func readRawSample(rd io.Reader, buf, sampleBuf []byte) ([]byte, error) {
   113  	buf = buf[:perfEventSampleSize]
   114  	if _, err := io.ReadFull(rd, buf); err != nil {
   115  		return nil, fmt.Errorf("read sample size: %w", err)
   116  	}
   117  
   118  	sample := perfEventSample{
   119  		internal.NativeEndian.Uint32(buf),
   120  	}
   121  
   122  	var data []byte
   123  	if size := int(sample.Size); cap(sampleBuf) < size {
   124  		data = make([]byte, size)
   125  	} else {
   126  		data = sampleBuf[:size]
   127  	}
   128  
   129  	if _, err := io.ReadFull(rd, data); err != nil {
   130  		return nil, fmt.Errorf("read sample: %w", err)
   131  	}
   132  	return data, nil
   133  }
   134  
   135  // Reader allows reading bpf_perf_event_output
   136  // from user space.
   137  type Reader struct {
   138  	poller   *epoll.Poller
   139  	deadline time.Time
   140  
   141  	// mu protects read/write access to the Reader structure with the
   142  	// exception of 'pauseFds', which is protected by 'pauseMu'.
   143  	// If locking both 'mu' and 'pauseMu', 'mu' must be locked first.
   144  	mu sync.Mutex
   145  
   146  	// Closing a PERF_EVENT_ARRAY removes all event fds
   147  	// stored in it, so we keep a reference alive.
   148  	array       *ebpf.Map
   149  	rings       []*perfEventRing
   150  	epollEvents []unix.EpollEvent
   151  	epollRings  []*perfEventRing
   152  	eventHeader []byte
   153  
   154  	// pauseMu protects eventFds so that Pause / Resume can be invoked while
   155  	// Read is blocked.
   156  	pauseMu  sync.Mutex
   157  	eventFds []*sys.FD
   158  
   159  	paused       bool
   160  	overwritable bool
   161  
   162  	bufferSize int
   163  }
   164  
   165  // ReaderOptions control the behaviour of the user
   166  // space reader.
   167  type ReaderOptions struct {
   168  	// The number of events required in any per CPU buffer before
   169  	// Read will process data. This is mutually exclusive with Watermark.
   170  	// The default is zero, which means Watermark will take precedence.
   171  	WakeupEvents int
   172  	// The number of written bytes required in any per CPU buffer before
   173  	// Read will process data. Must be smaller than PerCPUBuffer.
   174  	// The default is to start processing as soon as data is available.
   175  	Watermark int
   176  	// This perf ring buffer is overwritable, once full the oldest event will be
   177  	// overwritten by newest.
   178  	Overwritable bool
   179  }
   180  
   181  // NewReader creates a new reader with default options.
   182  //
   183  // array must be a PerfEventArray. perCPUBuffer gives the size of the
   184  // per CPU buffer in bytes. It is rounded up to the nearest multiple
   185  // of the current page size.
   186  func NewReader(array *ebpf.Map, perCPUBuffer int) (*Reader, error) {
   187  	return NewReaderWithOptions(array, perCPUBuffer, ReaderOptions{})
   188  }
   189  
   190  // NewReaderWithOptions creates a new reader with the given options.
   191  func NewReaderWithOptions(array *ebpf.Map, perCPUBuffer int, opts ReaderOptions) (pr *Reader, err error) {
   192  	closeOnError := func(c io.Closer) {
   193  		if err != nil {
   194  			c.Close()
   195  		}
   196  	}
   197  
   198  	if perCPUBuffer < 1 {
   199  		return nil, errors.New("perCPUBuffer must be larger than 0")
   200  	}
   201  	if opts.WakeupEvents > 0 && opts.Watermark > 0 {
   202  		return nil, errors.New("WakeupEvents and Watermark cannot both be non-zero")
   203  	}
   204  
   205  	var (
   206  		nCPU     = int(array.MaxEntries())
   207  		rings    = make([]*perfEventRing, 0, nCPU)
   208  		eventFds = make([]*sys.FD, 0, nCPU)
   209  	)
   210  
   211  	poller, err := epoll.New()
   212  	if err != nil {
   213  		return nil, err
   214  	}
   215  	defer closeOnError(poller)
   216  
   217  	// bpf_perf_event_output checks which CPU an event is enabled on,
   218  	// but doesn't allow using a wildcard like -1 to specify "all CPUs".
   219  	// Hence we have to create a ring for each CPU.
   220  	bufferSize := 0
   221  	for i := 0; i < nCPU; i++ {
   222  		event, ring, err := newPerfEventRing(i, perCPUBuffer, opts)
   223  		if errors.Is(err, unix.ENODEV) {
   224  			// The requested CPU is currently offline, skip it.
   225  			rings = append(rings, nil)
   226  			eventFds = append(eventFds, nil)
   227  			continue
   228  		}
   229  
   230  		if err != nil {
   231  			return nil, fmt.Errorf("failed to create perf ring for CPU %d: %v", i, err)
   232  		}
   233  		defer closeOnError(event)
   234  		defer closeOnError(ring)
   235  
   236  		bufferSize = ring.size()
   237  		rings = append(rings, ring)
   238  		eventFds = append(eventFds, event)
   239  
   240  		if err := poller.Add(event.Int(), 0); err != nil {
   241  			return nil, err
   242  		}
   243  	}
   244  
   245  	array, err = array.Clone()
   246  	if err != nil {
   247  		return nil, err
   248  	}
   249  
   250  	pr = &Reader{
   251  		array:        array,
   252  		rings:        rings,
   253  		poller:       poller,
   254  		deadline:     time.Time{},
   255  		epollEvents:  make([]unix.EpollEvent, len(rings)),
   256  		epollRings:   make([]*perfEventRing, 0, len(rings)),
   257  		eventHeader:  make([]byte, perfEventHeaderSize),
   258  		eventFds:     eventFds,
   259  		overwritable: opts.Overwritable,
   260  		bufferSize:   bufferSize,
   261  	}
   262  	if err = pr.Resume(); err != nil {
   263  		return nil, err
   264  	}
   265  	runtime.SetFinalizer(pr, (*Reader).Close)
   266  	return pr, nil
   267  }
   268  
   269  // Close frees resources used by the reader.
   270  //
   271  // It interrupts calls to Read.
   272  //
   273  // Calls to perf_event_output from eBPF programs will return
   274  // ENOENT after calling this method.
   275  func (pr *Reader) Close() error {
   276  	if err := pr.poller.Close(); err != nil {
   277  		if errors.Is(err, os.ErrClosed) {
   278  			return nil
   279  		}
   280  		return fmt.Errorf("close poller: %w", err)
   281  	}
   282  
   283  	// Trying to poll will now fail, so Read() can't block anymore. Acquire the
   284  	// locks so that we can clean up.
   285  	pr.mu.Lock()
   286  	defer pr.mu.Unlock()
   287  
   288  	pr.pauseMu.Lock()
   289  	defer pr.pauseMu.Unlock()
   290  
   291  	for _, ring := range pr.rings {
   292  		if ring != nil {
   293  			ring.Close()
   294  		}
   295  	}
   296  	for _, event := range pr.eventFds {
   297  		if event != nil {
   298  			event.Close()
   299  		}
   300  	}
   301  	pr.rings = nil
   302  	pr.eventFds = nil
   303  	pr.array.Close()
   304  
   305  	return nil
   306  }
   307  
   308  // SetDeadline controls how long Read and ReadInto will block waiting for samples.
   309  //
   310  // Passing a zero time.Time will remove the deadline. Passing a deadline in the
   311  // past will prevent the reader from blocking if there are no records to be read.
   312  func (pr *Reader) SetDeadline(t time.Time) {
   313  	pr.mu.Lock()
   314  	defer pr.mu.Unlock()
   315  
   316  	pr.deadline = t
   317  }
   318  
   319  // Read the next record from the perf ring buffer.
   320  //
   321  // The function blocks until there are at least Watermark bytes in one
   322  // of the per CPU buffers. Records from buffers below the Watermark
   323  // are not returned.
   324  //
   325  // Records can contain between 0 and 7 bytes of trailing garbage from the ring
   326  // depending on the input sample's length.
   327  //
   328  // Calling Close interrupts the function.
   329  //
   330  // Returns [os.ErrDeadlineExceeded] if a deadline was set and the perf ring buffer
   331  // was empty. Otherwise returns a record and no error, even if the deadline was
   332  // exceeded.
   333  //
   334  // See [Reader.ReadInto] for a more efficient version of this method.
   335  func (pr *Reader) Read() (Record, error) {
   336  	var r Record
   337  
   338  	return r, pr.ReadInto(&r)
   339  }
   340  
   341  var errMustBePaused = fmt.Errorf("perf ringbuffer: must have been paused before reading overwritable buffer")
   342  
   343  // ReadInto is like [Reader.Read] except that it allows reusing Record and associated buffers.
   344  func (pr *Reader) ReadInto(rec *Record) error {
   345  	pr.mu.Lock()
   346  	defer pr.mu.Unlock()
   347  
   348  	pr.pauseMu.Lock()
   349  	defer pr.pauseMu.Unlock()
   350  
   351  	if pr.overwritable && !pr.paused {
   352  		return errMustBePaused
   353  	}
   354  
   355  	if pr.rings == nil {
   356  		return fmt.Errorf("perf ringbuffer: %w", ErrClosed)
   357  	}
   358  
   359  	deadlineWasExceeded := false
   360  	for {
   361  		if len(pr.epollRings) == 0 {
   362  			if deadlineWasExceeded {
   363  				// All rings were empty when the deadline expired, return
   364  				// appropriate error.
   365  				return os.ErrDeadlineExceeded
   366  			}
   367  
   368  			// NB: The deferred pauseMu.Unlock will panic if Wait panics, which
   369  			// might obscure the original panic.
   370  			pr.pauseMu.Unlock()
   371  			_, err := pr.poller.Wait(pr.epollEvents, pr.deadline)
   372  			pr.pauseMu.Lock()
   373  
   374  			if errors.Is(err, os.ErrDeadlineExceeded) {
   375  				// We've hit the deadline, check whether there is any data in
   376  				// the rings that we've not been woken up for.
   377  				deadlineWasExceeded = true
   378  			} else if err != nil {
   379  				return err
   380  			}
   381  
   382  			// Re-validate pr.paused since we dropped pauseMu.
   383  			if pr.overwritable && !pr.paused {
   384  				return errMustBePaused
   385  			}
   386  
   387  			// Waking up userspace is expensive, make the most of it by checking
   388  			// all rings.
   389  			for _, ring := range pr.rings {
   390  				ring.loadHead()
   391  				pr.epollRings = append(pr.epollRings, ring)
   392  			}
   393  		}
   394  
   395  		// Start at the last available event. The order in which we
   396  		// process them doesn't matter, and starting at the back allows
   397  		// resizing epollRings to keep track of processed rings.
   398  		err := pr.readRecordFromRing(rec, pr.epollRings[len(pr.epollRings)-1])
   399  		if err == errEOR {
   400  			// We've emptied the current ring buffer, process
   401  			// the next one.
   402  			pr.epollRings = pr.epollRings[:len(pr.epollRings)-1]
   403  			continue
   404  		}
   405  
   406  		return err
   407  	}
   408  }
   409  
   410  // Pause stops all notifications from this Reader.
   411  //
   412  // While the Reader is paused, any attempts to write to the event buffer from
   413  // BPF programs will return -ENOENT.
   414  //
   415  // Subsequent calls to Read will block until a call to Resume.
   416  func (pr *Reader) Pause() error {
   417  	pr.pauseMu.Lock()
   418  	defer pr.pauseMu.Unlock()
   419  
   420  	if pr.eventFds == nil {
   421  		return fmt.Errorf("%w", ErrClosed)
   422  	}
   423  
   424  	for i := range pr.eventFds {
   425  		if err := pr.array.Delete(uint32(i)); err != nil && !errors.Is(err, ebpf.ErrKeyNotExist) {
   426  			return fmt.Errorf("could't delete event fd for CPU %d: %w", i, err)
   427  		}
   428  	}
   429  
   430  	pr.paused = true
   431  
   432  	return nil
   433  }
   434  
   435  // Resume allows this perf reader to emit notifications.
   436  //
   437  // Subsequent calls to Read will block until the next event notification.
   438  func (pr *Reader) Resume() error {
   439  	pr.pauseMu.Lock()
   440  	defer pr.pauseMu.Unlock()
   441  
   442  	if pr.eventFds == nil {
   443  		return fmt.Errorf("%w", ErrClosed)
   444  	}
   445  
   446  	for i, fd := range pr.eventFds {
   447  		if fd == nil {
   448  			continue
   449  		}
   450  
   451  		if err := pr.array.Put(uint32(i), fd.Uint()); err != nil {
   452  			return fmt.Errorf("couldn't put event fd %d for CPU %d: %w", fd, i, err)
   453  		}
   454  	}
   455  
   456  	pr.paused = false
   457  
   458  	return nil
   459  }
   460  
   461  // BufferSize is the size in bytes of each per-CPU buffer
   462  func (pr *Reader) BufferSize() int {
   463  	return pr.bufferSize
   464  }
   465  
   466  // NB: Has to be preceded by a call to ring.loadHead.
   467  func (pr *Reader) readRecordFromRing(rec *Record, ring *perfEventRing) error {
   468  	defer ring.writeTail()
   469  
   470  	rec.CPU = ring.cpu
   471  	err := readRecord(ring, rec, pr.eventHeader, pr.overwritable)
   472  	if pr.overwritable && (errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF)) {
   473  		return errEOR
   474  	}
   475  	rec.Remaining = ring.remaining()
   476  	return err
   477  }
   478  
   479  type unknownEventError struct {
   480  	eventType uint32
   481  }
   482  
   483  func (uev *unknownEventError) Error() string {
   484  	return fmt.Sprintf("unknown event type: %d", uev.eventType)
   485  }
   486  
   487  // IsUnknownEvent returns true if the error occurred
   488  // because an unknown event was submitted to the perf event ring.
   489  func IsUnknownEvent(err error) bool {
   490  	var uee *unknownEventError
   491  	return errors.As(err, &uee)
   492  }