github.com/cilium/ebpf@v0.15.0/perf/reader.go (about)

     1  package perf
     2  
     3  import (
     4  	"encoding/binary"
     5  	"errors"
     6  	"fmt"
     7  	"io"
     8  	"os"
     9  	"runtime"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/cilium/ebpf"
    14  	"github.com/cilium/ebpf/internal"
    15  	"github.com/cilium/ebpf/internal/epoll"
    16  	"github.com/cilium/ebpf/internal/unix"
    17  )
    18  
    19  var (
    20  	ErrClosed = os.ErrClosed
    21  	errEOR    = errors.New("end of ring")
    22  )
    23  
    24  var perfEventHeaderSize = binary.Size(perfEventHeader{})
    25  
    26  // perfEventHeader must match 'struct perf_event_header` in <linux/perf_event.h>.
    27  type perfEventHeader struct {
    28  	Type uint32
    29  	Misc uint16
    30  	Size uint16
    31  }
    32  
    33  func cpuForEvent(event *unix.EpollEvent) int {
    34  	return int(event.Pad)
    35  }
    36  
    37  // Record contains either a sample or a counter of the
    38  // number of lost samples.
    39  type Record struct {
    40  	// The CPU this record was generated on.
    41  	CPU int
    42  
    43  	// The data submitted via bpf_perf_event_output.
    44  	// Due to a kernel bug, this can contain between 0 and 7 bytes of trailing
    45  	// garbage from the ring depending on the input sample's length.
    46  	RawSample []byte
    47  
    48  	// The number of samples which could not be output, since
    49  	// the ring buffer was full.
    50  	LostSamples uint64
    51  
    52  	// The minimum number of bytes remaining in the per-CPU buffer after this Record has been read.
    53  	// Negative for overwritable buffers.
    54  	Remaining int
    55  }
    56  
    57  // Read a record from a reader and tag it as being from the given CPU.
    58  //
    59  // buf must be at least perfEventHeaderSize bytes long.
    60  func readRecord(rd io.Reader, rec *Record, buf []byte, overwritable bool) error {
    61  	// Assert that the buffer is large enough.
    62  	buf = buf[:perfEventHeaderSize]
    63  	_, err := io.ReadFull(rd, buf)
    64  	if errors.Is(err, io.EOF) {
    65  		return errEOR
    66  	} else if err != nil {
    67  		return fmt.Errorf("read perf event header: %v", err)
    68  	}
    69  
    70  	header := perfEventHeader{
    71  		internal.NativeEndian.Uint32(buf[0:4]),
    72  		internal.NativeEndian.Uint16(buf[4:6]),
    73  		internal.NativeEndian.Uint16(buf[6:8]),
    74  	}
    75  
    76  	switch header.Type {
    77  	case unix.PERF_RECORD_LOST:
    78  		rec.RawSample = rec.RawSample[:0]
    79  		rec.LostSamples, err = readLostRecords(rd)
    80  		return err
    81  
    82  	case unix.PERF_RECORD_SAMPLE:
    83  		rec.LostSamples = 0
    84  		// We can reuse buf here because perfEventHeaderSize > perfEventSampleSize.
    85  		rec.RawSample, err = readRawSample(rd, buf, rec.RawSample)
    86  		return err
    87  
    88  	default:
    89  		return &unknownEventError{header.Type}
    90  	}
    91  }
    92  
    93  func readLostRecords(rd io.Reader) (uint64, error) {
    94  	// lostHeader must match 'struct perf_event_lost in kernel sources.
    95  	var lostHeader struct {
    96  		ID   uint64
    97  		Lost uint64
    98  	}
    99  
   100  	err := binary.Read(rd, internal.NativeEndian, &lostHeader)
   101  	if err != nil {
   102  		return 0, fmt.Errorf("can't read lost records header: %v", err)
   103  	}
   104  
   105  	return lostHeader.Lost, nil
   106  }
   107  
   108  var perfEventSampleSize = binary.Size(uint32(0))
   109  
   110  // This must match 'struct perf_event_sample in kernel sources.
   111  type perfEventSample struct {
   112  	Size uint32
   113  }
   114  
   115  func readRawSample(rd io.Reader, buf, sampleBuf []byte) ([]byte, error) {
   116  	buf = buf[:perfEventSampleSize]
   117  	if _, err := io.ReadFull(rd, buf); err != nil {
   118  		return nil, fmt.Errorf("read sample size: %w", err)
   119  	}
   120  
   121  	sample := perfEventSample{
   122  		internal.NativeEndian.Uint32(buf),
   123  	}
   124  
   125  	var data []byte
   126  	if size := int(sample.Size); cap(sampleBuf) < size {
   127  		data = make([]byte, size)
   128  	} else {
   129  		data = sampleBuf[:size]
   130  	}
   131  
   132  	if _, err := io.ReadFull(rd, data); err != nil {
   133  		return nil, fmt.Errorf("read sample: %w", err)
   134  	}
   135  	return data, nil
   136  }
   137  
   138  // Reader allows reading bpf_perf_event_output
   139  // from user space.
   140  type Reader struct {
   141  	poller   *epoll.Poller
   142  	deadline time.Time
   143  
   144  	// mu protects read/write access to the Reader structure with the
   145  	// exception of 'pauseFds', which is protected by 'pauseMu'.
   146  	// If locking both 'mu' and 'pauseMu', 'mu' must be locked first.
   147  	mu sync.Mutex
   148  
   149  	// Closing a PERF_EVENT_ARRAY removes all event fds
   150  	// stored in it, so we keep a reference alive.
   151  	array       *ebpf.Map
   152  	rings       []*perfEventRing
   153  	epollEvents []unix.EpollEvent
   154  	epollRings  []*perfEventRing
   155  	eventHeader []byte
   156  
   157  	// pauseFds are a copy of the fds in 'rings', protected by 'pauseMu'.
   158  	// These allow Pause/Resume to be executed independently of any ongoing
   159  	// Read calls, which would otherwise need to be interrupted.
   160  	pauseMu  sync.Mutex
   161  	pauseFds []int
   162  
   163  	paused       bool
   164  	overwritable bool
   165  
   166  	bufferSize int
   167  }
   168  
   169  // ReaderOptions control the behaviour of the user
   170  // space reader.
   171  type ReaderOptions struct {
   172  	// The number of events required in any per CPU buffer before
   173  	// Read will process data. This is mutually exclusive with Watermark.
   174  	// The default is zero, which means Watermark will take precedence.
   175  	WakeupEvents int
   176  	// The number of written bytes required in any per CPU buffer before
   177  	// Read will process data. Must be smaller than PerCPUBuffer.
   178  	// The default is to start processing as soon as data is available.
   179  	Watermark int
   180  	// This perf ring buffer is overwritable, once full the oldest event will be
   181  	// overwritten by newest.
   182  	Overwritable bool
   183  }
   184  
   185  // NewReader creates a new reader with default options.
   186  //
   187  // array must be a PerfEventArray. perCPUBuffer gives the size of the
   188  // per CPU buffer in bytes. It is rounded up to the nearest multiple
   189  // of the current page size.
   190  func NewReader(array *ebpf.Map, perCPUBuffer int) (*Reader, error) {
   191  	return NewReaderWithOptions(array, perCPUBuffer, ReaderOptions{})
   192  }
   193  
   194  // NewReaderWithOptions creates a new reader with the given options.
   195  func NewReaderWithOptions(array *ebpf.Map, perCPUBuffer int, opts ReaderOptions) (pr *Reader, err error) {
   196  	if perCPUBuffer < 1 {
   197  		return nil, errors.New("perCPUBuffer must be larger than 0")
   198  	}
   199  	if opts.WakeupEvents > 0 && opts.Watermark > 0 {
   200  		return nil, errors.New("WakeupEvents and Watermark cannot both be non-zero")
   201  	}
   202  
   203  	var (
   204  		fds      []int
   205  		nCPU     = int(array.MaxEntries())
   206  		rings    = make([]*perfEventRing, 0, nCPU)
   207  		pauseFds = make([]int, 0, nCPU)
   208  	)
   209  
   210  	poller, err := epoll.New()
   211  	if err != nil {
   212  		return nil, err
   213  	}
   214  
   215  	defer func() {
   216  		if err != nil {
   217  			poller.Close()
   218  			for _, fd := range fds {
   219  				unix.Close(fd)
   220  			}
   221  			for _, ring := range rings {
   222  				if ring != nil {
   223  					ring.Close()
   224  				}
   225  			}
   226  		}
   227  	}()
   228  
   229  	// bpf_perf_event_output checks which CPU an event is enabled on,
   230  	// but doesn't allow using a wildcard like -1 to specify "all CPUs".
   231  	// Hence we have to create a ring for each CPU.
   232  	bufferSize := 0
   233  	for i := 0; i < nCPU; i++ {
   234  		ring, err := newPerfEventRing(i, perCPUBuffer, opts)
   235  		if errors.Is(err, unix.ENODEV) {
   236  			// The requested CPU is currently offline, skip it.
   237  			rings = append(rings, nil)
   238  			pauseFds = append(pauseFds, -1)
   239  			continue
   240  		}
   241  
   242  		if err != nil {
   243  			return nil, fmt.Errorf("failed to create perf ring for CPU %d: %v", i, err)
   244  		}
   245  
   246  		bufferSize = ring.size()
   247  		rings = append(rings, ring)
   248  		pauseFds = append(pauseFds, ring.fd)
   249  
   250  		if err := poller.Add(ring.fd, i); err != nil {
   251  			return nil, err
   252  		}
   253  	}
   254  
   255  	array, err = array.Clone()
   256  	if err != nil {
   257  		return nil, err
   258  	}
   259  
   260  	pr = &Reader{
   261  		array:        array,
   262  		rings:        rings,
   263  		poller:       poller,
   264  		deadline:     time.Time{},
   265  		epollEvents:  make([]unix.EpollEvent, len(rings)),
   266  		epollRings:   make([]*perfEventRing, 0, len(rings)),
   267  		eventHeader:  make([]byte, perfEventHeaderSize),
   268  		pauseFds:     pauseFds,
   269  		overwritable: opts.Overwritable,
   270  		bufferSize:   bufferSize,
   271  	}
   272  	if err = pr.Resume(); err != nil {
   273  		return nil, err
   274  	}
   275  	runtime.SetFinalizer(pr, (*Reader).Close)
   276  	return pr, nil
   277  }
   278  
   279  // Close frees resources used by the reader.
   280  //
   281  // It interrupts calls to Read.
   282  //
   283  // Calls to perf_event_output from eBPF programs will return
   284  // ENOENT after calling this method.
   285  func (pr *Reader) Close() error {
   286  	if err := pr.poller.Close(); err != nil {
   287  		if errors.Is(err, os.ErrClosed) {
   288  			return nil
   289  		}
   290  		return fmt.Errorf("close poller: %w", err)
   291  	}
   292  
   293  	// Trying to poll will now fail, so Read() can't block anymore. Acquire the
   294  	// lock so that we can clean up.
   295  	pr.mu.Lock()
   296  	defer pr.mu.Unlock()
   297  
   298  	for _, ring := range pr.rings {
   299  		if ring != nil {
   300  			ring.Close()
   301  		}
   302  	}
   303  	pr.rings = nil
   304  	pr.pauseFds = nil
   305  	pr.array.Close()
   306  
   307  	return nil
   308  }
   309  
   310  // SetDeadline controls how long Read and ReadInto will block waiting for samples.
   311  //
   312  // Passing a zero time.Time will remove the deadline. Passing a deadline in the
   313  // past will prevent the reader from blocking if there are no records to be read.
   314  func (pr *Reader) SetDeadline(t time.Time) {
   315  	pr.mu.Lock()
   316  	defer pr.mu.Unlock()
   317  
   318  	pr.deadline = t
   319  }
   320  
   321  // Read the next record from the perf ring buffer.
   322  //
   323  // The function blocks until there are at least Watermark bytes in one
   324  // of the per CPU buffers. Records from buffers below the Watermark
   325  // are not returned.
   326  //
   327  // Records can contain between 0 and 7 bytes of trailing garbage from the ring
   328  // depending on the input sample's length.
   329  //
   330  // Calling Close interrupts the function.
   331  //
   332  // Returns os.ErrDeadlineExceeded if a deadline was set.
   333  func (pr *Reader) Read() (Record, error) {
   334  	var r Record
   335  
   336  	return r, pr.ReadInto(&r)
   337  }
   338  
   339  var errMustBePaused = fmt.Errorf("perf ringbuffer: must have been paused before reading overwritable buffer")
   340  
   341  // ReadInto is like Read except that it allows reusing Record and associated buffers.
   342  func (pr *Reader) ReadInto(rec *Record) error {
   343  	pr.mu.Lock()
   344  	defer pr.mu.Unlock()
   345  
   346  	pr.pauseMu.Lock()
   347  	defer pr.pauseMu.Unlock()
   348  
   349  	if pr.overwritable && !pr.paused {
   350  		return errMustBePaused
   351  	}
   352  
   353  	if pr.rings == nil {
   354  		return fmt.Errorf("perf ringbuffer: %w", ErrClosed)
   355  	}
   356  
   357  	for {
   358  		if len(pr.epollRings) == 0 {
   359  			// NB: The deferred pauseMu.Unlock will panic if Wait panics, which
   360  			// might obscure the original panic.
   361  			pr.pauseMu.Unlock()
   362  			nEvents, err := pr.poller.Wait(pr.epollEvents, pr.deadline)
   363  			pr.pauseMu.Lock()
   364  			if err != nil {
   365  				return err
   366  			}
   367  
   368  			// Re-validate pr.paused since we dropped pauseMu.
   369  			if pr.overwritable && !pr.paused {
   370  				return errMustBePaused
   371  			}
   372  
   373  			for _, event := range pr.epollEvents[:nEvents] {
   374  				ring := pr.rings[cpuForEvent(&event)]
   375  				pr.epollRings = append(pr.epollRings, ring)
   376  
   377  				// Read the current head pointer now, not every time
   378  				// we read a record. This prevents a single fast producer
   379  				// from keeping the reader busy.
   380  				ring.loadHead()
   381  			}
   382  		}
   383  
   384  		// Start at the last available event. The order in which we
   385  		// process them doesn't matter, and starting at the back allows
   386  		// resizing epollRings to keep track of processed rings.
   387  		err := pr.readRecordFromRing(rec, pr.epollRings[len(pr.epollRings)-1])
   388  		if err == errEOR {
   389  			// We've emptied the current ring buffer, process
   390  			// the next one.
   391  			pr.epollRings = pr.epollRings[:len(pr.epollRings)-1]
   392  			continue
   393  		}
   394  
   395  		return err
   396  	}
   397  }
   398  
   399  // Pause stops all notifications from this Reader.
   400  //
   401  // While the Reader is paused, any attempts to write to the event buffer from
   402  // BPF programs will return -ENOENT.
   403  //
   404  // Subsequent calls to Read will block until a call to Resume.
   405  func (pr *Reader) Pause() error {
   406  	pr.pauseMu.Lock()
   407  	defer pr.pauseMu.Unlock()
   408  
   409  	if pr.pauseFds == nil {
   410  		return fmt.Errorf("%w", ErrClosed)
   411  	}
   412  
   413  	for i := range pr.pauseFds {
   414  		if err := pr.array.Delete(uint32(i)); err != nil && !errors.Is(err, ebpf.ErrKeyNotExist) {
   415  			return fmt.Errorf("could't delete event fd for CPU %d: %w", i, err)
   416  		}
   417  	}
   418  
   419  	pr.paused = true
   420  
   421  	return nil
   422  }
   423  
   424  // Resume allows this perf reader to emit notifications.
   425  //
   426  // Subsequent calls to Read will block until the next event notification.
   427  func (pr *Reader) Resume() error {
   428  	pr.pauseMu.Lock()
   429  	defer pr.pauseMu.Unlock()
   430  
   431  	if pr.pauseFds == nil {
   432  		return fmt.Errorf("%w", ErrClosed)
   433  	}
   434  
   435  	for i, fd := range pr.pauseFds {
   436  		if fd == -1 {
   437  			continue
   438  		}
   439  
   440  		if err := pr.array.Put(uint32(i), uint32(fd)); err != nil {
   441  			return fmt.Errorf("couldn't put event fd %d for CPU %d: %w", fd, i, err)
   442  		}
   443  	}
   444  
   445  	pr.paused = false
   446  
   447  	return nil
   448  }
   449  
   450  // BufferSize is the size in bytes of each per-CPU buffer
   451  func (pr *Reader) BufferSize() int {
   452  	return pr.bufferSize
   453  }
   454  
   455  // NB: Has to be preceded by a call to ring.loadHead.
   456  func (pr *Reader) readRecordFromRing(rec *Record, ring *perfEventRing) error {
   457  	defer ring.writeTail()
   458  
   459  	rec.CPU = ring.cpu
   460  	err := readRecord(ring, rec, pr.eventHeader, pr.overwritable)
   461  	if pr.overwritable && (errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF)) {
   462  		return errEOR
   463  	}
   464  	rec.Remaining = ring.remaining()
   465  	return err
   466  }
   467  
   468  type unknownEventError struct {
   469  	eventType uint32
   470  }
   471  
   472  func (uev *unknownEventError) Error() string {
   473  	return fmt.Sprintf("unknown event type: %d", uev.eventType)
   474  }
   475  
   476  // IsUnknownEvent returns true if the error occurred
   477  // because an unknown event was submitted to the perf event ring.
   478  func IsUnknownEvent(err error) bool {
   479  	var uee *unknownEventError
   480  	return errors.As(err, &uee)
   481  }