github.com/cilium/ebpf@v0.10.0/perf/reader.go (about)

     1  package perf
     2  
     3  import (
     4  	"encoding/binary"
     5  	"errors"
     6  	"fmt"
     7  	"io"
     8  	"os"
     9  	"runtime"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/cilium/ebpf"
    14  	"github.com/cilium/ebpf/internal"
    15  	"github.com/cilium/ebpf/internal/epoll"
    16  	"github.com/cilium/ebpf/internal/unix"
    17  )
    18  
    19  var (
    20  	ErrClosed = os.ErrClosed
    21  	errEOR    = errors.New("end of ring")
    22  )
    23  
    24  var perfEventHeaderSize = binary.Size(perfEventHeader{})
    25  
    26  // perfEventHeader must match 'struct perf_event_header` in <linux/perf_event.h>.
    27  type perfEventHeader struct {
    28  	Type uint32
    29  	Misc uint16
    30  	Size uint16
    31  }
    32  
    33  func cpuForEvent(event *unix.EpollEvent) int {
    34  	return int(event.Pad)
    35  }
    36  
    37  // Record contains either a sample or a counter of the
    38  // number of lost samples.
    39  type Record struct {
    40  	// The CPU this record was generated on.
    41  	CPU int
    42  
    43  	// The data submitted via bpf_perf_event_output.
    44  	// Due to a kernel bug, this can contain between 0 and 7 bytes of trailing
    45  	// garbage from the ring depending on the input sample's length.
    46  	RawSample []byte
    47  
    48  	// The number of samples which could not be output, since
    49  	// the ring buffer was full.
    50  	LostSamples uint64
    51  }
    52  
    53  // Read a record from a reader and tag it as being from the given CPU.
    54  //
    55  // buf must be at least perfEventHeaderSize bytes long.
    56  func readRecord(rd io.Reader, rec *Record, buf []byte) error {
    57  	// Assert that the buffer is large enough.
    58  	buf = buf[:perfEventHeaderSize]
    59  	_, err := io.ReadFull(rd, buf)
    60  	if errors.Is(err, io.EOF) {
    61  		return errEOR
    62  	} else if err != nil {
    63  		return fmt.Errorf("read perf event header: %v", err)
    64  	}
    65  
    66  	header := perfEventHeader{
    67  		internal.NativeEndian.Uint32(buf[0:4]),
    68  		internal.NativeEndian.Uint16(buf[4:6]),
    69  		internal.NativeEndian.Uint16(buf[6:8]),
    70  	}
    71  
    72  	switch header.Type {
    73  	case unix.PERF_RECORD_LOST:
    74  		rec.RawSample = rec.RawSample[:0]
    75  		rec.LostSamples, err = readLostRecords(rd)
    76  		return err
    77  
    78  	case unix.PERF_RECORD_SAMPLE:
    79  		rec.LostSamples = 0
    80  		// We can reuse buf here because perfEventHeaderSize > perfEventSampleSize.
    81  		rec.RawSample, err = readRawSample(rd, buf, rec.RawSample)
    82  		return err
    83  
    84  	default:
    85  		return &unknownEventError{header.Type}
    86  	}
    87  }
    88  
    89  func readLostRecords(rd io.Reader) (uint64, error) {
    90  	// lostHeader must match 'struct perf_event_lost in kernel sources.
    91  	var lostHeader struct {
    92  		ID   uint64
    93  		Lost uint64
    94  	}
    95  
    96  	err := binary.Read(rd, internal.NativeEndian, &lostHeader)
    97  	if err != nil {
    98  		return 0, fmt.Errorf("can't read lost records header: %v", err)
    99  	}
   100  
   101  	return lostHeader.Lost, nil
   102  }
   103  
   104  var perfEventSampleSize = binary.Size(uint32(0))
   105  
   106  // This must match 'struct perf_event_sample in kernel sources.
   107  type perfEventSample struct {
   108  	Size uint32
   109  }
   110  
   111  func readRawSample(rd io.Reader, buf, sampleBuf []byte) ([]byte, error) {
   112  	buf = buf[:perfEventSampleSize]
   113  	if _, err := io.ReadFull(rd, buf); err != nil {
   114  		return nil, fmt.Errorf("read sample size: %v", err)
   115  	}
   116  
   117  	sample := perfEventSample{
   118  		internal.NativeEndian.Uint32(buf),
   119  	}
   120  
   121  	var data []byte
   122  	if size := int(sample.Size); cap(sampleBuf) < size {
   123  		data = make([]byte, size)
   124  	} else {
   125  		data = sampleBuf[:size]
   126  	}
   127  
   128  	if _, err := io.ReadFull(rd, data); err != nil {
   129  		return nil, fmt.Errorf("read sample: %v", err)
   130  	}
   131  	return data, nil
   132  }
   133  
   134  // Reader allows reading bpf_perf_event_output
   135  // from user space.
   136  type Reader struct {
   137  	poller   *epoll.Poller
   138  	deadline time.Time
   139  
   140  	// mu protects read/write access to the Reader structure with the
   141  	// exception of 'pauseFds', which is protected by 'pauseMu'.
   142  	// If locking both 'mu' and 'pauseMu', 'mu' must be locked first.
   143  	mu sync.Mutex
   144  
   145  	// Closing a PERF_EVENT_ARRAY removes all event fds
   146  	// stored in it, so we keep a reference alive.
   147  	array       *ebpf.Map
   148  	rings       []*perfEventRing
   149  	epollEvents []unix.EpollEvent
   150  	epollRings  []*perfEventRing
   151  	eventHeader []byte
   152  
   153  	// pauseFds are a copy of the fds in 'rings', protected by 'pauseMu'.
   154  	// These allow Pause/Resume to be executed independently of any ongoing
   155  	// Read calls, which would otherwise need to be interrupted.
   156  	pauseMu  sync.Mutex
   157  	pauseFds []int
   158  }
   159  
   160  // ReaderOptions control the behaviour of the user
   161  // space reader.
   162  type ReaderOptions struct {
   163  	// The number of written bytes required in any per CPU buffer before
   164  	// Read will process data. Must be smaller than PerCPUBuffer.
   165  	// The default is to start processing as soon as data is available.
   166  	Watermark int
   167  }
   168  
   169  // NewReader creates a new reader with default options.
   170  //
   171  // array must be a PerfEventArray. perCPUBuffer gives the size of the
   172  // per CPU buffer in bytes. It is rounded up to the nearest multiple
   173  // of the current page size.
   174  func NewReader(array *ebpf.Map, perCPUBuffer int) (*Reader, error) {
   175  	return NewReaderWithOptions(array, perCPUBuffer, ReaderOptions{})
   176  }
   177  
   178  // NewReaderWithOptions creates a new reader with the given options.
   179  func NewReaderWithOptions(array *ebpf.Map, perCPUBuffer int, opts ReaderOptions) (pr *Reader, err error) {
   180  	if perCPUBuffer < 1 {
   181  		return nil, errors.New("perCPUBuffer must be larger than 0")
   182  	}
   183  
   184  	var (
   185  		fds      []int
   186  		nCPU     = int(array.MaxEntries())
   187  		rings    = make([]*perfEventRing, 0, nCPU)
   188  		pauseFds = make([]int, 0, nCPU)
   189  	)
   190  
   191  	poller, err := epoll.New()
   192  	if err != nil {
   193  		return nil, err
   194  	}
   195  
   196  	defer func() {
   197  		if err != nil {
   198  			poller.Close()
   199  			for _, fd := range fds {
   200  				unix.Close(fd)
   201  			}
   202  			for _, ring := range rings {
   203  				if ring != nil {
   204  					ring.Close()
   205  				}
   206  			}
   207  		}
   208  	}()
   209  
   210  	// bpf_perf_event_output checks which CPU an event is enabled on,
   211  	// but doesn't allow using a wildcard like -1 to specify "all CPUs".
   212  	// Hence we have to create a ring for each CPU.
   213  	for i := 0; i < nCPU; i++ {
   214  		ring, err := newPerfEventRing(i, perCPUBuffer, opts.Watermark)
   215  		if errors.Is(err, unix.ENODEV) {
   216  			// The requested CPU is currently offline, skip it.
   217  			rings = append(rings, nil)
   218  			pauseFds = append(pauseFds, -1)
   219  			continue
   220  		}
   221  
   222  		if err != nil {
   223  			return nil, fmt.Errorf("failed to create perf ring for CPU %d: %v", i, err)
   224  		}
   225  		rings = append(rings, ring)
   226  		pauseFds = append(pauseFds, ring.fd)
   227  
   228  		if err := poller.Add(ring.fd, i); err != nil {
   229  			return nil, err
   230  		}
   231  	}
   232  
   233  	array, err = array.Clone()
   234  	if err != nil {
   235  		return nil, err
   236  	}
   237  
   238  	pr = &Reader{
   239  		array:       array,
   240  		rings:       rings,
   241  		poller:      poller,
   242  		deadline:    time.Time{},
   243  		epollEvents: make([]unix.EpollEvent, len(rings)),
   244  		epollRings:  make([]*perfEventRing, 0, len(rings)),
   245  		eventHeader: make([]byte, perfEventHeaderSize),
   246  		pauseFds:    pauseFds,
   247  	}
   248  	if err = pr.Resume(); err != nil {
   249  		return nil, err
   250  	}
   251  	runtime.SetFinalizer(pr, (*Reader).Close)
   252  	return pr, nil
   253  }
   254  
   255  // Close frees resources used by the reader.
   256  //
   257  // It interrupts calls to Read.
   258  //
   259  // Calls to perf_event_output from eBPF programs will return
   260  // ENOENT after calling this method.
   261  func (pr *Reader) Close() error {
   262  	if err := pr.poller.Close(); err != nil {
   263  		if errors.Is(err, os.ErrClosed) {
   264  			return nil
   265  		}
   266  		return fmt.Errorf("close poller: %w", err)
   267  	}
   268  
   269  	// Trying to poll will now fail, so Read() can't block anymore. Acquire the
   270  	// lock so that we can clean up.
   271  	pr.mu.Lock()
   272  	defer pr.mu.Unlock()
   273  
   274  	for _, ring := range pr.rings {
   275  		if ring != nil {
   276  			ring.Close()
   277  		}
   278  	}
   279  	pr.rings = nil
   280  	pr.pauseFds = nil
   281  	pr.array.Close()
   282  
   283  	return nil
   284  }
   285  
   286  // SetDeadline controls how long Read and ReadInto will block waiting for samples.
   287  //
   288  // Passing a zero time.Time will remove the deadline.
   289  func (pr *Reader) SetDeadline(t time.Time) {
   290  	pr.mu.Lock()
   291  	defer pr.mu.Unlock()
   292  
   293  	pr.deadline = t
   294  }
   295  
   296  // Read the next record from the perf ring buffer.
   297  //
   298  // The function blocks until there are at least Watermark bytes in one
   299  // of the per CPU buffers. Records from buffers below the Watermark
   300  // are not returned.
   301  //
   302  // Records can contain between 0 and 7 bytes of trailing garbage from the ring
   303  // depending on the input sample's length.
   304  //
   305  // Calling Close interrupts the function.
   306  //
   307  // Returns os.ErrDeadlineExceeded if a deadline was set.
   308  func (pr *Reader) Read() (Record, error) {
   309  	var r Record
   310  	return r, pr.ReadInto(&r)
   311  }
   312  
   313  // ReadInto is like Read except that it allows reusing Record and associated buffers.
   314  func (pr *Reader) ReadInto(rec *Record) error {
   315  	pr.mu.Lock()
   316  	defer pr.mu.Unlock()
   317  
   318  	if pr.rings == nil {
   319  		return fmt.Errorf("perf ringbuffer: %w", ErrClosed)
   320  	}
   321  
   322  	for {
   323  		if len(pr.epollRings) == 0 {
   324  			nEvents, err := pr.poller.Wait(pr.epollEvents, pr.deadline)
   325  			if err != nil {
   326  				return err
   327  			}
   328  
   329  			for _, event := range pr.epollEvents[:nEvents] {
   330  				ring := pr.rings[cpuForEvent(&event)]
   331  				pr.epollRings = append(pr.epollRings, ring)
   332  
   333  				// Read the current head pointer now, not every time
   334  				// we read a record. This prevents a single fast producer
   335  				// from keeping the reader busy.
   336  				ring.loadHead()
   337  			}
   338  		}
   339  
   340  		// Start at the last available event. The order in which we
   341  		// process them doesn't matter, and starting at the back allows
   342  		// resizing epollRings to keep track of processed rings.
   343  		err := pr.readRecordFromRing(rec, pr.epollRings[len(pr.epollRings)-1])
   344  		if err == errEOR {
   345  			// We've emptied the current ring buffer, process
   346  			// the next one.
   347  			pr.epollRings = pr.epollRings[:len(pr.epollRings)-1]
   348  			continue
   349  		}
   350  
   351  		return err
   352  	}
   353  }
   354  
   355  // Pause stops all notifications from this Reader.
   356  //
   357  // While the Reader is paused, any attempts to write to the event buffer from
   358  // BPF programs will return -ENOENT.
   359  //
   360  // Subsequent calls to Read will block until a call to Resume.
   361  func (pr *Reader) Pause() error {
   362  	pr.pauseMu.Lock()
   363  	defer pr.pauseMu.Unlock()
   364  
   365  	if pr.pauseFds == nil {
   366  		return fmt.Errorf("%w", ErrClosed)
   367  	}
   368  
   369  	for i := range pr.pauseFds {
   370  		if err := pr.array.Delete(uint32(i)); err != nil && !errors.Is(err, ebpf.ErrKeyNotExist) {
   371  			return fmt.Errorf("could't delete event fd for CPU %d: %w", i, err)
   372  		}
   373  	}
   374  
   375  	return nil
   376  }
   377  
   378  // Resume allows this perf reader to emit notifications.
   379  //
   380  // Subsequent calls to Read will block until the next event notification.
   381  func (pr *Reader) Resume() error {
   382  	pr.pauseMu.Lock()
   383  	defer pr.pauseMu.Unlock()
   384  
   385  	if pr.pauseFds == nil {
   386  		return fmt.Errorf("%w", ErrClosed)
   387  	}
   388  
   389  	for i, fd := range pr.pauseFds {
   390  		if fd == -1 {
   391  			continue
   392  		}
   393  
   394  		if err := pr.array.Put(uint32(i), uint32(fd)); err != nil {
   395  			return fmt.Errorf("couldn't put event fd %d for CPU %d: %w", fd, i, err)
   396  		}
   397  	}
   398  
   399  	return nil
   400  }
   401  
   402  // NB: Has to be preceded by a call to ring.loadHead.
   403  func (pr *Reader) readRecordFromRing(rec *Record, ring *perfEventRing) error {
   404  	defer ring.writeTail()
   405  
   406  	rec.CPU = ring.cpu
   407  	return readRecord(ring, rec, pr.eventHeader)
   408  }
   409  
   410  type unknownEventError struct {
   411  	eventType uint32
   412  }
   413  
   414  func (uev *unknownEventError) Error() string {
   415  	return fmt.Sprintf("unknown event type: %d", uev.eventType)
   416  }
   417  
   418  // IsUnknownEvent returns true if the error occurred
   419  // because an unknown event was submitted to the perf event ring.
   420  func IsUnknownEvent(err error) bool {
   421  	var uee *unknownEventError
   422  	return errors.As(err, &uee)
   423  }