github.com/cilium/ebpf@v0.16.0/perf/reader.go (about) 1 package perf 2 3 import ( 4 "encoding/binary" 5 "errors" 6 "fmt" 7 "io" 8 "os" 9 "runtime" 10 "sync" 11 "time" 12 13 "github.com/cilium/ebpf" 14 "github.com/cilium/ebpf/internal" 15 "github.com/cilium/ebpf/internal/epoll" 16 "github.com/cilium/ebpf/internal/sys" 17 "github.com/cilium/ebpf/internal/unix" 18 ) 19 20 var ( 21 ErrClosed = os.ErrClosed 22 ErrFlushed = epoll.ErrFlushed 23 errEOR = errors.New("end of ring") 24 ) 25 26 var perfEventHeaderSize = binary.Size(perfEventHeader{}) 27 28 // perfEventHeader must match 'struct perf_event_header` in <linux/perf_event.h>. 29 type perfEventHeader struct { 30 Type uint32 31 Misc uint16 32 Size uint16 33 } 34 35 // Record contains either a sample or a counter of the 36 // number of lost samples. 37 type Record struct { 38 // The CPU this record was generated on. 39 CPU int 40 41 // The data submitted via bpf_perf_event_output. 42 // Due to a kernel bug, this can contain between 0 and 7 bytes of trailing 43 // garbage from the ring depending on the input sample's length. 44 RawSample []byte 45 46 // The number of samples which could not be output, since 47 // the ring buffer was full. 48 LostSamples uint64 49 50 // The minimum number of bytes remaining in the per-CPU buffer after this Record has been read. 51 // Negative for overwritable buffers. 52 Remaining int 53 } 54 55 // Read a record from a reader and tag it as being from the given CPU. 56 // 57 // buf must be at least perfEventHeaderSize bytes long. 58 func readRecord(rd io.Reader, rec *Record, buf []byte, overwritable bool) error { 59 // Assert that the buffer is large enough. 60 buf = buf[:perfEventHeaderSize] 61 _, err := io.ReadFull(rd, buf) 62 if errors.Is(err, io.EOF) { 63 return errEOR 64 } else if err != nil { 65 return fmt.Errorf("read perf event header: %v", err) 66 } 67 68 header := perfEventHeader{ 69 internal.NativeEndian.Uint32(buf[0:4]), 70 internal.NativeEndian.Uint16(buf[4:6]), 71 internal.NativeEndian.Uint16(buf[6:8]), 72 } 73 74 switch header.Type { 75 case unix.PERF_RECORD_LOST: 76 rec.RawSample = rec.RawSample[:0] 77 rec.LostSamples, err = readLostRecords(rd) 78 return err 79 80 case unix.PERF_RECORD_SAMPLE: 81 rec.LostSamples = 0 82 // We can reuse buf here because perfEventHeaderSize > perfEventSampleSize. 83 rec.RawSample, err = readRawSample(rd, buf, rec.RawSample) 84 return err 85 86 default: 87 return &unknownEventError{header.Type} 88 } 89 } 90 91 func readLostRecords(rd io.Reader) (uint64, error) { 92 // lostHeader must match 'struct perf_event_lost in kernel sources. 93 var lostHeader struct { 94 ID uint64 95 Lost uint64 96 } 97 98 err := binary.Read(rd, internal.NativeEndian, &lostHeader) 99 if err != nil { 100 return 0, fmt.Errorf("can't read lost records header: %v", err) 101 } 102 103 return lostHeader.Lost, nil 104 } 105 106 var perfEventSampleSize = binary.Size(uint32(0)) 107 108 // This must match 'struct perf_event_sample in kernel sources. 109 type perfEventSample struct { 110 Size uint32 111 } 112 113 func readRawSample(rd io.Reader, buf, sampleBuf []byte) ([]byte, error) { 114 buf = buf[:perfEventSampleSize] 115 if _, err := io.ReadFull(rd, buf); err != nil { 116 return nil, fmt.Errorf("read sample size: %w", err) 117 } 118 119 sample := perfEventSample{ 120 internal.NativeEndian.Uint32(buf), 121 } 122 123 var data []byte 124 if size := int(sample.Size); cap(sampleBuf) < size { 125 data = make([]byte, size) 126 } else { 127 data = sampleBuf[:size] 128 } 129 130 if _, err := io.ReadFull(rd, data); err != nil { 131 return nil, fmt.Errorf("read sample: %w", err) 132 } 133 return data, nil 134 } 135 136 // Reader allows reading bpf_perf_event_output 137 // from user space. 138 type Reader struct { 139 poller *epoll.Poller 140 141 // mu protects read/write access to the Reader structure with the 142 // exception fields protected by 'pauseMu'. 143 // If locking both 'mu' and 'pauseMu', 'mu' must be locked first. 144 mu sync.Mutex 145 array *ebpf.Map 146 rings []*perfEventRing 147 epollEvents []unix.EpollEvent 148 epollRings []*perfEventRing 149 eventHeader []byte 150 deadline time.Time 151 overwritable bool 152 bufferSize int 153 pendingErr error 154 155 // pauseMu protects eventFds so that Pause / Resume can be invoked while 156 // Read is blocked. 157 pauseMu sync.Mutex 158 eventFds []*sys.FD 159 paused bool 160 } 161 162 // ReaderOptions control the behaviour of the user 163 // space reader. 164 type ReaderOptions struct { 165 // The number of events required in any per CPU buffer before 166 // Read will process data. This is mutually exclusive with Watermark. 167 // The default is zero, which means Watermark will take precedence. 168 WakeupEvents int 169 // The number of written bytes required in any per CPU buffer before 170 // Read will process data. Must be smaller than PerCPUBuffer. 171 // The default is to start processing as soon as data is available. 172 Watermark int 173 // This perf ring buffer is overwritable, once full the oldest event will be 174 // overwritten by newest. 175 Overwritable bool 176 } 177 178 // NewReader creates a new reader with default options. 179 // 180 // array must be a PerfEventArray. perCPUBuffer gives the size of the 181 // per CPU buffer in bytes. It is rounded up to the nearest multiple 182 // of the current page size. 183 func NewReader(array *ebpf.Map, perCPUBuffer int) (*Reader, error) { 184 return NewReaderWithOptions(array, perCPUBuffer, ReaderOptions{}) 185 } 186 187 // NewReaderWithOptions creates a new reader with the given options. 188 func NewReaderWithOptions(array *ebpf.Map, perCPUBuffer int, opts ReaderOptions) (pr *Reader, err error) { 189 closeOnError := func(c io.Closer) { 190 if err != nil { 191 c.Close() 192 } 193 } 194 195 if perCPUBuffer < 1 { 196 return nil, errors.New("perCPUBuffer must be larger than 0") 197 } 198 if opts.WakeupEvents > 0 && opts.Watermark > 0 { 199 return nil, errors.New("WakeupEvents and Watermark cannot both be non-zero") 200 } 201 202 var ( 203 nCPU = int(array.MaxEntries()) 204 rings = make([]*perfEventRing, 0, nCPU) 205 eventFds = make([]*sys.FD, 0, nCPU) 206 ) 207 208 poller, err := epoll.New() 209 if err != nil { 210 return nil, err 211 } 212 defer closeOnError(poller) 213 214 // bpf_perf_event_output checks which CPU an event is enabled on, 215 // but doesn't allow using a wildcard like -1 to specify "all CPUs". 216 // Hence we have to create a ring for each CPU. 217 bufferSize := 0 218 for i := 0; i < nCPU; i++ { 219 event, ring, err := newPerfEventRing(i, perCPUBuffer, opts) 220 if errors.Is(err, unix.ENODEV) { 221 // The requested CPU is currently offline, skip it. 222 continue 223 } 224 225 if err != nil { 226 return nil, fmt.Errorf("failed to create perf ring for CPU %d: %v", i, err) 227 } 228 defer closeOnError(event) 229 defer closeOnError(ring) 230 231 bufferSize = ring.size() 232 rings = append(rings, ring) 233 eventFds = append(eventFds, event) 234 235 if err := poller.Add(event.Int(), 0); err != nil { 236 return nil, err 237 } 238 } 239 240 // Closing a PERF_EVENT_ARRAY removes all event fds 241 // stored in it, so we keep a reference alive. 242 array, err = array.Clone() 243 if err != nil { 244 return nil, err 245 } 246 247 pr = &Reader{ 248 array: array, 249 rings: rings, 250 poller: poller, 251 deadline: time.Time{}, 252 epollEvents: make([]unix.EpollEvent, len(rings)), 253 epollRings: make([]*perfEventRing, 0, len(rings)), 254 eventHeader: make([]byte, perfEventHeaderSize), 255 eventFds: eventFds, 256 overwritable: opts.Overwritable, 257 bufferSize: bufferSize, 258 } 259 if err = pr.Resume(); err != nil { 260 return nil, err 261 } 262 runtime.SetFinalizer(pr, (*Reader).Close) 263 return pr, nil 264 } 265 266 // Close frees resources used by the reader. 267 // 268 // It interrupts calls to Read. 269 // 270 // Calls to perf_event_output from eBPF programs will return 271 // ENOENT after calling this method. 272 func (pr *Reader) Close() error { 273 if err := pr.poller.Close(); err != nil { 274 if errors.Is(err, os.ErrClosed) { 275 return nil 276 } 277 return fmt.Errorf("close poller: %w", err) 278 } 279 280 // Trying to poll will now fail, so Read() can't block anymore. Acquire the 281 // locks so that we can clean up. 282 pr.mu.Lock() 283 defer pr.mu.Unlock() 284 285 pr.pauseMu.Lock() 286 defer pr.pauseMu.Unlock() 287 288 for _, ring := range pr.rings { 289 ring.Close() 290 } 291 for _, event := range pr.eventFds { 292 event.Close() 293 } 294 pr.rings = nil 295 pr.eventFds = nil 296 pr.array.Close() 297 298 return nil 299 } 300 301 // SetDeadline controls how long Read and ReadInto will block waiting for samples. 302 // 303 // Passing a zero time.Time will remove the deadline. Passing a deadline in the 304 // past will prevent the reader from blocking if there are no records to be read. 305 func (pr *Reader) SetDeadline(t time.Time) { 306 pr.mu.Lock() 307 defer pr.mu.Unlock() 308 309 pr.deadline = t 310 } 311 312 // Read the next record from the perf ring buffer. 313 // 314 // The method blocks until there are at least Watermark bytes in one 315 // of the per CPU buffers. Records from buffers below the Watermark 316 // are not returned. 317 // 318 // Records can contain between 0 and 7 bytes of trailing garbage from the ring 319 // depending on the input sample's length. 320 // 321 // Calling [Close] interrupts the method with [os.ErrClosed]. Calling [Flush] 322 // makes it return all records currently in the ring buffer, followed by [ErrFlushed]. 323 // 324 // Returns [os.ErrDeadlineExceeded] if a deadline was set and after all records 325 // have been read from the ring. 326 // 327 // See [Reader.ReadInto] for a more efficient version of this method. 328 func (pr *Reader) Read() (Record, error) { 329 var r Record 330 331 return r, pr.ReadInto(&r) 332 } 333 334 var errMustBePaused = fmt.Errorf("perf ringbuffer: must have been paused before reading overwritable buffer") 335 336 // ReadInto is like [Reader.Read] except that it allows reusing Record and associated buffers. 337 func (pr *Reader) ReadInto(rec *Record) error { 338 pr.mu.Lock() 339 defer pr.mu.Unlock() 340 341 pr.pauseMu.Lock() 342 defer pr.pauseMu.Unlock() 343 344 if pr.overwritable && !pr.paused { 345 return errMustBePaused 346 } 347 348 if pr.rings == nil { 349 return fmt.Errorf("perf ringbuffer: %w", ErrClosed) 350 } 351 352 for { 353 if len(pr.epollRings) == 0 { 354 if pe := pr.pendingErr; pe != nil { 355 // All rings have been emptied since the error occurred, return 356 // appropriate error. 357 pr.pendingErr = nil 358 return pe 359 } 360 361 // NB: The deferred pauseMu.Unlock will panic if Wait panics, which 362 // might obscure the original panic. 363 pr.pauseMu.Unlock() 364 _, err := pr.poller.Wait(pr.epollEvents, pr.deadline) 365 pr.pauseMu.Lock() 366 367 if errors.Is(err, os.ErrDeadlineExceeded) || errors.Is(err, ErrFlushed) { 368 // We've hit the deadline, check whether there is any data in 369 // the rings that we've not been woken up for. 370 pr.pendingErr = err 371 } else if err != nil { 372 return err 373 } 374 375 // Re-validate pr.paused since we dropped pauseMu. 376 if pr.overwritable && !pr.paused { 377 return errMustBePaused 378 } 379 380 // Waking up userspace is expensive, make the most of it by checking 381 // all rings. 382 for _, ring := range pr.rings { 383 ring.loadHead() 384 pr.epollRings = append(pr.epollRings, ring) 385 } 386 } 387 388 // Start at the last available event. The order in which we 389 // process them doesn't matter, and starting at the back allows 390 // resizing epollRings to keep track of processed rings. 391 err := pr.readRecordFromRing(rec, pr.epollRings[len(pr.epollRings)-1]) 392 if err == errEOR { 393 // We've emptied the current ring buffer, process 394 // the next one. 395 pr.epollRings = pr.epollRings[:len(pr.epollRings)-1] 396 continue 397 } 398 399 return err 400 } 401 } 402 403 // Pause stops all notifications from this Reader. 404 // 405 // While the Reader is paused, any attempts to write to the event buffer from 406 // BPF programs will return -ENOENT. 407 // 408 // Subsequent calls to Read will block until a call to Resume. 409 func (pr *Reader) Pause() error { 410 pr.pauseMu.Lock() 411 defer pr.pauseMu.Unlock() 412 413 if pr.eventFds == nil { 414 return fmt.Errorf("%w", ErrClosed) 415 } 416 417 for i := range pr.eventFds { 418 if err := pr.array.Delete(uint32(i)); err != nil && !errors.Is(err, ebpf.ErrKeyNotExist) { 419 return fmt.Errorf("could't delete event fd for CPU %d: %w", i, err) 420 } 421 } 422 423 pr.paused = true 424 425 return nil 426 } 427 428 // Resume allows this perf reader to emit notifications. 429 // 430 // Subsequent calls to Read will block until the next event notification. 431 func (pr *Reader) Resume() error { 432 pr.pauseMu.Lock() 433 defer pr.pauseMu.Unlock() 434 435 if pr.eventFds == nil { 436 return fmt.Errorf("%w", ErrClosed) 437 } 438 439 for i, fd := range pr.eventFds { 440 if fd == nil { 441 continue 442 } 443 444 if err := pr.array.Put(uint32(i), fd.Uint()); err != nil { 445 return fmt.Errorf("couldn't put event fd %d for CPU %d: %w", fd, i, err) 446 } 447 } 448 449 pr.paused = false 450 451 return nil 452 } 453 454 // BufferSize is the size in bytes of each per-CPU buffer 455 func (pr *Reader) BufferSize() int { 456 return pr.bufferSize 457 } 458 459 // Flush unblocks Read/ReadInto and successive Read/ReadInto calls will return pending samples at this point, 460 // until you receive a [ErrFlushed] error. 461 func (pr *Reader) Flush() error { 462 return pr.poller.Flush() 463 } 464 465 // NB: Has to be preceded by a call to ring.loadHead. 466 func (pr *Reader) readRecordFromRing(rec *Record, ring *perfEventRing) error { 467 defer ring.writeTail() 468 469 rec.CPU = ring.cpu 470 err := readRecord(ring, rec, pr.eventHeader, pr.overwritable) 471 if pr.overwritable && (errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF)) { 472 return errEOR 473 } 474 rec.Remaining = ring.remaining() 475 return err 476 } 477 478 type unknownEventError struct { 479 eventType uint32 480 } 481 482 func (uev *unknownEventError) Error() string { 483 return fmt.Sprintf("unknown event type: %d", uev.eventType) 484 } 485 486 // IsUnknownEvent returns true if the error occurred 487 // because an unknown event was submitted to the perf event ring. 488 func IsUnknownEvent(err error) bool { 489 var uee *unknownEventError 490 return errors.As(err, &uee) 491 }