github.com/cilium/ebpf@v0.15.1-0.20240517100537-8079b37aa138/perf/reader.go (about) 1 package perf 2 3 import ( 4 "encoding/binary" 5 "errors" 6 "fmt" 7 "io" 8 "os" 9 "runtime" 10 "sync" 11 "time" 12 13 "github.com/cilium/ebpf" 14 "github.com/cilium/ebpf/internal" 15 "github.com/cilium/ebpf/internal/epoll" 16 "github.com/cilium/ebpf/internal/sys" 17 "github.com/cilium/ebpf/internal/unix" 18 ) 19 20 var ( 21 ErrClosed = os.ErrClosed 22 errEOR = errors.New("end of ring") 23 ) 24 25 var perfEventHeaderSize = binary.Size(perfEventHeader{}) 26 27 // perfEventHeader must match 'struct perf_event_header` in <linux/perf_event.h>. 28 type perfEventHeader struct { 29 Type uint32 30 Misc uint16 31 Size uint16 32 } 33 34 // Record contains either a sample or a counter of the 35 // number of lost samples. 36 type Record struct { 37 // The CPU this record was generated on. 38 CPU int 39 40 // The data submitted via bpf_perf_event_output. 41 // Due to a kernel bug, this can contain between 0 and 7 bytes of trailing 42 // garbage from the ring depending on the input sample's length. 43 RawSample []byte 44 45 // The number of samples which could not be output, since 46 // the ring buffer was full. 47 LostSamples uint64 48 49 // The minimum number of bytes remaining in the per-CPU buffer after this Record has been read. 50 // Negative for overwritable buffers. 51 Remaining int 52 } 53 54 // Read a record from a reader and tag it as being from the given CPU. 55 // 56 // buf must be at least perfEventHeaderSize bytes long. 57 func readRecord(rd io.Reader, rec *Record, buf []byte, overwritable bool) error { 58 // Assert that the buffer is large enough. 59 buf = buf[:perfEventHeaderSize] 60 _, err := io.ReadFull(rd, buf) 61 if errors.Is(err, io.EOF) { 62 return errEOR 63 } else if err != nil { 64 return fmt.Errorf("read perf event header: %v", err) 65 } 66 67 header := perfEventHeader{ 68 internal.NativeEndian.Uint32(buf[0:4]), 69 internal.NativeEndian.Uint16(buf[4:6]), 70 internal.NativeEndian.Uint16(buf[6:8]), 71 } 72 73 switch header.Type { 74 case unix.PERF_RECORD_LOST: 75 rec.RawSample = rec.RawSample[:0] 76 rec.LostSamples, err = readLostRecords(rd) 77 return err 78 79 case unix.PERF_RECORD_SAMPLE: 80 rec.LostSamples = 0 81 // We can reuse buf here because perfEventHeaderSize > perfEventSampleSize. 82 rec.RawSample, err = readRawSample(rd, buf, rec.RawSample) 83 return err 84 85 default: 86 return &unknownEventError{header.Type} 87 } 88 } 89 90 func readLostRecords(rd io.Reader) (uint64, error) { 91 // lostHeader must match 'struct perf_event_lost in kernel sources. 92 var lostHeader struct { 93 ID uint64 94 Lost uint64 95 } 96 97 err := binary.Read(rd, internal.NativeEndian, &lostHeader) 98 if err != nil { 99 return 0, fmt.Errorf("can't read lost records header: %v", err) 100 } 101 102 return lostHeader.Lost, nil 103 } 104 105 var perfEventSampleSize = binary.Size(uint32(0)) 106 107 // This must match 'struct perf_event_sample in kernel sources. 108 type perfEventSample struct { 109 Size uint32 110 } 111 112 func readRawSample(rd io.Reader, buf, sampleBuf []byte) ([]byte, error) { 113 buf = buf[:perfEventSampleSize] 114 if _, err := io.ReadFull(rd, buf); err != nil { 115 return nil, fmt.Errorf("read sample size: %w", err) 116 } 117 118 sample := perfEventSample{ 119 internal.NativeEndian.Uint32(buf), 120 } 121 122 var data []byte 123 if size := int(sample.Size); cap(sampleBuf) < size { 124 data = make([]byte, size) 125 } else { 126 data = sampleBuf[:size] 127 } 128 129 if _, err := io.ReadFull(rd, data); err != nil { 130 return nil, fmt.Errorf("read sample: %w", err) 131 } 132 return data, nil 133 } 134 135 // Reader allows reading bpf_perf_event_output 136 // from user space. 137 type Reader struct { 138 poller *epoll.Poller 139 deadline time.Time 140 141 // mu protects read/write access to the Reader structure with the 142 // exception of 'pauseFds', which is protected by 'pauseMu'. 143 // If locking both 'mu' and 'pauseMu', 'mu' must be locked first. 144 mu sync.Mutex 145 146 // Closing a PERF_EVENT_ARRAY removes all event fds 147 // stored in it, so we keep a reference alive. 148 array *ebpf.Map 149 rings []*perfEventRing 150 epollEvents []unix.EpollEvent 151 epollRings []*perfEventRing 152 eventHeader []byte 153 154 // pauseMu protects eventFds so that Pause / Resume can be invoked while 155 // Read is blocked. 156 pauseMu sync.Mutex 157 eventFds []*sys.FD 158 159 paused bool 160 overwritable bool 161 162 bufferSize int 163 } 164 165 // ReaderOptions control the behaviour of the user 166 // space reader. 167 type ReaderOptions struct { 168 // The number of events required in any per CPU buffer before 169 // Read will process data. This is mutually exclusive with Watermark. 170 // The default is zero, which means Watermark will take precedence. 171 WakeupEvents int 172 // The number of written bytes required in any per CPU buffer before 173 // Read will process data. Must be smaller than PerCPUBuffer. 174 // The default is to start processing as soon as data is available. 175 Watermark int 176 // This perf ring buffer is overwritable, once full the oldest event will be 177 // overwritten by newest. 178 Overwritable bool 179 } 180 181 // NewReader creates a new reader with default options. 182 // 183 // array must be a PerfEventArray. perCPUBuffer gives the size of the 184 // per CPU buffer in bytes. It is rounded up to the nearest multiple 185 // of the current page size. 186 func NewReader(array *ebpf.Map, perCPUBuffer int) (*Reader, error) { 187 return NewReaderWithOptions(array, perCPUBuffer, ReaderOptions{}) 188 } 189 190 // NewReaderWithOptions creates a new reader with the given options. 191 func NewReaderWithOptions(array *ebpf.Map, perCPUBuffer int, opts ReaderOptions) (pr *Reader, err error) { 192 closeOnError := func(c io.Closer) { 193 if err != nil { 194 c.Close() 195 } 196 } 197 198 if perCPUBuffer < 1 { 199 return nil, errors.New("perCPUBuffer must be larger than 0") 200 } 201 if opts.WakeupEvents > 0 && opts.Watermark > 0 { 202 return nil, errors.New("WakeupEvents and Watermark cannot both be non-zero") 203 } 204 205 var ( 206 nCPU = int(array.MaxEntries()) 207 rings = make([]*perfEventRing, 0, nCPU) 208 eventFds = make([]*sys.FD, 0, nCPU) 209 ) 210 211 poller, err := epoll.New() 212 if err != nil { 213 return nil, err 214 } 215 defer closeOnError(poller) 216 217 // bpf_perf_event_output checks which CPU an event is enabled on, 218 // but doesn't allow using a wildcard like -1 to specify "all CPUs". 219 // Hence we have to create a ring for each CPU. 220 bufferSize := 0 221 for i := 0; i < nCPU; i++ { 222 event, ring, err := newPerfEventRing(i, perCPUBuffer, opts) 223 if errors.Is(err, unix.ENODEV) { 224 // The requested CPU is currently offline, skip it. 225 rings = append(rings, nil) 226 eventFds = append(eventFds, nil) 227 continue 228 } 229 230 if err != nil { 231 return nil, fmt.Errorf("failed to create perf ring for CPU %d: %v", i, err) 232 } 233 defer closeOnError(event) 234 defer closeOnError(ring) 235 236 bufferSize = ring.size() 237 rings = append(rings, ring) 238 eventFds = append(eventFds, event) 239 240 if err := poller.Add(event.Int(), 0); err != nil { 241 return nil, err 242 } 243 } 244 245 array, err = array.Clone() 246 if err != nil { 247 return nil, err 248 } 249 250 pr = &Reader{ 251 array: array, 252 rings: rings, 253 poller: poller, 254 deadline: time.Time{}, 255 epollEvents: make([]unix.EpollEvent, len(rings)), 256 epollRings: make([]*perfEventRing, 0, len(rings)), 257 eventHeader: make([]byte, perfEventHeaderSize), 258 eventFds: eventFds, 259 overwritable: opts.Overwritable, 260 bufferSize: bufferSize, 261 } 262 if err = pr.Resume(); err != nil { 263 return nil, err 264 } 265 runtime.SetFinalizer(pr, (*Reader).Close) 266 return pr, nil 267 } 268 269 // Close frees resources used by the reader. 270 // 271 // It interrupts calls to Read. 272 // 273 // Calls to perf_event_output from eBPF programs will return 274 // ENOENT after calling this method. 275 func (pr *Reader) Close() error { 276 if err := pr.poller.Close(); err != nil { 277 if errors.Is(err, os.ErrClosed) { 278 return nil 279 } 280 return fmt.Errorf("close poller: %w", err) 281 } 282 283 // Trying to poll will now fail, so Read() can't block anymore. Acquire the 284 // locks so that we can clean up. 285 pr.mu.Lock() 286 defer pr.mu.Unlock() 287 288 pr.pauseMu.Lock() 289 defer pr.pauseMu.Unlock() 290 291 for _, ring := range pr.rings { 292 if ring != nil { 293 ring.Close() 294 } 295 } 296 for _, event := range pr.eventFds { 297 if event != nil { 298 event.Close() 299 } 300 } 301 pr.rings = nil 302 pr.eventFds = nil 303 pr.array.Close() 304 305 return nil 306 } 307 308 // SetDeadline controls how long Read and ReadInto will block waiting for samples. 309 // 310 // Passing a zero time.Time will remove the deadline. Passing a deadline in the 311 // past will prevent the reader from blocking if there are no records to be read. 312 func (pr *Reader) SetDeadline(t time.Time) { 313 pr.mu.Lock() 314 defer pr.mu.Unlock() 315 316 pr.deadline = t 317 } 318 319 // Read the next record from the perf ring buffer. 320 // 321 // The function blocks until there are at least Watermark bytes in one 322 // of the per CPU buffers. Records from buffers below the Watermark 323 // are not returned. 324 // 325 // Records can contain between 0 and 7 bytes of trailing garbage from the ring 326 // depending on the input sample's length. 327 // 328 // Calling Close interrupts the function. 329 // 330 // Returns [os.ErrDeadlineExceeded] if a deadline was set and the perf ring buffer 331 // was empty. Otherwise returns a record and no error, even if the deadline was 332 // exceeded. 333 // 334 // See [Reader.ReadInto] for a more efficient version of this method. 335 func (pr *Reader) Read() (Record, error) { 336 var r Record 337 338 return r, pr.ReadInto(&r) 339 } 340 341 var errMustBePaused = fmt.Errorf("perf ringbuffer: must have been paused before reading overwritable buffer") 342 343 // ReadInto is like [Reader.Read] except that it allows reusing Record and associated buffers. 344 func (pr *Reader) ReadInto(rec *Record) error { 345 pr.mu.Lock() 346 defer pr.mu.Unlock() 347 348 pr.pauseMu.Lock() 349 defer pr.pauseMu.Unlock() 350 351 if pr.overwritable && !pr.paused { 352 return errMustBePaused 353 } 354 355 if pr.rings == nil { 356 return fmt.Errorf("perf ringbuffer: %w", ErrClosed) 357 } 358 359 deadlineWasExceeded := false 360 for { 361 if len(pr.epollRings) == 0 { 362 if deadlineWasExceeded { 363 // All rings were empty when the deadline expired, return 364 // appropriate error. 365 return os.ErrDeadlineExceeded 366 } 367 368 // NB: The deferred pauseMu.Unlock will panic if Wait panics, which 369 // might obscure the original panic. 370 pr.pauseMu.Unlock() 371 _, err := pr.poller.Wait(pr.epollEvents, pr.deadline) 372 pr.pauseMu.Lock() 373 374 if errors.Is(err, os.ErrDeadlineExceeded) { 375 // We've hit the deadline, check whether there is any data in 376 // the rings that we've not been woken up for. 377 deadlineWasExceeded = true 378 } else if err != nil { 379 return err 380 } 381 382 // Re-validate pr.paused since we dropped pauseMu. 383 if pr.overwritable && !pr.paused { 384 return errMustBePaused 385 } 386 387 // Waking up userspace is expensive, make the most of it by checking 388 // all rings. 389 for _, ring := range pr.rings { 390 ring.loadHead() 391 pr.epollRings = append(pr.epollRings, ring) 392 } 393 } 394 395 // Start at the last available event. The order in which we 396 // process them doesn't matter, and starting at the back allows 397 // resizing epollRings to keep track of processed rings. 398 err := pr.readRecordFromRing(rec, pr.epollRings[len(pr.epollRings)-1]) 399 if err == errEOR { 400 // We've emptied the current ring buffer, process 401 // the next one. 402 pr.epollRings = pr.epollRings[:len(pr.epollRings)-1] 403 continue 404 } 405 406 return err 407 } 408 } 409 410 // Pause stops all notifications from this Reader. 411 // 412 // While the Reader is paused, any attempts to write to the event buffer from 413 // BPF programs will return -ENOENT. 414 // 415 // Subsequent calls to Read will block until a call to Resume. 416 func (pr *Reader) Pause() error { 417 pr.pauseMu.Lock() 418 defer pr.pauseMu.Unlock() 419 420 if pr.eventFds == nil { 421 return fmt.Errorf("%w", ErrClosed) 422 } 423 424 for i := range pr.eventFds { 425 if err := pr.array.Delete(uint32(i)); err != nil && !errors.Is(err, ebpf.ErrKeyNotExist) { 426 return fmt.Errorf("could't delete event fd for CPU %d: %w", i, err) 427 } 428 } 429 430 pr.paused = true 431 432 return nil 433 } 434 435 // Resume allows this perf reader to emit notifications. 436 // 437 // Subsequent calls to Read will block until the next event notification. 438 func (pr *Reader) Resume() error { 439 pr.pauseMu.Lock() 440 defer pr.pauseMu.Unlock() 441 442 if pr.eventFds == nil { 443 return fmt.Errorf("%w", ErrClosed) 444 } 445 446 for i, fd := range pr.eventFds { 447 if fd == nil { 448 continue 449 } 450 451 if err := pr.array.Put(uint32(i), fd.Uint()); err != nil { 452 return fmt.Errorf("couldn't put event fd %d for CPU %d: %w", fd, i, err) 453 } 454 } 455 456 pr.paused = false 457 458 return nil 459 } 460 461 // BufferSize is the size in bytes of each per-CPU buffer 462 func (pr *Reader) BufferSize() int { 463 return pr.bufferSize 464 } 465 466 // NB: Has to be preceded by a call to ring.loadHead. 467 func (pr *Reader) readRecordFromRing(rec *Record, ring *perfEventRing) error { 468 defer ring.writeTail() 469 470 rec.CPU = ring.cpu 471 err := readRecord(ring, rec, pr.eventHeader, pr.overwritable) 472 if pr.overwritable && (errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF)) { 473 return errEOR 474 } 475 rec.Remaining = ring.remaining() 476 return err 477 } 478 479 type unknownEventError struct { 480 eventType uint32 481 } 482 483 func (uev *unknownEventError) Error() string { 484 return fmt.Sprintf("unknown event type: %d", uev.eventType) 485 } 486 487 // IsUnknownEvent returns true if the error occurred 488 // because an unknown event was submitted to the perf event ring. 489 func IsUnknownEvent(err error) bool { 490 var uee *unknownEventError 491 return errors.As(err, &uee) 492 }