github.com/cilium/ebpf@v0.15.0/perf/reader.go (about) 1 package perf 2 3 import ( 4 "encoding/binary" 5 "errors" 6 "fmt" 7 "io" 8 "os" 9 "runtime" 10 "sync" 11 "time" 12 13 "github.com/cilium/ebpf" 14 "github.com/cilium/ebpf/internal" 15 "github.com/cilium/ebpf/internal/epoll" 16 "github.com/cilium/ebpf/internal/unix" 17 ) 18 19 var ( 20 ErrClosed = os.ErrClosed 21 errEOR = errors.New("end of ring") 22 ) 23 24 var perfEventHeaderSize = binary.Size(perfEventHeader{}) 25 26 // perfEventHeader must match 'struct perf_event_header` in <linux/perf_event.h>. 27 type perfEventHeader struct { 28 Type uint32 29 Misc uint16 30 Size uint16 31 } 32 33 func cpuForEvent(event *unix.EpollEvent) int { 34 return int(event.Pad) 35 } 36 37 // Record contains either a sample or a counter of the 38 // number of lost samples. 39 type Record struct { 40 // The CPU this record was generated on. 41 CPU int 42 43 // The data submitted via bpf_perf_event_output. 44 // Due to a kernel bug, this can contain between 0 and 7 bytes of trailing 45 // garbage from the ring depending on the input sample's length. 46 RawSample []byte 47 48 // The number of samples which could not be output, since 49 // the ring buffer was full. 50 LostSamples uint64 51 52 // The minimum number of bytes remaining in the per-CPU buffer after this Record has been read. 53 // Negative for overwritable buffers. 54 Remaining int 55 } 56 57 // Read a record from a reader and tag it as being from the given CPU. 58 // 59 // buf must be at least perfEventHeaderSize bytes long. 60 func readRecord(rd io.Reader, rec *Record, buf []byte, overwritable bool) error { 61 // Assert that the buffer is large enough. 62 buf = buf[:perfEventHeaderSize] 63 _, err := io.ReadFull(rd, buf) 64 if errors.Is(err, io.EOF) { 65 return errEOR 66 } else if err != nil { 67 return fmt.Errorf("read perf event header: %v", err) 68 } 69 70 header := perfEventHeader{ 71 internal.NativeEndian.Uint32(buf[0:4]), 72 internal.NativeEndian.Uint16(buf[4:6]), 73 internal.NativeEndian.Uint16(buf[6:8]), 74 } 75 76 switch header.Type { 77 case unix.PERF_RECORD_LOST: 78 rec.RawSample = rec.RawSample[:0] 79 rec.LostSamples, err = readLostRecords(rd) 80 return err 81 82 case unix.PERF_RECORD_SAMPLE: 83 rec.LostSamples = 0 84 // We can reuse buf here because perfEventHeaderSize > perfEventSampleSize. 85 rec.RawSample, err = readRawSample(rd, buf, rec.RawSample) 86 return err 87 88 default: 89 return &unknownEventError{header.Type} 90 } 91 } 92 93 func readLostRecords(rd io.Reader) (uint64, error) { 94 // lostHeader must match 'struct perf_event_lost in kernel sources. 95 var lostHeader struct { 96 ID uint64 97 Lost uint64 98 } 99 100 err := binary.Read(rd, internal.NativeEndian, &lostHeader) 101 if err != nil { 102 return 0, fmt.Errorf("can't read lost records header: %v", err) 103 } 104 105 return lostHeader.Lost, nil 106 } 107 108 var perfEventSampleSize = binary.Size(uint32(0)) 109 110 // This must match 'struct perf_event_sample in kernel sources. 111 type perfEventSample struct { 112 Size uint32 113 } 114 115 func readRawSample(rd io.Reader, buf, sampleBuf []byte) ([]byte, error) { 116 buf = buf[:perfEventSampleSize] 117 if _, err := io.ReadFull(rd, buf); err != nil { 118 return nil, fmt.Errorf("read sample size: %w", err) 119 } 120 121 sample := perfEventSample{ 122 internal.NativeEndian.Uint32(buf), 123 } 124 125 var data []byte 126 if size := int(sample.Size); cap(sampleBuf) < size { 127 data = make([]byte, size) 128 } else { 129 data = sampleBuf[:size] 130 } 131 132 if _, err := io.ReadFull(rd, data); err != nil { 133 return nil, fmt.Errorf("read sample: %w", err) 134 } 135 return data, nil 136 } 137 138 // Reader allows reading bpf_perf_event_output 139 // from user space. 140 type Reader struct { 141 poller *epoll.Poller 142 deadline time.Time 143 144 // mu protects read/write access to the Reader structure with the 145 // exception of 'pauseFds', which is protected by 'pauseMu'. 146 // If locking both 'mu' and 'pauseMu', 'mu' must be locked first. 147 mu sync.Mutex 148 149 // Closing a PERF_EVENT_ARRAY removes all event fds 150 // stored in it, so we keep a reference alive. 151 array *ebpf.Map 152 rings []*perfEventRing 153 epollEvents []unix.EpollEvent 154 epollRings []*perfEventRing 155 eventHeader []byte 156 157 // pauseFds are a copy of the fds in 'rings', protected by 'pauseMu'. 158 // These allow Pause/Resume to be executed independently of any ongoing 159 // Read calls, which would otherwise need to be interrupted. 160 pauseMu sync.Mutex 161 pauseFds []int 162 163 paused bool 164 overwritable bool 165 166 bufferSize int 167 } 168 169 // ReaderOptions control the behaviour of the user 170 // space reader. 171 type ReaderOptions struct { 172 // The number of events required in any per CPU buffer before 173 // Read will process data. This is mutually exclusive with Watermark. 174 // The default is zero, which means Watermark will take precedence. 175 WakeupEvents int 176 // The number of written bytes required in any per CPU buffer before 177 // Read will process data. Must be smaller than PerCPUBuffer. 178 // The default is to start processing as soon as data is available. 179 Watermark int 180 // This perf ring buffer is overwritable, once full the oldest event will be 181 // overwritten by newest. 182 Overwritable bool 183 } 184 185 // NewReader creates a new reader with default options. 186 // 187 // array must be a PerfEventArray. perCPUBuffer gives the size of the 188 // per CPU buffer in bytes. It is rounded up to the nearest multiple 189 // of the current page size. 190 func NewReader(array *ebpf.Map, perCPUBuffer int) (*Reader, error) { 191 return NewReaderWithOptions(array, perCPUBuffer, ReaderOptions{}) 192 } 193 194 // NewReaderWithOptions creates a new reader with the given options. 195 func NewReaderWithOptions(array *ebpf.Map, perCPUBuffer int, opts ReaderOptions) (pr *Reader, err error) { 196 if perCPUBuffer < 1 { 197 return nil, errors.New("perCPUBuffer must be larger than 0") 198 } 199 if opts.WakeupEvents > 0 && opts.Watermark > 0 { 200 return nil, errors.New("WakeupEvents and Watermark cannot both be non-zero") 201 } 202 203 var ( 204 fds []int 205 nCPU = int(array.MaxEntries()) 206 rings = make([]*perfEventRing, 0, nCPU) 207 pauseFds = make([]int, 0, nCPU) 208 ) 209 210 poller, err := epoll.New() 211 if err != nil { 212 return nil, err 213 } 214 215 defer func() { 216 if err != nil { 217 poller.Close() 218 for _, fd := range fds { 219 unix.Close(fd) 220 } 221 for _, ring := range rings { 222 if ring != nil { 223 ring.Close() 224 } 225 } 226 } 227 }() 228 229 // bpf_perf_event_output checks which CPU an event is enabled on, 230 // but doesn't allow using a wildcard like -1 to specify "all CPUs". 231 // Hence we have to create a ring for each CPU. 232 bufferSize := 0 233 for i := 0; i < nCPU; i++ { 234 ring, err := newPerfEventRing(i, perCPUBuffer, opts) 235 if errors.Is(err, unix.ENODEV) { 236 // The requested CPU is currently offline, skip it. 237 rings = append(rings, nil) 238 pauseFds = append(pauseFds, -1) 239 continue 240 } 241 242 if err != nil { 243 return nil, fmt.Errorf("failed to create perf ring for CPU %d: %v", i, err) 244 } 245 246 bufferSize = ring.size() 247 rings = append(rings, ring) 248 pauseFds = append(pauseFds, ring.fd) 249 250 if err := poller.Add(ring.fd, i); err != nil { 251 return nil, err 252 } 253 } 254 255 array, err = array.Clone() 256 if err != nil { 257 return nil, err 258 } 259 260 pr = &Reader{ 261 array: array, 262 rings: rings, 263 poller: poller, 264 deadline: time.Time{}, 265 epollEvents: make([]unix.EpollEvent, len(rings)), 266 epollRings: make([]*perfEventRing, 0, len(rings)), 267 eventHeader: make([]byte, perfEventHeaderSize), 268 pauseFds: pauseFds, 269 overwritable: opts.Overwritable, 270 bufferSize: bufferSize, 271 } 272 if err = pr.Resume(); err != nil { 273 return nil, err 274 } 275 runtime.SetFinalizer(pr, (*Reader).Close) 276 return pr, nil 277 } 278 279 // Close frees resources used by the reader. 280 // 281 // It interrupts calls to Read. 282 // 283 // Calls to perf_event_output from eBPF programs will return 284 // ENOENT after calling this method. 285 func (pr *Reader) Close() error { 286 if err := pr.poller.Close(); err != nil { 287 if errors.Is(err, os.ErrClosed) { 288 return nil 289 } 290 return fmt.Errorf("close poller: %w", err) 291 } 292 293 // Trying to poll will now fail, so Read() can't block anymore. Acquire the 294 // lock so that we can clean up. 295 pr.mu.Lock() 296 defer pr.mu.Unlock() 297 298 for _, ring := range pr.rings { 299 if ring != nil { 300 ring.Close() 301 } 302 } 303 pr.rings = nil 304 pr.pauseFds = nil 305 pr.array.Close() 306 307 return nil 308 } 309 310 // SetDeadline controls how long Read and ReadInto will block waiting for samples. 311 // 312 // Passing a zero time.Time will remove the deadline. Passing a deadline in the 313 // past will prevent the reader from blocking if there are no records to be read. 314 func (pr *Reader) SetDeadline(t time.Time) { 315 pr.mu.Lock() 316 defer pr.mu.Unlock() 317 318 pr.deadline = t 319 } 320 321 // Read the next record from the perf ring buffer. 322 // 323 // The function blocks until there are at least Watermark bytes in one 324 // of the per CPU buffers. Records from buffers below the Watermark 325 // are not returned. 326 // 327 // Records can contain between 0 and 7 bytes of trailing garbage from the ring 328 // depending on the input sample's length. 329 // 330 // Calling Close interrupts the function. 331 // 332 // Returns os.ErrDeadlineExceeded if a deadline was set. 333 func (pr *Reader) Read() (Record, error) { 334 var r Record 335 336 return r, pr.ReadInto(&r) 337 } 338 339 var errMustBePaused = fmt.Errorf("perf ringbuffer: must have been paused before reading overwritable buffer") 340 341 // ReadInto is like Read except that it allows reusing Record and associated buffers. 342 func (pr *Reader) ReadInto(rec *Record) error { 343 pr.mu.Lock() 344 defer pr.mu.Unlock() 345 346 pr.pauseMu.Lock() 347 defer pr.pauseMu.Unlock() 348 349 if pr.overwritable && !pr.paused { 350 return errMustBePaused 351 } 352 353 if pr.rings == nil { 354 return fmt.Errorf("perf ringbuffer: %w", ErrClosed) 355 } 356 357 for { 358 if len(pr.epollRings) == 0 { 359 // NB: The deferred pauseMu.Unlock will panic if Wait panics, which 360 // might obscure the original panic. 361 pr.pauseMu.Unlock() 362 nEvents, err := pr.poller.Wait(pr.epollEvents, pr.deadline) 363 pr.pauseMu.Lock() 364 if err != nil { 365 return err 366 } 367 368 // Re-validate pr.paused since we dropped pauseMu. 369 if pr.overwritable && !pr.paused { 370 return errMustBePaused 371 } 372 373 for _, event := range pr.epollEvents[:nEvents] { 374 ring := pr.rings[cpuForEvent(&event)] 375 pr.epollRings = append(pr.epollRings, ring) 376 377 // Read the current head pointer now, not every time 378 // we read a record. This prevents a single fast producer 379 // from keeping the reader busy. 380 ring.loadHead() 381 } 382 } 383 384 // Start at the last available event. The order in which we 385 // process them doesn't matter, and starting at the back allows 386 // resizing epollRings to keep track of processed rings. 387 err := pr.readRecordFromRing(rec, pr.epollRings[len(pr.epollRings)-1]) 388 if err == errEOR { 389 // We've emptied the current ring buffer, process 390 // the next one. 391 pr.epollRings = pr.epollRings[:len(pr.epollRings)-1] 392 continue 393 } 394 395 return err 396 } 397 } 398 399 // Pause stops all notifications from this Reader. 400 // 401 // While the Reader is paused, any attempts to write to the event buffer from 402 // BPF programs will return -ENOENT. 403 // 404 // Subsequent calls to Read will block until a call to Resume. 405 func (pr *Reader) Pause() error { 406 pr.pauseMu.Lock() 407 defer pr.pauseMu.Unlock() 408 409 if pr.pauseFds == nil { 410 return fmt.Errorf("%w", ErrClosed) 411 } 412 413 for i := range pr.pauseFds { 414 if err := pr.array.Delete(uint32(i)); err != nil && !errors.Is(err, ebpf.ErrKeyNotExist) { 415 return fmt.Errorf("could't delete event fd for CPU %d: %w", i, err) 416 } 417 } 418 419 pr.paused = true 420 421 return nil 422 } 423 424 // Resume allows this perf reader to emit notifications. 425 // 426 // Subsequent calls to Read will block until the next event notification. 427 func (pr *Reader) Resume() error { 428 pr.pauseMu.Lock() 429 defer pr.pauseMu.Unlock() 430 431 if pr.pauseFds == nil { 432 return fmt.Errorf("%w", ErrClosed) 433 } 434 435 for i, fd := range pr.pauseFds { 436 if fd == -1 { 437 continue 438 } 439 440 if err := pr.array.Put(uint32(i), uint32(fd)); err != nil { 441 return fmt.Errorf("couldn't put event fd %d for CPU %d: %w", fd, i, err) 442 } 443 } 444 445 pr.paused = false 446 447 return nil 448 } 449 450 // BufferSize is the size in bytes of each per-CPU buffer 451 func (pr *Reader) BufferSize() int { 452 return pr.bufferSize 453 } 454 455 // NB: Has to be preceded by a call to ring.loadHead. 456 func (pr *Reader) readRecordFromRing(rec *Record, ring *perfEventRing) error { 457 defer ring.writeTail() 458 459 rec.CPU = ring.cpu 460 err := readRecord(ring, rec, pr.eventHeader, pr.overwritable) 461 if pr.overwritable && (errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF)) { 462 return errEOR 463 } 464 rec.Remaining = ring.remaining() 465 return err 466 } 467 468 type unknownEventError struct { 469 eventType uint32 470 } 471 472 func (uev *unknownEventError) Error() string { 473 return fmt.Sprintf("unknown event type: %d", uev.eventType) 474 } 475 476 // IsUnknownEvent returns true if the error occurred 477 // because an unknown event was submitted to the perf event ring. 478 func IsUnknownEvent(err error) bool { 479 var uee *unknownEventError 480 return errors.As(err, &uee) 481 }