github.com/cilium/ebpf@v0.10.0/perf/reader.go (about) 1 package perf 2 3 import ( 4 "encoding/binary" 5 "errors" 6 "fmt" 7 "io" 8 "os" 9 "runtime" 10 "sync" 11 "time" 12 13 "github.com/cilium/ebpf" 14 "github.com/cilium/ebpf/internal" 15 "github.com/cilium/ebpf/internal/epoll" 16 "github.com/cilium/ebpf/internal/unix" 17 ) 18 19 var ( 20 ErrClosed = os.ErrClosed 21 errEOR = errors.New("end of ring") 22 ) 23 24 var perfEventHeaderSize = binary.Size(perfEventHeader{}) 25 26 // perfEventHeader must match 'struct perf_event_header` in <linux/perf_event.h>. 27 type perfEventHeader struct { 28 Type uint32 29 Misc uint16 30 Size uint16 31 } 32 33 func cpuForEvent(event *unix.EpollEvent) int { 34 return int(event.Pad) 35 } 36 37 // Record contains either a sample or a counter of the 38 // number of lost samples. 39 type Record struct { 40 // The CPU this record was generated on. 41 CPU int 42 43 // The data submitted via bpf_perf_event_output. 44 // Due to a kernel bug, this can contain between 0 and 7 bytes of trailing 45 // garbage from the ring depending on the input sample's length. 46 RawSample []byte 47 48 // The number of samples which could not be output, since 49 // the ring buffer was full. 50 LostSamples uint64 51 } 52 53 // Read a record from a reader and tag it as being from the given CPU. 54 // 55 // buf must be at least perfEventHeaderSize bytes long. 56 func readRecord(rd io.Reader, rec *Record, buf []byte) error { 57 // Assert that the buffer is large enough. 58 buf = buf[:perfEventHeaderSize] 59 _, err := io.ReadFull(rd, buf) 60 if errors.Is(err, io.EOF) { 61 return errEOR 62 } else if err != nil { 63 return fmt.Errorf("read perf event header: %v", err) 64 } 65 66 header := perfEventHeader{ 67 internal.NativeEndian.Uint32(buf[0:4]), 68 internal.NativeEndian.Uint16(buf[4:6]), 69 internal.NativeEndian.Uint16(buf[6:8]), 70 } 71 72 switch header.Type { 73 case unix.PERF_RECORD_LOST: 74 rec.RawSample = rec.RawSample[:0] 75 rec.LostSamples, err = readLostRecords(rd) 76 return err 77 78 case unix.PERF_RECORD_SAMPLE: 79 rec.LostSamples = 0 80 // We can reuse buf here because perfEventHeaderSize > perfEventSampleSize. 81 rec.RawSample, err = readRawSample(rd, buf, rec.RawSample) 82 return err 83 84 default: 85 return &unknownEventError{header.Type} 86 } 87 } 88 89 func readLostRecords(rd io.Reader) (uint64, error) { 90 // lostHeader must match 'struct perf_event_lost in kernel sources. 91 var lostHeader struct { 92 ID uint64 93 Lost uint64 94 } 95 96 err := binary.Read(rd, internal.NativeEndian, &lostHeader) 97 if err != nil { 98 return 0, fmt.Errorf("can't read lost records header: %v", err) 99 } 100 101 return lostHeader.Lost, nil 102 } 103 104 var perfEventSampleSize = binary.Size(uint32(0)) 105 106 // This must match 'struct perf_event_sample in kernel sources. 107 type perfEventSample struct { 108 Size uint32 109 } 110 111 func readRawSample(rd io.Reader, buf, sampleBuf []byte) ([]byte, error) { 112 buf = buf[:perfEventSampleSize] 113 if _, err := io.ReadFull(rd, buf); err != nil { 114 return nil, fmt.Errorf("read sample size: %v", err) 115 } 116 117 sample := perfEventSample{ 118 internal.NativeEndian.Uint32(buf), 119 } 120 121 var data []byte 122 if size := int(sample.Size); cap(sampleBuf) < size { 123 data = make([]byte, size) 124 } else { 125 data = sampleBuf[:size] 126 } 127 128 if _, err := io.ReadFull(rd, data); err != nil { 129 return nil, fmt.Errorf("read sample: %v", err) 130 } 131 return data, nil 132 } 133 134 // Reader allows reading bpf_perf_event_output 135 // from user space. 136 type Reader struct { 137 poller *epoll.Poller 138 deadline time.Time 139 140 // mu protects read/write access to the Reader structure with the 141 // exception of 'pauseFds', which is protected by 'pauseMu'. 142 // If locking both 'mu' and 'pauseMu', 'mu' must be locked first. 143 mu sync.Mutex 144 145 // Closing a PERF_EVENT_ARRAY removes all event fds 146 // stored in it, so we keep a reference alive. 147 array *ebpf.Map 148 rings []*perfEventRing 149 epollEvents []unix.EpollEvent 150 epollRings []*perfEventRing 151 eventHeader []byte 152 153 // pauseFds are a copy of the fds in 'rings', protected by 'pauseMu'. 154 // These allow Pause/Resume to be executed independently of any ongoing 155 // Read calls, which would otherwise need to be interrupted. 156 pauseMu sync.Mutex 157 pauseFds []int 158 } 159 160 // ReaderOptions control the behaviour of the user 161 // space reader. 162 type ReaderOptions struct { 163 // The number of written bytes required in any per CPU buffer before 164 // Read will process data. Must be smaller than PerCPUBuffer. 165 // The default is to start processing as soon as data is available. 166 Watermark int 167 } 168 169 // NewReader creates a new reader with default options. 170 // 171 // array must be a PerfEventArray. perCPUBuffer gives the size of the 172 // per CPU buffer in bytes. It is rounded up to the nearest multiple 173 // of the current page size. 174 func NewReader(array *ebpf.Map, perCPUBuffer int) (*Reader, error) { 175 return NewReaderWithOptions(array, perCPUBuffer, ReaderOptions{}) 176 } 177 178 // NewReaderWithOptions creates a new reader with the given options. 179 func NewReaderWithOptions(array *ebpf.Map, perCPUBuffer int, opts ReaderOptions) (pr *Reader, err error) { 180 if perCPUBuffer < 1 { 181 return nil, errors.New("perCPUBuffer must be larger than 0") 182 } 183 184 var ( 185 fds []int 186 nCPU = int(array.MaxEntries()) 187 rings = make([]*perfEventRing, 0, nCPU) 188 pauseFds = make([]int, 0, nCPU) 189 ) 190 191 poller, err := epoll.New() 192 if err != nil { 193 return nil, err 194 } 195 196 defer func() { 197 if err != nil { 198 poller.Close() 199 for _, fd := range fds { 200 unix.Close(fd) 201 } 202 for _, ring := range rings { 203 if ring != nil { 204 ring.Close() 205 } 206 } 207 } 208 }() 209 210 // bpf_perf_event_output checks which CPU an event is enabled on, 211 // but doesn't allow using a wildcard like -1 to specify "all CPUs". 212 // Hence we have to create a ring for each CPU. 213 for i := 0; i < nCPU; i++ { 214 ring, err := newPerfEventRing(i, perCPUBuffer, opts.Watermark) 215 if errors.Is(err, unix.ENODEV) { 216 // The requested CPU is currently offline, skip it. 217 rings = append(rings, nil) 218 pauseFds = append(pauseFds, -1) 219 continue 220 } 221 222 if err != nil { 223 return nil, fmt.Errorf("failed to create perf ring for CPU %d: %v", i, err) 224 } 225 rings = append(rings, ring) 226 pauseFds = append(pauseFds, ring.fd) 227 228 if err := poller.Add(ring.fd, i); err != nil { 229 return nil, err 230 } 231 } 232 233 array, err = array.Clone() 234 if err != nil { 235 return nil, err 236 } 237 238 pr = &Reader{ 239 array: array, 240 rings: rings, 241 poller: poller, 242 deadline: time.Time{}, 243 epollEvents: make([]unix.EpollEvent, len(rings)), 244 epollRings: make([]*perfEventRing, 0, len(rings)), 245 eventHeader: make([]byte, perfEventHeaderSize), 246 pauseFds: pauseFds, 247 } 248 if err = pr.Resume(); err != nil { 249 return nil, err 250 } 251 runtime.SetFinalizer(pr, (*Reader).Close) 252 return pr, nil 253 } 254 255 // Close frees resources used by the reader. 256 // 257 // It interrupts calls to Read. 258 // 259 // Calls to perf_event_output from eBPF programs will return 260 // ENOENT after calling this method. 261 func (pr *Reader) Close() error { 262 if err := pr.poller.Close(); err != nil { 263 if errors.Is(err, os.ErrClosed) { 264 return nil 265 } 266 return fmt.Errorf("close poller: %w", err) 267 } 268 269 // Trying to poll will now fail, so Read() can't block anymore. Acquire the 270 // lock so that we can clean up. 271 pr.mu.Lock() 272 defer pr.mu.Unlock() 273 274 for _, ring := range pr.rings { 275 if ring != nil { 276 ring.Close() 277 } 278 } 279 pr.rings = nil 280 pr.pauseFds = nil 281 pr.array.Close() 282 283 return nil 284 } 285 286 // SetDeadline controls how long Read and ReadInto will block waiting for samples. 287 // 288 // Passing a zero time.Time will remove the deadline. 289 func (pr *Reader) SetDeadline(t time.Time) { 290 pr.mu.Lock() 291 defer pr.mu.Unlock() 292 293 pr.deadline = t 294 } 295 296 // Read the next record from the perf ring buffer. 297 // 298 // The function blocks until there are at least Watermark bytes in one 299 // of the per CPU buffers. Records from buffers below the Watermark 300 // are not returned. 301 // 302 // Records can contain between 0 and 7 bytes of trailing garbage from the ring 303 // depending on the input sample's length. 304 // 305 // Calling Close interrupts the function. 306 // 307 // Returns os.ErrDeadlineExceeded if a deadline was set. 308 func (pr *Reader) Read() (Record, error) { 309 var r Record 310 return r, pr.ReadInto(&r) 311 } 312 313 // ReadInto is like Read except that it allows reusing Record and associated buffers. 314 func (pr *Reader) ReadInto(rec *Record) error { 315 pr.mu.Lock() 316 defer pr.mu.Unlock() 317 318 if pr.rings == nil { 319 return fmt.Errorf("perf ringbuffer: %w", ErrClosed) 320 } 321 322 for { 323 if len(pr.epollRings) == 0 { 324 nEvents, err := pr.poller.Wait(pr.epollEvents, pr.deadline) 325 if err != nil { 326 return err 327 } 328 329 for _, event := range pr.epollEvents[:nEvents] { 330 ring := pr.rings[cpuForEvent(&event)] 331 pr.epollRings = append(pr.epollRings, ring) 332 333 // Read the current head pointer now, not every time 334 // we read a record. This prevents a single fast producer 335 // from keeping the reader busy. 336 ring.loadHead() 337 } 338 } 339 340 // Start at the last available event. The order in which we 341 // process them doesn't matter, and starting at the back allows 342 // resizing epollRings to keep track of processed rings. 343 err := pr.readRecordFromRing(rec, pr.epollRings[len(pr.epollRings)-1]) 344 if err == errEOR { 345 // We've emptied the current ring buffer, process 346 // the next one. 347 pr.epollRings = pr.epollRings[:len(pr.epollRings)-1] 348 continue 349 } 350 351 return err 352 } 353 } 354 355 // Pause stops all notifications from this Reader. 356 // 357 // While the Reader is paused, any attempts to write to the event buffer from 358 // BPF programs will return -ENOENT. 359 // 360 // Subsequent calls to Read will block until a call to Resume. 361 func (pr *Reader) Pause() error { 362 pr.pauseMu.Lock() 363 defer pr.pauseMu.Unlock() 364 365 if pr.pauseFds == nil { 366 return fmt.Errorf("%w", ErrClosed) 367 } 368 369 for i := range pr.pauseFds { 370 if err := pr.array.Delete(uint32(i)); err != nil && !errors.Is(err, ebpf.ErrKeyNotExist) { 371 return fmt.Errorf("could't delete event fd for CPU %d: %w", i, err) 372 } 373 } 374 375 return nil 376 } 377 378 // Resume allows this perf reader to emit notifications. 379 // 380 // Subsequent calls to Read will block until the next event notification. 381 func (pr *Reader) Resume() error { 382 pr.pauseMu.Lock() 383 defer pr.pauseMu.Unlock() 384 385 if pr.pauseFds == nil { 386 return fmt.Errorf("%w", ErrClosed) 387 } 388 389 for i, fd := range pr.pauseFds { 390 if fd == -1 { 391 continue 392 } 393 394 if err := pr.array.Put(uint32(i), uint32(fd)); err != nil { 395 return fmt.Errorf("couldn't put event fd %d for CPU %d: %w", fd, i, err) 396 } 397 } 398 399 return nil 400 } 401 402 // NB: Has to be preceded by a call to ring.loadHead. 403 func (pr *Reader) readRecordFromRing(rec *Record, ring *perfEventRing) error { 404 defer ring.writeTail() 405 406 rec.CPU = ring.cpu 407 return readRecord(ring, rec, pr.eventHeader) 408 } 409 410 type unknownEventError struct { 411 eventType uint32 412 } 413 414 func (uev *unknownEventError) Error() string { 415 return fmt.Sprintf("unknown event type: %d", uev.eventType) 416 } 417 418 // IsUnknownEvent returns true if the error occurred 419 // because an unknown event was submitted to the perf event ring. 420 func IsUnknownEvent(err error) bool { 421 var uee *unknownEventError 422 return errors.As(err, &uee) 423 }