github.com/cilium/ebpf@v0.15.1-0.20240517100537-8079b37aa138/perf/ring.go (about) 1 package perf 2 3 import ( 4 "errors" 5 "fmt" 6 "io" 7 "math" 8 "os" 9 "runtime" 10 "sync/atomic" 11 "unsafe" 12 13 "github.com/cilium/ebpf/internal/sys" 14 "github.com/cilium/ebpf/internal/unix" 15 ) 16 17 // perfEventRing is a page of metadata followed by 18 // a variable number of pages which form a ring buffer. 19 type perfEventRing struct { 20 cpu int 21 mmap []byte 22 ringReader 23 } 24 25 func newPerfEventRing(cpu, perCPUBuffer int, opts ReaderOptions) (_ *sys.FD, _ *perfEventRing, err error) { 26 closeOnError := func(c io.Closer) { 27 if err != nil { 28 c.Close() 29 } 30 } 31 32 if opts.Watermark >= perCPUBuffer { 33 return nil, nil, errors.New("watermark must be smaller than perCPUBuffer") 34 } 35 36 fd, err := createPerfEvent(cpu, opts) 37 if err != nil { 38 return nil, nil, err 39 } 40 defer closeOnError(fd) 41 42 if err := unix.SetNonblock(fd.Int(), true); err != nil { 43 return nil, nil, err 44 } 45 46 protections := unix.PROT_READ 47 if !opts.Overwritable { 48 protections |= unix.PROT_WRITE 49 } 50 51 mmap, err := unix.Mmap(fd.Int(), 0, perfBufferSize(perCPUBuffer), protections, unix.MAP_SHARED) 52 if err != nil { 53 return nil, nil, fmt.Errorf("can't mmap: %v", err) 54 } 55 56 // This relies on the fact that we allocate an extra metadata page, 57 // and that the struct is smaller than an OS page. 58 // This use of unsafe.Pointer isn't explicitly sanctioned by the 59 // documentation, since a byte is smaller than sampledPerfEvent. 60 meta := (*unix.PerfEventMmapPage)(unsafe.Pointer(&mmap[0])) 61 62 var reader ringReader 63 if opts.Overwritable { 64 reader = newReverseReader(meta, mmap[meta.Data_offset:meta.Data_offset+meta.Data_size]) 65 } else { 66 reader = newForwardReader(meta, mmap[meta.Data_offset:meta.Data_offset+meta.Data_size]) 67 } 68 69 ring := &perfEventRing{ 70 cpu: cpu, 71 mmap: mmap, 72 ringReader: reader, 73 } 74 runtime.SetFinalizer(ring, (*perfEventRing).Close) 75 76 return fd, ring, nil 77 } 78 79 // perfBufferSize returns a valid mmap buffer size for use with perf_event_open (1+2^n pages) 80 func perfBufferSize(perCPUBuffer int) int { 81 pageSize := os.Getpagesize() 82 83 // Smallest whole number of pages 84 nPages := (perCPUBuffer + pageSize - 1) / pageSize 85 86 // Round up to nearest power of two number of pages 87 nPages = int(math.Pow(2, math.Ceil(math.Log2(float64(nPages))))) 88 89 // Add one for metadata 90 nPages += 1 91 92 return nPages * pageSize 93 } 94 95 func (ring *perfEventRing) Close() error { 96 runtime.SetFinalizer(ring, nil) 97 mmap := ring.mmap 98 ring.mmap = nil 99 return unix.Munmap(mmap) 100 } 101 102 func createPerfEvent(cpu int, opts ReaderOptions) (*sys.FD, error) { 103 wakeup := 0 104 bits := 0 105 if opts.WakeupEvents > 0 { 106 wakeup = opts.WakeupEvents 107 } else { 108 wakeup = opts.Watermark 109 if wakeup == 0 { 110 wakeup = 1 111 } 112 bits |= unix.PerfBitWatermark 113 } 114 115 if opts.Overwritable { 116 bits |= unix.PerfBitWriteBackward 117 } 118 119 attr := unix.PerfEventAttr{ 120 Type: unix.PERF_TYPE_SOFTWARE, 121 Config: unix.PERF_COUNT_SW_BPF_OUTPUT, 122 Bits: uint64(bits), 123 Sample_type: unix.PERF_SAMPLE_RAW, 124 Wakeup: uint32(wakeup), 125 } 126 127 attr.Size = uint32(unsafe.Sizeof(attr)) 128 fd, err := unix.PerfEventOpen(&attr, -1, cpu, -1, unix.PERF_FLAG_FD_CLOEXEC) 129 if err != nil { 130 return nil, fmt.Errorf("can't create perf event: %w", err) 131 } 132 return sys.NewFD(fd) 133 } 134 135 type ringReader interface { 136 loadHead() 137 size() int 138 remaining() int 139 writeTail() 140 Read(p []byte) (int, error) 141 } 142 143 type forwardReader struct { 144 meta *unix.PerfEventMmapPage 145 head, tail uint64 146 mask uint64 147 ring []byte 148 } 149 150 func newForwardReader(meta *unix.PerfEventMmapPage, ring []byte) *forwardReader { 151 return &forwardReader{ 152 meta: meta, 153 head: atomic.LoadUint64(&meta.Data_head), 154 tail: atomic.LoadUint64(&meta.Data_tail), 155 // cap is always a power of two 156 mask: uint64(cap(ring) - 1), 157 ring: ring, 158 } 159 } 160 161 func (rr *forwardReader) loadHead() { 162 rr.head = atomic.LoadUint64(&rr.meta.Data_head) 163 } 164 165 func (rr *forwardReader) size() int { 166 return len(rr.ring) 167 } 168 169 func (rr *forwardReader) remaining() int { 170 return int((rr.head - rr.tail) & rr.mask) 171 } 172 173 func (rr *forwardReader) writeTail() { 174 // Commit the new tail. This lets the kernel know that 175 // the ring buffer has been consumed. 176 atomic.StoreUint64(&rr.meta.Data_tail, rr.tail) 177 } 178 179 func (rr *forwardReader) Read(p []byte) (int, error) { 180 start := int(rr.tail & rr.mask) 181 182 n := len(p) 183 // Truncate if the read wraps in the ring buffer 184 if remainder := cap(rr.ring) - start; n > remainder { 185 n = remainder 186 } 187 188 // Truncate if there isn't enough data 189 if remainder := int(rr.head - rr.tail); n > remainder { 190 n = remainder 191 } 192 193 copy(p, rr.ring[start:start+n]) 194 rr.tail += uint64(n) 195 196 if rr.tail == rr.head { 197 return n, io.EOF 198 } 199 200 return n, nil 201 } 202 203 type reverseReader struct { 204 meta *unix.PerfEventMmapPage 205 // head is the position where the kernel last wrote data. 206 head uint64 207 // read is the position we read the next data from. Updated as reads are made. 208 read uint64 209 // tail is the end of the ring buffer. No reads must be made past it. 210 tail uint64 211 mask uint64 212 ring []byte 213 } 214 215 func newReverseReader(meta *unix.PerfEventMmapPage, ring []byte) *reverseReader { 216 rr := &reverseReader{ 217 meta: meta, 218 mask: uint64(cap(ring) - 1), 219 ring: ring, 220 } 221 rr.loadHead() 222 return rr 223 } 224 225 func (rr *reverseReader) loadHead() { 226 // The diagram below represents an overwritable perf ring buffer: 227 // 228 // head read tail 229 // | | | 230 // V V V 231 // +---+--------+------------+---------+--------+ 232 // | |H-D....D|H-C........C|H-B.....B|H-A....A| 233 // +---+--------+------------+---------+--------+ 234 // <--Write from right to left 235 // Read from left to right--> 236 // (H means header) 237 // 238 // The buffer is read left to right beginning from head to tail. 239 // [head, read) is the read portion of the buffer, [read, tail) the unread one. 240 // read is adjusted as we progress through the buffer. 241 242 // Avoid reading sample D multiple times by discarding unread samples C, B, A. 243 rr.tail = rr.head 244 245 // Get the new head and starting reading from it. 246 rr.head = atomic.LoadUint64(&rr.meta.Data_head) 247 rr.read = rr.head 248 249 if rr.tail-rr.head > uint64(cap(rr.ring)) { 250 // ring has been fully written, only permit at most cap(rr.ring) 251 // bytes to be read. 252 rr.tail = rr.head + uint64(cap(rr.ring)) 253 } 254 } 255 256 func (rr *reverseReader) size() int { 257 return len(rr.ring) 258 } 259 260 func (rr *reverseReader) remaining() int { 261 // remaining data is inaccurate for overwritable buffers 262 // once an overwrite happens, so return -1 here. 263 return -1 264 } 265 266 func (rr *reverseReader) writeTail() { 267 // We do not care about tail for over writable perf buffer. 268 // So, this function is noop. 269 } 270 271 func (rr *reverseReader) Read(p []byte) (int, error) { 272 start := int(rr.read & rr.mask) 273 274 n := len(p) 275 // Truncate if the read wraps in the ring buffer 276 if remainder := cap(rr.ring) - start; n > remainder { 277 n = remainder 278 } 279 280 // Truncate if there isn't enough data 281 if remainder := int(rr.tail - rr.read); n > remainder { 282 n = remainder 283 } 284 285 copy(p, rr.ring[start:start+n]) 286 rr.read += uint64(n) 287 288 if rr.read == rr.tail { 289 return n, io.EOF 290 } 291 292 return n, nil 293 }