github.com/cilium/ebpf@v0.15.0/perf/ring.go (about) 1 package perf 2 3 import ( 4 "errors" 5 "fmt" 6 "io" 7 "math" 8 "os" 9 "runtime" 10 "sync/atomic" 11 "unsafe" 12 13 "github.com/cilium/ebpf/internal/unix" 14 ) 15 16 // perfEventRing is a page of metadata followed by 17 // a variable number of pages which form a ring buffer. 18 type perfEventRing struct { 19 fd int 20 cpu int 21 mmap []byte 22 ringReader 23 } 24 25 func newPerfEventRing(cpu, perCPUBuffer int, opts ReaderOptions) (*perfEventRing, error) { 26 if opts.Watermark >= perCPUBuffer { 27 return nil, errors.New("watermark must be smaller than perCPUBuffer") 28 } 29 30 fd, err := createPerfEvent(cpu, opts) 31 if err != nil { 32 return nil, err 33 } 34 35 if err := unix.SetNonblock(fd, true); err != nil { 36 unix.Close(fd) 37 return nil, err 38 } 39 40 protections := unix.PROT_READ 41 if !opts.Overwritable { 42 protections |= unix.PROT_WRITE 43 } 44 45 mmap, err := unix.Mmap(fd, 0, perfBufferSize(perCPUBuffer), protections, unix.MAP_SHARED) 46 if err != nil { 47 unix.Close(fd) 48 return nil, fmt.Errorf("can't mmap: %v", err) 49 } 50 51 // This relies on the fact that we allocate an extra metadata page, 52 // and that the struct is smaller than an OS page. 53 // This use of unsafe.Pointer isn't explicitly sanctioned by the 54 // documentation, since a byte is smaller than sampledPerfEvent. 55 meta := (*unix.PerfEventMmapPage)(unsafe.Pointer(&mmap[0])) 56 57 var reader ringReader 58 if opts.Overwritable { 59 reader = newReverseReader(meta, mmap[meta.Data_offset:meta.Data_offset+meta.Data_size]) 60 } else { 61 reader = newForwardReader(meta, mmap[meta.Data_offset:meta.Data_offset+meta.Data_size]) 62 } 63 64 ring := &perfEventRing{ 65 fd: fd, 66 cpu: cpu, 67 mmap: mmap, 68 ringReader: reader, 69 } 70 runtime.SetFinalizer(ring, (*perfEventRing).Close) 71 72 return ring, nil 73 } 74 75 // perfBufferSize returns a valid mmap buffer size for use with perf_event_open (1+2^n pages) 76 func perfBufferSize(perCPUBuffer int) int { 77 pageSize := os.Getpagesize() 78 79 // Smallest whole number of pages 80 nPages := (perCPUBuffer + pageSize - 1) / pageSize 81 82 // Round up to nearest power of two number of pages 83 nPages = int(math.Pow(2, math.Ceil(math.Log2(float64(nPages))))) 84 85 // Add one for metadata 86 nPages += 1 87 88 return nPages * pageSize 89 } 90 91 func (ring *perfEventRing) Close() { 92 runtime.SetFinalizer(ring, nil) 93 94 _ = unix.Close(ring.fd) 95 _ = unix.Munmap(ring.mmap) 96 97 ring.fd = -1 98 ring.mmap = nil 99 } 100 101 func createPerfEvent(cpu int, opts ReaderOptions) (int, error) { 102 wakeup := 0 103 bits := 0 104 if opts.WakeupEvents > 0 { 105 wakeup = opts.WakeupEvents 106 } else { 107 wakeup = opts.Watermark 108 if wakeup == 0 { 109 wakeup = 1 110 } 111 bits |= unix.PerfBitWatermark 112 } 113 114 if opts.Overwritable { 115 bits |= unix.PerfBitWriteBackward 116 } 117 118 attr := unix.PerfEventAttr{ 119 Type: unix.PERF_TYPE_SOFTWARE, 120 Config: unix.PERF_COUNT_SW_BPF_OUTPUT, 121 Bits: uint64(bits), 122 Sample_type: unix.PERF_SAMPLE_RAW, 123 Wakeup: uint32(wakeup), 124 } 125 126 attr.Size = uint32(unsafe.Sizeof(attr)) 127 fd, err := unix.PerfEventOpen(&attr, -1, cpu, -1, unix.PERF_FLAG_FD_CLOEXEC) 128 if err != nil { 129 return -1, fmt.Errorf("can't create perf event: %w", err) 130 } 131 return fd, nil 132 } 133 134 type ringReader interface { 135 loadHead() 136 size() int 137 remaining() int 138 writeTail() 139 Read(p []byte) (int, error) 140 } 141 142 type forwardReader struct { 143 meta *unix.PerfEventMmapPage 144 head, tail uint64 145 mask uint64 146 ring []byte 147 } 148 149 func newForwardReader(meta *unix.PerfEventMmapPage, ring []byte) *forwardReader { 150 return &forwardReader{ 151 meta: meta, 152 head: atomic.LoadUint64(&meta.Data_head), 153 tail: atomic.LoadUint64(&meta.Data_tail), 154 // cap is always a power of two 155 mask: uint64(cap(ring) - 1), 156 ring: ring, 157 } 158 } 159 160 func (rr *forwardReader) loadHead() { 161 rr.head = atomic.LoadUint64(&rr.meta.Data_head) 162 } 163 164 func (rr *forwardReader) size() int { 165 return len(rr.ring) 166 } 167 168 func (rr *forwardReader) remaining() int { 169 return int((rr.head - rr.tail) & rr.mask) 170 } 171 172 func (rr *forwardReader) writeTail() { 173 // Commit the new tail. This lets the kernel know that 174 // the ring buffer has been consumed. 175 atomic.StoreUint64(&rr.meta.Data_tail, rr.tail) 176 } 177 178 func (rr *forwardReader) Read(p []byte) (int, error) { 179 start := int(rr.tail & rr.mask) 180 181 n := len(p) 182 // Truncate if the read wraps in the ring buffer 183 if remainder := cap(rr.ring) - start; n > remainder { 184 n = remainder 185 } 186 187 // Truncate if there isn't enough data 188 if remainder := int(rr.head - rr.tail); n > remainder { 189 n = remainder 190 } 191 192 copy(p, rr.ring[start:start+n]) 193 rr.tail += uint64(n) 194 195 if rr.tail == rr.head { 196 return n, io.EOF 197 } 198 199 return n, nil 200 } 201 202 type reverseReader struct { 203 meta *unix.PerfEventMmapPage 204 // head is the position where the kernel last wrote data. 205 head uint64 206 // read is the position we read the next data from. Updated as reads are made. 207 read uint64 208 // tail is the end of the ring buffer. No reads must be made past it. 209 tail uint64 210 mask uint64 211 ring []byte 212 } 213 214 func newReverseReader(meta *unix.PerfEventMmapPage, ring []byte) *reverseReader { 215 rr := &reverseReader{ 216 meta: meta, 217 mask: uint64(cap(ring) - 1), 218 ring: ring, 219 } 220 rr.loadHead() 221 return rr 222 } 223 224 func (rr *reverseReader) loadHead() { 225 // The diagram below represents an overwritable perf ring buffer: 226 // 227 // head read tail 228 // | | | 229 // V V V 230 // +---+--------+------------+---------+--------+ 231 // | |H-D....D|H-C........C|H-B.....B|H-A....A| 232 // +---+--------+------------+---------+--------+ 233 // <--Write from right to left 234 // Read from left to right--> 235 // (H means header) 236 // 237 // The buffer is read left to right beginning from head to tail. 238 // [head, read) is the read portion of the buffer, [read, tail) the unread one. 239 // read is adjusted as we progress through the buffer. 240 241 // Avoid reading sample D multiple times by discarding unread samples C, B, A. 242 rr.tail = rr.head 243 244 // Get the new head and starting reading from it. 245 rr.head = atomic.LoadUint64(&rr.meta.Data_head) 246 rr.read = rr.head 247 248 if rr.tail-rr.head > uint64(cap(rr.ring)) { 249 // ring has been fully written, only permit at most cap(rr.ring) 250 // bytes to be read. 251 rr.tail = rr.head + uint64(cap(rr.ring)) 252 } 253 } 254 255 func (rr *reverseReader) size() int { 256 return len(rr.ring) 257 } 258 259 func (rr *reverseReader) remaining() int { 260 // remaining data is inaccurate for overwritable buffers 261 // once an overwrite happens, so return -1 here. 262 return -1 263 } 264 265 func (rr *reverseReader) writeTail() { 266 // We do not care about tail for over writable perf buffer. 267 // So, this function is noop. 268 } 269 270 func (rr *reverseReader) Read(p []byte) (int, error) { 271 start := int(rr.read & rr.mask) 272 273 n := len(p) 274 // Truncate if the read wraps in the ring buffer 275 if remainder := cap(rr.ring) - start; n > remainder { 276 n = remainder 277 } 278 279 // Truncate if there isn't enough data 280 if remainder := int(rr.tail - rr.read); n > remainder { 281 n = remainder 282 } 283 284 copy(p, rr.ring[start:start+n]) 285 rr.read += uint64(n) 286 287 if rr.read == rr.tail { 288 return n, io.EOF 289 } 290 291 return n, nil 292 }