github.com/cilium/ebpf@v0.16.0/internal/epoll/poller.go (about) 1 package epoll 2 3 import ( 4 "errors" 5 "fmt" 6 "math" 7 "os" 8 "runtime" 9 "slices" 10 "sync" 11 "time" 12 13 "github.com/cilium/ebpf/internal" 14 "github.com/cilium/ebpf/internal/unix" 15 ) 16 17 var ErrFlushed = errors.New("data was flushed") 18 19 // Poller waits for readiness notifications from multiple file descriptors. 20 // 21 // The wait can be interrupted by calling Close. 22 type Poller struct { 23 // mutexes protect the fields declared below them. If you need to 24 // acquire both at once you must lock epollMu before eventMu. 25 epollMu sync.Mutex 26 epollFd int 27 28 eventMu sync.Mutex 29 closeEvent *eventFd 30 flushEvent *eventFd 31 } 32 33 func New() (_ *Poller, err error) { 34 closeFDOnError := func(fd int) { 35 if err != nil { 36 unix.Close(fd) 37 } 38 } 39 closeEventFDOnError := func(e *eventFd) { 40 if err != nil { 41 e.close() 42 } 43 } 44 45 epollFd, err := unix.EpollCreate1(unix.EPOLL_CLOEXEC) 46 if err != nil { 47 return nil, fmt.Errorf("create epoll fd: %v", err) 48 } 49 defer closeFDOnError(epollFd) 50 51 p := &Poller{epollFd: epollFd} 52 p.closeEvent, err = newEventFd() 53 if err != nil { 54 return nil, err 55 } 56 defer closeEventFDOnError(p.closeEvent) 57 58 p.flushEvent, err = newEventFd() 59 if err != nil { 60 return nil, err 61 } 62 defer closeEventFDOnError(p.flushEvent) 63 64 if err := p.Add(p.closeEvent.raw, 0); err != nil { 65 return nil, fmt.Errorf("add close eventfd: %w", err) 66 } 67 68 if err := p.Add(p.flushEvent.raw, 0); err != nil { 69 return nil, fmt.Errorf("add flush eventfd: %w", err) 70 } 71 72 runtime.SetFinalizer(p, (*Poller).Close) 73 return p, nil 74 } 75 76 // Close the poller. 77 // 78 // Interrupts any calls to Wait. Multiple calls to Close are valid, but subsequent 79 // calls will return os.ErrClosed. 80 func (p *Poller) Close() error { 81 runtime.SetFinalizer(p, nil) 82 83 // Interrupt Wait() via the closeEvent fd if it's currently blocked. 84 if err := p.wakeWaitForClose(); err != nil { 85 return err 86 } 87 88 // Acquire the lock. This ensures that Wait isn't running. 89 p.epollMu.Lock() 90 defer p.epollMu.Unlock() 91 92 // Prevent other calls to Close(). 93 p.eventMu.Lock() 94 defer p.eventMu.Unlock() 95 96 if p.epollFd != -1 { 97 unix.Close(p.epollFd) 98 p.epollFd = -1 99 } 100 101 if p.closeEvent != nil { 102 p.closeEvent.close() 103 p.closeEvent = nil 104 } 105 106 if p.flushEvent != nil { 107 p.flushEvent.close() 108 p.flushEvent = nil 109 } 110 111 return nil 112 } 113 114 // Add an fd to the poller. 115 // 116 // id is returned by Wait in the unix.EpollEvent.Pad field any may be zero. It 117 // must not exceed math.MaxInt32. 118 // 119 // Add is blocked by Wait. 120 func (p *Poller) Add(fd int, id int) error { 121 if int64(id) > math.MaxInt32 { 122 return fmt.Errorf("unsupported id: %d", id) 123 } 124 125 p.epollMu.Lock() 126 defer p.epollMu.Unlock() 127 128 if p.epollFd == -1 { 129 return fmt.Errorf("epoll add: %w", os.ErrClosed) 130 } 131 132 // The representation of EpollEvent isn't entirely accurate. 133 // Pad is fully usable, not just padding. Hence we stuff the 134 // id in there, which allows us to identify the event later (e.g., 135 // in case of perf events, which CPU sent it). 136 event := unix.EpollEvent{ 137 Events: unix.EPOLLIN, 138 Fd: int32(fd), 139 Pad: int32(id), 140 } 141 142 if err := unix.EpollCtl(p.epollFd, unix.EPOLL_CTL_ADD, fd, &event); err != nil { 143 return fmt.Errorf("add fd to epoll: %v", err) 144 } 145 146 return nil 147 } 148 149 // Wait for events. 150 // 151 // Returns the number of pending events and any errors. 152 // 153 // - [os.ErrClosed] if interrupted by [Close]. 154 // - [ErrFlushed] if interrupted by [Flush]. 155 // - [os.ErrDeadlineExceeded] if deadline is reached. 156 func (p *Poller) Wait(events []unix.EpollEvent, deadline time.Time) (int, error) { 157 p.epollMu.Lock() 158 defer p.epollMu.Unlock() 159 160 if p.epollFd == -1 { 161 return 0, fmt.Errorf("epoll wait: %w", os.ErrClosed) 162 } 163 164 for { 165 timeout := int(-1) 166 if !deadline.IsZero() { 167 msec := time.Until(deadline).Milliseconds() 168 // Deadline is in the past, don't block. 169 msec = max(msec, 0) 170 // Deadline is too far in the future. 171 msec = min(msec, math.MaxInt) 172 173 timeout = int(msec) 174 } 175 176 n, err := unix.EpollWait(p.epollFd, events, timeout) 177 if temp, ok := err.(temporaryError); ok && temp.Temporary() { 178 // Retry the syscall if we were interrupted, see https://github.com/golang/go/issues/20400 179 continue 180 } 181 182 if err != nil { 183 return 0, err 184 } 185 186 if n == 0 { 187 return 0, fmt.Errorf("epoll wait: %w", os.ErrDeadlineExceeded) 188 } 189 190 for i := 0; i < n; { 191 event := events[i] 192 if int(event.Fd) == p.closeEvent.raw { 193 // Since we don't read p.closeEvent the event is never cleared and 194 // we'll keep getting this wakeup until Close() acquires the 195 // lock and sets p.epollFd = -1. 196 return 0, fmt.Errorf("epoll wait: %w", os.ErrClosed) 197 } 198 if int(event.Fd) == p.flushEvent.raw { 199 // read event to prevent it from continuing to wake 200 p.flushEvent.read() 201 err = ErrFlushed 202 events = slices.Delete(events, i, i+1) 203 n -= 1 204 continue 205 } 206 i++ 207 } 208 209 return n, err 210 } 211 } 212 213 type temporaryError interface { 214 Temporary() bool 215 } 216 217 // wakeWaitForClose unblocks Wait if it's epoll_wait. 218 func (p *Poller) wakeWaitForClose() error { 219 p.eventMu.Lock() 220 defer p.eventMu.Unlock() 221 222 if p.closeEvent == nil { 223 return fmt.Errorf("epoll wake: %w", os.ErrClosed) 224 } 225 226 return p.closeEvent.add(1) 227 } 228 229 // Flush unblocks Wait if it's epoll_wait, for purposes of reading pending samples 230 func (p *Poller) Flush() error { 231 p.eventMu.Lock() 232 defer p.eventMu.Unlock() 233 234 if p.flushEvent == nil { 235 return fmt.Errorf("epoll wake: %w", os.ErrClosed) 236 } 237 238 return p.flushEvent.add(1) 239 } 240 241 // eventFd wraps a Linux eventfd. 242 // 243 // An eventfd acts like a counter: writes add to the counter, reads retrieve 244 // the counter and reset it to zero. Reads also block if the counter is zero. 245 // 246 // See man 2 eventfd. 247 type eventFd struct { 248 file *os.File 249 // prefer raw over file.Fd(), since the latter puts the file into blocking 250 // mode. 251 raw int 252 } 253 254 func newEventFd() (*eventFd, error) { 255 fd, err := unix.Eventfd(0, unix.O_CLOEXEC|unix.O_NONBLOCK) 256 if err != nil { 257 return nil, err 258 } 259 file := os.NewFile(uintptr(fd), "event") 260 return &eventFd{file, fd}, nil 261 } 262 263 func (efd *eventFd) close() error { 264 return efd.file.Close() 265 } 266 267 func (efd *eventFd) add(n uint64) error { 268 var buf [8]byte 269 internal.NativeEndian.PutUint64(buf[:], n) 270 _, err := efd.file.Write(buf[:]) 271 return err 272 } 273 274 func (efd *eventFd) read() (uint64, error) { 275 var buf [8]byte 276 _, err := efd.file.Read(buf[:]) 277 return internal.NativeEndian.Uint64(buf[:]), err 278 }