github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/vfs/epoll.go (about) 1 // Copyright 2020 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package vfs 16 17 import ( 18 "github.com/SagerNet/gvisor/pkg/abi/linux" 19 "github.com/SagerNet/gvisor/pkg/context" 20 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 21 "github.com/SagerNet/gvisor/pkg/sync" 22 "github.com/SagerNet/gvisor/pkg/syserror" 23 "github.com/SagerNet/gvisor/pkg/waiter" 24 ) 25 26 // epollCycleMu serializes attempts to register EpollInstances with other 27 // EpollInstances in order to check for cycles. 28 var epollCycleMu sync.Mutex 29 30 // EpollInstance represents an epoll instance, as described by epoll(7). 31 // 32 // +stateify savable 33 type EpollInstance struct { 34 vfsfd FileDescription 35 FileDescriptionDefaultImpl 36 DentryMetadataFileDescriptionImpl 37 NoLockFD 38 39 // q holds waiters on this EpollInstance. 40 q waiter.Queue 41 42 // interest is the set of file descriptors that are registered with the 43 // EpollInstance for monitoring. interest is protected by interestMu. 44 interestMu sync.Mutex `state:"nosave"` 45 interest map[epollInterestKey]*epollInterest 46 47 // mu protects fields in registered epollInterests. 48 mu sync.Mutex `state:"nosave"` 49 50 // ready is the set of file descriptors that may be "ready" for I/O. Note 51 // that this must be an ordered list, not a map: "If more than maxevents 52 // file descriptors are ready when epoll_wait() is called, then successive 53 // epoll_wait() calls will round robin through the set of ready file 54 // descriptors. This behavior helps avoid starvation scenarios, where a 55 // process fails to notice that additional file descriptors are ready 56 // because it focuses on a set of file descriptors that are already known 57 // to be ready." - epoll_wait(2) 58 ready epollInterestList 59 } 60 61 // +stateify savable 62 type epollInterestKey struct { 63 // file is the registered FileDescription. No reference is held on file; 64 // instead, when the last reference is dropped, FileDescription.DecRef() 65 // removes the FileDescription from all EpollInstances. file is immutable. 66 file *FileDescription 67 68 // num is the file descriptor number with which this entry was registered. 69 // num is immutable. 70 num int32 71 } 72 73 // epollInterest represents an EpollInstance's interest in a file descriptor. 74 // 75 // +stateify savable 76 type epollInterest struct { 77 // epoll is the owning EpollInstance. epoll is immutable. 78 epoll *EpollInstance `state:"wait"` 79 80 // key is the file to which this epollInterest applies. key is immutable. 81 key epollInterestKey 82 83 // waiter is registered with key.file. entry is protected by epoll.mu. 84 waiter waiter.Entry 85 86 // mask is the event mask associated with this registration, including 87 // flags EPOLLET and EPOLLONESHOT. mask is protected by epoll.mu. 88 mask uint32 89 90 // ready is true if epollInterestEntry is linked into epoll.ready. ready 91 // and epollInterestEntry are protected by epoll.mu. 92 ready bool 93 epollInterestEntry 94 95 // userData is the struct epoll_event::data associated with this 96 // epollInterest. userData is protected by epoll.mu. 97 userData [2]int32 98 } 99 100 // NewEpollInstanceFD returns a FileDescription representing a new epoll 101 // instance. A reference is taken on the returned FileDescription. 102 func (vfs *VirtualFilesystem) NewEpollInstanceFD(ctx context.Context) (*FileDescription, error) { 103 vd := vfs.NewAnonVirtualDentry("[eventpoll]") 104 defer vd.DecRef(ctx) 105 ep := &EpollInstance{ 106 interest: make(map[epollInterestKey]*epollInterest), 107 } 108 if err := ep.vfsfd.Init(ep, linux.O_RDWR, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ 109 DenyPRead: true, 110 DenyPWrite: true, 111 UseDentryMetadata: true, 112 }); err != nil { 113 return nil, err 114 } 115 return &ep.vfsfd, nil 116 } 117 118 // Release implements FileDescriptionImpl.Release. 119 func (ep *EpollInstance) Release(ctx context.Context) { 120 // Unregister all polled fds. 121 ep.interestMu.Lock() 122 defer ep.interestMu.Unlock() 123 for key, epi := range ep.interest { 124 file := key.file 125 file.epollMu.Lock() 126 delete(file.epolls, epi) 127 file.epollMu.Unlock() 128 file.EventUnregister(&epi.waiter) 129 } 130 ep.interest = nil 131 } 132 133 // Readiness implements waiter.Waitable.Readiness. 134 func (ep *EpollInstance) Readiness(mask waiter.EventMask) waiter.EventMask { 135 if mask&waiter.ReadableEvents == 0 { 136 return 0 137 } 138 ep.mu.Lock() 139 for epi := ep.ready.Front(); epi != nil; epi = epi.Next() { 140 wmask := waiter.EventMaskFromLinux(epi.mask) 141 if epi.key.file.Readiness(wmask)&wmask != 0 { 142 ep.mu.Unlock() 143 return waiter.ReadableEvents 144 } 145 } 146 ep.mu.Unlock() 147 return 0 148 } 149 150 // EventRegister implements waiter.Waitable.EventRegister. 151 func (ep *EpollInstance) EventRegister(e *waiter.Entry, mask waiter.EventMask) { 152 ep.q.EventRegister(e, mask) 153 } 154 155 // EventUnregister implements waiter.Waitable.EventUnregister. 156 func (ep *EpollInstance) EventUnregister(e *waiter.Entry) { 157 ep.q.EventUnregister(e) 158 } 159 160 // Seek implements FileDescriptionImpl.Seek. 161 func (ep *EpollInstance) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { 162 // Linux: fs/eventpoll.c:eventpoll_fops.llseek == noop_llseek 163 return 0, nil 164 } 165 166 // AddInterest implements the semantics of EPOLL_CTL_ADD. 167 // 168 // Preconditions: A reference must be held on file. 169 func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event linux.EpollEvent) error { 170 // Check for cyclic polling if necessary. 171 subep, _ := file.impl.(*EpollInstance) 172 if subep != nil { 173 epollCycleMu.Lock() 174 // epollCycleMu must be locked for the rest of AddInterest to ensure 175 // that cyclic polling is not introduced after the check. 176 defer epollCycleMu.Unlock() 177 if subep.mightPoll(ep) { 178 return linuxerr.ELOOP 179 } 180 } 181 182 ep.interestMu.Lock() 183 defer ep.interestMu.Unlock() 184 185 // Fail if the key is already registered. 186 key := epollInterestKey{ 187 file: file, 188 num: num, 189 } 190 if _, ok := ep.interest[key]; ok { 191 return syserror.EEXIST 192 } 193 194 // Register interest in file. 195 mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP 196 epi := &epollInterest{ 197 epoll: ep, 198 key: key, 199 mask: mask, 200 userData: event.Data, 201 } 202 epi.waiter.Callback = epi 203 ep.interest[key] = epi 204 wmask := waiter.EventMaskFromLinux(mask) 205 file.EventRegister(&epi.waiter, wmask) 206 207 // Check if the file is already ready. 208 if m := file.Readiness(wmask) & wmask; m != 0 { 209 epi.Callback(nil, m) 210 } 211 212 // Add epi to file.epolls so that it is removed when the last 213 // FileDescription reference is dropped. 214 file.epollMu.Lock() 215 if file.epolls == nil { 216 file.epolls = make(map[*epollInterest]struct{}) 217 } 218 file.epolls[epi] = struct{}{} 219 file.epollMu.Unlock() 220 221 return nil 222 } 223 224 func (ep *EpollInstance) mightPoll(ep2 *EpollInstance) bool { 225 return ep.mightPollRecursive(ep2, 4) // Linux: fs/eventpoll.c:EP_MAX_NESTS 226 } 227 228 func (ep *EpollInstance) mightPollRecursive(ep2 *EpollInstance, remainingRecursion int) bool { 229 ep.interestMu.Lock() 230 defer ep.interestMu.Unlock() 231 for key := range ep.interest { 232 nextep, ok := key.file.impl.(*EpollInstance) 233 if !ok { 234 continue 235 } 236 if nextep == ep2 { 237 return true 238 } 239 if remainingRecursion == 0 { 240 return true 241 } 242 if nextep.mightPollRecursive(ep2, remainingRecursion-1) { 243 return true 244 } 245 } 246 return false 247 } 248 249 // ModifyInterest implements the semantics of EPOLL_CTL_MOD. 250 // 251 // Preconditions: A reference must be held on file. 252 func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event linux.EpollEvent) error { 253 ep.interestMu.Lock() 254 defer ep.interestMu.Unlock() 255 256 // Fail if the key is not already registered. 257 epi, ok := ep.interest[epollInterestKey{ 258 file: file, 259 num: num, 260 }] 261 if !ok { 262 return syserror.ENOENT 263 } 264 265 // Update epi for the next call to ep.ReadEvents(). 266 mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP 267 ep.mu.Lock() 268 epi.mask = mask 269 epi.userData = event.Data 270 ep.mu.Unlock() 271 272 // Re-register with the new mask. 273 file.EventUnregister(&epi.waiter) 274 wmask := waiter.EventMaskFromLinux(mask) 275 file.EventRegister(&epi.waiter, wmask) 276 277 // Check if the file is already ready with the new mask. 278 if m := file.Readiness(wmask) & wmask; m != 0 { 279 epi.Callback(nil, m) 280 } 281 282 return nil 283 } 284 285 // DeleteInterest implements the semantics of EPOLL_CTL_DEL. 286 // 287 // Preconditions: A reference must be held on file. 288 func (ep *EpollInstance) DeleteInterest(file *FileDescription, num int32) error { 289 ep.interestMu.Lock() 290 defer ep.interestMu.Unlock() 291 292 // Fail if the key is not already registered. 293 epi, ok := ep.interest[epollInterestKey{ 294 file: file, 295 num: num, 296 }] 297 if !ok { 298 return syserror.ENOENT 299 } 300 301 // Unregister from the file so that epi will no longer be readied. 302 file.EventUnregister(&epi.waiter) 303 304 // Forget about epi. 305 ep.removeLocked(epi) 306 307 file.epollMu.Lock() 308 delete(file.epolls, epi) 309 file.epollMu.Unlock() 310 311 return nil 312 } 313 314 // Callback implements waiter.EntryCallback.Callback. 315 func (epi *epollInterest) Callback(*waiter.Entry, waiter.EventMask) { 316 newReady := false 317 epi.epoll.mu.Lock() 318 if !epi.ready { 319 newReady = true 320 epi.ready = true 321 epi.epoll.ready.PushBack(epi) 322 } 323 epi.epoll.mu.Unlock() 324 if newReady { 325 epi.epoll.q.Notify(waiter.ReadableEvents) 326 } 327 } 328 329 // Preconditions: ep.interestMu must be locked. 330 func (ep *EpollInstance) removeLocked(epi *epollInterest) { 331 delete(ep.interest, epi.key) 332 ep.mu.Lock() 333 if epi.ready { 334 epi.ready = false 335 ep.ready.Remove(epi) 336 } 337 ep.mu.Unlock() 338 } 339 340 // ReadEvents appends up to maxReady events to events and returns the updated 341 // slice of events. 342 func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent, maxEvents int) []linux.EpollEvent { 343 i := 0 344 // Hot path: avoid defer. 345 ep.mu.Lock() 346 var next *epollInterest 347 var requeue epollInterestList 348 for epi := ep.ready.Front(); epi != nil; epi = next { 349 next = epi.Next() 350 // Regardless of what else happens, epi is initially removed from the 351 // ready list. 352 ep.ready.Remove(epi) 353 wmask := waiter.EventMaskFromLinux(epi.mask) 354 ievents := epi.key.file.Readiness(wmask) & wmask 355 if ievents == 0 { 356 // Leave epi off the ready list. 357 epi.ready = false 358 continue 359 } 360 // Determine what we should do with epi. 361 switch { 362 case epi.mask&linux.EPOLLONESHOT != 0: 363 // Clear all events from the mask; they must be re-added by 364 // EPOLL_CTL_MOD. 365 epi.mask &= linux.EP_PRIVATE_BITS 366 fallthrough 367 case epi.mask&linux.EPOLLET != 0: 368 // Leave epi off the ready list. 369 epi.ready = false 370 default: 371 // Queue epi to be moved to the end of the ready list. 372 requeue.PushBack(epi) 373 } 374 // Report ievents. 375 events = append(events, linux.EpollEvent{ 376 Events: ievents.ToLinux(), 377 Data: epi.userData, 378 }) 379 i++ 380 if i == maxEvents { 381 break 382 } 383 } 384 ep.ready.PushBackList(&requeue) 385 ep.mu.Unlock() 386 return events 387 }