github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/vfs/epoll.go (about) 1 // Copyright 2020 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package vfs 16 17 import ( 18 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 19 "github.com/nicocha30/gvisor-ligolo/pkg/context" 20 "github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr" 21 "github.com/nicocha30/gvisor-ligolo/pkg/sync" 22 "github.com/nicocha30/gvisor-ligolo/pkg/waiter" 23 ) 24 25 // epollCycleMu serializes attempts to register EpollInstances with other 26 // EpollInstances in order to check for cycles. 27 var epollCycleMu sync.Mutex 28 29 // EpollInstance represents an epoll instance, as described by epoll(7). 30 // 31 // +stateify savable 32 type EpollInstance struct { 33 vfsfd FileDescription 34 FileDescriptionDefaultImpl 35 DentryMetadataFileDescriptionImpl 36 NoLockFD 37 38 // q holds waiters on this EpollInstance. 39 q waiter.Queue 40 41 // interestMu protects interest and most fields in registered 42 // epollInterests. interestMu is analogous to Linux's struct 43 // eventpoll::mtx. 44 interestMu sync.Mutex `state:"nosave"` 45 46 // interest is the set of file descriptors that are registered with the 47 // EpollInstance for monitoring. 48 interest map[epollInterestKey]*epollInterest 49 50 // readyMu protects ready, readySeq, epollInterest.ready, and 51 // epollInterest.epollInterestEntry. ready is analogous to Linux's struct 52 // eventpoll::lock. 53 readyMu epollReadyInstanceMutex `state:"nosave"` 54 55 // ready is the set of file descriptors that may be "ready" for I/O. Note 56 // that this must be an ordered list, not a map: "If more than maxevents 57 // file descriptors are ready when epoll_wait() is called, then successive 58 // epoll_wait() calls will round robin through the set of ready file 59 // descriptors. This behavior helps avoid starvation scenarios, where a 60 // process fails to notice that additional file descriptors are ready 61 // because it focuses on a set of file descriptors that are already known 62 // to be ready." - epoll_wait(2) 63 ready epollInterestList 64 65 // readySeq is used to detect calls to epollInterest.NotifyEvent() while 66 // Readiness() or ReadEvents() are running with readyMu unlocked. readySeq 67 // is protected by both interestMu and readyMu; reading requires either 68 // mutex to be locked, but mutation requires both mutexes to be locked. 69 readySeq uint32 70 } 71 72 // +stateify savable 73 type epollInterestKey struct { 74 // file is the registered FileDescription. No reference is held on file; 75 // instead, when the last reference is dropped, FileDescription.DecRef() 76 // removes the FileDescription from all EpollInstances. file is immutable. 77 file *FileDescription 78 79 // num is the file descriptor number with which this entry was registered. 80 // num is immutable. 81 num int32 82 } 83 84 // epollInterest represents an EpollInstance's interest in a file descriptor. 85 // 86 // +stateify savable 87 type epollInterest struct { 88 // epoll is the owning EpollInstance. epoll is immutable. 89 epoll *EpollInstance `state:"wait"` 90 91 // key is the file to which this epollInterest applies. key is immutable. 92 key epollInterestKey 93 94 // waiter is registered with key.file. entry is protected by 95 // epoll.interestMu. 96 waiter waiter.Entry 97 98 // mask is the event mask associated with this registration, including 99 // flags EPOLLET and EPOLLONESHOT. mask is protected by epoll.interestMu. 100 mask uint32 101 102 // ready is true if epollInterestEntry is linked into epoll.ready. readySeq 103 // is the value of epoll.readySeq when NotifyEvent() was last called. 104 // ready, epollInterestEntry, and readySeq are protected by epoll.readyMu. 105 ready bool 106 epollInterestEntry 107 readySeq uint32 108 109 // userData is the struct epoll_event::data associated with this 110 // epollInterest. userData is protected by epoll.interestMu. 111 userData [2]int32 112 } 113 114 // NewEpollInstanceFD returns a FileDescription representing a new epoll 115 // instance. A reference is taken on the returned FileDescription. 116 func (vfs *VirtualFilesystem) NewEpollInstanceFD(ctx context.Context) (*FileDescription, error) { 117 vd := vfs.NewAnonVirtualDentry("[eventpoll]") 118 defer vd.DecRef(ctx) 119 ep := &EpollInstance{ 120 interest: make(map[epollInterestKey]*epollInterest), 121 } 122 if err := ep.vfsfd.Init(ep, linux.O_RDWR, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ 123 DenyPRead: true, 124 DenyPWrite: true, 125 UseDentryMetadata: true, 126 }); err != nil { 127 return nil, err 128 } 129 return &ep.vfsfd, nil 130 } 131 132 // Release implements FileDescriptionImpl.Release. 133 func (ep *EpollInstance) Release(ctx context.Context) { 134 // Unregister all polled fds. 135 ep.interestMu.Lock() 136 defer ep.interestMu.Unlock() 137 for key, epi := range ep.interest { 138 file := key.file 139 file.epollMu.Lock() 140 delete(file.epolls, epi) 141 file.epollMu.Unlock() 142 file.EventUnregister(&epi.waiter) 143 } 144 ep.interest = nil 145 } 146 147 // Readiness implements waiter.Waitable.Readiness. 148 func (ep *EpollInstance) Readiness(mask waiter.EventMask) waiter.EventMask { 149 if mask&waiter.ReadableEvents == 0 { 150 return 0 151 } 152 153 // We can't call FileDescription.Readiness() while holding ep.readyMu. 154 // Instead, hold ep.interestMu to prevent changes to the set of 155 // epollInterests, then temporarily move all epollInterests already on 156 // ep.ready to a local list that we can iterate without holding ep.readyMu. 157 // epollInterest.ready is left set to true so that 158 // epollInterest.NotifyEvent() doesn't touch epollInterestEntry. 159 ep.interestMu.Lock() 160 defer ep.interestMu.Unlock() 161 var ( 162 ready epollInterestList 163 notReady epollInterestList 164 ) 165 ep.readyMu.Lock() 166 ready.PushBackList(&ep.ready) 167 ep.readySeq++ 168 ep.readyMu.Unlock() 169 if ready.Empty() { 170 return 0 171 } 172 defer func() { 173 notify := false 174 ep.readyMu.Lock() 175 ep.ready.PushFrontList(&ready) 176 var next *epollInterest 177 for epi := notReady.Front(); epi != nil; epi = next { 178 next = epi.Next() 179 if epi.readySeq == ep.readySeq { 180 // epi.NotifyEvent() was called while we were running. 181 notReady.Remove(epi) 182 ep.ready.PushBack(epi) 183 notify = true 184 } else { 185 epi.ready = false 186 } 187 } 188 ep.readyMu.Unlock() 189 if notify { 190 ep.q.Notify(waiter.ReadableEvents) 191 } 192 }() 193 194 var next *epollInterest 195 for epi := ready.Front(); epi != nil; epi = next { 196 next = epi.Next() 197 wmask := waiter.EventMaskFromLinux(epi.mask) 198 if epi.key.file.Readiness(wmask)&wmask != 0 { 199 return waiter.ReadableEvents 200 } 201 // epi.key.file was readied spuriously; leave it off of ep.ready. 202 ready.Remove(epi) 203 notReady.PushBack(epi) 204 } 205 return 0 206 } 207 208 // EventRegister implements waiter.Waitable.EventRegister. 209 func (ep *EpollInstance) EventRegister(e *waiter.Entry) error { 210 ep.q.EventRegister(e) 211 return nil 212 } 213 214 // EventUnregister implements waiter.Waitable.EventUnregister. 215 func (ep *EpollInstance) EventUnregister(e *waiter.Entry) { 216 ep.q.EventUnregister(e) 217 } 218 219 // Epollable implements FileDescriptionImpl.Epollable. 220 func (ep *EpollInstance) Epollable() bool { 221 return true 222 } 223 224 // Seek implements FileDescriptionImpl.Seek. 225 func (ep *EpollInstance) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { 226 // Linux: fs/eventpoll.c:eventpoll_fops.llseek == noop_llseek 227 return 0, nil 228 } 229 230 // AddInterest implements the semantics of EPOLL_CTL_ADD. 231 // 232 // Preconditions: A reference must be held on file. 233 func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event linux.EpollEvent) error { 234 if !file.Epollable() { 235 return linuxerr.EPERM 236 } 237 238 // Check for cyclic polling if necessary. 239 subep, _ := file.impl.(*EpollInstance) 240 if subep != nil { 241 epollCycleMu.Lock() 242 // epollCycleMu must be locked for the rest of AddInterest to ensure 243 // that cyclic polling is not introduced after the check. 244 defer epollCycleMu.Unlock() 245 if subep.mightPoll(ep) { 246 return linuxerr.ELOOP 247 } 248 } 249 250 ep.interestMu.Lock() 251 defer ep.interestMu.Unlock() 252 253 // Fail if the key is already registered. 254 key := epollInterestKey{ 255 file: file, 256 num: num, 257 } 258 if _, ok := ep.interest[key]; ok { 259 return linuxerr.EEXIST 260 } 261 262 // Register interest in file. 263 mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP 264 epi := &epollInterest{ 265 epoll: ep, 266 key: key, 267 mask: mask, 268 userData: event.Data, 269 } 270 ep.interest[key] = epi 271 wmask := waiter.EventMaskFromLinux(mask) 272 epi.waiter.Init(epi, wmask) 273 if err := file.EventRegister(&epi.waiter); err != nil { 274 return err 275 } 276 277 // Check if the file is already ready. 278 if m := file.Readiness(wmask) & wmask; m != 0 { 279 epi.NotifyEvent(m) 280 } 281 282 // Add epi to file.epolls so that it is removed when the last 283 // FileDescription reference is dropped. 284 file.epollMu.Lock() 285 if file.epolls == nil { 286 file.epolls = make(map[*epollInterest]struct{}) 287 } 288 file.epolls[epi] = struct{}{} 289 file.epollMu.Unlock() 290 291 return nil 292 } 293 294 func (ep *EpollInstance) mightPoll(ep2 *EpollInstance) bool { 295 return ep.mightPollRecursive(ep2, 4) // Linux: fs/eventpoll.c:EP_MAX_NESTS 296 } 297 298 func (ep *EpollInstance) mightPollRecursive(ep2 *EpollInstance, remainingRecursion int) bool { 299 ep.interestMu.Lock() 300 defer ep.interestMu.Unlock() 301 for key := range ep.interest { 302 nextep, ok := key.file.impl.(*EpollInstance) 303 if !ok { 304 continue 305 } 306 if nextep == ep2 { 307 return true 308 } 309 if remainingRecursion == 0 { 310 return true 311 } 312 if nextep.mightPollRecursive(ep2, remainingRecursion-1) { 313 return true 314 } 315 } 316 return false 317 } 318 319 // ModifyInterest implements the semantics of EPOLL_CTL_MOD. 320 // 321 // Preconditions: A reference must be held on file. 322 func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event linux.EpollEvent) error { 323 ep.interestMu.Lock() 324 defer ep.interestMu.Unlock() 325 326 // Fail if the key is not already registered. 327 epi, ok := ep.interest[epollInterestKey{ 328 file: file, 329 num: num, 330 }] 331 if !ok { 332 return linuxerr.ENOENT 333 } 334 335 // Update epi for the next call to ep.ReadEvents(). 336 mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP 337 epi.mask = mask 338 epi.userData = event.Data 339 340 // Re-register with the new mask. 341 file.EventUnregister(&epi.waiter) 342 wmask := waiter.EventMaskFromLinux(mask) 343 epi.waiter.Init(epi, wmask) 344 if err := file.EventRegister(&epi.waiter); err != nil { 345 return err 346 } 347 348 // Check if the file is already ready with the new mask. 349 if m := file.Readiness(wmask) & wmask; m != 0 { 350 epi.NotifyEvent(m) 351 } 352 353 return nil 354 } 355 356 // DeleteInterest implements the semantics of EPOLL_CTL_DEL. 357 // 358 // Preconditions: A reference must be held on file. 359 func (ep *EpollInstance) DeleteInterest(file *FileDescription, num int32) error { 360 ep.interestMu.Lock() 361 defer ep.interestMu.Unlock() 362 363 // Fail if the key is not already registered. 364 epi, ok := ep.interest[epollInterestKey{ 365 file: file, 366 num: num, 367 }] 368 if !ok { 369 return linuxerr.ENOENT 370 } 371 372 // Unregister from the file so that epi will no longer be readied. 373 file.EventUnregister(&epi.waiter) 374 375 // Forget about epi. 376 ep.removeLocked(epi) 377 378 file.epollMu.Lock() 379 delete(file.epolls, epi) 380 file.epollMu.Unlock() 381 382 return nil 383 } 384 385 // NotifyEvent implements waiter.EventListener.NotifyEvent. 386 func (epi *epollInterest) NotifyEvent(waiter.EventMask) { 387 newReady := false 388 epi.epoll.readyMu.Lock() 389 if !epi.ready { 390 newReady = true 391 epi.ready = true 392 epi.epoll.ready.PushBack(epi) 393 } 394 epi.readySeq = epi.epoll.readySeq 395 epi.epoll.readyMu.Unlock() 396 if newReady { 397 epi.epoll.q.Notify(waiter.ReadableEvents) 398 } 399 } 400 401 // Preconditions: ep.interestMu must be locked. 402 func (ep *EpollInstance) removeLocked(epi *epollInterest) { 403 delete(ep.interest, epi.key) 404 ep.readyMu.Lock() 405 if epi.ready { 406 epi.ready = false 407 ep.ready.Remove(epi) 408 } 409 ep.readyMu.Unlock() 410 } 411 412 // ReadEvents appends up to maxReady events to events and returns the updated 413 // slice of events. 414 func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent, maxEvents int) []linux.EpollEvent { 415 // We can't call FileDescription.Readiness() while holding ep.readyMu. 416 // Instead, hold ep.interestMu to prevent changes to the set of 417 // epollInterests, then temporarily move all epollInterests already on 418 // ep.ready to a local list that we can iterate without holding ep.readyMu. 419 // epollInterest.ready is left set to true so that 420 // epollInterest.NotifyEvent() doesn't touch epollInterestEntry. 421 ep.interestMu.Lock() 422 defer ep.interestMu.Unlock() 423 var ( 424 ready epollInterestList 425 notReady epollInterestList 426 requeue epollInterestList 427 ) 428 ep.readyMu.Lock() 429 ready.PushBackList(&ep.ready) 430 ep.readySeq++ 431 ep.readyMu.Unlock() 432 if ready.Empty() { 433 return nil 434 } 435 defer func() { 436 notify := false 437 ep.readyMu.Lock() 438 // epollInterests that we never checked are re-inserted at the start of 439 // ep.ready. epollInterests that were ready are re-inserted at the end 440 // for reasons described by EpollInstance.ready. 441 ep.ready.PushFrontList(&ready) 442 var next *epollInterest 443 for epi := notReady.Front(); epi != nil; epi = next { 444 next = epi.Next() 445 if epi.readySeq == ep.readySeq { 446 // epi.NotifyEvent() was called while we were running. 447 notReady.Remove(epi) 448 ep.ready.PushBack(epi) 449 notify = true 450 } else { 451 epi.ready = false 452 } 453 } 454 ep.ready.PushBackList(&requeue) 455 ep.readyMu.Unlock() 456 if notify { 457 ep.q.Notify(waiter.ReadableEvents) 458 } 459 }() 460 461 i := 0 462 var next *epollInterest 463 for epi := ready.Front(); epi != nil; epi = next { 464 next = epi.Next() 465 // Regardless of what else happens, epi is initially removed from the 466 // ready list. 467 ready.Remove(epi) 468 wmask := waiter.EventMaskFromLinux(epi.mask) 469 ievents := epi.key.file.Readiness(wmask) & wmask 470 if ievents == 0 { 471 // Leave epi off the ready list. 472 notReady.PushBack(epi) 473 continue 474 } 475 // Determine what we should do with epi. 476 switch { 477 case epi.mask&linux.EPOLLONESHOT != 0: 478 // Clear all events from the mask; they must be re-added by 479 // EPOLL_CTL_MOD. 480 epi.mask &= linux.EP_PRIVATE_BITS 481 fallthrough 482 case epi.mask&linux.EPOLLET != 0: 483 // Leave epi off the ready list. 484 notReady.PushBack(epi) 485 default: 486 // Queue epi to be moved to the end of the ready list. 487 requeue.PushBack(epi) 488 } 489 // Report ievents. 490 events = append(events, linux.EpollEvent{ 491 Events: ievents.ToLinux(), 492 Data: epi.userData, 493 }) 494 i++ 495 if i == maxEvents { 496 break 497 } 498 } 499 return events 500 }