github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/epoll/epoll.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package epoll provides an implementation of Linux's IO event notification 16 // facility. See epoll(7) for more details. 17 // 18 // Lock order: 19 // EventPoll.mu 20 // fdnotifier.notifier.mu 21 // EventPoll.listsMu 22 // unix.baseEndpoint.Mutex 23 package epoll 24 25 import ( 26 "fmt" 27 28 "golang.org/x/sys/unix" 29 "github.com/SagerNet/gvisor/pkg/abi/linux" 30 "github.com/SagerNet/gvisor/pkg/context" 31 "github.com/SagerNet/gvisor/pkg/refs" 32 "github.com/SagerNet/gvisor/pkg/sentry/fs" 33 "github.com/SagerNet/gvisor/pkg/sentry/fs/anon" 34 "github.com/SagerNet/gvisor/pkg/sentry/fs/fsutil" 35 "github.com/SagerNet/gvisor/pkg/sync" 36 "github.com/SagerNet/gvisor/pkg/usermem" 37 "github.com/SagerNet/gvisor/pkg/waiter" 38 ) 39 40 // EntryFlags is a bitmask that holds an entry's flags. 41 type EntryFlags int 42 43 // Valid entry flags. 44 const ( 45 OneShot EntryFlags = 1 << iota 46 EdgeTriggered 47 ) 48 49 // FileIdentifier identifies a file. We cannot use just the FD because it could 50 // potentially be reassigned. We also cannot use just the file pointer because 51 // it is possible to have multiple entries for the same file object as long as 52 // they are created with different FDs (i.e., the FDs point to the same file). 53 // 54 // +stateify savable 55 type FileIdentifier struct { 56 File *fs.File `state:"wait"` 57 Fd int32 58 } 59 60 // pollEntry holds all the state associated with an event poll entry, that is, 61 // a file being observed by an event poll object. 62 // 63 // +stateify savable 64 type pollEntry struct { 65 pollEntryEntry 66 file *refs.WeakRef `state:"manual"` 67 id FileIdentifier `state:"wait"` 68 userData [2]int32 69 waiter waiter.Entry `state:"manual"` 70 mask waiter.EventMask 71 flags EntryFlags 72 73 epoll *EventPoll 74 75 // We cannot save the current list pointer as it points into EventPoll 76 // struct, while state framework currently does not support such 77 // in-struct pointers. Instead, EventPoll will properly set this field 78 // in its loading logic. 79 curList *pollEntryList `state:"nosave"` 80 } 81 82 // WeakRefGone implements refs.WeakRefUser.WeakRefGone. 83 // weakReferenceGone is called when the file in the weak reference is destroyed. 84 // The poll entry is removed in response to this. 85 func (p *pollEntry) WeakRefGone(ctx context.Context) { 86 p.epoll.RemoveEntry(ctx, p.id) 87 } 88 89 // EventPoll holds all the state associated with an event poll object, that is, 90 // collection of files to observe and their current state. 91 // 92 // +stateify savable 93 type EventPoll struct { 94 fsutil.FilePipeSeek `state:"zerovalue"` 95 fsutil.FileNotDirReaddir `state:"zerovalue"` 96 fsutil.FileNoFsync `state:"zerovalue"` 97 fsutil.FileNoopFlush `state:"zerovalue"` 98 fsutil.FileNoIoctl `state:"zerovalue"` 99 fsutil.FileNoMMap `state:"zerovalue"` 100 fsutil.FileNoSplice `state:"nosave"` 101 fsutil.FileUseInodeUnstableAttr `state:"nosave"` 102 103 // Wait queue is used to notify interested parties when the event poll 104 // object itself becomes readable or writable. 105 waiter.Queue `state:"zerovalue"` 106 107 // files is the map of all the files currently being observed, it is 108 // protected by mu. 109 mu sync.Mutex `state:"nosave"` 110 files map[FileIdentifier]*pollEntry 111 112 // listsMu protects manipulation of the lists below. It needs to be a 113 // different lock to avoid circular lock acquisition order involving 114 // the wait queue mutexes and mu. The full order is mu, observed file 115 // wait queue mutex, then listsMu; this allows listsMu to be acquired 116 // when (*pollEntry).Callback is called. 117 // 118 // An entry is always in one of the following lists: 119 // readyList -- when there's a chance that it's ready to have 120 // events delivered to epoll waiters. Given that being 121 // ready is a transient state, the Readiness() and 122 // readEvents() functions always call the entry's file 123 // Readiness() function to confirm it's ready. 124 // waitingList -- when there's no chance that the entry is ready, 125 // so it's waiting for the (*pollEntry).Callback to be called 126 // on it before it gets moved to the readyList. 127 // disabledList -- when the entry is disabled. This happens when 128 // a one-shot entry gets delivered via readEvents(). 129 listsMu sync.Mutex `state:"nosave"` 130 readyList pollEntryList 131 waitingList pollEntryList 132 disabledList pollEntryList 133 } 134 135 // cycleMu is used to serialize all the cycle checks. This is only used when 136 // an event poll file is added as an entry to another event poll. Such checks 137 // are serialized to avoid lock acquisition order inversion: if a thread is 138 // adding A to B, and another thread is adding B to A, each would acquire A's 139 // and B's mutexes in reverse order, and could cause deadlocks. Having this 140 // lock prevents this by allowing only one check at a time to happen. 141 // 142 // We do the cycle check to prevent callers from introducing potentially 143 // infinite recursions. If a caller were to add A to B and then B to A, for 144 // event poll A to know if it's readable, it would need to check event poll B, 145 // which in turn would need event poll A and so on indefinitely. 146 var cycleMu sync.Mutex 147 148 // NewEventPoll allocates and initializes a new event poll object. 149 func NewEventPoll(ctx context.Context) *fs.File { 150 // name matches fs/eventpoll.c:epoll_create1. 151 dirent := fs.NewDirent(ctx, anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]")) 152 // Release the initial dirent reference after NewFile takes a reference. 153 defer dirent.DecRef(ctx) 154 return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{ 155 files: make(map[FileIdentifier]*pollEntry), 156 }) 157 } 158 159 // Release implements fs.FileOperations.Release. 160 func (e *EventPoll) Release(ctx context.Context) { 161 // We need to take the lock now because files may be attempting to 162 // remove entries in parallel if they get destroyed. 163 e.mu.Lock() 164 defer e.mu.Unlock() 165 166 // Go through all entries and clean up. 167 for _, entry := range e.files { 168 entry.id.File.EventUnregister(&entry.waiter) 169 entry.file.Drop(ctx) 170 } 171 e.files = nil 172 } 173 174 // Read implements fs.FileOperations.Read. 175 func (*EventPoll) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { 176 return 0, unix.ENOSYS 177 } 178 179 // Write implements fs.FileOperations.Write. 180 func (*EventPoll) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { 181 return 0, unix.ENOSYS 182 } 183 184 // eventsAvailable determines if 'e' has events available for delivery. 185 func (e *EventPoll) eventsAvailable() bool { 186 e.listsMu.Lock() 187 188 for it := e.readyList.Front(); it != nil; { 189 entry := it 190 it = it.Next() 191 192 // If the entry is ready, we know 'e' has at least one entry 193 // ready for delivery. 194 ready := entry.id.File.Readiness(entry.mask) 195 if ready != 0 { 196 e.listsMu.Unlock() 197 return true 198 } 199 200 // Entry is not ready, so move it to waiting list. 201 e.readyList.Remove(entry) 202 e.waitingList.PushBack(entry) 203 entry.curList = &e.waitingList 204 } 205 206 e.listsMu.Unlock() 207 208 return false 209 } 210 211 // Readiness determines if the event poll object is currently readable (i.e., 212 // if there are pending events for delivery). 213 func (e *EventPoll) Readiness(mask waiter.EventMask) waiter.EventMask { 214 ready := waiter.EventMask(0) 215 216 if (mask&waiter.ReadableEvents) != 0 && e.eventsAvailable() { 217 ready |= waiter.ReadableEvents 218 } 219 220 return ready 221 } 222 223 // ReadEvents returns up to max available events. 224 func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent { 225 var local pollEntryList 226 var ret []linux.EpollEvent 227 228 e.listsMu.Lock() 229 230 // Go through all entries we believe may be ready. 231 for it := e.readyList.Front(); it != nil && len(ret) < max; { 232 entry := it 233 it = it.Next() 234 235 // Check the entry's readiness. It it's not really ready, we 236 // just put it back in the waiting list and move on to the next 237 // entry. 238 ready := entry.id.File.Readiness(entry.mask) & entry.mask 239 if ready == 0 { 240 e.readyList.Remove(entry) 241 e.waitingList.PushBack(entry) 242 entry.curList = &e.waitingList 243 244 continue 245 } 246 247 // Add event to the array that will be returned to caller. 248 ret = append(ret, linux.EpollEvent{ 249 Events: uint32(ready), 250 Data: entry.userData, 251 }) 252 253 // The entry is consumed, so we must move it to the disabled 254 // list in case it's one-shot, or back to the wait list if it's 255 // edge-triggered. If it's neither, we leave it in the ready 256 // list so that its readiness can be checked the next time 257 // around; however, we must move it to the end of the list so 258 // that other events can be delivered as well. 259 e.readyList.Remove(entry) 260 if entry.flags&OneShot != 0 { 261 e.disabledList.PushBack(entry) 262 entry.curList = &e.disabledList 263 } else if entry.flags&EdgeTriggered != 0 { 264 e.waitingList.PushBack(entry) 265 entry.curList = &e.waitingList 266 } else { 267 local.PushBack(entry) 268 } 269 } 270 271 e.readyList.PushBackList(&local) 272 273 e.listsMu.Unlock() 274 275 return ret 276 } 277 278 // Callback implements waiter.EntryCallback.Callback. 279 // 280 // Callback is called when one of the files we're polling becomes ready. It 281 // moves said file to the readyList if it's currently in the waiting list. 282 func (p *pollEntry) Callback(*waiter.Entry, waiter.EventMask) { 283 e := p.epoll 284 285 e.listsMu.Lock() 286 287 if p.curList == &e.waitingList { 288 e.waitingList.Remove(p) 289 e.readyList.PushBack(p) 290 p.curList = &e.readyList 291 e.listsMu.Unlock() 292 293 e.Notify(waiter.ReadableEvents) 294 return 295 } 296 297 e.listsMu.Unlock() 298 } 299 300 // initEntryReadiness initializes the entry's state with regards to its 301 // readiness by placing it in the appropriate list and registering for 302 // notifications. 303 func (e *EventPoll) initEntryReadiness(entry *pollEntry) { 304 // A new entry starts off in the waiting list. 305 e.listsMu.Lock() 306 e.waitingList.PushBack(entry) 307 entry.curList = &e.waitingList 308 e.listsMu.Unlock() 309 310 // Register for event notifications. 311 f := entry.id.File 312 f.EventRegister(&entry.waiter, entry.mask) 313 314 // Check if the file happens to already be in a ready state. 315 if ready := f.Readiness(entry.mask) & entry.mask; ready != 0 { 316 entry.Callback(&entry.waiter, ready) 317 } 318 } 319 320 // observes checks if event poll object e is directly or indirectly observing 321 // event poll object ep. It uses a bounded recursive depth-first search. 322 func (e *EventPoll) observes(ep *EventPoll, depthLeft int) bool { 323 // If we reached the maximum depth, we'll consider that we found it 324 // because we don't want to allow chains that are too long. 325 if depthLeft <= 0 { 326 return true 327 } 328 329 e.mu.Lock() 330 defer e.mu.Unlock() 331 332 // Go through each observed file and check if it is or observes ep. 333 for id := range e.files { 334 f, ok := id.File.FileOperations.(*EventPoll) 335 if !ok { 336 continue 337 } 338 339 if f == ep || f.observes(ep, depthLeft-1) { 340 return true 341 } 342 } 343 344 return false 345 } 346 347 // AddEntry adds a new file to the collection of files observed by e. 348 func (e *EventPoll) AddEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error { 349 // Acquire cycle check lock if another event poll is being added. 350 ep, ok := id.File.FileOperations.(*EventPoll) 351 if ok { 352 cycleMu.Lock() 353 defer cycleMu.Unlock() 354 } 355 356 e.mu.Lock() 357 defer e.mu.Unlock() 358 359 // Fail if the file already has an entry. 360 if _, ok := e.files[id]; ok { 361 return unix.EEXIST 362 } 363 364 // Check if a cycle would be created. We use 4 as the limit because 365 // that's the value used by linux and we want to emulate it. 366 if ep != nil { 367 if e == ep { 368 return unix.EINVAL 369 } 370 371 if ep.observes(e, 4) { 372 return unix.ELOOP 373 } 374 } 375 376 // Create new entry and add it to map. 377 // 378 // N.B. Even though we are creating a weak reference here, we know it 379 // won't trigger a callback because we hold a reference to the file 380 // throughout the execution of this function. 381 entry := &pollEntry{ 382 id: id, 383 userData: data, 384 epoll: e, 385 flags: flags, 386 mask: mask, 387 } 388 entry.waiter.Callback = entry 389 e.files[id] = entry 390 entry.file = refs.NewWeakRef(id.File, entry) 391 392 // Initialize the readiness state of the new entry. 393 e.initEntryReadiness(entry) 394 395 return nil 396 } 397 398 // UpdateEntry updates the flags, mask and user data associated with a file that 399 // is already part of the collection of observed files. 400 func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error { 401 e.mu.Lock() 402 defer e.mu.Unlock() 403 404 // Fail if the file doesn't have an entry. 405 entry, ok := e.files[id] 406 if !ok { 407 return unix.ENOENT 408 } 409 410 // Unregister the old mask and remove entry from the list it's in, so 411 // (*pollEntry).Callback is guaranteed to not be called on this entry anymore. 412 entry.id.File.EventUnregister(&entry.waiter) 413 414 // Remove entry from whatever list it's in. This ensure that no other 415 // threads have access to this entry as the only way left to find it 416 // is via e.files, but we hold e.mu, which prevents that. 417 e.listsMu.Lock() 418 entry.curList.Remove(entry) 419 e.listsMu.Unlock() 420 421 // Initialize new readiness state. 422 entry.flags = flags 423 entry.mask = mask 424 entry.userData = data 425 e.initEntryReadiness(entry) 426 427 return nil 428 } 429 430 // RemoveEntry a files from the collection of observed files. 431 func (e *EventPoll) RemoveEntry(ctx context.Context, id FileIdentifier) error { 432 e.mu.Lock() 433 defer e.mu.Unlock() 434 435 // Fail if the file doesn't have an entry. 436 entry, ok := e.files[id] 437 if !ok { 438 return unix.ENOENT 439 } 440 441 // Unregister from file first so that no concurrent attempts will be 442 // made to manipulate the file. 443 entry.id.File.EventUnregister(&entry.waiter) 444 445 // Remove from the current list. 446 e.listsMu.Lock() 447 entry.curList.Remove(entry) 448 entry.curList = nil 449 e.listsMu.Unlock() 450 451 // Remove file from map, and drop weak reference. 452 delete(e.files, id) 453 entry.file.Drop(ctx) 454 455 return nil 456 } 457 458 // UnregisterEpollWaiters removes the epoll waiter objects from the waiting 459 // queues. This is different from Release() as the file is not dereferenced. 460 func (e *EventPoll) UnregisterEpollWaiters() { 461 e.mu.Lock() 462 defer e.mu.Unlock() 463 464 for _, entry := range e.files { 465 entry.id.File.EventUnregister(&entry.waiter) 466 } 467 }