gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/vfs/epoll.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package vfs
    16  
    17  import (
    18  	"gvisor.dev/gvisor/pkg/abi/linux"
    19  	"gvisor.dev/gvisor/pkg/context"
    20  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    21  	"gvisor.dev/gvisor/pkg/sync"
    22  	"gvisor.dev/gvisor/pkg/waiter"
    23  )
    24  
    25  // epollCycleMu serializes attempts to register EpollInstances with other
    26  // EpollInstances in order to check for cycles.
    27  var epollCycleMu sync.Mutex
    28  
    29  // EpollInstance represents an epoll instance, as described by epoll(7).
    30  //
    31  // +stateify savable
    32  type EpollInstance struct {
    33  	vfsfd FileDescription
    34  	FileDescriptionDefaultImpl
    35  	DentryMetadataFileDescriptionImpl
    36  	NoLockFD
    37  
    38  	// q holds waiters on this EpollInstance.
    39  	q waiter.Queue
    40  
    41  	// interestMu protects interest and most fields in registered
    42  	// epollInterests. interestMu is analogous to Linux's struct
    43  	// eventpoll::mtx.
    44  	interestMu sync.Mutex `state:"nosave"`
    45  
    46  	// interest is the set of file descriptors that are registered with the
    47  	// EpollInstance for monitoring.
    48  	interest map[epollInterestKey]*epollInterest
    49  
    50  	// readyMu protects ready, readySeq, epollInterest.ready, and
    51  	// epollInterest.epollInterestEntry. ready is analogous to Linux's struct
    52  	// eventpoll::lock.
    53  	readyMu epollReadyInstanceMutex `state:"nosave"`
    54  
    55  	// ready is the set of file descriptors that may be "ready" for I/O. Note
    56  	// that this must be an ordered list, not a map: "If more than maxevents
    57  	// file descriptors are ready when epoll_wait() is called, then successive
    58  	// epoll_wait() calls will round robin through the set of ready file
    59  	// descriptors. This behavior helps avoid starvation scenarios, where a
    60  	// process fails to notice that additional file descriptors are ready
    61  	// because it focuses on a set of file descriptors that are already known
    62  	// to be ready." - epoll_wait(2)
    63  	ready epollInterestList
    64  
    65  	// readySeq is used to detect calls to epollInterest.NotifyEvent() while
    66  	// Readiness() or ReadEvents() are running with readyMu unlocked. readySeq
    67  	// is protected by both interestMu and readyMu; reading requires either
    68  	// mutex to be locked, but mutation requires both mutexes to be locked.
    69  	readySeq uint32
    70  }
    71  
    72  // +stateify savable
    73  type epollInterestKey struct {
    74  	// file is the registered FileDescription. No reference is held on file;
    75  	// instead, when the last reference is dropped, FileDescription.DecRef()
    76  	// removes the FileDescription from all EpollInstances. file is immutable.
    77  	file *FileDescription
    78  
    79  	// num is the file descriptor number with which this entry was registered.
    80  	// num is immutable.
    81  	num int32
    82  }
    83  
    84  // epollInterest represents an EpollInstance's interest in a file descriptor.
    85  //
    86  // +stateify savable
    87  type epollInterest struct {
    88  	// epoll is the owning EpollInstance. epoll is immutable.
    89  	epoll *EpollInstance `state:"wait"`
    90  
    91  	// key is the file to which this epollInterest applies. key is immutable.
    92  	key epollInterestKey
    93  
    94  	// waiter is registered with key.file. entry is protected by
    95  	// epoll.interestMu.
    96  	waiter waiter.Entry
    97  
    98  	// mask is the event mask associated with this registration, including
    99  	// flags EPOLLET and EPOLLONESHOT. mask is protected by epoll.interestMu.
   100  	mask uint32
   101  
   102  	// ready is true if epollInterestEntry is linked into epoll.ready. readySeq
   103  	// is the value of epoll.readySeq when NotifyEvent() was last called.
   104  	// ready, epollInterestEntry, and readySeq are protected by epoll.readyMu.
   105  	ready bool
   106  	epollInterestEntry
   107  	readySeq uint32
   108  
   109  	// userData is the struct epoll_event::data associated with this
   110  	// epollInterest. userData is protected by epoll.interestMu.
   111  	userData [2]int32
   112  }
   113  
   114  // NewEpollInstanceFD returns a FileDescription representing a new epoll
   115  // instance. A reference is taken on the returned FileDescription.
   116  func (vfs *VirtualFilesystem) NewEpollInstanceFD(ctx context.Context) (*FileDescription, error) {
   117  	vd := vfs.NewAnonVirtualDentry("[eventpoll]")
   118  	defer vd.DecRef(ctx)
   119  	ep := &EpollInstance{
   120  		interest: make(map[epollInterestKey]*epollInterest),
   121  	}
   122  	if err := ep.vfsfd.Init(ep, linux.O_RDWR, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
   123  		DenyPRead:         true,
   124  		DenyPWrite:        true,
   125  		UseDentryMetadata: true,
   126  	}); err != nil {
   127  		return nil, err
   128  	}
   129  	return &ep.vfsfd, nil
   130  }
   131  
   132  // Release implements FileDescriptionImpl.Release.
   133  func (ep *EpollInstance) Release(ctx context.Context) {
   134  	// Unregister all polled fds.
   135  	ep.interestMu.Lock()
   136  	defer ep.interestMu.Unlock()
   137  	for key, epi := range ep.interest {
   138  		file := key.file
   139  		file.epollMu.Lock()
   140  		delete(file.epolls, epi)
   141  		file.epollMu.Unlock()
   142  		file.EventUnregister(&epi.waiter)
   143  	}
   144  	ep.interest = nil
   145  }
   146  
   147  // Readiness implements waiter.Waitable.Readiness.
   148  func (ep *EpollInstance) Readiness(mask waiter.EventMask) waiter.EventMask {
   149  	if mask&waiter.ReadableEvents == 0 {
   150  		return 0
   151  	}
   152  
   153  	// We can't call FileDescription.Readiness() while holding ep.readyMu.
   154  	// Instead, hold ep.interestMu to prevent changes to the set of
   155  	// epollInterests, then temporarily move all epollInterests already on
   156  	// ep.ready to a local list that we can iterate without holding ep.readyMu.
   157  	// epollInterest.ready is left set to true so that
   158  	// epollInterest.NotifyEvent() doesn't touch epollInterestEntry.
   159  	ep.interestMu.Lock()
   160  	defer ep.interestMu.Unlock()
   161  	var (
   162  		ready    epollInterestList
   163  		notReady epollInterestList
   164  	)
   165  	ep.readyMu.Lock()
   166  	ready.PushBackList(&ep.ready)
   167  	ep.readySeq++
   168  	ep.readyMu.Unlock()
   169  	if ready.Empty() {
   170  		return 0
   171  	}
   172  	defer func() {
   173  		notify := false
   174  		ep.readyMu.Lock()
   175  		ep.ready.PushFrontList(&ready)
   176  		var next *epollInterest
   177  		for epi := notReady.Front(); epi != nil; epi = next {
   178  			next = epi.Next()
   179  			if epi.readySeq == ep.readySeq {
   180  				// epi.NotifyEvent() was called while we were running.
   181  				notReady.Remove(epi)
   182  				ep.ready.PushBack(epi)
   183  				notify = true
   184  			} else {
   185  				epi.ready = false
   186  			}
   187  		}
   188  		ep.readyMu.Unlock()
   189  		if notify {
   190  			ep.q.Notify(waiter.ReadableEvents)
   191  		}
   192  	}()
   193  
   194  	var next *epollInterest
   195  	for epi := ready.Front(); epi != nil; epi = next {
   196  		next = epi.Next()
   197  		wmask := waiter.EventMaskFromLinux(epi.mask)
   198  		if epi.key.file.Readiness(wmask)&wmask != 0 {
   199  			return waiter.ReadableEvents
   200  		}
   201  		// epi.key.file was readied spuriously; leave it off of ep.ready.
   202  		ready.Remove(epi)
   203  		notReady.PushBack(epi)
   204  	}
   205  	return 0
   206  }
   207  
   208  // EventRegister implements waiter.Waitable.EventRegister.
   209  func (ep *EpollInstance) EventRegister(e *waiter.Entry) error {
   210  	ep.q.EventRegister(e)
   211  	return nil
   212  }
   213  
   214  // EventUnregister implements waiter.Waitable.EventUnregister.
   215  func (ep *EpollInstance) EventUnregister(e *waiter.Entry) {
   216  	ep.q.EventUnregister(e)
   217  }
   218  
   219  // Epollable implements FileDescriptionImpl.Epollable.
   220  func (ep *EpollInstance) Epollable() bool {
   221  	return true
   222  }
   223  
   224  // Seek implements FileDescriptionImpl.Seek.
   225  func (ep *EpollInstance) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
   226  	// Linux: fs/eventpoll.c:eventpoll_fops.llseek == noop_llseek
   227  	return 0, nil
   228  }
   229  
   230  // AddInterest implements the semantics of EPOLL_CTL_ADD.
   231  //
   232  // Preconditions: A reference must be held on file.
   233  func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event linux.EpollEvent) error {
   234  	if !file.Epollable() {
   235  		return linuxerr.EPERM
   236  	}
   237  
   238  	// Check for cyclic polling if necessary.
   239  	subep, _ := file.impl.(*EpollInstance)
   240  	if subep != nil {
   241  		epollCycleMu.Lock()
   242  		// epollCycleMu must be locked for the rest of AddInterest to ensure
   243  		// that cyclic polling is not introduced after the check.
   244  		defer epollCycleMu.Unlock()
   245  		if subep.mightPoll(ep) {
   246  			return linuxerr.ELOOP
   247  		}
   248  	}
   249  
   250  	ep.interestMu.Lock()
   251  	defer ep.interestMu.Unlock()
   252  
   253  	// Fail if the key is already registered.
   254  	key := epollInterestKey{
   255  		file: file,
   256  		num:  num,
   257  	}
   258  	if _, ok := ep.interest[key]; ok {
   259  		return linuxerr.EEXIST
   260  	}
   261  
   262  	// Register interest in file.
   263  	mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP
   264  	epi := &epollInterest{
   265  		epoll:    ep,
   266  		key:      key,
   267  		mask:     mask,
   268  		userData: event.Data,
   269  	}
   270  	ep.interest[key] = epi
   271  	wmask := waiter.EventMaskFromLinux(mask)
   272  	epi.waiter.Init(epi, wmask)
   273  	if err := file.EventRegister(&epi.waiter); err != nil {
   274  		return err
   275  	}
   276  
   277  	// Check if the file is already ready.
   278  	if m := file.Readiness(wmask) & wmask; m != 0 {
   279  		epi.NotifyEvent(m)
   280  	}
   281  
   282  	// Add epi to file.epolls so that it is removed when the last
   283  	// FileDescription reference is dropped.
   284  	file.epollMu.Lock()
   285  	if file.epolls == nil {
   286  		file.epolls = make(map[*epollInterest]struct{})
   287  	}
   288  	file.epolls[epi] = struct{}{}
   289  	file.epollMu.Unlock()
   290  
   291  	return nil
   292  }
   293  
   294  func (ep *EpollInstance) mightPoll(ep2 *EpollInstance) bool {
   295  	return ep.mightPollRecursive(ep2, 4) // Linux: fs/eventpoll.c:EP_MAX_NESTS
   296  }
   297  
   298  func (ep *EpollInstance) mightPollRecursive(ep2 *EpollInstance, remainingRecursion int) bool {
   299  	ep.interestMu.Lock()
   300  	defer ep.interestMu.Unlock()
   301  	for key := range ep.interest {
   302  		nextep, ok := key.file.impl.(*EpollInstance)
   303  		if !ok {
   304  			continue
   305  		}
   306  		if nextep == ep2 {
   307  			return true
   308  		}
   309  		if remainingRecursion == 0 {
   310  			return true
   311  		}
   312  		if nextep.mightPollRecursive(ep2, remainingRecursion-1) {
   313  			return true
   314  		}
   315  	}
   316  	return false
   317  }
   318  
   319  // ModifyInterest implements the semantics of EPOLL_CTL_MOD.
   320  //
   321  // Preconditions: A reference must be held on file.
   322  func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event linux.EpollEvent) error {
   323  	ep.interestMu.Lock()
   324  	defer ep.interestMu.Unlock()
   325  
   326  	// Fail if the key is not already registered.
   327  	epi, ok := ep.interest[epollInterestKey{
   328  		file: file,
   329  		num:  num,
   330  	}]
   331  	if !ok {
   332  		return linuxerr.ENOENT
   333  	}
   334  
   335  	// Update epi for the next call to ep.ReadEvents().
   336  	mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP
   337  	epi.mask = mask
   338  	epi.userData = event.Data
   339  
   340  	// Re-register with the new mask.
   341  	file.EventUnregister(&epi.waiter)
   342  	wmask := waiter.EventMaskFromLinux(mask)
   343  	epi.waiter.Init(epi, wmask)
   344  	if err := file.EventRegister(&epi.waiter); err != nil {
   345  		return err
   346  	}
   347  
   348  	// Check if the file is already ready with the new mask.
   349  	if m := file.Readiness(wmask) & wmask; m != 0 {
   350  		epi.NotifyEvent(m)
   351  	}
   352  
   353  	return nil
   354  }
   355  
   356  // DeleteInterest implements the semantics of EPOLL_CTL_DEL.
   357  //
   358  // Preconditions: A reference must be held on file.
   359  func (ep *EpollInstance) DeleteInterest(file *FileDescription, num int32) error {
   360  	ep.interestMu.Lock()
   361  	defer ep.interestMu.Unlock()
   362  
   363  	// Fail if the key is not already registered.
   364  	epi, ok := ep.interest[epollInterestKey{
   365  		file: file,
   366  		num:  num,
   367  	}]
   368  	if !ok {
   369  		return linuxerr.ENOENT
   370  	}
   371  
   372  	// Unregister from the file so that epi will no longer be readied.
   373  	file.EventUnregister(&epi.waiter)
   374  
   375  	// Forget about epi.
   376  	ep.removeLocked(epi)
   377  
   378  	file.epollMu.Lock()
   379  	delete(file.epolls, epi)
   380  	file.epollMu.Unlock()
   381  
   382  	return nil
   383  }
   384  
   385  // NotifyEvent implements waiter.EventListener.NotifyEvent.
   386  func (epi *epollInterest) NotifyEvent(waiter.EventMask) {
   387  	newReady := false
   388  	epi.epoll.readyMu.Lock()
   389  	if !epi.ready {
   390  		newReady = true
   391  		epi.ready = true
   392  		epi.epoll.ready.PushBack(epi)
   393  	}
   394  	epi.readySeq = epi.epoll.readySeq
   395  	epi.epoll.readyMu.Unlock()
   396  	if newReady {
   397  		epi.epoll.q.Notify(waiter.ReadableEvents)
   398  	}
   399  }
   400  
   401  // Preconditions: ep.interestMu must be locked.
   402  func (ep *EpollInstance) removeLocked(epi *epollInterest) {
   403  	delete(ep.interest, epi.key)
   404  	ep.readyMu.Lock()
   405  	if epi.ready {
   406  		epi.ready = false
   407  		ep.ready.Remove(epi)
   408  	}
   409  	ep.readyMu.Unlock()
   410  }
   411  
   412  // ReadEvents appends up to maxReady events to events and returns the updated
   413  // slice of events.
   414  func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent, maxEvents int) []linux.EpollEvent {
   415  	// We can't call FileDescription.Readiness() while holding ep.readyMu.
   416  	// Instead, hold ep.interestMu to prevent changes to the set of
   417  	// epollInterests, then temporarily move all epollInterests already on
   418  	// ep.ready to a local list that we can iterate without holding ep.readyMu.
   419  	// epollInterest.ready is left set to true so that
   420  	// epollInterest.NotifyEvent() doesn't touch epollInterestEntry.
   421  	ep.interestMu.Lock()
   422  	defer ep.interestMu.Unlock()
   423  	var (
   424  		ready    epollInterestList
   425  		notReady epollInterestList
   426  		requeue  epollInterestList
   427  	)
   428  	ep.readyMu.Lock()
   429  	ready.PushBackList(&ep.ready)
   430  	ep.readySeq++
   431  	ep.readyMu.Unlock()
   432  	if ready.Empty() {
   433  		return nil
   434  	}
   435  	defer func() {
   436  		notify := false
   437  		ep.readyMu.Lock()
   438  		// epollInterests that we never checked are re-inserted at the start of
   439  		// ep.ready. epollInterests that were ready are re-inserted at the end
   440  		// for reasons described by EpollInstance.ready.
   441  		ep.ready.PushFrontList(&ready)
   442  		var next *epollInterest
   443  		for epi := notReady.Front(); epi != nil; epi = next {
   444  			next = epi.Next()
   445  			if epi.readySeq == ep.readySeq {
   446  				// epi.NotifyEvent() was called while we were running.
   447  				notReady.Remove(epi)
   448  				ep.ready.PushBack(epi)
   449  				notify = true
   450  			} else {
   451  				epi.ready = false
   452  			}
   453  		}
   454  		ep.ready.PushBackList(&requeue)
   455  		ep.readyMu.Unlock()
   456  		if notify {
   457  			ep.q.Notify(waiter.ReadableEvents)
   458  		}
   459  	}()
   460  
   461  	i := 0
   462  	var next *epollInterest
   463  	for epi := ready.Front(); epi != nil; epi = next {
   464  		next = epi.Next()
   465  		// Regardless of what else happens, epi is initially removed from the
   466  		// ready list.
   467  		ready.Remove(epi)
   468  		wmask := waiter.EventMaskFromLinux(epi.mask)
   469  		ievents := epi.key.file.Readiness(wmask) & wmask
   470  		if ievents == 0 {
   471  			// Leave epi off the ready list.
   472  			notReady.PushBack(epi)
   473  			continue
   474  		}
   475  		// Determine what we should do with epi.
   476  		switch {
   477  		case epi.mask&linux.EPOLLONESHOT != 0:
   478  			// Clear all events from the mask; they must be re-added by
   479  			// EPOLL_CTL_MOD.
   480  			epi.mask &= linux.EP_PRIVATE_BITS
   481  			fallthrough
   482  		case epi.mask&linux.EPOLLET != 0:
   483  			// Leave epi off the ready list.
   484  			notReady.PushBack(epi)
   485  		default:
   486  			// Queue epi to be moved to the end of the ready list.
   487  			requeue.PushBack(epi)
   488  		}
   489  		// Report ievents.
   490  		events = append(events, linux.EpollEvent{
   491  			Events: ievents.ToLinux(),
   492  			Data:   epi.userData,
   493  		})
   494  		i++
   495  		if i == maxEvents {
   496  			break
   497  		}
   498  	}
   499  	return events
   500  }