github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/vfs/epoll.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package vfs
    16  
    17  import (
    18  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    19  	"github.com/SagerNet/gvisor/pkg/context"
    20  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    21  	"github.com/SagerNet/gvisor/pkg/sync"
    22  	"github.com/SagerNet/gvisor/pkg/syserror"
    23  	"github.com/SagerNet/gvisor/pkg/waiter"
    24  )
    25  
    26  // epollCycleMu serializes attempts to register EpollInstances with other
    27  // EpollInstances in order to check for cycles.
    28  var epollCycleMu sync.Mutex
    29  
    30  // EpollInstance represents an epoll instance, as described by epoll(7).
    31  //
    32  // +stateify savable
    33  type EpollInstance struct {
    34  	vfsfd FileDescription
    35  	FileDescriptionDefaultImpl
    36  	DentryMetadataFileDescriptionImpl
    37  	NoLockFD
    38  
    39  	// q holds waiters on this EpollInstance.
    40  	q waiter.Queue
    41  
    42  	// interest is the set of file descriptors that are registered with the
    43  	// EpollInstance for monitoring. interest is protected by interestMu.
    44  	interestMu sync.Mutex `state:"nosave"`
    45  	interest   map[epollInterestKey]*epollInterest
    46  
    47  	// mu protects fields in registered epollInterests.
    48  	mu sync.Mutex `state:"nosave"`
    49  
    50  	// ready is the set of file descriptors that may be "ready" for I/O. Note
    51  	// that this must be an ordered list, not a map: "If more than maxevents
    52  	// file descriptors are ready when epoll_wait() is called, then successive
    53  	// epoll_wait() calls will round robin through the set of ready file
    54  	// descriptors. This behavior helps avoid starvation scenarios, where a
    55  	// process fails to notice that additional file descriptors are ready
    56  	// because it focuses on a set of file descriptors that are already known
    57  	// to be ready." - epoll_wait(2)
    58  	ready epollInterestList
    59  }
    60  
    61  // +stateify savable
    62  type epollInterestKey struct {
    63  	// file is the registered FileDescription. No reference is held on file;
    64  	// instead, when the last reference is dropped, FileDescription.DecRef()
    65  	// removes the FileDescription from all EpollInstances. file is immutable.
    66  	file *FileDescription
    67  
    68  	// num is the file descriptor number with which this entry was registered.
    69  	// num is immutable.
    70  	num int32
    71  }
    72  
    73  // epollInterest represents an EpollInstance's interest in a file descriptor.
    74  //
    75  // +stateify savable
    76  type epollInterest struct {
    77  	// epoll is the owning EpollInstance. epoll is immutable.
    78  	epoll *EpollInstance `state:"wait"`
    79  
    80  	// key is the file to which this epollInterest applies. key is immutable.
    81  	key epollInterestKey
    82  
    83  	// waiter is registered with key.file. entry is protected by epoll.mu.
    84  	waiter waiter.Entry
    85  
    86  	// mask is the event mask associated with this registration, including
    87  	// flags EPOLLET and EPOLLONESHOT. mask is protected by epoll.mu.
    88  	mask uint32
    89  
    90  	// ready is true if epollInterestEntry is linked into epoll.ready. ready
    91  	// and epollInterestEntry are protected by epoll.mu.
    92  	ready bool
    93  	epollInterestEntry
    94  
    95  	// userData is the struct epoll_event::data associated with this
    96  	// epollInterest. userData is protected by epoll.mu.
    97  	userData [2]int32
    98  }
    99  
   100  // NewEpollInstanceFD returns a FileDescription representing a new epoll
   101  // instance. A reference is taken on the returned FileDescription.
   102  func (vfs *VirtualFilesystem) NewEpollInstanceFD(ctx context.Context) (*FileDescription, error) {
   103  	vd := vfs.NewAnonVirtualDentry("[eventpoll]")
   104  	defer vd.DecRef(ctx)
   105  	ep := &EpollInstance{
   106  		interest: make(map[epollInterestKey]*epollInterest),
   107  	}
   108  	if err := ep.vfsfd.Init(ep, linux.O_RDWR, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
   109  		DenyPRead:         true,
   110  		DenyPWrite:        true,
   111  		UseDentryMetadata: true,
   112  	}); err != nil {
   113  		return nil, err
   114  	}
   115  	return &ep.vfsfd, nil
   116  }
   117  
   118  // Release implements FileDescriptionImpl.Release.
   119  func (ep *EpollInstance) Release(ctx context.Context) {
   120  	// Unregister all polled fds.
   121  	ep.interestMu.Lock()
   122  	defer ep.interestMu.Unlock()
   123  	for key, epi := range ep.interest {
   124  		file := key.file
   125  		file.epollMu.Lock()
   126  		delete(file.epolls, epi)
   127  		file.epollMu.Unlock()
   128  		file.EventUnregister(&epi.waiter)
   129  	}
   130  	ep.interest = nil
   131  }
   132  
   133  // Readiness implements waiter.Waitable.Readiness.
   134  func (ep *EpollInstance) Readiness(mask waiter.EventMask) waiter.EventMask {
   135  	if mask&waiter.ReadableEvents == 0 {
   136  		return 0
   137  	}
   138  	ep.mu.Lock()
   139  	for epi := ep.ready.Front(); epi != nil; epi = epi.Next() {
   140  		wmask := waiter.EventMaskFromLinux(epi.mask)
   141  		if epi.key.file.Readiness(wmask)&wmask != 0 {
   142  			ep.mu.Unlock()
   143  			return waiter.ReadableEvents
   144  		}
   145  	}
   146  	ep.mu.Unlock()
   147  	return 0
   148  }
   149  
   150  // EventRegister implements waiter.Waitable.EventRegister.
   151  func (ep *EpollInstance) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
   152  	ep.q.EventRegister(e, mask)
   153  }
   154  
   155  // EventUnregister implements waiter.Waitable.EventUnregister.
   156  func (ep *EpollInstance) EventUnregister(e *waiter.Entry) {
   157  	ep.q.EventUnregister(e)
   158  }
   159  
   160  // Seek implements FileDescriptionImpl.Seek.
   161  func (ep *EpollInstance) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
   162  	// Linux: fs/eventpoll.c:eventpoll_fops.llseek == noop_llseek
   163  	return 0, nil
   164  }
   165  
   166  // AddInterest implements the semantics of EPOLL_CTL_ADD.
   167  //
   168  // Preconditions: A reference must be held on file.
   169  func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event linux.EpollEvent) error {
   170  	// Check for cyclic polling if necessary.
   171  	subep, _ := file.impl.(*EpollInstance)
   172  	if subep != nil {
   173  		epollCycleMu.Lock()
   174  		// epollCycleMu must be locked for the rest of AddInterest to ensure
   175  		// that cyclic polling is not introduced after the check.
   176  		defer epollCycleMu.Unlock()
   177  		if subep.mightPoll(ep) {
   178  			return linuxerr.ELOOP
   179  		}
   180  	}
   181  
   182  	ep.interestMu.Lock()
   183  	defer ep.interestMu.Unlock()
   184  
   185  	// Fail if the key is already registered.
   186  	key := epollInterestKey{
   187  		file: file,
   188  		num:  num,
   189  	}
   190  	if _, ok := ep.interest[key]; ok {
   191  		return syserror.EEXIST
   192  	}
   193  
   194  	// Register interest in file.
   195  	mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP
   196  	epi := &epollInterest{
   197  		epoll:    ep,
   198  		key:      key,
   199  		mask:     mask,
   200  		userData: event.Data,
   201  	}
   202  	epi.waiter.Callback = epi
   203  	ep.interest[key] = epi
   204  	wmask := waiter.EventMaskFromLinux(mask)
   205  	file.EventRegister(&epi.waiter, wmask)
   206  
   207  	// Check if the file is already ready.
   208  	if m := file.Readiness(wmask) & wmask; m != 0 {
   209  		epi.Callback(nil, m)
   210  	}
   211  
   212  	// Add epi to file.epolls so that it is removed when the last
   213  	// FileDescription reference is dropped.
   214  	file.epollMu.Lock()
   215  	if file.epolls == nil {
   216  		file.epolls = make(map[*epollInterest]struct{})
   217  	}
   218  	file.epolls[epi] = struct{}{}
   219  	file.epollMu.Unlock()
   220  
   221  	return nil
   222  }
   223  
   224  func (ep *EpollInstance) mightPoll(ep2 *EpollInstance) bool {
   225  	return ep.mightPollRecursive(ep2, 4) // Linux: fs/eventpoll.c:EP_MAX_NESTS
   226  }
   227  
   228  func (ep *EpollInstance) mightPollRecursive(ep2 *EpollInstance, remainingRecursion int) bool {
   229  	ep.interestMu.Lock()
   230  	defer ep.interestMu.Unlock()
   231  	for key := range ep.interest {
   232  		nextep, ok := key.file.impl.(*EpollInstance)
   233  		if !ok {
   234  			continue
   235  		}
   236  		if nextep == ep2 {
   237  			return true
   238  		}
   239  		if remainingRecursion == 0 {
   240  			return true
   241  		}
   242  		if nextep.mightPollRecursive(ep2, remainingRecursion-1) {
   243  			return true
   244  		}
   245  	}
   246  	return false
   247  }
   248  
   249  // ModifyInterest implements the semantics of EPOLL_CTL_MOD.
   250  //
   251  // Preconditions: A reference must be held on file.
   252  func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event linux.EpollEvent) error {
   253  	ep.interestMu.Lock()
   254  	defer ep.interestMu.Unlock()
   255  
   256  	// Fail if the key is not already registered.
   257  	epi, ok := ep.interest[epollInterestKey{
   258  		file: file,
   259  		num:  num,
   260  	}]
   261  	if !ok {
   262  		return syserror.ENOENT
   263  	}
   264  
   265  	// Update epi for the next call to ep.ReadEvents().
   266  	mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP
   267  	ep.mu.Lock()
   268  	epi.mask = mask
   269  	epi.userData = event.Data
   270  	ep.mu.Unlock()
   271  
   272  	// Re-register with the new mask.
   273  	file.EventUnregister(&epi.waiter)
   274  	wmask := waiter.EventMaskFromLinux(mask)
   275  	file.EventRegister(&epi.waiter, wmask)
   276  
   277  	// Check if the file is already ready with the new mask.
   278  	if m := file.Readiness(wmask) & wmask; m != 0 {
   279  		epi.Callback(nil, m)
   280  	}
   281  
   282  	return nil
   283  }
   284  
   285  // DeleteInterest implements the semantics of EPOLL_CTL_DEL.
   286  //
   287  // Preconditions: A reference must be held on file.
   288  func (ep *EpollInstance) DeleteInterest(file *FileDescription, num int32) error {
   289  	ep.interestMu.Lock()
   290  	defer ep.interestMu.Unlock()
   291  
   292  	// Fail if the key is not already registered.
   293  	epi, ok := ep.interest[epollInterestKey{
   294  		file: file,
   295  		num:  num,
   296  	}]
   297  	if !ok {
   298  		return syserror.ENOENT
   299  	}
   300  
   301  	// Unregister from the file so that epi will no longer be readied.
   302  	file.EventUnregister(&epi.waiter)
   303  
   304  	// Forget about epi.
   305  	ep.removeLocked(epi)
   306  
   307  	file.epollMu.Lock()
   308  	delete(file.epolls, epi)
   309  	file.epollMu.Unlock()
   310  
   311  	return nil
   312  }
   313  
   314  // Callback implements waiter.EntryCallback.Callback.
   315  func (epi *epollInterest) Callback(*waiter.Entry, waiter.EventMask) {
   316  	newReady := false
   317  	epi.epoll.mu.Lock()
   318  	if !epi.ready {
   319  		newReady = true
   320  		epi.ready = true
   321  		epi.epoll.ready.PushBack(epi)
   322  	}
   323  	epi.epoll.mu.Unlock()
   324  	if newReady {
   325  		epi.epoll.q.Notify(waiter.ReadableEvents)
   326  	}
   327  }
   328  
   329  // Preconditions: ep.interestMu must be locked.
   330  func (ep *EpollInstance) removeLocked(epi *epollInterest) {
   331  	delete(ep.interest, epi.key)
   332  	ep.mu.Lock()
   333  	if epi.ready {
   334  		epi.ready = false
   335  		ep.ready.Remove(epi)
   336  	}
   337  	ep.mu.Unlock()
   338  }
   339  
   340  // ReadEvents appends up to maxReady events to events and returns the updated
   341  // slice of events.
   342  func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent, maxEvents int) []linux.EpollEvent {
   343  	i := 0
   344  	// Hot path: avoid defer.
   345  	ep.mu.Lock()
   346  	var next *epollInterest
   347  	var requeue epollInterestList
   348  	for epi := ep.ready.Front(); epi != nil; epi = next {
   349  		next = epi.Next()
   350  		// Regardless of what else happens, epi is initially removed from the
   351  		// ready list.
   352  		ep.ready.Remove(epi)
   353  		wmask := waiter.EventMaskFromLinux(epi.mask)
   354  		ievents := epi.key.file.Readiness(wmask) & wmask
   355  		if ievents == 0 {
   356  			// Leave epi off the ready list.
   357  			epi.ready = false
   358  			continue
   359  		}
   360  		// Determine what we should do with epi.
   361  		switch {
   362  		case epi.mask&linux.EPOLLONESHOT != 0:
   363  			// Clear all events from the mask; they must be re-added by
   364  			// EPOLL_CTL_MOD.
   365  			epi.mask &= linux.EP_PRIVATE_BITS
   366  			fallthrough
   367  		case epi.mask&linux.EPOLLET != 0:
   368  			// Leave epi off the ready list.
   369  			epi.ready = false
   370  		default:
   371  			// Queue epi to be moved to the end of the ready list.
   372  			requeue.PushBack(epi)
   373  		}
   374  		// Report ievents.
   375  		events = append(events, linux.EpollEvent{
   376  			Events: ievents.ToLinux(),
   377  			Data:   epi.userData,
   378  		})
   379  		i++
   380  		if i == maxEvents {
   381  			break
   382  		}
   383  	}
   384  	ep.ready.PushBackList(&requeue)
   385  	ep.mu.Unlock()
   386  	return events
   387  }