github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/epoll/epoll.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package epoll provides an implementation of Linux's IO event notification
    16  // facility. See epoll(7) for more details.
    17  //
    18  // Lock order:
    19  // EventPoll.mu
    20  //   fdnotifier.notifier.mu
    21  //     EventPoll.listsMu
    22  //       unix.baseEndpoint.Mutex
    23  package epoll
    24  
    25  import (
    26  	"fmt"
    27  
    28  	"golang.org/x/sys/unix"
    29  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    30  	"github.com/SagerNet/gvisor/pkg/context"
    31  	"github.com/SagerNet/gvisor/pkg/refs"
    32  	"github.com/SagerNet/gvisor/pkg/sentry/fs"
    33  	"github.com/SagerNet/gvisor/pkg/sentry/fs/anon"
    34  	"github.com/SagerNet/gvisor/pkg/sentry/fs/fsutil"
    35  	"github.com/SagerNet/gvisor/pkg/sync"
    36  	"github.com/SagerNet/gvisor/pkg/usermem"
    37  	"github.com/SagerNet/gvisor/pkg/waiter"
    38  )
    39  
    40  // EntryFlags is a bitmask that holds an entry's flags.
    41  type EntryFlags int
    42  
    43  // Valid entry flags.
    44  const (
    45  	OneShot EntryFlags = 1 << iota
    46  	EdgeTriggered
    47  )
    48  
    49  // FileIdentifier identifies a file. We cannot use just the FD because it could
    50  // potentially be reassigned. We also cannot use just the file pointer because
    51  // it is possible to have multiple entries for the same file object as long as
    52  // they are created with different FDs (i.e., the FDs point to the same file).
    53  //
    54  // +stateify savable
    55  type FileIdentifier struct {
    56  	File *fs.File `state:"wait"`
    57  	Fd   int32
    58  }
    59  
    60  // pollEntry holds all the state associated with an event poll entry, that is,
    61  // a file being observed by an event poll object.
    62  //
    63  // +stateify savable
    64  type pollEntry struct {
    65  	pollEntryEntry
    66  	file     *refs.WeakRef  `state:"manual"`
    67  	id       FileIdentifier `state:"wait"`
    68  	userData [2]int32
    69  	waiter   waiter.Entry `state:"manual"`
    70  	mask     waiter.EventMask
    71  	flags    EntryFlags
    72  
    73  	epoll *EventPoll
    74  
    75  	// We cannot save the current list pointer as it points into EventPoll
    76  	// struct, while state framework currently does not support such
    77  	// in-struct pointers. Instead, EventPoll will properly set this field
    78  	// in its loading logic.
    79  	curList *pollEntryList `state:"nosave"`
    80  }
    81  
    82  // WeakRefGone implements refs.WeakRefUser.WeakRefGone.
    83  // weakReferenceGone is called when the file in the weak reference is destroyed.
    84  // The poll entry is removed in response to this.
    85  func (p *pollEntry) WeakRefGone(ctx context.Context) {
    86  	p.epoll.RemoveEntry(ctx, p.id)
    87  }
    88  
    89  // EventPoll holds all the state associated with an event poll object, that is,
    90  // collection of files to observe and their current state.
    91  //
    92  // +stateify savable
    93  type EventPoll struct {
    94  	fsutil.FilePipeSeek             `state:"zerovalue"`
    95  	fsutil.FileNotDirReaddir        `state:"zerovalue"`
    96  	fsutil.FileNoFsync              `state:"zerovalue"`
    97  	fsutil.FileNoopFlush            `state:"zerovalue"`
    98  	fsutil.FileNoIoctl              `state:"zerovalue"`
    99  	fsutil.FileNoMMap               `state:"zerovalue"`
   100  	fsutil.FileNoSplice             `state:"nosave"`
   101  	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
   102  
   103  	// Wait queue is used to notify interested parties when the event poll
   104  	// object itself becomes readable or writable.
   105  	waiter.Queue `state:"zerovalue"`
   106  
   107  	// files is the map of all the files currently being observed, it is
   108  	// protected by mu.
   109  	mu    sync.Mutex `state:"nosave"`
   110  	files map[FileIdentifier]*pollEntry
   111  
   112  	// listsMu protects manipulation of the lists below. It needs to be a
   113  	// different lock to avoid circular lock acquisition order involving
   114  	// the wait queue mutexes and mu. The full order is mu, observed file
   115  	// wait queue mutex, then listsMu; this allows listsMu to be acquired
   116  	// when (*pollEntry).Callback is called.
   117  	//
   118  	// An entry is always in one of the following lists:
   119  	//	readyList -- when there's a chance that it's ready to have
   120  	//		events delivered to epoll waiters. Given that being
   121  	//		ready is a transient state, the Readiness() and
   122  	//		readEvents() functions always call the entry's file
   123  	//		Readiness() function to confirm it's ready.
   124  	//	waitingList -- when there's no chance that the entry is ready,
   125  	//		so it's waiting for the (*pollEntry).Callback to be called
   126  	//		on it before it gets moved to the readyList.
   127  	//	disabledList -- when the entry is disabled. This happens when
   128  	//		a one-shot entry gets delivered via readEvents().
   129  	listsMu      sync.Mutex `state:"nosave"`
   130  	readyList    pollEntryList
   131  	waitingList  pollEntryList
   132  	disabledList pollEntryList
   133  }
   134  
   135  // cycleMu is used to serialize all the cycle checks. This is only used when
   136  // an event poll file is added as an entry to another event poll. Such checks
   137  // are serialized to avoid lock acquisition order inversion: if a thread is
   138  // adding A to B, and another thread is adding B to A, each would acquire A's
   139  // and B's mutexes in reverse order, and could cause deadlocks. Having this
   140  // lock prevents this by allowing only one check at a time to happen.
   141  //
   142  // We do the cycle check to prevent callers from introducing potentially
   143  // infinite recursions. If a caller were to add A to B and then B to A, for
   144  // event poll A to know if it's readable, it would need to check event poll B,
   145  // which in turn would need event poll A and so on indefinitely.
   146  var cycleMu sync.Mutex
   147  
   148  // NewEventPoll allocates and initializes a new event poll object.
   149  func NewEventPoll(ctx context.Context) *fs.File {
   150  	// name matches fs/eventpoll.c:epoll_create1.
   151  	dirent := fs.NewDirent(ctx, anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]"))
   152  	// Release the initial dirent reference after NewFile takes a reference.
   153  	defer dirent.DecRef(ctx)
   154  	return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{
   155  		files: make(map[FileIdentifier]*pollEntry),
   156  	})
   157  }
   158  
   159  // Release implements fs.FileOperations.Release.
   160  func (e *EventPoll) Release(ctx context.Context) {
   161  	// We need to take the lock now because files may be attempting to
   162  	// remove entries in parallel if they get destroyed.
   163  	e.mu.Lock()
   164  	defer e.mu.Unlock()
   165  
   166  	// Go through all entries and clean up.
   167  	for _, entry := range e.files {
   168  		entry.id.File.EventUnregister(&entry.waiter)
   169  		entry.file.Drop(ctx)
   170  	}
   171  	e.files = nil
   172  }
   173  
   174  // Read implements fs.FileOperations.Read.
   175  func (*EventPoll) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
   176  	return 0, unix.ENOSYS
   177  }
   178  
   179  // Write implements fs.FileOperations.Write.
   180  func (*EventPoll) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
   181  	return 0, unix.ENOSYS
   182  }
   183  
   184  // eventsAvailable determines if 'e' has events available for delivery.
   185  func (e *EventPoll) eventsAvailable() bool {
   186  	e.listsMu.Lock()
   187  
   188  	for it := e.readyList.Front(); it != nil; {
   189  		entry := it
   190  		it = it.Next()
   191  
   192  		// If the entry is ready, we know 'e' has at least one entry
   193  		// ready for delivery.
   194  		ready := entry.id.File.Readiness(entry.mask)
   195  		if ready != 0 {
   196  			e.listsMu.Unlock()
   197  			return true
   198  		}
   199  
   200  		// Entry is not ready, so move it to waiting list.
   201  		e.readyList.Remove(entry)
   202  		e.waitingList.PushBack(entry)
   203  		entry.curList = &e.waitingList
   204  	}
   205  
   206  	e.listsMu.Unlock()
   207  
   208  	return false
   209  }
   210  
   211  // Readiness determines if the event poll object is currently readable (i.e.,
   212  // if there are pending events for delivery).
   213  func (e *EventPoll) Readiness(mask waiter.EventMask) waiter.EventMask {
   214  	ready := waiter.EventMask(0)
   215  
   216  	if (mask&waiter.ReadableEvents) != 0 && e.eventsAvailable() {
   217  		ready |= waiter.ReadableEvents
   218  	}
   219  
   220  	return ready
   221  }
   222  
   223  // ReadEvents returns up to max available events.
   224  func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent {
   225  	var local pollEntryList
   226  	var ret []linux.EpollEvent
   227  
   228  	e.listsMu.Lock()
   229  
   230  	// Go through all entries we believe may be ready.
   231  	for it := e.readyList.Front(); it != nil && len(ret) < max; {
   232  		entry := it
   233  		it = it.Next()
   234  
   235  		// Check the entry's readiness. It it's not really ready, we
   236  		// just put it back in the waiting list and move on to the next
   237  		// entry.
   238  		ready := entry.id.File.Readiness(entry.mask) & entry.mask
   239  		if ready == 0 {
   240  			e.readyList.Remove(entry)
   241  			e.waitingList.PushBack(entry)
   242  			entry.curList = &e.waitingList
   243  
   244  			continue
   245  		}
   246  
   247  		// Add event to the array that will be returned to caller.
   248  		ret = append(ret, linux.EpollEvent{
   249  			Events: uint32(ready),
   250  			Data:   entry.userData,
   251  		})
   252  
   253  		// The entry is consumed, so we must move it to the disabled
   254  		// list in case it's one-shot, or back to the wait list if it's
   255  		// edge-triggered. If it's neither, we leave it in the ready
   256  		// list so that its readiness can be checked the next time
   257  		// around; however, we must move it to the end of the list so
   258  		// that other events can be delivered as well.
   259  		e.readyList.Remove(entry)
   260  		if entry.flags&OneShot != 0 {
   261  			e.disabledList.PushBack(entry)
   262  			entry.curList = &e.disabledList
   263  		} else if entry.flags&EdgeTriggered != 0 {
   264  			e.waitingList.PushBack(entry)
   265  			entry.curList = &e.waitingList
   266  		} else {
   267  			local.PushBack(entry)
   268  		}
   269  	}
   270  
   271  	e.readyList.PushBackList(&local)
   272  
   273  	e.listsMu.Unlock()
   274  
   275  	return ret
   276  }
   277  
   278  // Callback implements waiter.EntryCallback.Callback.
   279  //
   280  // Callback is called when one of the files we're polling becomes ready. It
   281  // moves said file to the readyList if it's currently in the waiting list.
   282  func (p *pollEntry) Callback(*waiter.Entry, waiter.EventMask) {
   283  	e := p.epoll
   284  
   285  	e.listsMu.Lock()
   286  
   287  	if p.curList == &e.waitingList {
   288  		e.waitingList.Remove(p)
   289  		e.readyList.PushBack(p)
   290  		p.curList = &e.readyList
   291  		e.listsMu.Unlock()
   292  
   293  		e.Notify(waiter.ReadableEvents)
   294  		return
   295  	}
   296  
   297  	e.listsMu.Unlock()
   298  }
   299  
   300  // initEntryReadiness initializes the entry's state with regards to its
   301  // readiness by placing it in the appropriate list and registering for
   302  // notifications.
   303  func (e *EventPoll) initEntryReadiness(entry *pollEntry) {
   304  	// A new entry starts off in the waiting list.
   305  	e.listsMu.Lock()
   306  	e.waitingList.PushBack(entry)
   307  	entry.curList = &e.waitingList
   308  	e.listsMu.Unlock()
   309  
   310  	// Register for event notifications.
   311  	f := entry.id.File
   312  	f.EventRegister(&entry.waiter, entry.mask)
   313  
   314  	// Check if the file happens to already be in a ready state.
   315  	if ready := f.Readiness(entry.mask) & entry.mask; ready != 0 {
   316  		entry.Callback(&entry.waiter, ready)
   317  	}
   318  }
   319  
   320  // observes checks if event poll object e is directly or indirectly observing
   321  // event poll object ep. It uses a bounded recursive depth-first search.
   322  func (e *EventPoll) observes(ep *EventPoll, depthLeft int) bool {
   323  	// If we reached the maximum depth, we'll consider that we found it
   324  	// because we don't want to allow chains that are too long.
   325  	if depthLeft <= 0 {
   326  		return true
   327  	}
   328  
   329  	e.mu.Lock()
   330  	defer e.mu.Unlock()
   331  
   332  	// Go through each observed file and check if it is or observes ep.
   333  	for id := range e.files {
   334  		f, ok := id.File.FileOperations.(*EventPoll)
   335  		if !ok {
   336  			continue
   337  		}
   338  
   339  		if f == ep || f.observes(ep, depthLeft-1) {
   340  			return true
   341  		}
   342  	}
   343  
   344  	return false
   345  }
   346  
   347  // AddEntry adds a new file to the collection of files observed by e.
   348  func (e *EventPoll) AddEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error {
   349  	// Acquire cycle check lock if another event poll is being added.
   350  	ep, ok := id.File.FileOperations.(*EventPoll)
   351  	if ok {
   352  		cycleMu.Lock()
   353  		defer cycleMu.Unlock()
   354  	}
   355  
   356  	e.mu.Lock()
   357  	defer e.mu.Unlock()
   358  
   359  	// Fail if the file already has an entry.
   360  	if _, ok := e.files[id]; ok {
   361  		return unix.EEXIST
   362  	}
   363  
   364  	// Check if a cycle would be created. We use 4 as the limit because
   365  	// that's the value used by linux and we want to emulate it.
   366  	if ep != nil {
   367  		if e == ep {
   368  			return unix.EINVAL
   369  		}
   370  
   371  		if ep.observes(e, 4) {
   372  			return unix.ELOOP
   373  		}
   374  	}
   375  
   376  	// Create new entry and add it to map.
   377  	//
   378  	// N.B. Even though we are creating a weak reference here, we know it
   379  	//      won't trigger a callback because we hold a reference to the file
   380  	//      throughout the execution of this function.
   381  	entry := &pollEntry{
   382  		id:       id,
   383  		userData: data,
   384  		epoll:    e,
   385  		flags:    flags,
   386  		mask:     mask,
   387  	}
   388  	entry.waiter.Callback = entry
   389  	e.files[id] = entry
   390  	entry.file = refs.NewWeakRef(id.File, entry)
   391  
   392  	// Initialize the readiness state of the new entry.
   393  	e.initEntryReadiness(entry)
   394  
   395  	return nil
   396  }
   397  
   398  // UpdateEntry updates the flags, mask and user data associated with a file that
   399  // is already part of the collection of observed files.
   400  func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error {
   401  	e.mu.Lock()
   402  	defer e.mu.Unlock()
   403  
   404  	// Fail if the file doesn't have an entry.
   405  	entry, ok := e.files[id]
   406  	if !ok {
   407  		return unix.ENOENT
   408  	}
   409  
   410  	// Unregister the old mask and remove entry from the list it's in, so
   411  	// (*pollEntry).Callback is guaranteed to not be called on this entry anymore.
   412  	entry.id.File.EventUnregister(&entry.waiter)
   413  
   414  	// Remove entry from whatever list it's in. This ensure that no other
   415  	// threads have access to this entry as the only way left to find it
   416  	// is via e.files, but we hold e.mu, which prevents that.
   417  	e.listsMu.Lock()
   418  	entry.curList.Remove(entry)
   419  	e.listsMu.Unlock()
   420  
   421  	// Initialize new readiness state.
   422  	entry.flags = flags
   423  	entry.mask = mask
   424  	entry.userData = data
   425  	e.initEntryReadiness(entry)
   426  
   427  	return nil
   428  }
   429  
   430  // RemoveEntry a files from the collection of observed files.
   431  func (e *EventPoll) RemoveEntry(ctx context.Context, id FileIdentifier) error {
   432  	e.mu.Lock()
   433  	defer e.mu.Unlock()
   434  
   435  	// Fail if the file doesn't have an entry.
   436  	entry, ok := e.files[id]
   437  	if !ok {
   438  		return unix.ENOENT
   439  	}
   440  
   441  	// Unregister from file first so that no concurrent attempts will be
   442  	// made to manipulate the file.
   443  	entry.id.File.EventUnregister(&entry.waiter)
   444  
   445  	// Remove from the current list.
   446  	e.listsMu.Lock()
   447  	entry.curList.Remove(entry)
   448  	entry.curList = nil
   449  	e.listsMu.Unlock()
   450  
   451  	// Remove file from map, and drop weak reference.
   452  	delete(e.files, id)
   453  	entry.file.Drop(ctx)
   454  
   455  	return nil
   456  }
   457  
   458  // UnregisterEpollWaiters removes the epoll waiter objects from the waiting
   459  // queues. This is different from Release() as the file is not dereferenced.
   460  func (e *EventPoll) UnregisterEpollWaiters() {
   461  	e.mu.Lock()
   462  	defer e.mu.Unlock()
   463  
   464  	for _, entry := range e.files {
   465  		entry.id.File.EventUnregister(&entry.waiter)
   466  	}
   467  }