github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/mm/aio_context.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package mm
    16  
    17  import (
    18  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    19  	"github.com/SagerNet/gvisor/pkg/context"
    20  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    21  	"github.com/SagerNet/gvisor/pkg/hostarch"
    22  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    23  	"github.com/SagerNet/gvisor/pkg/sentry/pgalloc"
    24  	"github.com/SagerNet/gvisor/pkg/sentry/usage"
    25  	"github.com/SagerNet/gvisor/pkg/sync"
    26  	"github.com/SagerNet/gvisor/pkg/syserror"
    27  	"github.com/SagerNet/gvisor/pkg/usermem"
    28  )
    29  
    30  // aioManager creates and manages asynchronous I/O contexts.
    31  //
    32  // +stateify savable
    33  type aioManager struct {
    34  	// mu protects below.
    35  	mu sync.Mutex `state:"nosave"`
    36  
    37  	// aioContexts is the set of asynchronous I/O contexts.
    38  	contexts map[uint64]*AIOContext
    39  }
    40  
    41  func (mm *MemoryManager) destroyAIOManager(ctx context.Context) {
    42  	mm.aioManager.mu.Lock()
    43  	defer mm.aioManager.mu.Unlock()
    44  
    45  	for id := range mm.aioManager.contexts {
    46  		mm.destroyAIOContextLocked(ctx, id)
    47  	}
    48  }
    49  
    50  // newAIOContext creates a new context for asynchronous I/O.
    51  //
    52  // Returns false if 'id' is currently in use.
    53  func (a *aioManager) newAIOContext(events uint32, id uint64) bool {
    54  	a.mu.Lock()
    55  	defer a.mu.Unlock()
    56  
    57  	if _, ok := a.contexts[id]; ok {
    58  		return false
    59  	}
    60  
    61  	a.contexts[id] = &AIOContext{
    62  		requestReady:   make(chan struct{}, 1),
    63  		maxOutstanding: events,
    64  	}
    65  	return true
    66  }
    67  
    68  // destroyAIOContext destroys an asynchronous I/O context. It doesn't wait for
    69  // for pending requests to complete. Returns the destroyed AIOContext so it can
    70  // be drained.
    71  //
    72  // Nil is returned if the context does not exist.
    73  //
    74  // Precondition: mm.aioManager.mu is locked.
    75  func (mm *MemoryManager) destroyAIOContextLocked(ctx context.Context, id uint64) *AIOContext {
    76  	aioCtx, ok := mm.aioManager.contexts[id]
    77  	if !ok {
    78  		return nil
    79  	}
    80  
    81  	// Only unmaps after it assured that the address is a valid aio context to
    82  	// prevent random memory from been unmapped.
    83  	//
    84  	// Note: It's possible to unmap this address and map something else into
    85  	// the same address. Then it would be unmapping memory that it doesn't own.
    86  	// This is, however, the way Linux implements AIO. Keeps the same [weird]
    87  	// semantics in case anyone relies on it.
    88  	mm.MUnmap(ctx, hostarch.Addr(id), aioRingBufferSize)
    89  
    90  	delete(mm.aioManager.contexts, id)
    91  	aioCtx.destroy()
    92  	return aioCtx
    93  }
    94  
    95  // lookupAIOContext looks up the given context.
    96  //
    97  // Returns false if context does not exist.
    98  func (a *aioManager) lookupAIOContext(id uint64) (*AIOContext, bool) {
    99  	a.mu.Lock()
   100  	defer a.mu.Unlock()
   101  	ctx, ok := a.contexts[id]
   102  	return ctx, ok
   103  }
   104  
   105  // ioResult is a completed I/O operation.
   106  //
   107  // +stateify savable
   108  type ioResult struct {
   109  	data interface{}
   110  	ioEntry
   111  }
   112  
   113  // AIOContext is a single asynchronous I/O context.
   114  //
   115  // +stateify savable
   116  type AIOContext struct {
   117  	// requestReady is the notification channel used for all requests.
   118  	requestReady chan struct{} `state:"nosave"`
   119  
   120  	// mu protects below.
   121  	mu sync.Mutex `state:"nosave"`
   122  
   123  	// results is the set of completed requests.
   124  	results ioList
   125  
   126  	// maxOutstanding is the maximum number of outstanding entries; this value
   127  	// is immutable.
   128  	maxOutstanding uint32
   129  
   130  	// outstanding is the number of requests outstanding; this will effectively
   131  	// be the number of entries in the result list or that are expected to be
   132  	// added to the result list.
   133  	outstanding uint32
   134  
   135  	// dead is set when the context is destroyed.
   136  	dead bool `state:"zerovalue"`
   137  }
   138  
   139  // destroy marks the context dead.
   140  func (ctx *AIOContext) destroy() {
   141  	ctx.mu.Lock()
   142  	defer ctx.mu.Unlock()
   143  	ctx.dead = true
   144  	ctx.checkForDone()
   145  }
   146  
   147  // Preconditions: ctx.mu must be held by caller.
   148  func (ctx *AIOContext) checkForDone() {
   149  	if ctx.dead && ctx.outstanding == 0 {
   150  		close(ctx.requestReady)
   151  		ctx.requestReady = nil
   152  	}
   153  }
   154  
   155  // Prepare reserves space for a new request, returning nil if available.
   156  // Returns EAGAIN if the context is busy and EINVAL if the context is dead.
   157  func (ctx *AIOContext) Prepare() error {
   158  	ctx.mu.Lock()
   159  	defer ctx.mu.Unlock()
   160  	if ctx.dead {
   161  		// Context died after the caller looked it up.
   162  		return linuxerr.EINVAL
   163  	}
   164  	if ctx.outstanding >= ctx.maxOutstanding {
   165  		// Context is busy.
   166  		return linuxerr.EAGAIN
   167  	}
   168  	ctx.outstanding++
   169  	return nil
   170  }
   171  
   172  // PopRequest pops a completed request if available, this function does not do
   173  // any blocking. Returns false if no request is available.
   174  func (ctx *AIOContext) PopRequest() (interface{}, bool) {
   175  	ctx.mu.Lock()
   176  	defer ctx.mu.Unlock()
   177  
   178  	// Is there anything ready?
   179  	if e := ctx.results.Front(); e != nil {
   180  		if ctx.outstanding == 0 {
   181  			panic("AIOContext outstanding is going negative")
   182  		}
   183  		ctx.outstanding--
   184  		ctx.results.Remove(e)
   185  		ctx.checkForDone()
   186  		return e.data, true
   187  	}
   188  	return nil, false
   189  }
   190  
   191  // FinishRequest finishes a pending request. It queues up the data
   192  // and notifies listeners.
   193  func (ctx *AIOContext) FinishRequest(data interface{}) {
   194  	ctx.mu.Lock()
   195  	defer ctx.mu.Unlock()
   196  
   197  	// Push to the list and notify opportunistically. The channel notify
   198  	// here is guaranteed to be safe because outstanding must be non-zero.
   199  	// The requestReady channel is only closed when outstanding reaches zero.
   200  	ctx.results.PushBack(&ioResult{data: data})
   201  
   202  	select {
   203  	case ctx.requestReady <- struct{}{}:
   204  	default:
   205  	}
   206  }
   207  
   208  // WaitChannel returns a channel that is notified when an AIO request is
   209  // completed. Returns nil if the context is destroyed and there are no more
   210  // outstanding requests.
   211  func (ctx *AIOContext) WaitChannel() chan struct{} {
   212  	ctx.mu.Lock()
   213  	defer ctx.mu.Unlock()
   214  	return ctx.requestReady
   215  }
   216  
   217  // Dead returns true if the context has been destroyed.
   218  func (ctx *AIOContext) Dead() bool {
   219  	ctx.mu.Lock()
   220  	defer ctx.mu.Unlock()
   221  	return ctx.dead
   222  }
   223  
   224  // CancelPendingRequest forgets about a request that hasn't yet completed.
   225  func (ctx *AIOContext) CancelPendingRequest() {
   226  	ctx.mu.Lock()
   227  	defer ctx.mu.Unlock()
   228  
   229  	if ctx.outstanding == 0 {
   230  		panic("AIOContext outstanding is going negative")
   231  	}
   232  	ctx.outstanding--
   233  	ctx.checkForDone()
   234  }
   235  
   236  // Drain drops all completed requests. Pending requests remain untouched.
   237  func (ctx *AIOContext) Drain() {
   238  	ctx.mu.Lock()
   239  	defer ctx.mu.Unlock()
   240  
   241  	if ctx.outstanding == 0 {
   242  		return
   243  	}
   244  	size := uint32(ctx.results.Len())
   245  	if ctx.outstanding < size {
   246  		panic("AIOContext outstanding is going negative")
   247  	}
   248  	ctx.outstanding -= size
   249  	ctx.results.Reset()
   250  	ctx.checkForDone()
   251  }
   252  
   253  // aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO
   254  // ring buffers.
   255  //
   256  // +stateify savable
   257  type aioMappable struct {
   258  	aioMappableRefs
   259  
   260  	mfp pgalloc.MemoryFileProvider
   261  	fr  memmap.FileRange
   262  }
   263  
   264  var aioRingBufferSize = uint64(hostarch.Addr(linux.AIORingSize).MustRoundUp())
   265  
   266  func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) {
   267  	fr, err := mfp.MemoryFile().Allocate(aioRingBufferSize, usage.Anonymous)
   268  	if err != nil {
   269  		return nil, err
   270  	}
   271  	m := aioMappable{mfp: mfp, fr: fr}
   272  	m.InitRefs()
   273  	return &m, nil
   274  }
   275  
   276  // DecRef implements refs.RefCounter.DecRef.
   277  func (m *aioMappable) DecRef(ctx context.Context) {
   278  	m.aioMappableRefs.DecRef(func() {
   279  		m.mfp.MemoryFile().DecRef(m.fr)
   280  	})
   281  }
   282  
   283  // MappedName implements memmap.MappingIdentity.MappedName.
   284  func (m *aioMappable) MappedName(ctx context.Context) string {
   285  	return "[aio]"
   286  }
   287  
   288  // DeviceID implements memmap.MappingIdentity.DeviceID.
   289  func (m *aioMappable) DeviceID() uint64 {
   290  	return 0
   291  }
   292  
   293  // InodeID implements memmap.MappingIdentity.InodeID.
   294  func (m *aioMappable) InodeID() uint64 {
   295  	return 0
   296  }
   297  
   298  // Msync implements memmap.MappingIdentity.Msync.
   299  func (m *aioMappable) Msync(ctx context.Context, mr memmap.MappableRange) error {
   300  	// Linux: aio_ring_fops.fsync == NULL
   301  	return linuxerr.EINVAL
   302  }
   303  
   304  // AddMapping implements memmap.Mappable.AddMapping.
   305  func (m *aioMappable) AddMapping(_ context.Context, _ memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, _ bool) error {
   306  	// Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap()
   307  	// sets VM_DONTEXPAND).
   308  	if offset != 0 || uint64(ar.Length()) != aioRingBufferSize {
   309  		return syserror.EFAULT
   310  	}
   311  	return nil
   312  }
   313  
   314  // RemoveMapping implements memmap.Mappable.RemoveMapping.
   315  func (m *aioMappable) RemoveMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, uint64, bool) {
   316  }
   317  
   318  // CopyMapping implements memmap.Mappable.CopyMapping.
   319  func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, _ bool) error {
   320  	// Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap()
   321  	// sets VM_DONTEXPAND).
   322  	if offset != 0 || uint64(dstAR.Length()) != aioRingBufferSize {
   323  		return syserror.EFAULT
   324  	}
   325  	// Require that the mapping correspond to a live AIOContext. Compare
   326  	// Linux's fs/aio.c:aio_ring_mremap().
   327  	mm, ok := ms.(*MemoryManager)
   328  	if !ok {
   329  		return linuxerr.EINVAL
   330  	}
   331  	am := &mm.aioManager
   332  	am.mu.Lock()
   333  	defer am.mu.Unlock()
   334  	oldID := uint64(srcAR.Start)
   335  	aioCtx, ok := am.contexts[oldID]
   336  	if !ok {
   337  		return linuxerr.EINVAL
   338  	}
   339  	aioCtx.mu.Lock()
   340  	defer aioCtx.mu.Unlock()
   341  	if aioCtx.dead {
   342  		return linuxerr.EINVAL
   343  	}
   344  	// Use the new ID for the AIOContext.
   345  	am.contexts[uint64(dstAR.Start)] = aioCtx
   346  	delete(am.contexts, oldID)
   347  	return nil
   348  }
   349  
   350  // Translate implements memmap.Mappable.Translate.
   351  func (m *aioMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
   352  	var err error
   353  	if required.End > m.fr.Length() {
   354  		err = &memmap.BusError{syserror.EFAULT}
   355  	}
   356  	if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 {
   357  		return []memmap.Translation{
   358  			{
   359  				Source: source,
   360  				File:   m.mfp.MemoryFile(),
   361  				Offset: m.fr.Start + source.Start,
   362  				Perms:  hostarch.AnyAccess,
   363  			},
   364  		}, err
   365  	}
   366  	return nil, err
   367  }
   368  
   369  // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
   370  func (m *aioMappable) InvalidateUnsavable(ctx context.Context) error {
   371  	return nil
   372  }
   373  
   374  // NewAIOContext creates a new context for asynchronous I/O.
   375  //
   376  // NewAIOContext is analogous to Linux's fs/aio.c:ioctx_alloc().
   377  func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint64, error) {
   378  	// libaio get_ioevents() expects context "handle" to be a valid address.
   379  	// libaio peeks inside looking for a magic number. This function allocates
   380  	// a page per context and keeps it set to zeroes to ensure it will not
   381  	// match AIO_RING_MAGIC and make libaio happy.
   382  	m, err := newAIOMappable(mm.mfp)
   383  	if err != nil {
   384  		return 0, err
   385  	}
   386  	defer m.DecRef(ctx)
   387  	addr, err := mm.MMap(ctx, memmap.MMapOpts{
   388  		Length:          aioRingBufferSize,
   389  		MappingIdentity: m,
   390  		Mappable:        m,
   391  		// Linux uses "do_mmap_pgoff(..., PROT_READ | PROT_WRITE, ...)" in
   392  		// fs/aio.c:aio_setup_ring(). Since we don't implement AIO_RING_MAGIC,
   393  		// user mode should not write to this page.
   394  		Perms:    hostarch.Read,
   395  		MaxPerms: hostarch.Read,
   396  	})
   397  	if err != nil {
   398  		return 0, err
   399  	}
   400  	id := uint64(addr)
   401  	if !mm.aioManager.newAIOContext(events, id) {
   402  		mm.MUnmap(ctx, addr, aioRingBufferSize)
   403  		return 0, linuxerr.EINVAL
   404  	}
   405  	return id, nil
   406  }
   407  
   408  // DestroyAIOContext destroys an asynchronous I/O context. It returns the
   409  // destroyed context. nil if the context does not exist.
   410  func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) *AIOContext {
   411  	if !mm.isValidAddr(ctx, id) {
   412  		return nil
   413  	}
   414  
   415  	mm.aioManager.mu.Lock()
   416  	defer mm.aioManager.mu.Unlock()
   417  	return mm.destroyAIOContextLocked(ctx, id)
   418  }
   419  
   420  // LookupAIOContext looks up the given context. It returns false if the context
   421  // does not exist.
   422  func (mm *MemoryManager) LookupAIOContext(ctx context.Context, id uint64) (*AIOContext, bool) {
   423  	aioCtx, ok := mm.aioManager.lookupAIOContext(id)
   424  	if !ok {
   425  		return nil, false
   426  	}
   427  
   428  	// Protect against 'id' that is inaccessible.
   429  	if !mm.isValidAddr(ctx, id) {
   430  		return nil, false
   431  	}
   432  
   433  	return aioCtx, true
   434  }
   435  
   436  // isValidAddr determines if the address `id` is valid. (Linux also reads 4
   437  // bytes from id).
   438  func (mm *MemoryManager) isValidAddr(ctx context.Context, id uint64) bool {
   439  	var buf [4]byte
   440  	_, err := mm.CopyIn(ctx, hostarch.Addr(id), buf[:], usermem.IOOpts{})
   441  	return err == nil
   442  }