github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/mm/aio_context.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package mm
    16  
    17  import (
    18  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    19  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    20  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    21  	"github.com/nicocha30/gvisor-ligolo/pkg/hostarch"
    22  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/memmap"
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/pgalloc"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/usage"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/usermem"
    26  )
    27  
    28  // aioManager creates and manages asynchronous I/O contexts.
    29  //
    30  // +stateify savable
    31  type aioManager struct {
    32  	// mu protects below.
    33  	mu aioManagerMutex `state:"nosave"`
    34  
    35  	// aioContexts is the set of asynchronous I/O contexts.
    36  	contexts map[uint64]*AIOContext
    37  }
    38  
    39  func (mm *MemoryManager) destroyAIOManager(ctx context.Context) {
    40  	mm.aioManager.mu.Lock()
    41  	defer mm.aioManager.mu.Unlock()
    42  
    43  	for id := range mm.aioManager.contexts {
    44  		mm.destroyAIOContextLocked(ctx, id)
    45  	}
    46  }
    47  
    48  // newAIOContext creates a new context for asynchronous I/O.
    49  //
    50  // Returns false if 'id' is currently in use.
    51  func (a *aioManager) newAIOContext(events uint32, id uint64) bool {
    52  	a.mu.Lock()
    53  	defer a.mu.Unlock()
    54  
    55  	if _, ok := a.contexts[id]; ok {
    56  		return false
    57  	}
    58  
    59  	a.contexts[id] = &AIOContext{
    60  		requestReady:   make(chan struct{}, 1),
    61  		maxOutstanding: events,
    62  	}
    63  	return true
    64  }
    65  
    66  // destroyAIOContext destroys an asynchronous I/O context. It doesn't wait for
    67  // for pending requests to complete. Returns the destroyed AIOContext so it can
    68  // be drained.
    69  //
    70  // Nil is returned if the context does not exist.
    71  //
    72  // Precondition: mm.aioManager.mu is locked.
    73  func (mm *MemoryManager) destroyAIOContextLocked(ctx context.Context, id uint64) *AIOContext {
    74  	aioCtx, ok := mm.aioManager.contexts[id]
    75  	if !ok {
    76  		return nil
    77  	}
    78  
    79  	delete(mm.aioManager.contexts, id)
    80  	aioCtx.destroy()
    81  	return aioCtx
    82  }
    83  
    84  // lookupAIOContext looks up the given context.
    85  //
    86  // Returns false if context does not exist.
    87  func (a *aioManager) lookupAIOContext(id uint64) (*AIOContext, bool) {
    88  	a.mu.Lock()
    89  	defer a.mu.Unlock()
    90  	ctx, ok := a.contexts[id]
    91  	return ctx, ok
    92  }
    93  
    94  // ioResult is a completed I/O operation.
    95  //
    96  // +stateify savable
    97  type ioResult struct {
    98  	data any
    99  	ioEntry
   100  }
   101  
   102  // AIOContext is a single asynchronous I/O context.
   103  //
   104  // +stateify savable
   105  type AIOContext struct {
   106  	// requestReady is the notification channel used for all requests.
   107  	requestReady chan struct{} `state:"nosave"`
   108  
   109  	// mu protects below.
   110  	mu aioContextMutex `state:"nosave"`
   111  
   112  	// results is the set of completed requests.
   113  	results ioList
   114  
   115  	// maxOutstanding is the maximum number of outstanding entries; this value
   116  	// is immutable.
   117  	maxOutstanding uint32
   118  
   119  	// outstanding is the number of requests outstanding; this will effectively
   120  	// be the number of entries in the result list or that are expected to be
   121  	// added to the result list.
   122  	outstanding uint32
   123  
   124  	// dead is set when the context is destroyed.
   125  	dead bool `state:"zerovalue"`
   126  }
   127  
   128  // destroy marks the context dead.
   129  func (ctx *AIOContext) destroy() {
   130  	ctx.mu.Lock()
   131  	defer ctx.mu.Unlock()
   132  	ctx.dead = true
   133  	ctx.checkForDone()
   134  }
   135  
   136  // Preconditions: ctx.mu must be held by caller.
   137  func (ctx *AIOContext) checkForDone() {
   138  	if ctx.dead && ctx.outstanding == 0 {
   139  		close(ctx.requestReady)
   140  		ctx.requestReady = nil
   141  	}
   142  }
   143  
   144  // Prepare reserves space for a new request, returning nil if available.
   145  // Returns EAGAIN if the context is busy and EINVAL if the context is dead.
   146  func (ctx *AIOContext) Prepare() error {
   147  	ctx.mu.Lock()
   148  	defer ctx.mu.Unlock()
   149  	if ctx.dead {
   150  		// Context died after the caller looked it up.
   151  		return linuxerr.EINVAL
   152  	}
   153  	if ctx.outstanding >= ctx.maxOutstanding {
   154  		// Context is busy.
   155  		return linuxerr.EAGAIN
   156  	}
   157  	ctx.outstanding++
   158  	return nil
   159  }
   160  
   161  // PopRequest pops a completed request if available, this function does not do
   162  // any blocking. Returns false if no request is available.
   163  func (ctx *AIOContext) PopRequest() (any, bool) {
   164  	ctx.mu.Lock()
   165  	defer ctx.mu.Unlock()
   166  
   167  	// Is there anything ready?
   168  	if e := ctx.results.Front(); e != nil {
   169  		if ctx.outstanding == 0 {
   170  			panic("AIOContext outstanding is going negative")
   171  		}
   172  		ctx.outstanding--
   173  		ctx.results.Remove(e)
   174  		ctx.checkForDone()
   175  		return e.data, true
   176  	}
   177  	return nil, false
   178  }
   179  
   180  // FinishRequest finishes a pending request. It queues up the data
   181  // and notifies listeners.
   182  func (ctx *AIOContext) FinishRequest(data any) {
   183  	ctx.mu.Lock()
   184  	defer ctx.mu.Unlock()
   185  
   186  	// Push to the list and notify opportunistically. The channel notify
   187  	// here is guaranteed to be safe because outstanding must be non-zero.
   188  	// The requestReady channel is only closed when outstanding reaches zero.
   189  	ctx.results.PushBack(&ioResult{data: data})
   190  
   191  	select {
   192  	case ctx.requestReady <- struct{}{}:
   193  	default:
   194  	}
   195  }
   196  
   197  // WaitChannel returns a channel that is notified when an AIO request is
   198  // completed. Returns nil if the context is destroyed and there are no more
   199  // outstanding requests.
   200  func (ctx *AIOContext) WaitChannel() chan struct{} {
   201  	ctx.mu.Lock()
   202  	defer ctx.mu.Unlock()
   203  	return ctx.requestReady
   204  }
   205  
   206  // Dead returns true if the context has been destroyed.
   207  func (ctx *AIOContext) Dead() bool {
   208  	ctx.mu.Lock()
   209  	defer ctx.mu.Unlock()
   210  	return ctx.dead
   211  }
   212  
   213  // CancelPendingRequest forgets about a request that hasn't yet completed.
   214  func (ctx *AIOContext) CancelPendingRequest() {
   215  	ctx.mu.Lock()
   216  	defer ctx.mu.Unlock()
   217  
   218  	if ctx.outstanding == 0 {
   219  		panic("AIOContext outstanding is going negative")
   220  	}
   221  	ctx.outstanding--
   222  	ctx.checkForDone()
   223  }
   224  
   225  // Drain drops all completed requests. Pending requests remain untouched.
   226  func (ctx *AIOContext) Drain() {
   227  	ctx.mu.Lock()
   228  	defer ctx.mu.Unlock()
   229  
   230  	if ctx.outstanding == 0 {
   231  		return
   232  	}
   233  	size := uint32(ctx.results.Len())
   234  	if ctx.outstanding < size {
   235  		panic("AIOContext outstanding is going negative")
   236  	}
   237  	ctx.outstanding -= size
   238  	ctx.results.Reset()
   239  	ctx.checkForDone()
   240  }
   241  
   242  // aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO
   243  // ring buffers.
   244  //
   245  // +stateify savable
   246  type aioMappable struct {
   247  	aioMappableRefs
   248  
   249  	mfp pgalloc.MemoryFileProvider
   250  	fr  memmap.FileRange
   251  }
   252  
   253  var aioRingBufferSize = uint64(hostarch.Addr(linux.AIORingSize).MustRoundUp())
   254  
   255  func newAIOMappable(ctx context.Context, mfp pgalloc.MemoryFileProvider) (*aioMappable, error) {
   256  	fr, err := mfp.MemoryFile().Allocate(aioRingBufferSize, pgalloc.AllocOpts{Kind: usage.Anonymous, MemCgID: pgalloc.MemoryCgroupIDFromContext(ctx)})
   257  	if err != nil {
   258  		return nil, err
   259  	}
   260  	m := aioMappable{mfp: mfp, fr: fr}
   261  	m.InitRefs()
   262  	return &m, nil
   263  }
   264  
   265  // DecRef implements refs.RefCounter.DecRef.
   266  func (m *aioMappable) DecRef(ctx context.Context) {
   267  	m.aioMappableRefs.DecRef(func() {
   268  		m.mfp.MemoryFile().DecRef(m.fr)
   269  	})
   270  }
   271  
   272  // MappedName implements memmap.MappingIdentity.MappedName.
   273  func (m *aioMappable) MappedName(ctx context.Context) string {
   274  	return "[aio]"
   275  }
   276  
   277  // DeviceID implements memmap.MappingIdentity.DeviceID.
   278  func (m *aioMappable) DeviceID() uint64 {
   279  	return 0
   280  }
   281  
   282  // InodeID implements memmap.MappingIdentity.InodeID.
   283  func (m *aioMappable) InodeID() uint64 {
   284  	return 0
   285  }
   286  
   287  // Msync implements memmap.MappingIdentity.Msync.
   288  func (m *aioMappable) Msync(ctx context.Context, mr memmap.MappableRange) error {
   289  	// Linux: aio_ring_fops.fsync == NULL
   290  	return linuxerr.EINVAL
   291  }
   292  
   293  // AddMapping implements memmap.Mappable.AddMapping.
   294  func (m *aioMappable) AddMapping(_ context.Context, _ memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, _ bool) error {
   295  	// Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap()
   296  	// sets VM_DONTEXPAND).
   297  	if offset != 0 || uint64(ar.Length()) != aioRingBufferSize {
   298  		return linuxerr.EFAULT
   299  	}
   300  	return nil
   301  }
   302  
   303  // RemoveMapping implements memmap.Mappable.RemoveMapping.
   304  func (m *aioMappable) RemoveMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, uint64, bool) {
   305  }
   306  
   307  // CopyMapping implements memmap.Mappable.CopyMapping.
   308  func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, _ bool) error {
   309  	// Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap()
   310  	// sets VM_DONTEXPAND).
   311  	if offset != 0 || uint64(dstAR.Length()) != aioRingBufferSize {
   312  		return linuxerr.EFAULT
   313  	}
   314  	// Require that the mapping correspond to a live AIOContext. Compare
   315  	// Linux's fs/aio.c:aio_ring_mremap().
   316  	mm, ok := ms.(*MemoryManager)
   317  	if !ok {
   318  		return linuxerr.EINVAL
   319  	}
   320  	am := &mm.aioManager
   321  	am.mu.Lock()
   322  	defer am.mu.Unlock()
   323  	oldID := uint64(srcAR.Start)
   324  	aioCtx, ok := am.contexts[oldID]
   325  	if !ok {
   326  		return linuxerr.EINVAL
   327  	}
   328  	aioCtx.mu.Lock()
   329  	defer aioCtx.mu.Unlock()
   330  	if aioCtx.dead {
   331  		return linuxerr.EINVAL
   332  	}
   333  	// Use the new ID for the AIOContext.
   334  	am.contexts[uint64(dstAR.Start)] = aioCtx
   335  	delete(am.contexts, oldID)
   336  	return nil
   337  }
   338  
   339  // Translate implements memmap.Mappable.Translate.
   340  func (m *aioMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
   341  	var err error
   342  	if required.End > m.fr.Length() {
   343  		err = &memmap.BusError{linuxerr.EFAULT}
   344  	}
   345  	if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 {
   346  		return []memmap.Translation{
   347  			{
   348  				Source: source,
   349  				File:   m.mfp.MemoryFile(),
   350  				Offset: m.fr.Start + source.Start,
   351  				Perms:  hostarch.AnyAccess,
   352  			},
   353  		}, err
   354  	}
   355  	return nil, err
   356  }
   357  
   358  // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
   359  func (m *aioMappable) InvalidateUnsavable(ctx context.Context) error {
   360  	return nil
   361  }
   362  
   363  // NewAIOContext creates a new context for asynchronous I/O.
   364  //
   365  // NewAIOContext is analogous to Linux's fs/aio.c:ioctx_alloc().
   366  func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint64, error) {
   367  	// libaio get_ioevents() expects context "handle" to be a valid address.
   368  	// libaio peeks inside looking for a magic number. This function allocates
   369  	// a page per context and keeps it set to zeroes to ensure it will not
   370  	// match AIO_RING_MAGIC and make libaio happy.
   371  	m, err := newAIOMappable(ctx, mm.mfp)
   372  	if err != nil {
   373  		return 0, err
   374  	}
   375  	defer m.DecRef(ctx)
   376  	addr, err := mm.MMap(ctx, memmap.MMapOpts{
   377  		Length:          aioRingBufferSize,
   378  		MappingIdentity: m,
   379  		Mappable:        m,
   380  		// Linux uses "do_mmap_pgoff(..., PROT_READ | PROT_WRITE, ...)" in
   381  		// fs/aio.c:aio_setup_ring(). Since we don't implement AIO_RING_MAGIC,
   382  		// user mode should not write to this page.
   383  		Perms:    hostarch.Read,
   384  		MaxPerms: hostarch.Read,
   385  	})
   386  	if err != nil {
   387  		return 0, err
   388  	}
   389  	id := uint64(addr)
   390  	if !mm.aioManager.newAIOContext(events, id) {
   391  		mm.MUnmap(ctx, addr, aioRingBufferSize)
   392  		return 0, linuxerr.EINVAL
   393  	}
   394  	return id, nil
   395  }
   396  
   397  // DestroyAIOContext destroys an asynchronous I/O context. It returns the
   398  // destroyed context. nil if the context does not exist.
   399  func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) *AIOContext {
   400  	if !mm.isValidAddr(ctx, id) {
   401  		return nil
   402  	}
   403  
   404  	// Only unmaps after it assured that the address is a valid aio context to
   405  	// prevent random memory from been unmapped.
   406  	//
   407  	// Note: It's possible to unmap this address and map something else into
   408  	// the same address. Then it would be unmapping memory that it doesn't own.
   409  	// This is, however, the way Linux implements AIO. Keeps the same [weird]
   410  	// semantics in case anyone relies on it.
   411  	mm.MUnmap(ctx, hostarch.Addr(id), aioRingBufferSize)
   412  
   413  	mm.aioManager.mu.Lock()
   414  	defer mm.aioManager.mu.Unlock()
   415  	return mm.destroyAIOContextLocked(ctx, id)
   416  }
   417  
   418  // LookupAIOContext looks up the given context. It returns false if the context
   419  // does not exist.
   420  func (mm *MemoryManager) LookupAIOContext(ctx context.Context, id uint64) (*AIOContext, bool) {
   421  	aioCtx, ok := mm.aioManager.lookupAIOContext(id)
   422  	if !ok {
   423  		return nil, false
   424  	}
   425  
   426  	// Protect against 'id' that is inaccessible.
   427  	if !mm.isValidAddr(ctx, id) {
   428  		return nil, false
   429  	}
   430  
   431  	return aioCtx, true
   432  }
   433  
   434  // isValidAddr determines if the address `id` is valid. (Linux also reads 4
   435  // bytes from id).
   436  func (mm *MemoryManager) isValidAddr(ctx context.Context, id uint64) bool {
   437  	var buf [4]byte
   438  	_, err := mm.CopyIn(ctx, hostarch.Addr(id), buf[:], usermem.IOOpts{})
   439  	return err == nil
   440  }