github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/syscalls/linux/sys_aio.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package linux
    16  
    17  import (
    18  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    19  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    20  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    21  	"github.com/nicocha30/gvisor-ligolo/pkg/hostarch"
    22  	"github.com/nicocha30/gvisor-ligolo/pkg/marshal/primitive"
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/eventfd"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel"
    26  	ktime "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/time"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/mm"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    29  	"github.com/nicocha30/gvisor-ligolo/pkg/usermem"
    30  )
    31  
    32  // IoSetup implements linux syscall io_setup(2).
    33  func IoSetup(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    34  	nrEvents := args[0].Int()
    35  	idAddr := args[1].Pointer()
    36  
    37  	// Linux uses the native long as the aio ID.
    38  	//
    39  	// The context pointer _must_ be zero initially.
    40  	var idIn uint64
    41  	if _, err := primitive.CopyUint64In(t, idAddr, &idIn); err != nil {
    42  		return 0, nil, err
    43  	}
    44  	if idIn != 0 {
    45  		return 0, nil, linuxerr.EINVAL
    46  	}
    47  
    48  	id, err := t.MemoryManager().NewAIOContext(t, uint32(nrEvents))
    49  	if err != nil {
    50  		return 0, nil, err
    51  	}
    52  
    53  	// Copy out the new ID.
    54  	if _, err := primitive.CopyUint64Out(t, idAddr, id); err != nil {
    55  		t.MemoryManager().DestroyAIOContext(t, id)
    56  		return 0, nil, err
    57  	}
    58  
    59  	return 0, nil, nil
    60  }
    61  
    62  // IoDestroy implements linux syscall io_destroy(2).
    63  func IoDestroy(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    64  	id := args[0].Uint64()
    65  
    66  	ctx := t.MemoryManager().DestroyAIOContext(t, id)
    67  	if ctx == nil {
    68  		// Does not exist.
    69  		return 0, nil, linuxerr.EINVAL
    70  	}
    71  
    72  	// Drain completed requests amd wait for pending requests until there are no
    73  	// more.
    74  	for {
    75  		ctx.Drain()
    76  
    77  		ch := ctx.WaitChannel()
    78  		if ch == nil {
    79  			// No more requests, we're done.
    80  			return 0, nil, nil
    81  		}
    82  		// The task cannot be interrupted during the wait. Equivalent to
    83  		// TASK_UNINTERRUPTIBLE in Linux.
    84  		t.UninterruptibleSleepStart(true /* deactivate */)
    85  		<-ch
    86  		t.UninterruptibleSleepFinish(true /* activate */)
    87  	}
    88  }
    89  
    90  // IoGetevents implements linux syscall io_getevents(2).
    91  func IoGetevents(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    92  	id := args[0].Uint64()
    93  	minEvents := args[1].Int()
    94  	events := args[2].Int()
    95  	eventsAddr := args[3].Pointer()
    96  	timespecAddr := args[4].Pointer()
    97  
    98  	// Sanity check arguments.
    99  	if minEvents < 0 || minEvents > events {
   100  		return 0, nil, linuxerr.EINVAL
   101  	}
   102  
   103  	ctx, ok := t.MemoryManager().LookupAIOContext(t, id)
   104  	if !ok {
   105  		return 0, nil, linuxerr.EINVAL
   106  	}
   107  
   108  	// Setup the timeout.
   109  	var haveDeadline bool
   110  	var deadline ktime.Time
   111  	if timespecAddr != 0 {
   112  		d, err := copyTimespecIn(t, timespecAddr)
   113  		if err != nil {
   114  			return 0, nil, err
   115  		}
   116  		if !d.Valid() {
   117  			return 0, nil, linuxerr.EINVAL
   118  		}
   119  		deadline = t.Kernel().MonotonicClock().Now().Add(d.ToDuration())
   120  		haveDeadline = true
   121  	}
   122  
   123  	// Loop over all requests.
   124  	for count := int32(0); count < events; count++ {
   125  		// Get a request, per semantics.
   126  		var v any
   127  		if count >= minEvents {
   128  			var ok bool
   129  			v, ok = ctx.PopRequest()
   130  			if !ok {
   131  				return uintptr(count), nil, nil
   132  			}
   133  		} else {
   134  			var err error
   135  			v, err = waitForRequest(ctx, t, haveDeadline, deadline)
   136  			if err != nil {
   137  				if count > 0 || linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
   138  					return uintptr(count), nil, nil
   139  				}
   140  				return 0, nil, linuxerr.ConvertIntr(err, linuxerr.EINTR)
   141  			}
   142  		}
   143  
   144  		ev := v.(*linux.IOEvent)
   145  
   146  		// Copy out the result.
   147  		if _, err := ev.CopyOut(t, eventsAddr); err != nil {
   148  			if count > 0 {
   149  				return uintptr(count), nil, nil
   150  			}
   151  			// Nothing done.
   152  			return 0, nil, err
   153  		}
   154  
   155  		// Keep rolling.
   156  		eventsAddr += hostarch.Addr(linux.IOEventSize)
   157  	}
   158  
   159  	// Everything finished.
   160  	return uintptr(events), nil, nil
   161  }
   162  
   163  func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadline ktime.Time) (any, error) {
   164  	for {
   165  		if v, ok := ctx.PopRequest(); ok {
   166  			// Request was readily available. Just return it.
   167  			return v, nil
   168  		}
   169  
   170  		// Need to wait for request completion.
   171  		done := ctx.WaitChannel()
   172  		if done == nil {
   173  			// Context has been destroyed.
   174  			return nil, linuxerr.EINVAL
   175  		}
   176  		if err := t.BlockWithDeadline(done, haveDeadline, deadline); err != nil {
   177  			return nil, err
   178  		}
   179  	}
   180  }
   181  
   182  // memoryFor returns appropriate memory for the given callback.
   183  func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) {
   184  	bytes := int(cb.Bytes)
   185  	if bytes < 0 {
   186  		// Linux also requires that this field fit in ssize_t.
   187  		return usermem.IOSequence{}, linuxerr.EINVAL
   188  	}
   189  
   190  	// Since this I/O will be asynchronous with respect to t's task goroutine,
   191  	// we have no guarantee that t's AddressSpace will be active during the
   192  	// I/O.
   193  	switch cb.OpCode {
   194  	case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE:
   195  		return t.SingleIOSequence(hostarch.Addr(cb.Buf), bytes, usermem.IOOpts{
   196  			AddressSpaceActive: false,
   197  		})
   198  
   199  	case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV:
   200  		return t.IovecsIOSequence(hostarch.Addr(cb.Buf), bytes, usermem.IOOpts{
   201  			AddressSpaceActive: false,
   202  		})
   203  
   204  	case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP:
   205  		return usermem.IOSequence{}, nil
   206  
   207  	default:
   208  		// Not a supported command.
   209  		return usermem.IOSequence{}, linuxerr.EINVAL
   210  	}
   211  }
   212  
   213  // IoCancel implements linux syscall io_cancel(2).
   214  //
   215  // It is not presently supported (ENOSYS indicates no support on this
   216  // architecture).
   217  func IoCancel(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   218  	return 0, nil, linuxerr.ENOSYS
   219  }
   220  
   221  // IoSubmit implements linux syscall io_submit(2).
   222  func IoSubmit(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   223  	id := args[0].Uint64()
   224  	nrEvents := args[1].Int()
   225  	addr := args[2].Pointer()
   226  
   227  	if nrEvents < 0 {
   228  		return 0, nil, linuxerr.EINVAL
   229  	}
   230  
   231  	for i := int32(0); i < nrEvents; i++ {
   232  		// Copy in the callback address.
   233  		var cbAddr hostarch.Addr
   234  		switch t.Arch().Width() {
   235  		case 8:
   236  			var cbAddrP primitive.Uint64
   237  			if _, err := cbAddrP.CopyIn(t, addr); err != nil {
   238  				if i > 0 {
   239  					// Some successful.
   240  					return uintptr(i), nil, nil
   241  				}
   242  				// Nothing done.
   243  				return 0, nil, err
   244  			}
   245  			cbAddr = hostarch.Addr(cbAddrP)
   246  		default:
   247  			return 0, nil, linuxerr.ENOSYS
   248  		}
   249  
   250  		// Copy in this callback.
   251  		var cb linux.IOCallback
   252  		if _, err := cb.CopyIn(t, cbAddr); err != nil {
   253  			if i > 0 {
   254  				// Some have been successful.
   255  				return uintptr(i), nil, nil
   256  			}
   257  			// Nothing done.
   258  			return 0, nil, err
   259  		}
   260  
   261  		// Process this callback.
   262  		if err := submitCallback(t, id, &cb, cbAddr); err != nil {
   263  			if i > 0 {
   264  				// Partial success.
   265  				return uintptr(i), nil, nil
   266  			}
   267  			// Nothing done.
   268  			return 0, nil, err
   269  		}
   270  
   271  		// Advance to the next one.
   272  		addr += hostarch.Addr(t.Arch().Width())
   273  	}
   274  
   275  	return uintptr(nrEvents), nil, nil
   276  }
   277  
   278  // submitCallback processes a single callback.
   279  func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr hostarch.Addr) error {
   280  	if cb.Reserved2 != 0 {
   281  		return linuxerr.EINVAL
   282  	}
   283  
   284  	fd := t.GetFile(cb.FD)
   285  	if fd == nil {
   286  		return linuxerr.EBADF
   287  	}
   288  	defer fd.DecRef(t)
   289  
   290  	// Was there an eventFD? Extract it.
   291  	var eventFD *vfs.FileDescription
   292  	if cb.Flags&linux.IOCB_FLAG_RESFD != 0 {
   293  		eventFD = t.GetFile(cb.ResFD)
   294  		if eventFD == nil {
   295  			return linuxerr.EBADF
   296  		}
   297  		defer eventFD.DecRef(t)
   298  
   299  		// Check that it is an eventfd.
   300  		if _, ok := eventFD.Impl().(*eventfd.EventFileDescription); !ok {
   301  			return linuxerr.EINVAL
   302  		}
   303  	}
   304  
   305  	ioseq, err := memoryFor(t, cb)
   306  	if err != nil {
   307  		return err
   308  	}
   309  
   310  	// Check offset for reads/writes.
   311  	switch cb.OpCode {
   312  	case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
   313  		if cb.Offset < 0 {
   314  			return linuxerr.EINVAL
   315  		}
   316  	}
   317  
   318  	// Prepare the request.
   319  	aioCtx, ok := t.MemoryManager().LookupAIOContext(t, id)
   320  	if !ok {
   321  		return linuxerr.EINVAL
   322  	}
   323  	if err := aioCtx.Prepare(); err != nil {
   324  		return err
   325  	}
   326  
   327  	if eventFD != nil {
   328  		// The request is set. Make sure there's a ref on the file.
   329  		//
   330  		// This is necessary when the callback executes on completion,
   331  		// which is also what will release this reference.
   332  		eventFD.IncRef()
   333  	}
   334  
   335  	// Perform the request asynchronously.
   336  	fd.IncRef()
   337  	t.QueueAIO(getAIOCallback(t, fd, eventFD, cbAddr, cb, ioseq, aioCtx))
   338  	return nil
   339  }
   340  
   341  func getAIOCallback(t *kernel.Task, fd, eventFD *vfs.FileDescription, cbAddr hostarch.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, aioCtx *mm.AIOContext) kernel.AIOCallback {
   342  	return func(ctx context.Context) {
   343  		// Release references after completing the callback.
   344  		defer fd.DecRef(ctx)
   345  		if eventFD != nil {
   346  			defer eventFD.DecRef(ctx)
   347  		}
   348  
   349  		if aioCtx.Dead() {
   350  			aioCtx.CancelPendingRequest()
   351  			return
   352  		}
   353  		ev := &linux.IOEvent{
   354  			Data: cb.Data,
   355  			Obj:  uint64(cbAddr),
   356  		}
   357  
   358  		var err error
   359  		switch cb.OpCode {
   360  		case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV:
   361  			ev.Result, err = fd.PRead(ctx, ioseq, cb.Offset, vfs.ReadOptions{})
   362  		case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
   363  			ev.Result, err = fd.PWrite(ctx, ioseq, cb.Offset, vfs.WriteOptions{})
   364  		case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC:
   365  			err = fd.Sync(ctx)
   366  		}
   367  
   368  		// Update the result.
   369  		if err != nil {
   370  			err = HandleIOError(ctx, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", fd)
   371  			ev.Result = -int64(kernel.ExtractErrno(err, 0))
   372  		}
   373  
   374  		// Queue the result for delivery.
   375  		aioCtx.FinishRequest(ev)
   376  
   377  		// Notify the event file if one was specified. This needs to happen
   378  		// *after* queueing the result to avoid racing with the thread we may
   379  		// wake up.
   380  		if eventFD != nil {
   381  			eventFD.Impl().(*eventfd.EventFileDescription).Signal(1)
   382  		}
   383  	}
   384  }