github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/syscalls/linux/vfs2/read_write.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package vfs2
    16  
    17  import (
    18  	"time"
    19  
    20  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    21  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    22  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    23  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    24  	ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time"
    25  	"github.com/SagerNet/gvisor/pkg/sentry/socket"
    26  	slinux "github.com/SagerNet/gvisor/pkg/sentry/syscalls/linux"
    27  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    28  	"github.com/SagerNet/gvisor/pkg/syserror"
    29  	"github.com/SagerNet/gvisor/pkg/usermem"
    30  	"github.com/SagerNet/gvisor/pkg/waiter"
    31  )
    32  
    33  const (
    34  	eventMaskRead  = waiter.EventRdNorm | waiter.EventIn | waiter.EventHUp | waiter.EventErr
    35  	eventMaskWrite = waiter.EventWrNorm | waiter.EventOut | waiter.EventHUp | waiter.EventErr
    36  )
    37  
    38  // Read implements Linux syscall read(2).
    39  func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    40  	fd := args[0].Int()
    41  	addr := args[1].Pointer()
    42  	size := args[2].SizeT()
    43  
    44  	file := t.GetFileVFS2(fd)
    45  	if file == nil {
    46  		return 0, nil, linuxerr.EBADF
    47  	}
    48  	defer file.DecRef(t)
    49  
    50  	// Check that the size is legitimate.
    51  	si := int(size)
    52  	if si < 0 {
    53  		return 0, nil, linuxerr.EINVAL
    54  	}
    55  
    56  	// Get the destination of the read.
    57  	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
    58  		AddressSpaceActive: true,
    59  	})
    60  	if err != nil {
    61  		return 0, nil, err
    62  	}
    63  
    64  	n, err := read(t, file, dst, vfs.ReadOptions{})
    65  	t.IOUsage().AccountReadSyscall(n)
    66  	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "read", file)
    67  }
    68  
    69  // Readv implements Linux syscall readv(2).
    70  func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    71  	fd := args[0].Int()
    72  	addr := args[1].Pointer()
    73  	iovcnt := int(args[2].Int())
    74  
    75  	file := t.GetFileVFS2(fd)
    76  	if file == nil {
    77  		return 0, nil, linuxerr.EBADF
    78  	}
    79  	defer file.DecRef(t)
    80  
    81  	// Get the destination of the read.
    82  	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
    83  		AddressSpaceActive: true,
    84  	})
    85  	if err != nil {
    86  		return 0, nil, err
    87  	}
    88  
    89  	n, err := read(t, file, dst, vfs.ReadOptions{})
    90  	t.IOUsage().AccountReadSyscall(n)
    91  	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "readv", file)
    92  }
    93  
    94  func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
    95  	n, err := file.Read(t, dst, opts)
    96  	if err != syserror.ErrWouldBlock {
    97  		return n, err
    98  	}
    99  
   100  	allowBlock, deadline, hasDeadline := blockPolicy(t, file)
   101  	if !allowBlock {
   102  		return n, err
   103  	}
   104  
   105  	// Register for notifications.
   106  	w, ch := waiter.NewChannelEntry(nil)
   107  	file.EventRegister(&w, eventMaskRead)
   108  
   109  	total := n
   110  	for {
   111  		// Shorten dst to reflect bytes previously read.
   112  		dst = dst.DropFirst(int(n))
   113  
   114  		// Issue the request and break out if it completes with anything other than
   115  		// "would block".
   116  		n, err = file.Read(t, dst, opts)
   117  		total += n
   118  		if err != syserror.ErrWouldBlock {
   119  			break
   120  		}
   121  
   122  		// Wait for a notification that we should retry.
   123  		if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil {
   124  			if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
   125  				err = syserror.ErrWouldBlock
   126  			}
   127  			break
   128  		}
   129  	}
   130  	file.EventUnregister(&w)
   131  
   132  	return total, err
   133  }
   134  
   135  // Pread64 implements Linux syscall pread64(2).
   136  func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   137  	fd := args[0].Int()
   138  	addr := args[1].Pointer()
   139  	size := args[2].SizeT()
   140  	offset := args[3].Int64()
   141  
   142  	file := t.GetFileVFS2(fd)
   143  	if file == nil {
   144  		return 0, nil, linuxerr.EBADF
   145  	}
   146  	defer file.DecRef(t)
   147  
   148  	// Check that the offset is legitimate and does not overflow.
   149  	if offset < 0 || offset+int64(size) < 0 {
   150  		return 0, nil, linuxerr.EINVAL
   151  	}
   152  
   153  	// Check that the size is legitimate.
   154  	si := int(size)
   155  	if si < 0 {
   156  		return 0, nil, linuxerr.EINVAL
   157  	}
   158  
   159  	// Get the destination of the read.
   160  	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
   161  		AddressSpaceActive: true,
   162  	})
   163  	if err != nil {
   164  		return 0, nil, err
   165  	}
   166  
   167  	n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
   168  	t.IOUsage().AccountReadSyscall(n)
   169  	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pread64", file)
   170  }
   171  
   172  // Preadv implements Linux syscall preadv(2).
   173  func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   174  	fd := args[0].Int()
   175  	addr := args[1].Pointer()
   176  	iovcnt := int(args[2].Int())
   177  	offset := args[3].Int64()
   178  
   179  	file := t.GetFileVFS2(fd)
   180  	if file == nil {
   181  		return 0, nil, linuxerr.EBADF
   182  	}
   183  	defer file.DecRef(t)
   184  
   185  	// Check that the offset is legitimate.
   186  	if offset < 0 {
   187  		return 0, nil, linuxerr.EINVAL
   188  	}
   189  
   190  	// Get the destination of the read.
   191  	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
   192  		AddressSpaceActive: true,
   193  	})
   194  	if err != nil {
   195  		return 0, nil, err
   196  	}
   197  
   198  	n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
   199  	t.IOUsage().AccountReadSyscall(n)
   200  	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "preadv", file)
   201  }
   202  
   203  // Preadv2 implements Linux syscall preadv2(2).
   204  func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   205  	// While the glibc signature is
   206  	// preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
   207  	// the actual syscall
   208  	// (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1142)
   209  	// splits the offset argument into a high/low value for compatibility with
   210  	// 32-bit architectures. The flags argument is the 6th argument (index 5).
   211  	fd := args[0].Int()
   212  	addr := args[1].Pointer()
   213  	iovcnt := int(args[2].Int())
   214  	offset := args[3].Int64()
   215  	flags := args[5].Int()
   216  
   217  	file := t.GetFileVFS2(fd)
   218  	if file == nil {
   219  		return 0, nil, linuxerr.EBADF
   220  	}
   221  	defer file.DecRef(t)
   222  
   223  	// Check that the offset is legitimate.
   224  	if offset < -1 {
   225  		return 0, nil, linuxerr.EINVAL
   226  	}
   227  
   228  	// Get the destination of the read.
   229  	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
   230  		AddressSpaceActive: true,
   231  	})
   232  	if err != nil {
   233  		return 0, nil, err
   234  	}
   235  
   236  	opts := vfs.ReadOptions{
   237  		Flags: uint32(flags),
   238  	}
   239  	var n int64
   240  	if offset == -1 {
   241  		n, err = read(t, file, dst, opts)
   242  	} else {
   243  		n, err = pread(t, file, dst, offset, opts)
   244  	}
   245  	t.IOUsage().AccountReadSyscall(n)
   246  	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "preadv2", file)
   247  }
   248  
   249  func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
   250  	n, err := file.PRead(t, dst, offset, opts)
   251  	if err != syserror.ErrWouldBlock {
   252  		return n, err
   253  	}
   254  
   255  	allowBlock, deadline, hasDeadline := blockPolicy(t, file)
   256  	if !allowBlock {
   257  		return n, err
   258  	}
   259  
   260  	// Register for notifications.
   261  	w, ch := waiter.NewChannelEntry(nil)
   262  	file.EventRegister(&w, eventMaskRead)
   263  
   264  	total := n
   265  	for {
   266  		// Shorten dst to reflect bytes previously read.
   267  		dst = dst.DropFirst(int(n))
   268  
   269  		// Issue the request and break out if it completes with anything other than
   270  		// "would block".
   271  		n, err = file.PRead(t, dst, offset+total, opts)
   272  		total += n
   273  		if err != syserror.ErrWouldBlock {
   274  			break
   275  		}
   276  
   277  		// Wait for a notification that we should retry.
   278  		if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil {
   279  			if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
   280  				err = syserror.ErrWouldBlock
   281  			}
   282  			break
   283  		}
   284  	}
   285  	file.EventUnregister(&w)
   286  	return total, err
   287  }
   288  
   289  // Write implements Linux syscall write(2).
   290  func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   291  	fd := args[0].Int()
   292  	addr := args[1].Pointer()
   293  	size := args[2].SizeT()
   294  
   295  	file := t.GetFileVFS2(fd)
   296  	if file == nil {
   297  		return 0, nil, linuxerr.EBADF
   298  	}
   299  	defer file.DecRef(t)
   300  
   301  	// Check that the size is legitimate.
   302  	si := int(size)
   303  	if si < 0 {
   304  		return 0, nil, linuxerr.EINVAL
   305  	}
   306  
   307  	// Get the source of the write.
   308  	src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
   309  		AddressSpaceActive: true,
   310  	})
   311  	if err != nil {
   312  		return 0, nil, err
   313  	}
   314  
   315  	n, err := write(t, file, src, vfs.WriteOptions{})
   316  	t.IOUsage().AccountWriteSyscall(n)
   317  	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "write", file)
   318  }
   319  
   320  // Writev implements Linux syscall writev(2).
   321  func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   322  	fd := args[0].Int()
   323  	addr := args[1].Pointer()
   324  	iovcnt := int(args[2].Int())
   325  
   326  	file := t.GetFileVFS2(fd)
   327  	if file == nil {
   328  		return 0, nil, linuxerr.EBADF
   329  	}
   330  	defer file.DecRef(t)
   331  
   332  	// Get the source of the write.
   333  	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
   334  		AddressSpaceActive: true,
   335  	})
   336  	if err != nil {
   337  		return 0, nil, err
   338  	}
   339  
   340  	n, err := write(t, file, src, vfs.WriteOptions{})
   341  	t.IOUsage().AccountWriteSyscall(n)
   342  	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "writev", file)
   343  }
   344  
   345  func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
   346  	n, err := file.Write(t, src, opts)
   347  	if err != syserror.ErrWouldBlock {
   348  		return n, err
   349  	}
   350  
   351  	allowBlock, deadline, hasDeadline := blockPolicy(t, file)
   352  	if !allowBlock {
   353  		return n, err
   354  	}
   355  
   356  	// Register for notifications.
   357  	w, ch := waiter.NewChannelEntry(nil)
   358  	file.EventRegister(&w, eventMaskWrite)
   359  
   360  	total := n
   361  	for {
   362  		// Shorten src to reflect bytes previously written.
   363  		src = src.DropFirst(int(n))
   364  
   365  		// Issue the request and break out if it completes with anything other than
   366  		// "would block".
   367  		n, err = file.Write(t, src, opts)
   368  		total += n
   369  		if err != syserror.ErrWouldBlock {
   370  			break
   371  		}
   372  
   373  		// Wait for a notification that we should retry.
   374  		if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil {
   375  			if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
   376  				err = syserror.ErrWouldBlock
   377  			}
   378  			break
   379  		}
   380  	}
   381  	file.EventUnregister(&w)
   382  	return total, err
   383  }
   384  
   385  // Pwrite64 implements Linux syscall pwrite64(2).
   386  func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   387  	fd := args[0].Int()
   388  	addr := args[1].Pointer()
   389  	size := args[2].SizeT()
   390  	offset := args[3].Int64()
   391  
   392  	file := t.GetFileVFS2(fd)
   393  	if file == nil {
   394  		return 0, nil, linuxerr.EBADF
   395  	}
   396  	defer file.DecRef(t)
   397  
   398  	// Check that the offset is legitimate and does not overflow.
   399  	if offset < 0 || offset+int64(size) < 0 {
   400  		return 0, nil, linuxerr.EINVAL
   401  	}
   402  
   403  	// Check that the size is legitimate.
   404  	si := int(size)
   405  	if si < 0 {
   406  		return 0, nil, linuxerr.EINVAL
   407  	}
   408  
   409  	// Get the source of the write.
   410  	src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
   411  		AddressSpaceActive: true,
   412  	})
   413  	if err != nil {
   414  		return 0, nil, err
   415  	}
   416  
   417  	n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
   418  	t.IOUsage().AccountWriteSyscall(n)
   419  	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwrite64", file)
   420  }
   421  
   422  // Pwritev implements Linux syscall pwritev(2).
   423  func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   424  	fd := args[0].Int()
   425  	addr := args[1].Pointer()
   426  	iovcnt := int(args[2].Int())
   427  	offset := args[3].Int64()
   428  
   429  	file := t.GetFileVFS2(fd)
   430  	if file == nil {
   431  		return 0, nil, linuxerr.EBADF
   432  	}
   433  	defer file.DecRef(t)
   434  
   435  	// Check that the offset is legitimate.
   436  	if offset < 0 {
   437  		return 0, nil, linuxerr.EINVAL
   438  	}
   439  
   440  	// Get the source of the write.
   441  	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
   442  		AddressSpaceActive: true,
   443  	})
   444  	if err != nil {
   445  		return 0, nil, err
   446  	}
   447  
   448  	n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
   449  	t.IOUsage().AccountReadSyscall(n)
   450  	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwritev", file)
   451  }
   452  
   453  // Pwritev2 implements Linux syscall pwritev2(2).
   454  func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   455  	// While the glibc signature is
   456  	// pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
   457  	// the actual syscall
   458  	// (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1162)
   459  	// splits the offset argument into a high/low value for compatibility with
   460  	// 32-bit architectures. The flags argument is the 6th argument (index 5).
   461  	fd := args[0].Int()
   462  	addr := args[1].Pointer()
   463  	iovcnt := int(args[2].Int())
   464  	offset := args[3].Int64()
   465  	flags := args[5].Int()
   466  
   467  	file := t.GetFileVFS2(fd)
   468  	if file == nil {
   469  		return 0, nil, linuxerr.EBADF
   470  	}
   471  	defer file.DecRef(t)
   472  
   473  	// Check that the offset is legitimate.
   474  	if offset < -1 {
   475  		return 0, nil, linuxerr.EINVAL
   476  	}
   477  
   478  	// Get the source of the write.
   479  	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
   480  		AddressSpaceActive: true,
   481  	})
   482  	if err != nil {
   483  		return 0, nil, err
   484  	}
   485  
   486  	opts := vfs.WriteOptions{
   487  		Flags: uint32(flags),
   488  	}
   489  	var n int64
   490  	if offset == -1 {
   491  		n, err = write(t, file, src, opts)
   492  	} else {
   493  		n, err = pwrite(t, file, src, offset, opts)
   494  	}
   495  	t.IOUsage().AccountWriteSyscall(n)
   496  	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwritev2", file)
   497  }
   498  
   499  func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
   500  	n, err := file.PWrite(t, src, offset, opts)
   501  	if err != syserror.ErrWouldBlock {
   502  		return n, err
   503  	}
   504  
   505  	allowBlock, deadline, hasDeadline := blockPolicy(t, file)
   506  	if !allowBlock {
   507  		return n, err
   508  	}
   509  
   510  	// Register for notifications.
   511  	w, ch := waiter.NewChannelEntry(nil)
   512  	file.EventRegister(&w, eventMaskWrite)
   513  
   514  	total := n
   515  	for {
   516  		// Shorten src to reflect bytes previously written.
   517  		src = src.DropFirst(int(n))
   518  
   519  		// Issue the request and break out if it completes with anything other than
   520  		// "would block".
   521  		n, err = file.PWrite(t, src, offset+total, opts)
   522  		total += n
   523  		if err != syserror.ErrWouldBlock {
   524  			break
   525  		}
   526  
   527  		// Wait for a notification that we should retry.
   528  		if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil {
   529  			if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
   530  				err = syserror.ErrWouldBlock
   531  			}
   532  			break
   533  		}
   534  	}
   535  	file.EventUnregister(&w)
   536  	return total, err
   537  }
   538  
   539  func blockPolicy(t *kernel.Task, file *vfs.FileDescription) (allowBlock bool, deadline ktime.Time, hasDeadline bool) {
   540  	if file.StatusFlags()&linux.O_NONBLOCK != 0 {
   541  		return false, ktime.Time{}, false
   542  	}
   543  	// Sockets support read/write timeouts.
   544  	if s, ok := file.Impl().(socket.SocketVFS2); ok {
   545  		dl := s.RecvTimeout()
   546  		if dl < 0 {
   547  			return false, ktime.Time{}, false
   548  		}
   549  		if dl > 0 {
   550  			return true, t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond), true
   551  		}
   552  	}
   553  	return true, ktime.Time{}, false
   554  }
   555  
   556  // Lseek implements Linux syscall lseek(2).
   557  func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   558  	fd := args[0].Int()
   559  	offset := args[1].Int64()
   560  	whence := args[2].Int()
   561  
   562  	file := t.GetFileVFS2(fd)
   563  	if file == nil {
   564  		return 0, nil, linuxerr.EBADF
   565  	}
   566  	defer file.DecRef(t)
   567  
   568  	newoff, err := file.Seek(t, offset, whence)
   569  	return uintptr(newoff), nil, err
   570  }
   571  
   572  // Readahead implements readahead(2).
   573  func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   574  	fd := args[0].Int()
   575  	offset := args[1].Int64()
   576  	size := args[2].SizeT()
   577  
   578  	file := t.GetFileVFS2(fd)
   579  	if file == nil {
   580  		return 0, nil, linuxerr.EBADF
   581  	}
   582  	defer file.DecRef(t)
   583  
   584  	// Check that the file is readable.
   585  	if !file.IsReadable() {
   586  		return 0, nil, linuxerr.EBADF
   587  	}
   588  
   589  	// Check that the size is valid.
   590  	if int(size) < 0 {
   591  		return 0, nil, linuxerr.EINVAL
   592  	}
   593  
   594  	// Check that the offset is legitimate and does not overflow.
   595  	if offset < 0 || offset+int64(size) < 0 {
   596  		return 0, nil, linuxerr.EINVAL
   597  	}
   598  
   599  	// Return EINVAL; if the underlying file type does not support readahead,
   600  	// then Linux will return EINVAL to indicate as much. In the future, we
   601  	// may extend this function to actually support readahead hints.
   602  	return 0, nil, linuxerr.EINVAL
   603  }