github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/syscalls/linux/vfs2/splice.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package vfs2
    16  
    17  import (
    18  	"io"
    19  
    20  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    21  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    22  	"github.com/SagerNet/gvisor/pkg/log"
    23  	"github.com/SagerNet/gvisor/pkg/marshal/primitive"
    24  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    25  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    26  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/pipe"
    27  	slinux "github.com/SagerNet/gvisor/pkg/sentry/syscalls/linux"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    29  	"github.com/SagerNet/gvisor/pkg/syserror"
    30  	"github.com/SagerNet/gvisor/pkg/usermem"
    31  	"github.com/SagerNet/gvisor/pkg/waiter"
    32  )
    33  
    34  // Splice implements Linux syscall splice(2).
    35  func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    36  	inFD := args[0].Int()
    37  	inOffsetPtr := args[1].Pointer()
    38  	outFD := args[2].Int()
    39  	outOffsetPtr := args[3].Pointer()
    40  	count := int64(args[4].SizeT())
    41  	flags := args[5].Int()
    42  
    43  	if count == 0 {
    44  		return 0, nil, nil
    45  	}
    46  	if count > int64(kernel.MAX_RW_COUNT) {
    47  		count = int64(kernel.MAX_RW_COUNT)
    48  	}
    49  	if count < 0 {
    50  		return 0, nil, linuxerr.EINVAL
    51  	}
    52  
    53  	// Check for invalid flags.
    54  	if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
    55  		return 0, nil, linuxerr.EINVAL
    56  	}
    57  
    58  	// Get file descriptions.
    59  	inFile := t.GetFileVFS2(inFD)
    60  	if inFile == nil {
    61  		return 0, nil, linuxerr.EBADF
    62  	}
    63  	defer inFile.DecRef(t)
    64  	outFile := t.GetFileVFS2(outFD)
    65  	if outFile == nil {
    66  		return 0, nil, linuxerr.EBADF
    67  	}
    68  	defer outFile.DecRef(t)
    69  
    70  	// Check that both files support the required directionality.
    71  	if !inFile.IsReadable() || !outFile.IsWritable() {
    72  		return 0, nil, linuxerr.EBADF
    73  	}
    74  
    75  	// The operation is non-blocking if anything is non-blocking.
    76  	//
    77  	// N.B. This is a rather simplistic heuristic that avoids some
    78  	// poor edge case behavior since the exact semantics here are
    79  	// underspecified and vary between versions of Linux itself.
    80  	nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0)
    81  
    82  	// At least one file description must represent a pipe.
    83  	inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD)
    84  	outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
    85  	if !inIsPipe && !outIsPipe {
    86  		return 0, nil, linuxerr.EINVAL
    87  	}
    88  
    89  	// Copy in offsets.
    90  	inOffset := int64(-1)
    91  	if inOffsetPtr != 0 {
    92  		if inIsPipe {
    93  			return 0, nil, linuxerr.ESPIPE
    94  		}
    95  		if inFile.Options().DenyPRead {
    96  			return 0, nil, linuxerr.EINVAL
    97  		}
    98  		if _, err := primitive.CopyInt64In(t, inOffsetPtr, &inOffset); err != nil {
    99  			return 0, nil, err
   100  		}
   101  		if inOffset < 0 {
   102  			return 0, nil, linuxerr.EINVAL
   103  		}
   104  	}
   105  	outOffset := int64(-1)
   106  	if outOffsetPtr != 0 {
   107  		if outIsPipe {
   108  			return 0, nil, linuxerr.ESPIPE
   109  		}
   110  		if outFile.Options().DenyPWrite {
   111  			return 0, nil, linuxerr.EINVAL
   112  		}
   113  		if _, err := primitive.CopyInt64In(t, outOffsetPtr, &outOffset); err != nil {
   114  			return 0, nil, err
   115  		}
   116  		if outOffset < 0 {
   117  			return 0, nil, linuxerr.EINVAL
   118  		}
   119  	}
   120  
   121  	// Move data.
   122  	var (
   123  		n   int64
   124  		err error
   125  	)
   126  	dw := dualWaiter{
   127  		inFile:  inFile,
   128  		outFile: outFile,
   129  	}
   130  	defer dw.destroy()
   131  	for {
   132  		// If both input and output are pipes, delegate to the pipe
   133  		// implementation. Otherwise, exactly one end is a pipe, which
   134  		// we ensure is consistently ordered after the non-pipe FD's
   135  		// locks by passing the pipe FD as usermem.IO to the non-pipe
   136  		// end.
   137  		switch {
   138  		case inIsPipe && outIsPipe:
   139  			n, err = pipe.Splice(t, outPipeFD, inPipeFD, count)
   140  		case inIsPipe:
   141  			n, err = inPipeFD.SpliceToNonPipe(t, outFile, outOffset, count)
   142  			if outOffset != -1 {
   143  				outOffset += n
   144  			}
   145  		case outIsPipe:
   146  			n, err = outPipeFD.SpliceFromNonPipe(t, inFile, inOffset, count)
   147  			if inOffset != -1 {
   148  				inOffset += n
   149  			}
   150  		default:
   151  			panic("at least one end of splice must be a pipe")
   152  		}
   153  
   154  		if n != 0 || err != syserror.ErrWouldBlock || nonBlock {
   155  			break
   156  		}
   157  		if err = dw.waitForBoth(t); err != nil {
   158  			break
   159  		}
   160  	}
   161  
   162  	// Copy updated offsets out.
   163  	if inOffsetPtr != 0 {
   164  		if _, err := primitive.CopyInt64Out(t, inOffsetPtr, inOffset); err != nil {
   165  			return 0, nil, err
   166  		}
   167  	}
   168  	if outOffsetPtr != 0 {
   169  		if _, err := primitive.CopyInt64Out(t, outOffsetPtr, outOffset); err != nil {
   170  			return 0, nil, err
   171  		}
   172  	}
   173  
   174  	// We can only pass a single file to handleIOError, so pick inFile arbitrarily.
   175  	// This is used only for debugging purposes.
   176  	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "splice", outFile)
   177  }
   178  
   179  // Tee implements Linux syscall tee(2).
   180  func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   181  	inFD := args[0].Int()
   182  	outFD := args[1].Int()
   183  	count := int64(args[2].SizeT())
   184  	flags := args[3].Int()
   185  
   186  	if count == 0 {
   187  		return 0, nil, nil
   188  	}
   189  	if count > int64(kernel.MAX_RW_COUNT) {
   190  		count = int64(kernel.MAX_RW_COUNT)
   191  	}
   192  	if count < 0 {
   193  		return 0, nil, linuxerr.EINVAL
   194  	}
   195  
   196  	// Check for invalid flags.
   197  	if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
   198  		return 0, nil, linuxerr.EINVAL
   199  	}
   200  
   201  	// Get file descriptions.
   202  	inFile := t.GetFileVFS2(inFD)
   203  	if inFile == nil {
   204  		return 0, nil, linuxerr.EBADF
   205  	}
   206  	defer inFile.DecRef(t)
   207  	outFile := t.GetFileVFS2(outFD)
   208  	if outFile == nil {
   209  		return 0, nil, linuxerr.EBADF
   210  	}
   211  	defer outFile.DecRef(t)
   212  
   213  	// Check that both files support the required directionality.
   214  	if !inFile.IsReadable() || !outFile.IsWritable() {
   215  		return 0, nil, linuxerr.EBADF
   216  	}
   217  
   218  	// The operation is non-blocking if anything is non-blocking.
   219  	//
   220  	// N.B. This is a rather simplistic heuristic that avoids some
   221  	// poor edge case behavior since the exact semantics here are
   222  	// underspecified and vary between versions of Linux itself.
   223  	nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0)
   224  
   225  	// Both file descriptions must represent pipes.
   226  	inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD)
   227  	outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
   228  	if !inIsPipe || !outIsPipe {
   229  		return 0, nil, linuxerr.EINVAL
   230  	}
   231  
   232  	// Copy data.
   233  	var (
   234  		n   int64
   235  		err error
   236  	)
   237  	dw := dualWaiter{
   238  		inFile:  inFile,
   239  		outFile: outFile,
   240  	}
   241  	defer dw.destroy()
   242  	for {
   243  		n, err = pipe.Tee(t, outPipeFD, inPipeFD, count)
   244  		if n != 0 || err != syserror.ErrWouldBlock || nonBlock {
   245  			break
   246  		}
   247  		if err = dw.waitForBoth(t); err != nil {
   248  			break
   249  		}
   250  	}
   251  
   252  	if n != 0 {
   253  		// If a partial write is completed, the error is dropped. Log it here.
   254  		if err != nil && err != io.EOF && err != syserror.ErrWouldBlock {
   255  			log.Debugf("tee completed a partial write with error: %v", err)
   256  			err = nil
   257  		}
   258  	}
   259  
   260  	// We can only pass a single file to handleIOError, so pick inFile arbitrarily.
   261  	// This is used only for debugging purposes.
   262  	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "tee", inFile)
   263  }
   264  
   265  // Sendfile implements linux system call sendfile(2).
   266  func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   267  	outFD := args[0].Int()
   268  	inFD := args[1].Int()
   269  	offsetAddr := args[2].Pointer()
   270  	count := int64(args[3].SizeT())
   271  
   272  	inFile := t.GetFileVFS2(inFD)
   273  	if inFile == nil {
   274  		return 0, nil, linuxerr.EBADF
   275  	}
   276  	defer inFile.DecRef(t)
   277  	if !inFile.IsReadable() {
   278  		return 0, nil, linuxerr.EBADF
   279  	}
   280  
   281  	outFile := t.GetFileVFS2(outFD)
   282  	if outFile == nil {
   283  		return 0, nil, linuxerr.EBADF
   284  	}
   285  	defer outFile.DecRef(t)
   286  	if !outFile.IsWritable() {
   287  		return 0, nil, linuxerr.EBADF
   288  	}
   289  
   290  	// Verify that the outFile Append flag is not set.
   291  	if outFile.StatusFlags()&linux.O_APPEND != 0 {
   292  		return 0, nil, linuxerr.EINVAL
   293  	}
   294  
   295  	// Verify that inFile is a regular file or block device. This is a
   296  	// requirement; the same check appears in Linux
   297  	// (fs/splice.c:splice_direct_to_actor).
   298  	if stat, err := inFile.Stat(t, vfs.StatOptions{Mask: linux.STATX_TYPE}); err != nil {
   299  		return 0, nil, err
   300  	} else if stat.Mask&linux.STATX_TYPE == 0 ||
   301  		(stat.Mode&linux.S_IFMT != linux.S_IFREG && stat.Mode&linux.S_IFMT != linux.S_IFBLK) {
   302  		return 0, nil, linuxerr.EINVAL
   303  	}
   304  
   305  	// Copy offset if it exists.
   306  	offset := int64(-1)
   307  	if offsetAddr != 0 {
   308  		if inFile.Options().DenyPRead {
   309  			return 0, nil, linuxerr.ESPIPE
   310  		}
   311  		var offsetP primitive.Int64
   312  		if _, err := offsetP.CopyIn(t, offsetAddr); err != nil {
   313  			return 0, nil, err
   314  		}
   315  		offset = int64(offsetP)
   316  
   317  		if offset < 0 {
   318  			return 0, nil, linuxerr.EINVAL
   319  		}
   320  		if offset+count < 0 {
   321  			return 0, nil, linuxerr.EINVAL
   322  		}
   323  	}
   324  
   325  	// Validate count. This must come after offset checks.
   326  	if count < 0 {
   327  		return 0, nil, linuxerr.EINVAL
   328  	}
   329  	if count == 0 {
   330  		return 0, nil, nil
   331  	}
   332  	if count > int64(kernel.MAX_RW_COUNT) {
   333  		count = int64(kernel.MAX_RW_COUNT)
   334  	}
   335  
   336  	// Copy data.
   337  	var (
   338  		total int64
   339  		err   error
   340  	)
   341  	dw := dualWaiter{
   342  		inFile:  inFile,
   343  		outFile: outFile,
   344  	}
   345  	defer dw.destroy()
   346  	outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
   347  	// Reading from input file should never block, since it is regular or
   348  	// block device. We only need to check if writing to the output file
   349  	// can block.
   350  	nonBlock := outFile.StatusFlags()&linux.O_NONBLOCK != 0
   351  	if outIsPipe {
   352  		for {
   353  			var n int64
   354  			n, err = outPipeFD.SpliceFromNonPipe(t, inFile, offset, count-total)
   355  			if offset != -1 {
   356  				offset += n
   357  			}
   358  			total += n
   359  			if total == count {
   360  				break
   361  			}
   362  			if err == nil && t.Interrupted() {
   363  				err = syserror.ErrInterrupted
   364  				break
   365  			}
   366  			if err == syserror.ErrWouldBlock && !nonBlock {
   367  				err = dw.waitForBoth(t)
   368  			}
   369  			if err != nil {
   370  				break
   371  			}
   372  		}
   373  	} else {
   374  		// Read inFile to buffer, then write the contents to outFile.
   375  		buf := make([]byte, count)
   376  		for {
   377  			var readN int64
   378  			if offset != -1 {
   379  				readN, err = inFile.PRead(t, usermem.BytesIOSequence(buf), offset, vfs.ReadOptions{})
   380  				offset += readN
   381  			} else {
   382  				readN, err = inFile.Read(t, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
   383  			}
   384  
   385  			// Write all of the bytes that we read. This may need
   386  			// multiple write calls to complete.
   387  			wbuf := buf[:readN]
   388  			for len(wbuf) > 0 {
   389  				var writeN int64
   390  				writeN, err = outFile.Write(t, usermem.BytesIOSequence(wbuf), vfs.WriteOptions{})
   391  				wbuf = wbuf[writeN:]
   392  				if err == syserror.ErrWouldBlock && !nonBlock {
   393  					err = dw.waitForOut(t)
   394  				}
   395  				if err != nil {
   396  					// We didn't complete the write. Only report the bytes that were actually
   397  					// written, and rewind offsets as needed.
   398  					notWritten := int64(len(wbuf))
   399  					readN -= notWritten
   400  					if offset == -1 {
   401  						// We modified the offset of the input file itself during the read
   402  						// operation. Rewind it.
   403  						if _, seekErr := inFile.Seek(t, -notWritten, linux.SEEK_CUR); seekErr != nil {
   404  							// Log the error but don't return it, since the write has already
   405  							// completed successfully.
   406  							log.Warningf("failed to roll back input file offset: %v", seekErr)
   407  						}
   408  					} else {
   409  						// The sendfile call was provided an offset parameter that should be
   410  						// adjusted to reflect the number of bytes sent. Rewind it.
   411  						offset -= notWritten
   412  					}
   413  					break
   414  				}
   415  			}
   416  
   417  			total += readN
   418  			buf = buf[readN:]
   419  			if total == count {
   420  				break
   421  			}
   422  			if err == nil && t.Interrupted() {
   423  				err = syserror.ErrInterrupted
   424  				break
   425  			}
   426  			if err == syserror.ErrWouldBlock && !nonBlock {
   427  				err = dw.waitForBoth(t)
   428  			}
   429  			if err != nil {
   430  				break
   431  			}
   432  		}
   433  	}
   434  
   435  	if offsetAddr != 0 {
   436  		// Copy out the new offset.
   437  		offsetP := primitive.Uint64(offset)
   438  		if _, err := offsetP.CopyOut(t, offsetAddr); err != nil {
   439  			return 0, nil, err
   440  		}
   441  	}
   442  
   443  	if total != 0 {
   444  		if err != nil && err != io.EOF && err != syserror.ErrWouldBlock {
   445  			// If a partial write is completed, the error is dropped. Log it here.
   446  			log.Debugf("sendfile completed a partial write with error: %v", err)
   447  			err = nil
   448  		}
   449  	}
   450  
   451  	// We can only pass a single file to handleIOError, so pick inFile arbitrarily.
   452  	// This is used only for debugging purposes.
   453  	return uintptr(total), nil, slinux.HandleIOErrorVFS2(t, total != 0, err, syserror.ERESTARTSYS, "sendfile", inFile)
   454  }
   455  
   456  // dualWaiter is used to wait on one or both vfs.FileDescriptions. It is not
   457  // thread-safe, and does not take a reference on the vfs.FileDescriptions.
   458  //
   459  // Users must call destroy() when finished.
   460  type dualWaiter struct {
   461  	inFile  *vfs.FileDescription
   462  	outFile *vfs.FileDescription
   463  
   464  	inW   waiter.Entry
   465  	inCh  chan struct{}
   466  	outW  waiter.Entry
   467  	outCh chan struct{}
   468  }
   469  
   470  // waitForBoth waits for both dw.inFile and dw.outFile to be ready.
   471  func (dw *dualWaiter) waitForBoth(t *kernel.Task) error {
   472  	if dw.inFile.Readiness(eventMaskRead)&eventMaskRead == 0 {
   473  		if dw.inCh == nil {
   474  			dw.inW, dw.inCh = waiter.NewChannelEntry(nil)
   475  			dw.inFile.EventRegister(&dw.inW, eventMaskRead)
   476  			// We might be ready now. Try again before blocking.
   477  			return nil
   478  		}
   479  		if err := t.Block(dw.inCh); err != nil {
   480  			return err
   481  		}
   482  	}
   483  	return dw.waitForOut(t)
   484  }
   485  
   486  // waitForOut waits for dw.outfile to be read.
   487  func (dw *dualWaiter) waitForOut(t *kernel.Task) error {
   488  	// Don't bother checking readiness of the outFile, because it's not a
   489  	// guarantee that it won't return EWOULDBLOCK. Both pipes and eventfds
   490  	// can be "ready" but will reject writes of certain sizes with
   491  	// EWOULDBLOCK. See b/172075629, b/170743336.
   492  	if dw.outCh == nil {
   493  		dw.outW, dw.outCh = waiter.NewChannelEntry(nil)
   494  		dw.outFile.EventRegister(&dw.outW, eventMaskWrite)
   495  		// We might be ready to write now. Try again before blocking.
   496  		return nil
   497  	}
   498  	return t.Block(dw.outCh)
   499  }
   500  
   501  // destroy cleans up resources help by dw. No more calls to wait* can occur
   502  // after destroy is called.
   503  func (dw *dualWaiter) destroy() {
   504  	if dw.inCh != nil {
   505  		dw.inFile.EventUnregister(&dw.inW)
   506  		dw.inCh = nil
   507  	}
   508  	if dw.outCh != nil {
   509  		dw.outFile.EventUnregister(&dw.outW)
   510  		dw.outCh = nil
   511  	}
   512  	dw.inFile = nil
   513  	dw.outFile = nil
   514  }