github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/syscalls/linux/sys_splice.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package linux
    16  
    17  import (
    18  	"io"
    19  
    20  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    21  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    22  	"github.com/nicocha30/gvisor-ligolo/pkg/log"
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/marshal/primitive"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/pipe"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/usermem"
    29  	"github.com/nicocha30/gvisor-ligolo/pkg/waiter"
    30  )
    31  
    32  // Splice implements Linux syscall splice(2).
    33  func Splice(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    34  	inFD := args[0].Int()
    35  	inOffsetPtr := args[1].Pointer()
    36  	outFD := args[2].Int()
    37  	outOffsetPtr := args[3].Pointer()
    38  	count := int64(args[4].SizeT())
    39  	flags := args[5].Int()
    40  
    41  	if count == 0 {
    42  		return 0, nil, nil
    43  	}
    44  	if count > int64(kernel.MAX_RW_COUNT) {
    45  		count = int64(kernel.MAX_RW_COUNT)
    46  	}
    47  	if count < 0 {
    48  		return 0, nil, linuxerr.EINVAL
    49  	}
    50  
    51  	// Check for invalid flags.
    52  	if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
    53  		return 0, nil, linuxerr.EINVAL
    54  	}
    55  
    56  	// Get file descriptions.
    57  	inFile := t.GetFile(inFD)
    58  	if inFile == nil {
    59  		return 0, nil, linuxerr.EBADF
    60  	}
    61  	defer inFile.DecRef(t)
    62  	outFile := t.GetFile(outFD)
    63  	if outFile == nil {
    64  		return 0, nil, linuxerr.EBADF
    65  	}
    66  	defer outFile.DecRef(t)
    67  
    68  	// Check that both files support the required directionality.
    69  	if !inFile.IsReadable() || !outFile.IsWritable() {
    70  		return 0, nil, linuxerr.EBADF
    71  	}
    72  	if outFile.Options().DenySpliceIn {
    73  		return 0, nil, linuxerr.EINVAL
    74  	}
    75  
    76  	// The operation is non-blocking if anything is non-blocking.
    77  	//
    78  	// N.B. This is a rather simplistic heuristic that avoids some
    79  	// poor edge case behavior since the exact semantics here are
    80  	// underspecified and vary between versions of Linux itself.
    81  	nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0)
    82  
    83  	// At least one file description must represent a pipe.
    84  	inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD)
    85  	outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
    86  	if !inIsPipe && !outIsPipe {
    87  		return 0, nil, linuxerr.EINVAL
    88  	}
    89  
    90  	// Copy in offsets.
    91  	inOffset := int64(-1)
    92  	if inOffsetPtr != 0 {
    93  		if inIsPipe {
    94  			return 0, nil, linuxerr.ESPIPE
    95  		}
    96  		if inFile.Options().DenyPRead {
    97  			return 0, nil, linuxerr.EINVAL
    98  		}
    99  		if _, err := primitive.CopyInt64In(t, inOffsetPtr, &inOffset); err != nil {
   100  			return 0, nil, err
   101  		}
   102  		if inOffset < 0 {
   103  			return 0, nil, linuxerr.EINVAL
   104  		}
   105  	}
   106  	outOffset := int64(-1)
   107  	if outOffsetPtr != 0 {
   108  		if outIsPipe {
   109  			return 0, nil, linuxerr.ESPIPE
   110  		}
   111  		if outFile.Options().DenyPWrite {
   112  			return 0, nil, linuxerr.EINVAL
   113  		}
   114  		if _, err := primitive.CopyInt64In(t, outOffsetPtr, &outOffset); err != nil {
   115  			return 0, nil, err
   116  		}
   117  		if outOffset < 0 {
   118  			return 0, nil, linuxerr.EINVAL
   119  		}
   120  	}
   121  
   122  	// Move data.
   123  	var (
   124  		n   int64
   125  		err error
   126  	)
   127  	dw := dualWaiter{
   128  		inFile:  inFile,
   129  		outFile: outFile,
   130  	}
   131  	defer dw.destroy()
   132  	for {
   133  		// If both input and output are pipes, delegate to the pipe
   134  		// implementation. Otherwise, exactly one end is a pipe, which
   135  		// we ensure is consistently ordered after the non-pipe FD's
   136  		// locks by passing the pipe FD as usermem.IO to the non-pipe
   137  		// end.
   138  		switch {
   139  		case inIsPipe && outIsPipe:
   140  			n, err = pipe.Splice(t, outPipeFD, inPipeFD, count)
   141  		case inIsPipe:
   142  			n, err = inPipeFD.SpliceToNonPipe(t, outFile, outOffset, count)
   143  			if outOffset != -1 {
   144  				outOffset += n
   145  			}
   146  		case outIsPipe:
   147  			n, err = outPipeFD.SpliceFromNonPipe(t, inFile, inOffset, count)
   148  			if inOffset != -1 {
   149  				inOffset += n
   150  			}
   151  		default:
   152  			panic("at least one end of splice must be a pipe")
   153  		}
   154  
   155  		if n != 0 || !linuxerr.Equals(linuxerr.ErrWouldBlock, err) || nonBlock {
   156  			break
   157  		}
   158  		if err = dw.waitForBoth(t); err != nil {
   159  			break
   160  		}
   161  	}
   162  
   163  	// Copy updated offsets out.
   164  	if inOffsetPtr != 0 {
   165  		if _, err := primitive.CopyInt64Out(t, inOffsetPtr, inOffset); err != nil {
   166  			return 0, nil, err
   167  		}
   168  	}
   169  	if outOffsetPtr != 0 {
   170  		if _, err := primitive.CopyInt64Out(t, outOffsetPtr, outOffset); err != nil {
   171  			return 0, nil, err
   172  		}
   173  	}
   174  
   175  	// We can only pass a single file to handleIOError, so pick inFile arbitrarily.
   176  	// This is used only for debugging purposes.
   177  	return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "splice", outFile)
   178  }
   179  
   180  // Tee implements Linux syscall tee(2).
   181  func Tee(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   182  	inFD := args[0].Int()
   183  	outFD := args[1].Int()
   184  	count := int64(args[2].SizeT())
   185  	flags := args[3].Int()
   186  
   187  	if count == 0 {
   188  		return 0, nil, nil
   189  	}
   190  	if count > int64(kernel.MAX_RW_COUNT) {
   191  		count = int64(kernel.MAX_RW_COUNT)
   192  	}
   193  	if count < 0 {
   194  		return 0, nil, linuxerr.EINVAL
   195  	}
   196  
   197  	// Check for invalid flags.
   198  	if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
   199  		return 0, nil, linuxerr.EINVAL
   200  	}
   201  
   202  	// Get file descriptions.
   203  	inFile := t.GetFile(inFD)
   204  	if inFile == nil {
   205  		return 0, nil, linuxerr.EBADF
   206  	}
   207  	defer inFile.DecRef(t)
   208  	outFile := t.GetFile(outFD)
   209  	if outFile == nil {
   210  		return 0, nil, linuxerr.EBADF
   211  	}
   212  	defer outFile.DecRef(t)
   213  
   214  	// Check that both files support the required directionality.
   215  	if !inFile.IsReadable() || !outFile.IsWritable() {
   216  		return 0, nil, linuxerr.EBADF
   217  	}
   218  	if outFile.Options().DenySpliceIn {
   219  		return 0, nil, linuxerr.EINVAL
   220  	}
   221  
   222  	// The operation is non-blocking if anything is non-blocking.
   223  	//
   224  	// N.B. This is a rather simplistic heuristic that avoids some
   225  	// poor edge case behavior since the exact semantics here are
   226  	// underspecified and vary between versions of Linux itself.
   227  	nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0)
   228  
   229  	// Both file descriptions must represent pipes.
   230  	inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD)
   231  	outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
   232  	if !inIsPipe || !outIsPipe {
   233  		return 0, nil, linuxerr.EINVAL
   234  	}
   235  
   236  	// Copy data.
   237  	var (
   238  		n   int64
   239  		err error
   240  	)
   241  	dw := dualWaiter{
   242  		inFile:  inFile,
   243  		outFile: outFile,
   244  	}
   245  	defer dw.destroy()
   246  	for {
   247  		n, err = pipe.Tee(t, outPipeFD, inPipeFD, count)
   248  		if n != 0 || !linuxerr.Equals(linuxerr.ErrWouldBlock, err) || nonBlock {
   249  			break
   250  		}
   251  		if err = dw.waitForBoth(t); err != nil {
   252  			break
   253  		}
   254  	}
   255  
   256  	if n != 0 {
   257  		// If a partial write is completed, the error is dropped. Log it here.
   258  		if err != nil && err != io.EOF && !linuxerr.Equals(linuxerr.ErrWouldBlock, err) {
   259  			log.Debugf("tee completed a partial write with error: %v", err)
   260  			err = nil
   261  		}
   262  	}
   263  
   264  	// We can only pass a single file to handleIOError, so pick inFile arbitrarily.
   265  	// This is used only for debugging purposes.
   266  	return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "tee", inFile)
   267  }
   268  
   269  // Sendfile implements linux system call sendfile(2).
   270  func Sendfile(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   271  	outFD := args[0].Int()
   272  	inFD := args[1].Int()
   273  	offsetAddr := args[2].Pointer()
   274  	count := int64(args[3].SizeT())
   275  
   276  	inFile := t.GetFile(inFD)
   277  	if inFile == nil {
   278  		return 0, nil, linuxerr.EBADF
   279  	}
   280  	defer inFile.DecRef(t)
   281  	if !inFile.IsReadable() {
   282  		return 0, nil, linuxerr.EBADF
   283  	}
   284  
   285  	outFile := t.GetFile(outFD)
   286  	if outFile == nil {
   287  		return 0, nil, linuxerr.EBADF
   288  	}
   289  	defer outFile.DecRef(t)
   290  	if !outFile.IsWritable() {
   291  		return 0, nil, linuxerr.EBADF
   292  	}
   293  	if outFile.Options().DenySpliceIn {
   294  		return 0, nil, linuxerr.EINVAL
   295  	}
   296  
   297  	// Verify that the outFile Append flag is not set.
   298  	if outFile.StatusFlags()&linux.O_APPEND != 0 {
   299  		return 0, nil, linuxerr.EINVAL
   300  	}
   301  
   302  	// Verify that inFile is a regular file or block device. This is a
   303  	// requirement; the same check appears in Linux
   304  	// (fs/splice.c:splice_direct_to_actor).
   305  	if stat, err := inFile.Stat(t, vfs.StatOptions{Mask: linux.STATX_TYPE}); err != nil {
   306  		return 0, nil, err
   307  	} else if stat.Mask&linux.STATX_TYPE == 0 ||
   308  		(stat.Mode&linux.S_IFMT != linux.S_IFREG && stat.Mode&linux.S_IFMT != linux.S_IFBLK) {
   309  		return 0, nil, linuxerr.EINVAL
   310  	}
   311  
   312  	// Copy offset if it exists.
   313  	offset := int64(-1)
   314  	if offsetAddr != 0 {
   315  		if inFile.Options().DenyPRead {
   316  			return 0, nil, linuxerr.ESPIPE
   317  		}
   318  		var offsetP primitive.Int64
   319  		if _, err := offsetP.CopyIn(t, offsetAddr); err != nil {
   320  			return 0, nil, err
   321  		}
   322  		offset = int64(offsetP)
   323  
   324  		if offset < 0 {
   325  			return 0, nil, linuxerr.EINVAL
   326  		}
   327  		if offset+count < 0 {
   328  			return 0, nil, linuxerr.EINVAL
   329  		}
   330  	}
   331  
   332  	// Validate count. This must come after offset checks.
   333  	if count < 0 {
   334  		return 0, nil, linuxerr.EINVAL
   335  	}
   336  	if count == 0 {
   337  		return 0, nil, nil
   338  	}
   339  	if count > int64(kernel.MAX_RW_COUNT) {
   340  		count = int64(kernel.MAX_RW_COUNT)
   341  	}
   342  
   343  	// Copy data.
   344  	var (
   345  		total int64
   346  		err   error
   347  	)
   348  	dw := dualWaiter{
   349  		inFile:  inFile,
   350  		outFile: outFile,
   351  	}
   352  	defer dw.destroy()
   353  	outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
   354  	// Reading from input file should never block, since it is regular or
   355  	// block device. We only need to check if writing to the output file
   356  	// can block.
   357  	nonBlock := outFile.StatusFlags()&linux.O_NONBLOCK != 0
   358  	if outIsPipe {
   359  		for {
   360  			var n int64
   361  			n, err = outPipeFD.SpliceFromNonPipe(t, inFile, offset, count-total)
   362  			if offset != -1 {
   363  				offset += n
   364  			}
   365  			total += n
   366  			if total == count {
   367  				break
   368  			}
   369  			if err == nil && t.Interrupted() {
   370  				err = linuxerr.ErrInterrupted
   371  				break
   372  			}
   373  			if linuxerr.Equals(linuxerr.ErrWouldBlock, err) && !nonBlock {
   374  				err = dw.waitForBoth(t)
   375  			}
   376  			if err != nil {
   377  				break
   378  			}
   379  		}
   380  	} else {
   381  		// Read inFile to buffer, then write the contents to outFile.
   382  		//
   383  		// The buffer size has to be limited to avoid large memory
   384  		// allocations and long delays. In Linux, the buffer size is
   385  		// limited by a size of an internl pipe. Here, we repeat this
   386  		// behavior.
   387  		bufSize := count
   388  		if bufSize > pipe.MaximumPipeSize {
   389  			bufSize = pipe.MaximumPipeSize
   390  		}
   391  		buf := make([]byte, bufSize)
   392  		for {
   393  			if int64(len(buf)) > count-total {
   394  				buf = buf[:count-total]
   395  			}
   396  			var readN int64
   397  			if offset != -1 {
   398  				readN, err = inFile.PRead(t, usermem.BytesIOSequence(buf), offset, vfs.ReadOptions{})
   399  				offset += readN
   400  			} else {
   401  				readN, err = inFile.Read(t, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
   402  			}
   403  
   404  			// Write all of the bytes that we read. This may need
   405  			// multiple write calls to complete.
   406  			wbuf := buf[:readN]
   407  			for len(wbuf) > 0 {
   408  				var writeN int64
   409  				writeN, err = outFile.Write(t, usermem.BytesIOSequence(wbuf), vfs.WriteOptions{})
   410  				wbuf = wbuf[writeN:]
   411  				if linuxerr.Equals(linuxerr.ErrWouldBlock, err) && !nonBlock {
   412  					err = dw.waitForOut(t)
   413  				}
   414  				if err != nil {
   415  					// We didn't complete the write. Only report the bytes that were actually
   416  					// written, and rewind offsets as needed.
   417  					notWritten := int64(len(wbuf))
   418  					readN -= notWritten
   419  					if offset == -1 {
   420  						// We modified the offset of the input file itself during the read
   421  						// operation. Rewind it.
   422  						if _, seekErr := inFile.Seek(t, -notWritten, linux.SEEK_CUR); seekErr != nil {
   423  							// Log the error but don't return it, since the write has already
   424  							// completed successfully.
   425  							log.Warningf("failed to roll back input file offset: %v", seekErr)
   426  						}
   427  					} else {
   428  						// The sendfile call was provided an offset parameter that should be
   429  						// adjusted to reflect the number of bytes sent. Rewind it.
   430  						offset -= notWritten
   431  					}
   432  					break
   433  				}
   434  			}
   435  
   436  			total += readN
   437  			if total == count {
   438  				break
   439  			}
   440  			if err == nil && t.Interrupted() {
   441  				err = linuxerr.ErrInterrupted
   442  				break
   443  			}
   444  			if linuxerr.Equals(linuxerr.ErrWouldBlock, err) && !nonBlock {
   445  				err = dw.waitForBoth(t)
   446  			}
   447  			if err != nil {
   448  				break
   449  			}
   450  		}
   451  	}
   452  
   453  	if offsetAddr != 0 {
   454  		// Copy out the new offset.
   455  		offsetP := primitive.Uint64(offset)
   456  		if _, err := offsetP.CopyOut(t, offsetAddr); err != nil {
   457  			return 0, nil, err
   458  		}
   459  	}
   460  
   461  	if total != 0 {
   462  		if err != nil && err != io.EOF && !linuxerr.Equals(linuxerr.ErrWouldBlock, err) {
   463  			// If a partial write is completed, the error is dropped. Log it here.
   464  			log.Debugf("sendfile completed a partial write with error: %v", err)
   465  			err = nil
   466  		}
   467  	}
   468  
   469  	// We can only pass a single file to handleIOError, so pick inFile arbitrarily.
   470  	// This is used only for debugging purposes.
   471  	return uintptr(total), nil, HandleIOError(t, total != 0, err, linuxerr.ERESTARTSYS, "sendfile", inFile)
   472  }
   473  
   474  // dualWaiter is used to wait on one or both vfs.FileDescriptions. It is not
   475  // thread-safe, and does not take a reference on the vfs.FileDescriptions.
   476  //
   477  // Users must call destroy() when finished.
   478  type dualWaiter struct {
   479  	inFile  *vfs.FileDescription
   480  	outFile *vfs.FileDescription
   481  
   482  	inW   waiter.Entry
   483  	inCh  chan struct{}
   484  	outW  waiter.Entry
   485  	outCh chan struct{}
   486  }
   487  
   488  // waitForBoth waits for both dw.inFile and dw.outFile to be ready.
   489  func (dw *dualWaiter) waitForBoth(t *kernel.Task) error {
   490  	if dw.inFile.Readiness(eventMaskRead)&eventMaskRead == 0 {
   491  		if dw.inCh == nil {
   492  			dw.inW, dw.inCh = waiter.NewChannelEntry(eventMaskRead)
   493  			if err := dw.inFile.EventRegister(&dw.inW); err != nil {
   494  				return err
   495  			}
   496  			// We might be ready now. Try again before blocking.
   497  			return nil
   498  		}
   499  		if err := t.Block(dw.inCh); err != nil {
   500  			return err
   501  		}
   502  	}
   503  	return dw.waitForOut(t)
   504  }
   505  
   506  // waitForOut waits for dw.outfile to be read.
   507  func (dw *dualWaiter) waitForOut(t *kernel.Task) error {
   508  	// Don't bother checking readiness of the outFile, because it's not a
   509  	// guarantee that it won't return EWOULDBLOCK. Both pipes and eventfds
   510  	// can be "ready" but will reject writes of certain sizes with
   511  	// EWOULDBLOCK. See b/172075629, b/170743336.
   512  	if dw.outCh == nil {
   513  		dw.outW, dw.outCh = waiter.NewChannelEntry(eventMaskWrite)
   514  		if err := dw.outFile.EventRegister(&dw.outW); err != nil {
   515  			return err
   516  		}
   517  		// We might be ready to write now. Try again before blocking.
   518  		return nil
   519  	}
   520  	return t.Block(dw.outCh)
   521  }
   522  
   523  // destroy cleans up resources help by dw. No more calls to wait* can occur
   524  // after destroy is called.
   525  func (dw *dualWaiter) destroy() {
   526  	if dw.inCh != nil {
   527  		dw.inFile.EventUnregister(&dw.inW)
   528  		dw.inCh = nil
   529  	}
   530  	if dw.outCh != nil {
   531  		dw.outFile.EventUnregister(&dw.outW)
   532  		dw.outCh = nil
   533  	}
   534  	dw.inFile = nil
   535  	dw.outFile = nil
   536  }