github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/kernel/pipe/vfs.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package pipe
    16  
    17  import (
    18  	"github.com/metacubex/gvisor/pkg/abi/linux"
    19  	"github.com/metacubex/gvisor/pkg/context"
    20  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    21  	"github.com/metacubex/gvisor/pkg/hostarch"
    22  	"github.com/metacubex/gvisor/pkg/log"
    23  	"github.com/metacubex/gvisor/pkg/safemem"
    24  	"github.com/metacubex/gvisor/pkg/sentry/arch"
    25  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    26  	"github.com/metacubex/gvisor/pkg/usermem"
    27  	"github.com/metacubex/gvisor/pkg/waiter"
    28  )
    29  
    30  // This file contains types enabling the pipe package to be used with the vfs
    31  // package.
    32  
    33  // VFSPipe represents the actual pipe, analogous to an inode. VFSPipes should
    34  // not be copied.
    35  //
    36  // +stateify savable
    37  type VFSPipe struct {
    38  	// pipe is the underlying pipe.
    39  	pipe Pipe
    40  }
    41  
    42  // NewVFSPipe returns an initialized VFSPipe.
    43  func NewVFSPipe(isNamed bool, sizeBytes int64) *VFSPipe {
    44  	var vp VFSPipe
    45  	initPipe(&vp.pipe, isNamed, sizeBytes)
    46  	return &vp
    47  }
    48  
    49  // ReaderWriterPair returns read-only and write-only FDs for vp.
    50  //
    51  // Preconditions: statusFlags should not contain an open access mode.
    52  func (vp *VFSPipe) ReaderWriterPair(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription, error) {
    53  	// Connected pipes share the same locks.
    54  	locks := &vfs.FileLocks{}
    55  	r, err := vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks)
    56  	if err != nil {
    57  		return nil, nil, err
    58  	}
    59  	vp.pipe.rOpen()
    60  	w, err := vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks)
    61  	if err != nil {
    62  		r.DecRef(ctx)
    63  		return nil, nil, err
    64  	}
    65  	vp.pipe.wOpen()
    66  	return r, w, nil
    67  }
    68  
    69  // Allocate implements vfs.FileDescriptionImpl.Allocate.
    70  func (*VFSPipe) Allocate(context.Context, uint64, uint64, uint64) error {
    71  	return linuxerr.ESPIPE
    72  }
    73  
    74  // Open opens the pipe represented by vp.
    75  func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) {
    76  	readable := vfs.MayReadFileWithOpenFlags(statusFlags)
    77  	writable := vfs.MayWriteFileWithOpenFlags(statusFlags)
    78  	if !readable && !writable {
    79  		return nil, linuxerr.EINVAL
    80  	}
    81  
    82  	fd, err := vp.newFD(mnt, vfsd, statusFlags, locks)
    83  	if err != nil {
    84  		return nil, err
    85  	}
    86  
    87  	// Named pipes have special blocking semantics during open:
    88  	//
    89  	// "Normally, opening the FIFO blocks until the other end is opened also. A
    90  	// process can open a FIFO in nonblocking mode. In this case, opening for
    91  	// read-only will succeed even if no-one has opened on the write side yet,
    92  	// opening for write-only will fail with ENXIO (no such device or address)
    93  	// unless the other end has already been opened. Under Linux, opening a
    94  	// FIFO for read and write will succeed both in blocking and nonblocking
    95  	// mode. POSIX leaves this behavior undefined. This can be used to open a
    96  	// FIFO for writing while there are no readers available." - fifo(7)
    97  	switch {
    98  	case readable && writable:
    99  		vp.pipe.rOpen()
   100  		vp.pipe.wOpen()
   101  		// Pipes opened for read-write always succeed without blocking.
   102  
   103  	case readable:
   104  		tWriters := vp.pipe.totalWriters.Load()
   105  		vp.pipe.rOpen()
   106  		// If this pipe is being opened as blocking and there's no
   107  		// writer, we have to wait for a writer to open the other end.
   108  		for vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() &&
   109  			tWriters == vp.pipe.totalWriters.Load() {
   110  			if !ctx.BlockOn((*waitWriters)(&vp.pipe), waiter.EventInternal) {
   111  				fd.DecRef(ctx)
   112  				return nil, linuxerr.EINTR
   113  			}
   114  		}
   115  
   116  	case writable:
   117  		tReaders := vp.pipe.totalReaders.Load()
   118  		vp.pipe.wOpen()
   119  		for vp.pipe.isNamed && !vp.pipe.HasReaders() &&
   120  			tReaders == vp.pipe.totalReaders.Load() {
   121  			// Non-blocking, write-only opens fail with ENXIO when the read
   122  			// side isn't open yet.
   123  			if statusFlags&linux.O_NONBLOCK != 0 {
   124  				fd.DecRef(ctx)
   125  				return nil, linuxerr.ENXIO
   126  			}
   127  			if !ctx.BlockOn((*waitReaders)(&vp.pipe), waiter.EventInternal) {
   128  				fd.DecRef(ctx)
   129  				return nil, linuxerr.EINTR
   130  			}
   131  		}
   132  
   133  	default:
   134  		panic("invalid pipe flags: must be readable, writable, or both")
   135  	}
   136  
   137  	return fd, nil
   138  }
   139  
   140  // Preconditions: vp.mu must be held.
   141  func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) {
   142  	fd := &VFSPipeFD{
   143  		pipe: &vp.pipe,
   144  	}
   145  	fd.LockFD.Init(locks)
   146  	if err := fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{
   147  		DenyPRead:         true,
   148  		DenyPWrite:        true,
   149  		UseDentryMetadata: true,
   150  	}); err != nil {
   151  		return nil, err
   152  	}
   153  
   154  	return &fd.vfsfd, nil
   155  }
   156  
   157  // VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements
   158  // non-atomic usermem.IO methods, allowing it to be passed as usermem.IO to
   159  // other FileDescriptions for splice(2) and tee(2).
   160  //
   161  // +stateify savable
   162  type VFSPipeFD struct {
   163  	vfsfd vfs.FileDescription
   164  	vfs.FileDescriptionDefaultImpl
   165  	vfs.DentryMetadataFileDescriptionImpl
   166  	vfs.LockFD
   167  
   168  	pipe *Pipe
   169  
   170  	// lastAddr is the last hostarch.Addr at which a call to a
   171  	// VFSPipeFD.(usermem.IO) method ended. lastAddr is protected by pipe.mu.
   172  	lastAddr hostarch.Addr
   173  }
   174  
   175  // Release implements vfs.FileDescriptionImpl.Release.
   176  func (fd *VFSPipeFD) Release(context.Context) {
   177  	var event waiter.EventMask
   178  	if fd.vfsfd.IsReadable() {
   179  		fd.pipe.rClose()
   180  		event |= waiter.WritableEvents
   181  		if !fd.pipe.HasReaders() {
   182  			event |= waiter.EventErr
   183  		}
   184  	}
   185  	if fd.vfsfd.IsWritable() {
   186  		fd.pipe.wClose()
   187  		event |= waiter.ReadableEvents | waiter.EventHUp
   188  	}
   189  	if event == 0 {
   190  		panic("invalid pipe flags: must be readable, writable, or both")
   191  	}
   192  
   193  	fd.pipe.queue.Notify(event)
   194  }
   195  
   196  // Readiness implements waiter.Waitable.Readiness.
   197  func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask {
   198  	switch {
   199  	case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable():
   200  		return fd.pipe.rwReadiness()
   201  	case fd.vfsfd.IsReadable():
   202  		return fd.pipe.rReadiness()
   203  	case fd.vfsfd.IsWritable():
   204  		return fd.pipe.wReadiness()
   205  	default:
   206  		panic("pipe FD is neither readable nor writable")
   207  	}
   208  }
   209  
   210  // Allocate implements vfs.FileDescriptionImpl.Allocate.
   211  func (fd *VFSPipeFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
   212  	return linuxerr.ESPIPE
   213  }
   214  
   215  // EventRegister implements waiter.Waitable.EventRegister.
   216  func (fd *VFSPipeFD) EventRegister(e *waiter.Entry) error {
   217  	fd.pipe.EventRegister(e)
   218  
   219  	// Notify synchronously.
   220  	e.NotifyEvent(fd.Readiness(^waiter.EventMask(0)))
   221  	return nil
   222  }
   223  
   224  // EventUnregister implements waiter.Waitable.EventUnregister.
   225  func (fd *VFSPipeFD) EventUnregister(e *waiter.Entry) {
   226  	fd.pipe.EventUnregister(e)
   227  }
   228  
   229  // Epollable implements FileDescriptionImpl.Epollable.
   230  func (fd *VFSPipeFD) Epollable() bool {
   231  	return true
   232  }
   233  
   234  // Read implements vfs.FileDescriptionImpl.Read.
   235  func (fd *VFSPipeFD) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
   236  	return fd.pipe.Read(ctx, dst)
   237  }
   238  
   239  // Write implements vfs.FileDescriptionImpl.Write.
   240  func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
   241  	return fd.pipe.Write(ctx, src)
   242  }
   243  
   244  // Ioctl implements vfs.FileDescriptionImpl.Ioctl.
   245  func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
   246  	return fd.pipe.Ioctl(ctx, uio, sysno, args)
   247  }
   248  
   249  // PipeSize implements fcntl(F_GETPIPE_SZ).
   250  func (fd *VFSPipeFD) PipeSize() int64 {
   251  	// Inline Pipe.FifoSize() since we don't have a fs.File.
   252  	fd.pipe.mu.Lock()
   253  	defer fd.pipe.mu.Unlock()
   254  	return fd.pipe.max
   255  }
   256  
   257  // SetPipeSize implements fcntl(F_SETPIPE_SZ).
   258  func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) {
   259  	return fd.pipe.SetFifoSize(size)
   260  }
   261  
   262  // SpliceToNonPipe performs a splice operation from fd to a non-pipe file.
   263  func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescription, off, count int64) (int64, error) {
   264  	fd.pipe.mu.Lock()
   265  
   266  	// Cap the sequence at number of bytes actually available.
   267  	if count > fd.pipe.size {
   268  		count = fd.pipe.size
   269  	}
   270  	src := usermem.IOSequence{
   271  		IO:    fd,
   272  		Addrs: hostarch.AddrRangeSeqOf(hostarch.AddrRange{0, hostarch.Addr(count)}),
   273  	}
   274  
   275  	var (
   276  		n   int64
   277  		err error
   278  	)
   279  	fd.lastAddr = 0
   280  	if off == -1 {
   281  		n, err = out.Write(ctx, src, vfs.WriteOptions{})
   282  	} else {
   283  		n, err = out.PWrite(ctx, src, off, vfs.WriteOptions{})
   284  	}
   285  	// Implementations of out.[P]Write() that ignore written data (e.g.
   286  	// /dev/null) may skip calling src.CopyIn[To](), so:
   287  	//
   288  	// - We must call Pipe.consumeLocked() here rather than in fd.CopyIn[To]().
   289  	//
   290  	// - We must check if Pipe.peekLocked() would have returned ErrWouldBlock.
   291  	fd.pipe.consumeLocked(n)
   292  	if n == 0 && err == nil && fd.pipe.size == 0 && fd.pipe.HasWriters() {
   293  		err = linuxerr.ErrWouldBlock
   294  	}
   295  
   296  	fd.pipe.mu.Unlock()
   297  
   298  	if n > 0 {
   299  		fd.pipe.queue.Notify(waiter.WritableEvents)
   300  	}
   301  	return n, err
   302  }
   303  
   304  // SpliceFromNonPipe performs a splice operation from a non-pipe file to fd.
   305  func (fd *VFSPipeFD) SpliceFromNonPipe(ctx context.Context, in *vfs.FileDescription, off, count int64) (int64, error) {
   306  	dst := usermem.IOSequence{
   307  		IO:    fd,
   308  		Addrs: hostarch.AddrRangeSeqOf(hostarch.AddrRange{0, hostarch.Addr(count)}),
   309  	}
   310  
   311  	var (
   312  		n   int64
   313  		err error
   314  	)
   315  	fd.pipe.mu.Lock()
   316  	fd.lastAddr = 0
   317  	if off == -1 {
   318  		n, err = in.Read(ctx, dst, vfs.ReadOptions{})
   319  	} else {
   320  		n, err = in.PRead(ctx, dst, off, vfs.ReadOptions{})
   321  	}
   322  	fd.pipe.mu.Unlock()
   323  
   324  	if n > 0 {
   325  		fd.pipe.queue.Notify(waiter.ReadableEvents)
   326  	}
   327  	return n, err
   328  }
   329  
   330  // CopyIn implements usermem.IO.CopyIn. Note that it is the caller's
   331  // responsibility to call fd.pipe.Notify(waiter.WritableEvents) after the read
   332  // is completed.
   333  //
   334  // Preconditions: fd.pipe.mu must be locked.
   335  func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr hostarch.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
   336  	if addr != fd.lastAddr {
   337  		log.Traceback("Non-sequential VFSPipeFD.CopyIn: lastAddr=%#x addr=%#x", fd.lastAddr, addr)
   338  		return 0, linuxerr.EINVAL
   339  	}
   340  	n, err := fd.pipe.peekLocked(int64(addr), int64(len(dst)), func(srcs safemem.BlockSeq) (uint64, error) {
   341  		return safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(dst)), srcs)
   342  	})
   343  	fd.lastAddr = addr + hostarch.Addr(n)
   344  	return int(n), err
   345  }
   346  
   347  // CopyOut implements usermem.IO.CopyOut. Note that it is the caller's
   348  // responsibility to call fd.pipe.queue.Notify(waiter.ReadableEvents) after the
   349  // write is completed.
   350  //
   351  // Preconditions: fd.pipe.mu must be locked.
   352  func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr hostarch.Addr, src []byte, opts usermem.IOOpts) (int, error) {
   353  	if addr != fd.lastAddr {
   354  		log.Traceback("Non-sequential VFSPipeFD.CopyOut: lastAddr=%#x addr=%#x", fd.lastAddr, addr)
   355  		return 0, linuxerr.EINVAL
   356  	}
   357  	n, err := fd.pipe.writeLocked(int64(len(src)), func(dsts safemem.BlockSeq) (uint64, error) {
   358  		return safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(src)))
   359  	})
   360  	fd.lastAddr = addr + hostarch.Addr(n)
   361  	return int(n), err
   362  }
   363  
   364  // ZeroOut implements usermem.IO.ZeroOut.
   365  //
   366  // Preconditions: fd.pipe.mu must be locked.
   367  func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
   368  	if addr != fd.lastAddr {
   369  		log.Traceback("Non-sequential VFSPipeFD.ZeroOut: lastAddr=%#x addr=%#x", fd.lastAddr, addr)
   370  		return 0, linuxerr.EINVAL
   371  	}
   372  	n, err := fd.pipe.writeLocked(toZero, func(dsts safemem.BlockSeq) (uint64, error) {
   373  		return safemem.ZeroSeq(dsts)
   374  	})
   375  	fd.lastAddr = addr + hostarch.Addr(n)
   376  	return n, err
   377  }
   378  
   379  // CopyInTo implements usermem.IO.CopyInTo. Note that it is the caller's
   380  // responsibility to call fd.pipe.consumeLocked() and
   381  // fd.pipe.queue.Notify(waiter.WritableEvents) after the read is completed.
   382  //
   383  // Preconditions: fd.pipe.mu must be locked.
   384  func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars hostarch.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
   385  	total := int64(0)
   386  	for !ars.IsEmpty() {
   387  		ar := ars.Head()
   388  		if ar.Start != fd.lastAddr {
   389  			log.Traceback("Non-sequential VFSPipeFD.CopyInTo: lastAddr=%#x addr=%#x", fd.lastAddr, ar.Start)
   390  			return total, linuxerr.EINVAL
   391  		}
   392  		n, err := fd.pipe.peekLocked(int64(ar.Start), int64(ar.Length()), func(srcs safemem.BlockSeq) (uint64, error) {
   393  			return dst.WriteFromBlocks(srcs)
   394  		})
   395  		fd.lastAddr = ar.Start + hostarch.Addr(n)
   396  		total += n
   397  		if err != nil {
   398  			return total, err
   399  		}
   400  		ars = ars.Tail()
   401  	}
   402  	return total, nil
   403  }
   404  
   405  // CopyOutFrom implements usermem.IO.CopyOutFrom. Note that it is the caller's
   406  // responsibility to call fd.pipe.queue.Notify(waiter.ReadableEvents) after the
   407  // write is completed.
   408  //
   409  // Preconditions: fd.pipe.mu must be locked.
   410  func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars hostarch.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) {
   411  	total := int64(0)
   412  	for !ars.IsEmpty() {
   413  		ar := ars.Head()
   414  		if ar.Start != fd.lastAddr {
   415  			log.Traceback("Non-sequential VFSPipeFD.CopyOutFrom: lastAddr=%#x addr=%#x", fd.lastAddr, ar.Start)
   416  			return total, linuxerr.EINVAL
   417  		}
   418  		n, err := fd.pipe.writeLocked(int64(ar.Length()), func(dsts safemem.BlockSeq) (uint64, error) {
   419  			return src.ReadToBlocks(dsts)
   420  		})
   421  		fd.lastAddr = ar.Start + hostarch.Addr(n)
   422  		total += n
   423  		if err != nil {
   424  			return total, err
   425  		}
   426  		ars = ars.Tail()
   427  	}
   428  	return total, nil
   429  }
   430  
   431  // SwapUint32 implements usermem.IO.SwapUint32.
   432  func (fd *VFSPipeFD) SwapUint32(ctx context.Context, addr hostarch.Addr, new uint32, opts usermem.IOOpts) (uint32, error) {
   433  	// How did a pipe get passed as the virtual address space to futex(2)?
   434  	panic("VFSPipeFD.SwapUint32 called unexpectedly")
   435  }
   436  
   437  // CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32.
   438  func (fd *VFSPipeFD) CompareAndSwapUint32(ctx context.Context, addr hostarch.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) {
   439  	panic("VFSPipeFD.CompareAndSwapUint32 called unexpectedly")
   440  }
   441  
   442  // LoadUint32 implements usermem.IO.LoadUint32.
   443  func (fd *VFSPipeFD) LoadUint32(ctx context.Context, addr hostarch.Addr, opts usermem.IOOpts) (uint32, error) {
   444  	panic("VFSPipeFD.LoadUint32 called unexpectedly")
   445  }
   446  
   447  // Splice reads up to count bytes from src and writes them to dst. It returns
   448  // the number of bytes moved.
   449  //
   450  // Preconditions: count > 0.
   451  func Splice(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) {
   452  	return spliceOrTee(ctx, dst, src, count, true /* removeFromSrc */)
   453  }
   454  
   455  // Tee reads up to count bytes from src and writes them to dst, without
   456  // removing the read bytes from src. It returns the number of bytes copied.
   457  //
   458  // Preconditions: count > 0.
   459  func Tee(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) {
   460  	return spliceOrTee(ctx, dst, src, count, false /* removeFromSrc */)
   461  }
   462  
   463  // Preconditions: count > 0.
   464  func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFromSrc bool) (int64, error) {
   465  	if dst.pipe == src.pipe {
   466  		return 0, linuxerr.EINVAL
   467  	}
   468  
   469  	firstLocked, secondLocked := lockTwoPipes(dst.pipe, src.pipe)
   470  	n, err := dst.pipe.writeLocked(count, func(dsts safemem.BlockSeq) (uint64, error) {
   471  		n, err := src.pipe.peekLocked(0, int64(dsts.NumBytes()), func(srcs safemem.BlockSeq) (uint64, error) {
   472  			return safemem.CopySeq(dsts, srcs)
   473  		})
   474  		if n > 0 && removeFromSrc {
   475  			src.pipe.consumeLocked(n)
   476  		}
   477  		return uint64(n), err
   478  	})
   479  	secondLocked.mu.NestedUnlock(pipeLockPipe)
   480  	firstLocked.mu.Unlock()
   481  
   482  	if n > 0 {
   483  		dst.pipe.queue.Notify(waiter.ReadableEvents)
   484  		if removeFromSrc {
   485  			src.pipe.queue.Notify(waiter.WritableEvents)
   486  		}
   487  	}
   488  	return n, err
   489  }