github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/fuse/dev.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package fuse
    16  
    17  import (
    18  	"golang.org/x/sys/unix"
    19  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    20  	"github.com/SagerNet/gvisor/pkg/context"
    21  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    22  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    23  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    24  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    25  	"github.com/SagerNet/gvisor/pkg/sync"
    26  	"github.com/SagerNet/gvisor/pkg/syserror"
    27  	"github.com/SagerNet/gvisor/pkg/usermem"
    28  	"github.com/SagerNet/gvisor/pkg/waiter"
    29  )
    30  
    31  const fuseDevMinor = 229
    32  
    33  // fuseDevice implements vfs.Device for /dev/fuse.
    34  //
    35  // +stateify savable
    36  type fuseDevice struct{}
    37  
    38  // Open implements vfs.Device.Open.
    39  func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
    40  	if !kernel.FUSEEnabled {
    41  		return nil, syserror.ENOENT
    42  	}
    43  
    44  	var fd DeviceFD
    45  	if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
    46  		UseDentryMetadata: true,
    47  	}); err != nil {
    48  		return nil, err
    49  	}
    50  	return &fd.vfsfd, nil
    51  }
    52  
    53  // DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse.
    54  //
    55  // +stateify savable
    56  type DeviceFD struct {
    57  	vfsfd vfs.FileDescription
    58  	vfs.FileDescriptionDefaultImpl
    59  	vfs.DentryMetadataFileDescriptionImpl
    60  	vfs.NoLockFD
    61  
    62  	// nextOpID is used to create new requests.
    63  	nextOpID linux.FUSEOpID
    64  
    65  	// queue is the list of requests that need to be processed by the FUSE server.
    66  	queue requestList
    67  
    68  	// numActiveRequests is the number of requests made by the Sentry that has
    69  	// yet to be responded to.
    70  	numActiveRequests uint64
    71  
    72  	// completions is used to map a request to its response. A Writer will use this
    73  	// to notify the caller of a completed response.
    74  	completions map[linux.FUSEOpID]*futureResponse
    75  
    76  	writeCursor uint32
    77  
    78  	// writeBuf is the memory buffer used to copy in the FUSE out header from
    79  	// userspace.
    80  	writeBuf []byte
    81  
    82  	// writeCursorFR current FR being copied from server.
    83  	writeCursorFR *futureResponse
    84  
    85  	// mu protects all the queues, maps, buffers and cursors and nextOpID.
    86  	mu sync.Mutex `state:"nosave"`
    87  
    88  	// waitQueue is used to notify interested parties when the device becomes
    89  	// readable or writable.
    90  	waitQueue waiter.Queue
    91  
    92  	// fullQueueCh is a channel used to synchronize the readers with the writers.
    93  	// Writers (inbound requests to the filesystem) block if there are too many
    94  	// unprocessed in-flight requests.
    95  	fullQueueCh chan struct{} `state:".(int)"`
    96  
    97  	// fs is the FUSE filesystem that this FD is being used for. A reference is
    98  	// held on fs.
    99  	fs *filesystem
   100  }
   101  
   102  func (fd *DeviceFD) saveFullQueueCh() int {
   103  	return cap(fd.fullQueueCh)
   104  }
   105  
   106  func (fd *DeviceFD) loadFullQueueCh(capacity int) {
   107  	fd.fullQueueCh = make(chan struct{}, capacity)
   108  }
   109  
   110  // Release implements vfs.FileDescriptionImpl.Release.
   111  func (fd *DeviceFD) Release(ctx context.Context) {
   112  	if fd.fs != nil {
   113  		fd.fs.conn.mu.Lock()
   114  		fd.fs.conn.connected = false
   115  		fd.fs.conn.mu.Unlock()
   116  
   117  		fd.fs.VFSFilesystem().DecRef(ctx)
   118  		fd.fs = nil
   119  	}
   120  }
   121  
   122  // PRead implements vfs.FileDescriptionImpl.PRead.
   123  func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
   124  	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
   125  	if fd.fs == nil {
   126  		return 0, linuxerr.EPERM
   127  	}
   128  
   129  	return 0, syserror.ENOSYS
   130  }
   131  
   132  // Read implements vfs.FileDescriptionImpl.Read.
   133  func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
   134  	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
   135  	if fd.fs == nil {
   136  		return 0, linuxerr.EPERM
   137  	}
   138  
   139  	// We require that any Read done on this filesystem have a sane minimum
   140  	// read buffer. It must have the capacity for the fixed parts of any request
   141  	// header (Linux uses the request header and the FUSEWriteIn header for this
   142  	// calculation) + the negotiated MaxWrite room for the data.
   143  	minBuffSize := linux.FUSE_MIN_READ_BUFFER
   144  	inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes())
   145  	writeHdrLen := uint32((*linux.FUSEWriteIn)(nil).SizeBytes())
   146  	negotiatedMinBuffSize := inHdrLen + writeHdrLen + fd.fs.conn.maxWrite
   147  	if minBuffSize < negotiatedMinBuffSize {
   148  		minBuffSize = negotiatedMinBuffSize
   149  	}
   150  
   151  	// If the read buffer is too small, error out.
   152  	if dst.NumBytes() < int64(minBuffSize) {
   153  		return 0, linuxerr.EINVAL
   154  	}
   155  
   156  	fd.mu.Lock()
   157  	defer fd.mu.Unlock()
   158  	return fd.readLocked(ctx, dst, opts)
   159  }
   160  
   161  // readLocked implements the reading of the fuse device while locked with DeviceFD.mu.
   162  //
   163  // Preconditions: dst is large enough for any reasonable request.
   164  func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
   165  	var req *Request
   166  
   167  	// Find the first valid request.
   168  	// For the normal case this loop only execute once.
   169  	for !fd.queue.Empty() {
   170  		req = fd.queue.Front()
   171  
   172  		if int64(req.hdr.Len)+int64(len(req.payload)) <= dst.NumBytes() {
   173  			break
   174  		}
   175  
   176  		// The request is too large. Cannot process it. All requests must be smaller than the
   177  		// negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT
   178  		// handshake.
   179  		errno := -int32(unix.EIO)
   180  		if req.hdr.Opcode == linux.FUSE_SETXATTR {
   181  			errno = -int32(unix.E2BIG)
   182  		}
   183  
   184  		// Return the error to the calling task.
   185  		if err := fd.sendError(ctx, errno, req.hdr.Unique); err != nil {
   186  			return 0, err
   187  		}
   188  
   189  		// We're done with this request.
   190  		fd.queue.Remove(req)
   191  		req = nil
   192  	}
   193  
   194  	if req == nil {
   195  		return 0, syserror.ErrWouldBlock
   196  	}
   197  
   198  	// We already checked the size: dst must be able to fit the whole request.
   199  	// Now we write the marshalled header, the payload,
   200  	// and the potential additional payload
   201  	// to the user memory IOSequence.
   202  
   203  	n, err := dst.CopyOut(ctx, req.data)
   204  	if err != nil {
   205  		return 0, err
   206  	}
   207  	if n != len(req.data) {
   208  		return 0, syserror.EIO
   209  	}
   210  
   211  	if req.hdr.Opcode == linux.FUSE_WRITE {
   212  		written, err := dst.DropFirst(n).CopyOut(ctx, req.payload)
   213  		if err != nil {
   214  			return 0, err
   215  		}
   216  		if written != len(req.payload) {
   217  			return 0, syserror.EIO
   218  		}
   219  		n += int(written)
   220  	}
   221  
   222  	// Fully done with this req, remove it from the queue.
   223  	fd.queue.Remove(req)
   224  
   225  	// Remove noReply ones from map of requests expecting a reply.
   226  	if req.noReply {
   227  		fd.numActiveRequests -= 1
   228  		delete(fd.completions, req.hdr.Unique)
   229  	}
   230  
   231  	return int64(n), nil
   232  }
   233  
   234  // PWrite implements vfs.FileDescriptionImpl.PWrite.
   235  func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
   236  	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
   237  	if fd.fs == nil {
   238  		return 0, linuxerr.EPERM
   239  	}
   240  
   241  	return 0, syserror.ENOSYS
   242  }
   243  
   244  // Write implements vfs.FileDescriptionImpl.Write.
   245  func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
   246  	fd.mu.Lock()
   247  	defer fd.mu.Unlock()
   248  	return fd.writeLocked(ctx, src, opts)
   249  }
   250  
   251  // writeLocked implements writing to the fuse device while locked with DeviceFD.mu.
   252  func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
   253  	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
   254  	if fd.fs == nil {
   255  		return 0, linuxerr.EPERM
   256  	}
   257  
   258  	// Return ENODEV if the filesystem is umounted.
   259  	if fd.fs.umounted {
   260  		return 0, linuxerr.ENODEV
   261  	}
   262  
   263  	var cn, n int64
   264  	hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
   265  
   266  	for src.NumBytes() > 0 {
   267  		if fd.writeCursorFR != nil {
   268  			// Already have common header, and we're now copying the payload.
   269  			wantBytes := fd.writeCursorFR.hdr.Len
   270  
   271  			// Note that the FR data doesn't have the header. Copy it over if its necessary.
   272  			if fd.writeCursorFR.data == nil {
   273  				fd.writeCursorFR.data = make([]byte, wantBytes)
   274  			}
   275  
   276  			bytesCopied, err := src.CopyIn(ctx, fd.writeCursorFR.data[fd.writeCursor:wantBytes])
   277  			if err != nil {
   278  				return 0, err
   279  			}
   280  			src = src.DropFirst(bytesCopied)
   281  
   282  			cn = int64(bytesCopied)
   283  			n += cn
   284  			fd.writeCursor += uint32(cn)
   285  			if fd.writeCursor == wantBytes {
   286  				// Done reading this full response. Clean up and unblock the
   287  				// initiator.
   288  				break
   289  			}
   290  
   291  			// Check if we have more data in src.
   292  			continue
   293  		}
   294  
   295  		// Assert that the header isn't read into the writeBuf yet.
   296  		if fd.writeCursor >= hdrLen {
   297  			return 0, linuxerr.EINVAL
   298  		}
   299  
   300  		// We don't have the full common response header yet.
   301  		wantBytes := hdrLen - fd.writeCursor
   302  		bytesCopied, err := src.CopyIn(ctx, fd.writeBuf[fd.writeCursor:wantBytes])
   303  		if err != nil {
   304  			return 0, err
   305  		}
   306  		src = src.DropFirst(bytesCopied)
   307  
   308  		cn = int64(bytesCopied)
   309  		n += cn
   310  		fd.writeCursor += uint32(cn)
   311  		if fd.writeCursor == hdrLen {
   312  			// Have full header in the writeBuf. Use it to fetch the actual futureResponse
   313  			// from the device's completions map.
   314  			var hdr linux.FUSEHeaderOut
   315  			hdr.UnmarshalBytes(fd.writeBuf)
   316  
   317  			// We have the header now and so the writeBuf has served its purpose.
   318  			// We could reset it manually here but instead of doing that, at the
   319  			// end of the write, the writeCursor will be set to 0 thereby allowing
   320  			// the next request to overwrite whats in the buffer,
   321  
   322  			fut, ok := fd.completions[hdr.Unique]
   323  			if !ok {
   324  				// Server sent us a response for a request we never sent,
   325  				// or for which we already received a reply (e.g. aborted), an unlikely event.
   326  				return 0, linuxerr.EINVAL
   327  			}
   328  
   329  			delete(fd.completions, hdr.Unique)
   330  
   331  			// Copy over the header into the future response. The rest of the payload
   332  			// will be copied over to the FR's data in the next iteration.
   333  			fut.hdr = &hdr
   334  			fd.writeCursorFR = fut
   335  
   336  			// Next iteration will now try read the complete request, if src has
   337  			// any data remaining. Otherwise we're done.
   338  		}
   339  	}
   340  
   341  	if fd.writeCursorFR != nil {
   342  		if err := fd.sendResponse(ctx, fd.writeCursorFR); err != nil {
   343  			return 0, err
   344  		}
   345  
   346  		// Ready the device for the next request.
   347  		fd.writeCursorFR = nil
   348  		fd.writeCursor = 0
   349  	}
   350  
   351  	return n, nil
   352  }
   353  
   354  // Readiness implements vfs.FileDescriptionImpl.Readiness.
   355  func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask {
   356  	fd.mu.Lock()
   357  	defer fd.mu.Unlock()
   358  	return fd.readinessLocked(mask)
   359  }
   360  
   361  // readinessLocked implements checking the readiness of the fuse device while
   362  // locked with DeviceFD.mu.
   363  func (fd *DeviceFD) readinessLocked(mask waiter.EventMask) waiter.EventMask {
   364  	var ready waiter.EventMask
   365  
   366  	if fd.fs == nil || fd.fs.umounted {
   367  		ready |= waiter.EventErr
   368  		return ready & mask
   369  	}
   370  
   371  	// FD is always writable.
   372  	ready |= waiter.WritableEvents
   373  	if !fd.queue.Empty() {
   374  		// Have reqs available, FD is readable.
   375  		ready |= waiter.ReadableEvents
   376  	}
   377  
   378  	return ready & mask
   379  }
   380  
   381  // EventRegister implements waiter.Waitable.EventRegister.
   382  func (fd *DeviceFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
   383  	fd.waitQueue.EventRegister(e, mask)
   384  }
   385  
   386  // EventUnregister implements waiter.Waitable.EventUnregister.
   387  func (fd *DeviceFD) EventUnregister(e *waiter.Entry) {
   388  	fd.waitQueue.EventUnregister(e)
   389  }
   390  
   391  // Seek implements vfs.FileDescriptionImpl.Seek.
   392  func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
   393  	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
   394  	if fd.fs == nil {
   395  		return 0, linuxerr.EPERM
   396  	}
   397  
   398  	return 0, syserror.ENOSYS
   399  }
   400  
   401  // sendResponse sends a response to the waiting task (if any).
   402  //
   403  // Preconditions: fd.mu must be held.
   404  func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error {
   405  	// Signal the task waiting on a response if any.
   406  	defer close(fut.ch)
   407  
   408  	// Signal that the queue is no longer full.
   409  	select {
   410  	case fd.fullQueueCh <- struct{}{}:
   411  	default:
   412  	}
   413  	fd.numActiveRequests--
   414  
   415  	if fut.async {
   416  		return fd.asyncCallBack(ctx, fut.getResponse())
   417  	}
   418  
   419  	return nil
   420  }
   421  
   422  // sendError sends an error response to the waiting task (if any) by calling sendResponse().
   423  //
   424  // Preconditions: fd.mu must be held.
   425  func (fd *DeviceFD) sendError(ctx context.Context, errno int32, unique linux.FUSEOpID) error {
   426  	// Return the error to the calling task.
   427  	outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
   428  	respHdr := linux.FUSEHeaderOut{
   429  		Len:    outHdrLen,
   430  		Error:  errno,
   431  		Unique: unique,
   432  	}
   433  
   434  	fut, ok := fd.completions[respHdr.Unique]
   435  	if !ok {
   436  		// A response for a request we never sent,
   437  		// or for which we already received a reply (e.g. aborted).
   438  		return linuxerr.EINVAL
   439  	}
   440  	delete(fd.completions, respHdr.Unique)
   441  
   442  	fut.hdr = &respHdr
   443  	return fd.sendResponse(ctx, fut)
   444  }
   445  
   446  // asyncCallBack executes pre-defined callback function for async requests.
   447  // Currently used by: FUSE_INIT.
   448  func (fd *DeviceFD) asyncCallBack(ctx context.Context, r *Response) error {
   449  	switch r.opcode {
   450  	case linux.FUSE_INIT:
   451  		creds := auth.CredentialsFromContext(ctx)
   452  		rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace()
   453  		return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs))
   454  		// TODO(github.com/SagerNet/issue/3247): support async read: correctly process the response.
   455  	}
   456  
   457  	return nil
   458  }