github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/fuse/dev.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package fuse
    16  
    17  import (
    18  	"golang.org/x/sys/unix"
    19  	"github.com/metacubex/gvisor/pkg/abi/linux"
    20  	"github.com/metacubex/gvisor/pkg/context"
    21  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    22  	"github.com/metacubex/gvisor/pkg/sentry/kernel"
    23  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    24  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    25  	"github.com/metacubex/gvisor/pkg/sync"
    26  	"github.com/metacubex/gvisor/pkg/usermem"
    27  	"github.com/metacubex/gvisor/pkg/waiter"
    28  )
    29  
    30  const fuseDevMinor = 229
    31  
    32  // This is equivalent to linux.SizeOfFUSEHeaderIn
    33  const fuseHeaderOutSize = 16
    34  
    35  // fuseDevice implements vfs.Device for /dev/fuse.
    36  //
    37  // +stateify savable
    38  type fuseDevice struct{}
    39  
    40  // Open implements vfs.Device.Open.
    41  func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
    42  	var fd DeviceFD
    43  	if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
    44  		UseDentryMetadata: true,
    45  	}); err != nil {
    46  		return nil, err
    47  	}
    48  	return &fd.vfsfd, nil
    49  }
    50  
    51  // DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse.
    52  //
    53  // +stateify savable
    54  type DeviceFD struct {
    55  	vfsfd vfs.FileDescription
    56  	vfs.FileDescriptionDefaultImpl
    57  	vfs.DentryMetadataFileDescriptionImpl
    58  	vfs.NoLockFD
    59  
    60  	// waitQueue is used to notify interested parties when the device becomes
    61  	// readable or writable.
    62  	waitQueue waiter.Queue
    63  
    64  	// fullQueueCh is a channel used to synchronize the readers with the writers.
    65  	// Writers (inbound requests to the filesystem) block if there are too many
    66  	// unprocessed in-flight requests.
    67  	fullQueueCh chan struct{} `state:".(int)"`
    68  
    69  	// mu protects all the queues, maps, buffers and cursors and nextOpID.
    70  	mu sync.Mutex `state:"nosave"`
    71  
    72  	// nextOpID is used to create new requests.
    73  	// +checklocks:mu
    74  	nextOpID linux.FUSEOpID
    75  
    76  	// queue is the list of requests that need to be processed by the FUSE server.
    77  	// +checklocks:mu
    78  	queue requestList
    79  
    80  	// numActiveRequests is the number of requests made by the Sentry that has
    81  	// yet to be responded to.
    82  	// +checklocks:mu
    83  	numActiveRequests uint64
    84  
    85  	// completions is used to map a request to its response. A Writer will use this
    86  	// to notify the caller of a completed response.
    87  	// +checklocks:mu
    88  	completions map[linux.FUSEOpID]*futureResponse
    89  
    90  	// writeBuf is the memory buffer used to copy in the FUSE out header from
    91  	// userspace.
    92  	// +checklocks:mu
    93  	writeBuf [fuseHeaderOutSize]byte
    94  
    95  	// conn is the FUSE connection that this FD is being used for.
    96  	// +checklocks:mu
    97  	conn *connection
    98  }
    99  
   100  // Release implements vfs.FileDescriptionImpl.Release.
   101  func (fd *DeviceFD) Release(ctx context.Context) {
   102  	fd.mu.Lock()
   103  	defer fd.mu.Unlock()
   104  	if fd.conn != nil {
   105  		fd.conn.mu.Lock()
   106  		fd.conn.connected = false
   107  		fd.conn.mu.Unlock()
   108  
   109  		fd.conn.Abort(ctx) // +checklocksforce: fd.conn.fd.mu=fd.mu
   110  		fd.waitQueue.Notify(waiter.ReadableEvents)
   111  		fd.conn = nil
   112  	}
   113  }
   114  
   115  // connected returns true if fd.conn is set and the connection has not been
   116  // aborted.
   117  // +checklocks:fd.mu
   118  func (fd *DeviceFD) connected() bool {
   119  	if fd.conn != nil {
   120  		fd.conn.mu.Lock()
   121  		defer fd.conn.mu.Unlock()
   122  		return fd.conn.connected
   123  	}
   124  	return false
   125  }
   126  
   127  // PRead implements vfs.FileDescriptionImpl.PRead.
   128  func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
   129  	// Operations on /dev/fuse don't make sense until a FUSE filesystem is
   130  	// mounted. If there is an active connection we know there is at least one
   131  	// filesystem mounted.
   132  	fd.mu.Lock()
   133  	defer fd.mu.Unlock()
   134  	if !fd.connected() {
   135  		return 0, linuxerr.EPERM
   136  	}
   137  
   138  	return 0, linuxerr.ENOSYS
   139  }
   140  
   141  // Read implements vfs.FileDescriptionImpl.Read.
   142  func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
   143  	fd.mu.Lock()
   144  	defer fd.mu.Unlock()
   145  	if !fd.connected() {
   146  		return 0, linuxerr.EPERM
   147  	}
   148  	// We require that any Read done on this filesystem have a sane minimum
   149  	// read buffer. It must have the capacity for the fixed parts of any request
   150  	// header (Linux uses the request header and the FUSEWriteIn header for this
   151  	// calculation) + the negotiated MaxWrite room for the data.
   152  	minBuffSize := linux.FUSE_MIN_READ_BUFFER
   153  	fd.conn.mu.Lock()
   154  	negotiatedMinBuffSize := linux.SizeOfFUSEHeaderIn + linux.SizeOfFUSEHeaderOut + fd.conn.maxWrite
   155  	fd.conn.mu.Unlock()
   156  	if minBuffSize < negotiatedMinBuffSize {
   157  		minBuffSize = negotiatedMinBuffSize
   158  	}
   159  
   160  	// If the read buffer is too small, error out.
   161  	if dst.NumBytes() < int64(minBuffSize) {
   162  		return 0, linuxerr.EINVAL
   163  	}
   164  	// Find the first valid request. For the normal case this loop only executes
   165  	// once.
   166  	var req *Request
   167  	for req = fd.queue.Front(); !fd.queue.Empty(); req = fd.queue.Front() {
   168  		if int64(req.hdr.Len) <= dst.NumBytes() {
   169  			break
   170  		}
   171  		// The request is too large so we cannot process it. All requests must be
   172  		// smaller than the negotiated size as specified by Connection.MaxWrite set
   173  		// as part of the FUSE_INIT handshake.
   174  		errno := -int32(unix.EIO)
   175  		if req.hdr.Opcode == linux.FUSE_SETXATTR {
   176  			errno = -int32(unix.E2BIG)
   177  		}
   178  
   179  		if err := fd.sendError(ctx, errno, req.hdr.Unique); err != nil {
   180  			return 0, err
   181  		}
   182  		fd.queue.Remove(req)
   183  		req = nil
   184  	}
   185  	if req == nil {
   186  		return 0, linuxerr.ErrWouldBlock
   187  	}
   188  
   189  	// We already checked the size: dst must be able to fit the whole request.
   190  	n, err := dst.CopyOut(ctx, req.data)
   191  	if err != nil {
   192  		return 0, err
   193  	}
   194  	if n != len(req.data) {
   195  		return 0, linuxerr.EIO
   196  	}
   197  	fd.queue.Remove(req)
   198  	// Remove noReply ones from the map of requests expecting a reply.
   199  	if req.noReply {
   200  		fd.numActiveRequests--
   201  		delete(fd.completions, req.hdr.Unique)
   202  	}
   203  	return int64(n), nil
   204  }
   205  
   206  // PWrite implements vfs.FileDescriptionImpl.PWrite.
   207  func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
   208  	// Operations on /dev/fuse don't make sense until a FUSE filesystem is
   209  	// mounted. If there is an active connection we know there is at least one
   210  	// filesystem mounted.
   211  	fd.mu.Lock()
   212  	defer fd.mu.Unlock()
   213  	if !fd.connected() {
   214  		return 0, linuxerr.EPERM
   215  	}
   216  
   217  	return 0, linuxerr.ENOSYS
   218  }
   219  
   220  // Write implements vfs.FileDescriptionImpl.Write.
   221  func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
   222  	fd.mu.Lock()
   223  	defer fd.mu.Unlock()
   224  	if !fd.connected() {
   225  		return 0, linuxerr.EPERM
   226  	}
   227  
   228  	n, err := src.CopyIn(ctx, fd.writeBuf[:])
   229  	if err != nil {
   230  		return 0, err
   231  	}
   232  	var hdr linux.FUSEHeaderOut
   233  	hdr.UnmarshalBytes(fd.writeBuf[:])
   234  
   235  	fut, ok := fd.completions[hdr.Unique]
   236  	if !ok {
   237  		// Server sent us a response for a request we never sent, or for which we
   238  		// already received a reply (e.g. aborted), an unlikely event.
   239  		return 0, linuxerr.EINVAL
   240  	}
   241  	delete(fd.completions, hdr.Unique)
   242  
   243  	// Copy over the header into the future response. The rest of the payload
   244  	// will be copied over to the FR's data in the next iteration.
   245  	fut.hdr = &hdr
   246  	fut.data = make([]byte, fut.hdr.Len)
   247  	copy(fut.data, fd.writeBuf[:])
   248  	if fut.hdr.Len > uint32(len(fd.writeBuf)) {
   249  		src = src.DropFirst(len(fd.writeBuf))
   250  		n2, err := src.CopyIn(ctx, fut.data[len(fd.writeBuf):])
   251  		if err != nil {
   252  			return 0, err
   253  		}
   254  		n += n2
   255  	}
   256  	if err := fd.sendResponse(ctx, fut); err != nil {
   257  		return 0, err
   258  	}
   259  	return int64(n), nil
   260  }
   261  
   262  // Readiness implements vfs.FileDescriptionImpl.Readiness.
   263  func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask {
   264  	fd.mu.Lock()
   265  	defer fd.mu.Unlock()
   266  	var ready waiter.EventMask
   267  
   268  	if !fd.connected() {
   269  		ready |= waiter.EventErr
   270  		return ready & mask
   271  	}
   272  
   273  	// FD is always writable.
   274  	ready |= waiter.WritableEvents
   275  	if !fd.queue.Empty() {
   276  		// Have reqs available, FD is readable.
   277  		ready |= waiter.ReadableEvents
   278  	}
   279  
   280  	return ready & mask
   281  }
   282  
   283  // EventRegister implements waiter.Waitable.EventRegister.
   284  func (fd *DeviceFD) EventRegister(e *waiter.Entry) error {
   285  	fd.mu.Lock()
   286  	defer fd.mu.Unlock()
   287  	fd.waitQueue.EventRegister(e)
   288  	return nil
   289  }
   290  
   291  // EventUnregister implements waiter.Waitable.EventUnregister.
   292  func (fd *DeviceFD) EventUnregister(e *waiter.Entry) {
   293  	fd.mu.Lock()
   294  	defer fd.mu.Unlock()
   295  	fd.waitQueue.EventUnregister(e)
   296  }
   297  
   298  // Epollable implements FileDescriptionImpl.Epollable.
   299  func (fd *DeviceFD) Epollable() bool {
   300  	return true
   301  }
   302  
   303  // Seek implements vfs.FileDescriptionImpl.Seek.
   304  func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
   305  	// Operations on /dev/fuse don't make sense until a FUSE filesystem is
   306  	// mounted. If there is an active connection we know there is at least one
   307  	// filesystem mounted.
   308  	fd.mu.Lock()
   309  	defer fd.mu.Unlock()
   310  	if !fd.connected() {
   311  		return 0, linuxerr.EPERM
   312  	}
   313  
   314  	return 0, linuxerr.ENOSYS
   315  }
   316  
   317  // sendResponse sends a response to the waiting task (if any).
   318  //
   319  // +checklocks:fd.mu
   320  func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error {
   321  	// Signal the task waiting on a response if any.
   322  	defer close(fut.ch)
   323  
   324  	// Signal that the queue is no longer full.
   325  	select {
   326  	case fd.fullQueueCh <- struct{}{}:
   327  	default:
   328  	}
   329  	fd.numActiveRequests--
   330  
   331  	if fut.async {
   332  		return fd.asyncCallBack(ctx, fut.getResponse())
   333  	}
   334  
   335  	return nil
   336  }
   337  
   338  // sendError sends an error response to the waiting task (if any) by calling sendResponse().
   339  //
   340  // +checklocks:fd.mu
   341  func (fd *DeviceFD) sendError(ctx context.Context, errno int32, unique linux.FUSEOpID) error {
   342  	// Return the error to the calling task.
   343  	respHdr := linux.FUSEHeaderOut{
   344  		Len:    linux.SizeOfFUSEHeaderOut,
   345  		Error:  errno,
   346  		Unique: unique,
   347  	}
   348  
   349  	fut, ok := fd.completions[respHdr.Unique]
   350  	if !ok {
   351  		// A response for a request we never sent,
   352  		// or for which we already received a reply (e.g. aborted).
   353  		return linuxerr.EINVAL
   354  	}
   355  	delete(fd.completions, respHdr.Unique)
   356  
   357  	fut.hdr = &respHdr
   358  	return fd.sendResponse(ctx, fut)
   359  }
   360  
   361  // asyncCallBack executes pre-defined callback function for async requests.
   362  // Currently used by: FUSE_INIT.
   363  // +checklocks:fd.mu
   364  func (fd *DeviceFD) asyncCallBack(ctx context.Context, r *Response) error {
   365  	switch r.opcode {
   366  	case linux.FUSE_INIT:
   367  		creds := auth.CredentialsFromContext(ctx)
   368  		rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace()
   369  		return fd.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs))
   370  		// TODO(gvisor.dev/issue/3247): support async read: correctly process the response.
   371  	}
   372  
   373  	return nil
   374  }