github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/fsimpl/fuse/dev.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package fuse
    16  
    17  import (
    18  	"golang.org/x/sys/unix"
    19  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    20  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    21  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    22  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel"
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/sync"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/usermem"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/waiter"
    28  )
    29  
    30  const fuseDevMinor = 229
    31  
    32  // This is equivalent to linux.SizeOfFUSEHeaderIn
    33  const fuseHeaderOutSize = 16
    34  
    35  // fuseDevice implements vfs.Device for /dev/fuse.
    36  //
    37  // +stateify savable
    38  type fuseDevice struct{}
    39  
    40  // Open implements vfs.Device.Open.
    41  func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
    42  	var fd DeviceFD
    43  	if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
    44  		UseDentryMetadata: true,
    45  	}); err != nil {
    46  		return nil, err
    47  	}
    48  	return &fd.vfsfd, nil
    49  }
    50  
    51  // DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse.
    52  //
    53  // +stateify savable
    54  type DeviceFD struct {
    55  	vfsfd vfs.FileDescription
    56  	vfs.FileDescriptionDefaultImpl
    57  	vfs.DentryMetadataFileDescriptionImpl
    58  	vfs.NoLockFD
    59  
    60  	// waitQueue is used to notify interested parties when the device becomes
    61  	// readable or writable.
    62  	waitQueue waiter.Queue
    63  
    64  	// fullQueueCh is a channel used to synchronize the readers with the writers.
    65  	// Writers (inbound requests to the filesystem) block if there are too many
    66  	// unprocessed in-flight requests.
    67  	fullQueueCh chan struct{} `state:".(int)"`
    68  
    69  	// mu protects all the queues, maps, buffers and cursors and nextOpID.
    70  	mu sync.Mutex `state:"nosave"`
    71  
    72  	// nextOpID is used to create new requests.
    73  	// +checklocks:mu
    74  	nextOpID linux.FUSEOpID
    75  
    76  	// queue is the list of requests that need to be processed by the FUSE server.
    77  	// +checklocks:mu
    78  	queue requestList
    79  
    80  	// numActiveRequests is the number of requests made by the Sentry that has
    81  	// yet to be responded to.
    82  	// +checklocks:mu
    83  	numActiveRequests uint64
    84  
    85  	// completions is used to map a request to its response. A Writer will use this
    86  	// to notify the caller of a completed response.
    87  	// +checklocks:mu
    88  	completions map[linux.FUSEOpID]*futureResponse
    89  
    90  	// writeBuf is the memory buffer used to copy in the FUSE out header from
    91  	// userspace.
    92  	// +checklocks:mu
    93  	writeBuf [fuseHeaderOutSize]byte
    94  
    95  	// conn is the FUSE connection that this FD is being used for.
    96  	// +checklocks:mu
    97  	conn *connection
    98  }
    99  
   100  // Release implements vfs.FileDescriptionImpl.Release.
   101  func (fd *DeviceFD) Release(ctx context.Context) {
   102  	fd.mu.Lock()
   103  	defer fd.mu.Unlock()
   104  	if fd.conn != nil {
   105  		fd.conn.mu.Lock()
   106  		fd.conn.connected = false
   107  		fd.conn.mu.Unlock()
   108  
   109  		fd.conn.Abort(ctx) // +checklocksforce: fd.conn.fd.mu=fd.mu
   110  		fd.waitQueue.Notify(waiter.ReadableEvents)
   111  		fd.conn = nil
   112  	}
   113  }
   114  
   115  // connected returns true if fd.conn is set and the connection has not been
   116  // aborted.
   117  // +checklocks:fd.mu
   118  func (fd *DeviceFD) connected() bool {
   119  	if fd.conn != nil {
   120  		fd.conn.mu.Lock()
   121  		defer fd.conn.mu.Unlock()
   122  		return fd.conn.connected
   123  	}
   124  	return false
   125  }
   126  
   127  // PRead implements vfs.FileDescriptionImpl.PRead.
   128  func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
   129  	// Operations on /dev/fuse don't make sense until a FUSE filesystem is
   130  	// mounted. If there is an active connection we know there is at least one
   131  	// filesystem mounted.
   132  	fd.mu.Lock()
   133  	defer fd.mu.Unlock()
   134  	if !fd.connected() {
   135  		return 0, linuxerr.EPERM
   136  	}
   137  
   138  	return 0, linuxerr.ENOSYS
   139  }
   140  
   141  // Read implements vfs.FileDescriptionImpl.Read.
   142  func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
   143  	fd.mu.Lock()
   144  	defer fd.mu.Unlock()
   145  	if !fd.connected() {
   146  		return 0, linuxerr.EPERM
   147  	}
   148  	// We require that any Read done on this filesystem have a sane minimum
   149  	// read buffer. It must have the capacity for the fixed parts of any request
   150  	// header (Linux uses the request header and the FUSEWriteIn header for this
   151  	// calculation) + the negotiated MaxWrite room for the data.
   152  	minBuffSize := linux.FUSE_MIN_READ_BUFFER
   153  	fd.conn.mu.Lock()
   154  	negotiatedMinBuffSize := linux.SizeOfFUSEHeaderIn + linux.SizeOfFUSEHeaderOut + fd.conn.maxWrite
   155  	fd.conn.mu.Unlock()
   156  	if minBuffSize < negotiatedMinBuffSize {
   157  		minBuffSize = negotiatedMinBuffSize
   158  	}
   159  
   160  	// If the read buffer is too small, error out.
   161  	if dst.NumBytes() < int64(minBuffSize) {
   162  		return 0, linuxerr.EINVAL
   163  	}
   164  	// Find the first valid request. For the normal case this loop only executes
   165  	// once.
   166  	var req *Request
   167  	for req = fd.queue.Front(); !fd.queue.Empty(); req = fd.queue.Front() {
   168  		if int64(req.hdr.Len) <= dst.NumBytes() {
   169  			break
   170  		}
   171  		// The request is too large so we cannot process it. All requests must be
   172  		// smaller than the negotiated size as specified by Connection.MaxWrite set
   173  		// as part of the FUSE_INIT handshake.
   174  		errno := -int32(unix.EIO)
   175  		if req.hdr.Opcode == linux.FUSE_SETXATTR {
   176  			errno = -int32(unix.E2BIG)
   177  		}
   178  
   179  		if err := fd.sendError(ctx, errno, req.hdr.Unique); err != nil {
   180  			return 0, err
   181  		}
   182  		fd.queue.Remove(req)
   183  		req = nil
   184  	}
   185  	if req == nil {
   186  		return 0, linuxerr.ErrWouldBlock
   187  	}
   188  
   189  	// We already checked the size: dst must be able to fit the whole request.
   190  	n, err := dst.CopyOut(ctx, req.data)
   191  	if err != nil {
   192  		return 0, err
   193  	}
   194  	if n != len(req.data) {
   195  		return 0, linuxerr.EIO
   196  	}
   197  	fd.queue.Remove(req)
   198  	// Remove noReply ones from the map of requests expecting a reply.
   199  	if req.noReply {
   200  		fd.numActiveRequests--
   201  		delete(fd.completions, req.hdr.Unique)
   202  	}
   203  	return int64(n), nil
   204  }
   205  
   206  // PWrite implements vfs.FileDescriptionImpl.PWrite.
   207  func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
   208  	// Operations on /dev/fuse don't make sense until a FUSE filesystem is
   209  	// mounted. If there is an active connection we know there is at least one
   210  	// filesystem mounted.
   211  	fd.mu.Lock()
   212  	defer fd.mu.Unlock()
   213  	if !fd.connected() {
   214  		return 0, linuxerr.EPERM
   215  	}
   216  
   217  	return 0, linuxerr.ENOSYS
   218  }
   219  
   220  // Write implements vfs.FileDescriptionImpl.Write.
   221  func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
   222  	fd.mu.Lock()
   223  	defer fd.mu.Unlock()
   224  	if !fd.connected() {
   225  		return 0, linuxerr.EPERM
   226  	}
   227  
   228  	if _, err := src.CopyIn(ctx, fd.writeBuf[:]); err != nil {
   229  		return 0, err
   230  	}
   231  	var hdr linux.FUSEHeaderOut
   232  	hdr.UnmarshalBytes(fd.writeBuf[:])
   233  
   234  	fut, ok := fd.completions[hdr.Unique]
   235  	if !ok {
   236  		// Server sent us a response for a request we never sent, or for which we
   237  		// already received a reply (e.g. aborted), an unlikely event.
   238  		return 0, linuxerr.EINVAL
   239  	}
   240  	delete(fd.completions, hdr.Unique)
   241  
   242  	// Copy over the header into the future response. The rest of the payload
   243  	// will be copied over to the FR's data in the next iteration.
   244  	fut.hdr = &hdr
   245  	fut.data = make([]byte, fut.hdr.Len)
   246  	n, err := src.CopyIn(ctx, fut.data)
   247  	if err != nil {
   248  		return 0, err
   249  	}
   250  	if err := fd.sendResponse(ctx, fut); err != nil {
   251  		return 0, err
   252  	}
   253  	return int64(n), nil
   254  }
   255  
   256  // Readiness implements vfs.FileDescriptionImpl.Readiness.
   257  func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask {
   258  	fd.mu.Lock()
   259  	defer fd.mu.Unlock()
   260  	var ready waiter.EventMask
   261  
   262  	if !fd.connected() {
   263  		ready |= waiter.EventErr
   264  		return ready & mask
   265  	}
   266  
   267  	// FD is always writable.
   268  	ready |= waiter.WritableEvents
   269  	if !fd.queue.Empty() {
   270  		// Have reqs available, FD is readable.
   271  		ready |= waiter.ReadableEvents
   272  	}
   273  
   274  	return ready & mask
   275  }
   276  
   277  // EventRegister implements waiter.Waitable.EventRegister.
   278  func (fd *DeviceFD) EventRegister(e *waiter.Entry) error {
   279  	fd.mu.Lock()
   280  	defer fd.mu.Unlock()
   281  	fd.waitQueue.EventRegister(e)
   282  	return nil
   283  }
   284  
   285  // EventUnregister implements waiter.Waitable.EventUnregister.
   286  func (fd *DeviceFD) EventUnregister(e *waiter.Entry) {
   287  	fd.mu.Lock()
   288  	defer fd.mu.Unlock()
   289  	fd.waitQueue.EventUnregister(e)
   290  }
   291  
   292  // Epollable implements FileDescriptionImpl.Epollable.
   293  func (fd *DeviceFD) Epollable() bool {
   294  	return true
   295  }
   296  
   297  // Seek implements vfs.FileDescriptionImpl.Seek.
   298  func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
   299  	// Operations on /dev/fuse don't make sense until a FUSE filesystem is
   300  	// mounted. If there is an active connection we know there is at least one
   301  	// filesystem mounted.
   302  	fd.mu.Lock()
   303  	defer fd.mu.Unlock()
   304  	if !fd.connected() {
   305  		return 0, linuxerr.EPERM
   306  	}
   307  
   308  	return 0, linuxerr.ENOSYS
   309  }
   310  
   311  // sendResponse sends a response to the waiting task (if any).
   312  //
   313  // +checklocks:fd.mu
   314  func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error {
   315  	// Signal the task waiting on a response if any.
   316  	defer close(fut.ch)
   317  
   318  	// Signal that the queue is no longer full.
   319  	select {
   320  	case fd.fullQueueCh <- struct{}{}:
   321  	default:
   322  	}
   323  	fd.numActiveRequests--
   324  
   325  	if fut.async {
   326  		return fd.asyncCallBack(ctx, fut.getResponse())
   327  	}
   328  
   329  	return nil
   330  }
   331  
   332  // sendError sends an error response to the waiting task (if any) by calling sendResponse().
   333  //
   334  // +checklocks:fd.mu
   335  func (fd *DeviceFD) sendError(ctx context.Context, errno int32, unique linux.FUSEOpID) error {
   336  	// Return the error to the calling task.
   337  	respHdr := linux.FUSEHeaderOut{
   338  		Len:    linux.SizeOfFUSEHeaderOut,
   339  		Error:  errno,
   340  		Unique: unique,
   341  	}
   342  
   343  	fut, ok := fd.completions[respHdr.Unique]
   344  	if !ok {
   345  		// A response for a request we never sent,
   346  		// or for which we already received a reply (e.g. aborted).
   347  		return linuxerr.EINVAL
   348  	}
   349  	delete(fd.completions, respHdr.Unique)
   350  
   351  	fut.hdr = &respHdr
   352  	return fd.sendResponse(ctx, fut)
   353  }
   354  
   355  // asyncCallBack executes pre-defined callback function for async requests.
   356  // Currently used by: FUSE_INIT.
   357  // +checklocks:fd.mu
   358  func (fd *DeviceFD) asyncCallBack(ctx context.Context, r *Response) error {
   359  	switch r.opcode {
   360  	case linux.FUSE_INIT:
   361  		creds := auth.CredentialsFromContext(ctx)
   362  		rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace()
   363  		return fd.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs))
   364  		// TODO(gvisor.dev/issue/3247): support async read: correctly process the response.
   365  	}
   366  
   367  	return nil
   368  }