github.com/scaleoutsean/fusego@v0.0.0-20220224074057-4a6429e46bb8/connection.go (about)

     1  // Copyright 2015 Google Inc. All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package fuse
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"io"
    21  	"log"
    22  	"os"
    23  	"path"
    24  	"runtime"
    25  	"sync"
    26  	"syscall"
    27  
    28  	"github.com/scaleoutsean/fusego/fuseops"
    29  	"github.com/scaleoutsean/fusego/internal/buffer"
    30  	"github.com/scaleoutsean/fusego/internal/freelist"
    31  	"github.com/scaleoutsean/fusego/internal/fusekernel"
    32  )
    33  
    34  type contextKeyType uint64
    35  
    36  var contextKey interface{} = contextKeyType(0)
    37  
    38  // Ask the Linux kernel for larger read requests.
    39  //
    40  // As of 2015-03-26, the behavior in the kernel is:
    41  //
    42  //  *  (http://goo.gl/bQ1f1i, http://goo.gl/HwBrR6) Set the local variable
    43  //     ra_pages to be init_response->max_readahead divided by the page size.
    44  //
    45  //  *  (http://goo.gl/gcIsSh, http://goo.gl/LKV2vA) Set
    46  //     backing_dev_info::ra_pages to the min of that value and what was sent
    47  //     in the request's max_readahead field.
    48  //
    49  //  *  (http://goo.gl/u2SqzH) Use backing_dev_info::ra_pages when deciding
    50  //     how much to read ahead.
    51  //
    52  //  *  (http://goo.gl/JnhbdL) Don't read ahead at all if that field is zero.
    53  //
    54  // Reading a page at a time is a drag. Ask for a larger size.
    55  const maxReadahead = 1 << 20
    56  
    57  // Connection represents a connection to the fuse kernel process. It is used to
    58  // receive and reply to requests from the kernel.
    59  type Connection struct {
    60  	cfg         MountConfig
    61  	debugLogger *log.Logger
    62  	errorLogger *log.Logger
    63  
    64  	// The device through which we're talking to the kernel, and the protocol
    65  	// version that we're using to talk to it.
    66  	dev      *os.File
    67  	protocol fusekernel.Protocol
    68  
    69  	mu sync.Mutex
    70  
    71  	// A map from fuse "unique" request ID (*not* the op ID for logging used
    72  	// above) to a function that cancel's its associated context.
    73  	//
    74  	// GUARDED_BY(mu)
    75  	cancelFuncs map[uint64]func()
    76  
    77  	// Freelists, serviced by freelists.go.
    78  	inMessages  freelist.Freelist // GUARDED_BY(mu)
    79  	outMessages freelist.Freelist // GUARDED_BY(mu)
    80  }
    81  
    82  // State that is maintained for each in-flight op. This is stuffed into the
    83  // context that the user uses to reply to the op.
    84  type opState struct {
    85  	inMsg  *buffer.InMessage
    86  	outMsg *buffer.OutMessage
    87  	op     interface{}
    88  }
    89  
    90  // Create a connection wrapping the supplied file descriptor connected to the
    91  // kernel. You must eventually call c.close().
    92  //
    93  // The loggers may be nil.
    94  func newConnection(
    95  	cfg MountConfig,
    96  	debugLogger *log.Logger,
    97  	errorLogger *log.Logger,
    98  	dev *os.File) (*Connection, error) {
    99  	c := &Connection{
   100  		cfg:         cfg,
   101  		debugLogger: debugLogger,
   102  		errorLogger: errorLogger,
   103  		dev:         dev,
   104  		cancelFuncs: make(map[uint64]func()),
   105  	}
   106  
   107  	// Initialize.
   108  	if err := c.Init(); err != nil {
   109  		c.close()
   110  		return nil, fmt.Errorf("Init: %v", err)
   111  	}
   112  
   113  	return c, nil
   114  }
   115  
   116  // Init performs the work necessary to cause the mount process to complete.
   117  func (c *Connection) Init() error {
   118  	// Read the init op.
   119  	ctx, op, err := c.ReadOp()
   120  	if err != nil {
   121  		return fmt.Errorf("Reading init op: %v", err)
   122  	}
   123  
   124  	initOp, ok := op.(*initOp)
   125  	if !ok {
   126  		c.Reply(ctx, syscall.EPROTO)
   127  		return fmt.Errorf("Expected *initOp, got %T", op)
   128  	}
   129  
   130  	// Make sure the protocol version spoken by the kernel is new enough.
   131  	min := fusekernel.Protocol{
   132  		fusekernel.ProtoVersionMinMajor,
   133  		fusekernel.ProtoVersionMinMinor,
   134  	}
   135  
   136  	if initOp.Kernel.LT(min) {
   137  		c.Reply(ctx, syscall.EPROTO)
   138  		return fmt.Errorf("Version too old: %v", initOp.Kernel)
   139  	}
   140  
   141  	// Downgrade our protocol if necessary.
   142  	c.protocol = fusekernel.Protocol{
   143  		fusekernel.ProtoVersionMaxMajor,
   144  		fusekernel.ProtoVersionMaxMinor,
   145  	}
   146  
   147  	if initOp.Kernel.LT(c.protocol) {
   148  		c.protocol = initOp.Kernel
   149  	}
   150  
   151  	cacheSymlinks := initOp.Flags&fusekernel.InitCacheSymlinks > 0
   152  	noOpenSupport := initOp.Flags&fusekernel.InitNoOpenSupport > 0
   153  	noOpendirSupport := initOp.Flags&fusekernel.InitNoOpendirSupport > 0
   154  
   155  	// Respond to the init op.
   156  	initOp.Library = c.protocol
   157  	initOp.MaxReadahead = maxReadahead
   158  	initOp.MaxWrite = buffer.MaxWriteSize
   159  
   160  	initOp.Flags = 0
   161  
   162  	// Tell the kernel not to use pitifully small 4 KiB writes.
   163  	initOp.Flags |= fusekernel.InitBigWrites
   164  	// kernel 4.20 increases the max from 32 -> 256
   165  	initOp.Flags |= fusekernel.InitMaxPages
   166  	initOp.MaxPages = 256
   167  
   168  	// Enable writeback caching if the user hasn't asked us not to.
   169  	if !c.cfg.DisableWritebackCaching {
   170  		initOp.Flags |= fusekernel.InitWritebackCache
   171  	}
   172  
   173  	// Enable caching symlink targets in the kernel page cache if the user opted
   174  	// into it (might require fixing the size field of inode attributes first):
   175  	if c.cfg.EnableSymlinkCaching && cacheSymlinks {
   176  		initOp.Flags |= fusekernel.InitCacheSymlinks
   177  	}
   178  
   179  	// Tell the kernel to treat returning -ENOSYS on OpenFile as not needing
   180  	// OpenFile calls at all (Linux >= 3.16):
   181  	if c.cfg.EnableNoOpenSupport && noOpenSupport {
   182  		initOp.Flags |= fusekernel.InitNoOpenSupport
   183  	}
   184  
   185  	// Tell the kernel to treat returning -ENOSYS on OpenDir as not needing
   186  	// OpenDir calls at all (Linux >= 5.1):
   187  	if c.cfg.EnableNoOpendirSupport && noOpendirSupport {
   188  		initOp.Flags |= fusekernel.InitNoOpendirSupport
   189  	}
   190  
   191  	c.Reply(ctx, nil)
   192  	return nil
   193  }
   194  
   195  // Log information for an operation with the given ID. calldepth is the depth
   196  // to use when recovering file:line information with runtime.Caller.
   197  func (c *Connection) debugLog(
   198  	fuseID uint64,
   199  	calldepth int,
   200  	format string,
   201  	v ...interface{}) {
   202  	if c.debugLogger == nil {
   203  		return
   204  	}
   205  
   206  	// Get file:line info.
   207  	var file string
   208  	var line int
   209  	var ok bool
   210  
   211  	_, file, line, ok = runtime.Caller(calldepth)
   212  	if !ok {
   213  		file = "???"
   214  	}
   215  
   216  	fileLine := fmt.Sprintf("%v:%v", path.Base(file), line)
   217  
   218  	// Format the actual message to be printed.
   219  	msg := fmt.Sprintf(
   220  		"Op 0x%08x %24s] %v",
   221  		fuseID,
   222  		fileLine,
   223  		fmt.Sprintf(format, v...))
   224  
   225  	// Print it.
   226  	c.debugLogger.Println(msg)
   227  }
   228  
   229  // LOCKS_EXCLUDED(c.mu)
   230  func (c *Connection) recordCancelFunc(
   231  	fuseID uint64,
   232  	f func()) {
   233  	c.mu.Lock()
   234  	defer c.mu.Unlock()
   235  
   236  	if _, ok := c.cancelFuncs[fuseID]; ok {
   237  		panic(fmt.Sprintf("Already have cancel func for request %v", fuseID))
   238  	}
   239  
   240  	c.cancelFuncs[fuseID] = f
   241  }
   242  
   243  // Set up state for an op that is about to be returned to the user, given its
   244  // underlying fuse opcode and request ID.
   245  //
   246  // Return a context that should be used for the op.
   247  //
   248  // LOCKS_EXCLUDED(c.mu)
   249  func (c *Connection) beginOp(
   250  	opCode uint32,
   251  	fuseID uint64) context.Context {
   252  	// Start with the parent context.
   253  	ctx := c.cfg.OpContext
   254  
   255  	// Set up a cancellation function.
   256  	//
   257  	// Special case: On Darwin, osxfuse aggressively reuses "unique" request IDs.
   258  	// This matters for Forget requests, which have no reply associated and
   259  	// therefore have IDs that are immediately eligible for reuse. For these, we
   260  	// should not record any state keyed on their ID.
   261  	//
   262  	// Cf. https://github.com/osxfuse/osxfuse/issues/208
   263  	if opCode != fusekernel.OpForget {
   264  		var cancel func()
   265  		ctx, cancel = context.WithCancel(ctx)
   266  		c.recordCancelFunc(fuseID, cancel)
   267  	}
   268  
   269  	return ctx
   270  }
   271  
   272  // Clean up all state associated with an op to which the user has responded,
   273  // given its underlying fuse opcode and request ID. This must be called before
   274  // a response is sent to the kernel, to avoid a race where the request's ID
   275  // might be reused by osxfuse.
   276  //
   277  // LOCKS_EXCLUDED(c.mu)
   278  func (c *Connection) finishOp(
   279  	opCode uint32,
   280  	fuseID uint64) {
   281  	c.mu.Lock()
   282  	defer c.mu.Unlock()
   283  
   284  	// Even though the op is finished, context.WithCancel requires us to arrange
   285  	// for the cancellation function to be invoked. We also must remove it from
   286  	// our map.
   287  	//
   288  	// Special case: we don't do this for Forget requests. See the note in
   289  	// beginOp above.
   290  	if opCode != fusekernel.OpForget {
   291  		cancel, ok := c.cancelFuncs[fuseID]
   292  		if !ok {
   293  			panic(fmt.Sprintf("Unknown request ID in finishOp: %v", fuseID))
   294  		}
   295  
   296  		cancel()
   297  		delete(c.cancelFuncs, fuseID)
   298  	}
   299  }
   300  
   301  // LOCKS_EXCLUDED(c.mu)
   302  func (c *Connection) handleInterrupt(fuseID uint64) {
   303  	c.mu.Lock()
   304  	defer c.mu.Unlock()
   305  
   306  	// NOTE(jacobsa): fuse.txt in the Linux kernel documentation
   307  	// (https://goo.gl/H55Dnr) defines the kernel <-> userspace protocol for
   308  	// interrupts.
   309  	//
   310  	// In particular, my reading of it is that an interrupt request cannot be
   311  	// delivered to userspace before the original request. The part about the
   312  	// race and EAGAIN appears to be aimed at userspace programs that
   313  	// concurrently process requests (cf. http://goo.gl/BES2rs).
   314  	//
   315  	// So in this method if we can't find the ID to be interrupted, it means that
   316  	// the request has already been replied to.
   317  	//
   318  	// Cf. https://github.com/osxfuse/osxfuse/issues/208
   319  	// Cf. http://comments.gmane.org/gmane.comp.file-systems.fuse.devel/14675
   320  	cancel, ok := c.cancelFuncs[fuseID]
   321  	if !ok {
   322  		return
   323  	}
   324  
   325  	cancel()
   326  }
   327  
   328  // Read the next message from the kernel. The message must later be destroyed
   329  // using destroyInMessage.
   330  func (c *Connection) readMessage() (*buffer.InMessage, error) {
   331  	// Allocate a message.
   332  	m := c.getInMessage()
   333  
   334  	// Loop past transient errors.
   335  	for {
   336  		// Attempt a reaed.
   337  		err := m.Init(c.dev)
   338  
   339  		// Special cases:
   340  		//
   341  		//  *  ENODEV means fuse has hung up.
   342  		//
   343  		//  *  EINTR means we should try again. (This seems to happen often on
   344  		//     OS X, cf. http://golang.org/issue/11180)
   345  		//
   346  		if pe, ok := err.(*os.PathError); ok {
   347  			switch pe.Err {
   348  			case syscall.ENODEV:
   349  				err = io.EOF
   350  
   351  			case syscall.EINTR:
   352  				err = nil
   353  				continue
   354  			}
   355  		}
   356  
   357  		if err != nil {
   358  			c.putInMessage(m)
   359  			return nil, err
   360  		}
   361  
   362  		return m, nil
   363  	}
   364  }
   365  
   366  // Write the supplied message to the kernel.
   367  func (c *Connection) writeMessage(msg []byte) error {
   368  	// Avoid the retry loop in os.File.Write.
   369  	n, err := syscall.Write(int(c.dev.Fd()), msg)
   370  	if err != nil {
   371  		return err
   372  	}
   373  
   374  	if n != len(msg) {
   375  		return fmt.Errorf("Wrote %d bytes; expected %d", n, len(msg))
   376  	}
   377  
   378  	return nil
   379  }
   380  
   381  // ReadOp consumes the next op from the kernel process, returning the op and a
   382  // context that should be used for work related to the op. It returns io.EOF if
   383  // the kernel has closed the connection.
   384  //
   385  // If err != nil, the user is responsible for later calling c.Reply with the
   386  // returned context.
   387  //
   388  // This function delivers ops in exactly the order they are received from
   389  // /dev/fuse. It must not be called multiple times concurrently.
   390  //
   391  // LOCKS_EXCLUDED(c.mu)
   392  func (c *Connection) ReadOp() (_ context.Context, op interface{}, _ error) {
   393  	// Keep going until we find a request we know how to convert.
   394  	for {
   395  		// Read the next message from the kernel.
   396  		inMsg, err := c.readMessage()
   397  		if err != nil {
   398  			return nil, nil, err
   399  		}
   400  
   401  		// Convert the message to an op.
   402  		outMsg := c.getOutMessage()
   403  		op, err = convertInMessage(inMsg, outMsg, c.protocol)
   404  		if err != nil {
   405  			c.putOutMessage(outMsg)
   406  			return nil, nil, fmt.Errorf("convertInMessage: %v", err)
   407  		}
   408  
   409  		// Choose an ID for this operation for the purposes of logging, and log it.
   410  		if c.debugLogger != nil {
   411  			c.debugLog(inMsg.Header().Unique, 1, "<- %s", describeRequest(op))
   412  		}
   413  
   414  		// Special case: handle interrupt requests inline.
   415  		if interruptOp, ok := op.(*interruptOp); ok {
   416  			c.handleInterrupt(interruptOp.FuseID)
   417  			continue
   418  		}
   419  
   420  		// Set up a context that remembers information about this op.
   421  		ctx := c.beginOp(inMsg.Header().Opcode, inMsg.Header().Unique)
   422  		ctx = context.WithValue(ctx, contextKey, opState{inMsg, outMsg, op})
   423  
   424  		// Return the op to the user.
   425  		return ctx, op, nil
   426  	}
   427  }
   428  
   429  // Skip errors that happen as a matter of course, since they spook users.
   430  func (c *Connection) shouldLogError(
   431  	op interface{},
   432  	err error) bool {
   433  	// We don't log non-errors.
   434  	if err == nil {
   435  		return false
   436  	}
   437  
   438  	// We can't log if there's nothing to log to.
   439  	if c.errorLogger == nil {
   440  		return false
   441  	}
   442  
   443  	switch op.(type) {
   444  	case *fuseops.LookUpInodeOp:
   445  		// It is totally normal for the kernel to ask to look up an inode by name
   446  		// and find the name doesn't exist. For example, this happens when linking
   447  		// a new file.
   448  		if err == syscall.ENOENT {
   449  			return false
   450  		}
   451  
   452  	case *fuseops.GetXattrOp:
   453  		if err == syscall.ENODATA || err == syscall.ERANGE {
   454  			return false
   455  		}
   456  	case *unknownOp:
   457  		// Don't bother the user with methods we intentionally don't support.
   458  		if err == syscall.ENOSYS {
   459  			return false
   460  		}
   461  	}
   462  
   463  	return true
   464  }
   465  
   466  // Reply replies to an op previously read using ReadOp, with the supplied error
   467  // (or nil if successful). The context must be the context returned by ReadOp.
   468  //
   469  // LOCKS_EXCLUDED(c.mu)
   470  func (c *Connection) Reply(ctx context.Context, opErr error) {
   471  	// Extract the state we stuffed in earlier.
   472  	var key interface{} = contextKey
   473  	foo := ctx.Value(key)
   474  	state, ok := foo.(opState)
   475  	if !ok {
   476  		panic(fmt.Sprintf("Reply called with invalid context: %#v", ctx))
   477  	}
   478  
   479  	op := state.op
   480  	inMsg := state.inMsg
   481  	outMsg := state.outMsg
   482  	fuseID := inMsg.Header().Unique
   483  
   484  	// Make sure we destroy the messages when we're done.
   485  	defer c.putInMessage(inMsg)
   486  	defer c.putOutMessage(outMsg)
   487  
   488  	// Clean up state for this op.
   489  	c.finishOp(inMsg.Header().Opcode, inMsg.Header().Unique)
   490  
   491  	// Debug logging
   492  	if c.debugLogger != nil {
   493  		if opErr == nil {
   494  			c.debugLog(fuseID, 1, "-> OK (%s)", describeResponse(op))
   495  		} else {
   496  			c.debugLog(fuseID, 1, "-> Error: %q", opErr.Error())
   497  		}
   498  	}
   499  
   500  	// Error logging
   501  	if c.shouldLogError(op, opErr) {
   502  		c.errorLogger.Printf("%T error: %v", op, opErr)
   503  	}
   504  
   505  	// Send the reply to the kernel, if one is required.
   506  	noResponse := c.kernelResponse(outMsg, inMsg.Header().Unique, op, opErr)
   507  
   508  	if !noResponse {
   509  		err := c.writeMessage(outMsg.Bytes())
   510  		if err != nil && c.errorLogger != nil {
   511  			c.errorLogger.Printf("writeMessage: %v %v", err, outMsg.Bytes())
   512  		}
   513  	}
   514  }
   515  
   516  // Close the connection. Must not be called until operations that were read
   517  // from the connection have been responded to.
   518  func (c *Connection) close() error {
   519  	// Posix doesn't say that close can be called concurrently with read or
   520  	// write, but luckily we exclude the possibility of a race by requiring the
   521  	// user to respond to all ops first.
   522  	return c.dev.Close()
   523  }