github.com/StarfishStorage/goofys@v0.23.2-0.20200415030923-535558486b34/internal/goofys.go (about)

     1  // Copyright 2015 - 2017 Ka-Hing Cheung
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  // Copyright 2019-2020
    15  // modifed by StarfishStorage for persistent hash-based inode
    16  
    17  package internal
    18  
    19  import (
    20  	. "github.com/kahing/goofys/api/common"
    21  
    22  	"context"
    23  	"fmt"
    24  	"math/rand"
    25  	"net/url"
    26  	"runtime/debug"
    27  	"strings"
    28  	"sync"
    29  	"sync/atomic"
    30  	"syscall"
    31  	"time"
    32  
    33  	"github.com/aws/aws-sdk-go/aws/awserr"
    34  
    35  	"github.com/jacobsa/fuse"
    36  	"github.com/jacobsa/fuse/fuseops"
    37  	"github.com/jacobsa/fuse/fuseutil"
    38  
    39  	"github.com/sirupsen/logrus"
    40  )
    41  
    42  // goofys is a Filey System written in Go. All the backend data is
    43  // stored on S3 as is. It's a Filey System instead of a File System
    44  // because it makes minimal effort at being POSIX
    45  // compliant. Particularly things that are difficult to support on S3
    46  // or would translate into more than one round-trip would either fail
    47  // (rename non-empty dir) or faked (no per-file permission). goofys
    48  // does not have a on disk data cache, and consistency model is
    49  // close-to-open.
    50  
    51  type Goofys struct {
    52  	fuseutil.NotImplementedFileSystem
    53  	bucket string
    54  
    55  	flags *FlagStorage
    56  
    57  	umask uint32
    58  
    59  	gcs       bool
    60  	rootAttrs InodeAttributes
    61  
    62  	bufferPool *BufferPool
    63  
    64  	// A lock protecting the state of the file system struct itself (distinct
    65  	// from per-inode locks). Make sure to see the notes on lock ordering above.
    66  	mu sync.RWMutex
    67  
    68  	// The next inode ID to hand out. We assume that this will never overflow,
    69  	// since even if we were handing out inode IDs at 4 GHz, it would still take
    70  	// over a century to do so.
    71  	//
    72  	// GUARDED_BY(mu)
    73  	nextInodeID fuseops.InodeID
    74  
    75  	// The collection of live inodes, keyed by inode ID. No ID less than
    76  	// fuseops.RootInodeID is ever used.
    77  	//
    78  	// INVARIANT: For all keys k, fuseops.RootInodeID <= k < nextInodeID
    79  	// INVARIANT: For all keys k, inodes[k].ID() == k
    80  	// INVARIANT: inodes[fuseops.RootInodeID] is missing or of type inode.DirInode
    81  	// INVARIANT: For all v, if IsDirName(v.Name()) then v is inode.DirInode
    82  	//
    83  	// GUARDED_BY(mu)
    84  	inodes map[fuseops.InodeID]*Inode
    85  
    86  	nextHandleID fuseops.HandleID
    87  	dirHandles   map[fuseops.HandleID]*DirHandle
    88  
    89  	fileHandles map[fuseops.HandleID]*FileHandle
    90  
    91  	replicators *Ticket
    92  	restorers   *Ticket
    93  
    94  	forgotCnt uint32
    95  }
    96  
    97  var s3Log = GetLogger("s3")
    98  var log = GetLogger("main")
    99  var fuseLog = GetLogger("fuse")
   100  
   101  func NewBackend(bucket string, flags *FlagStorage) (cloud StorageBackend, err error) {
   102  	if flags.Backend == nil {
   103  		flags.Backend = (&S3Config{}).Init()
   104  	}
   105  
   106  	if config, ok := flags.Backend.(*AZBlobConfig); ok {
   107  		cloud, err = NewAZBlob(bucket, config)
   108  	} else if config, ok := flags.Backend.(*ADLv1Config); ok {
   109  		cloud, err = NewADLv1(bucket, flags, config)
   110  	} else if config, ok := flags.Backend.(*ADLv2Config); ok {
   111  		cloud, err = NewADLv2(bucket, flags, config)
   112  	} else if config, ok := flags.Backend.(*S3Config); ok {
   113  		if strings.HasSuffix(flags.Endpoint, "/storage.googleapis.com") {
   114  			cloud, err = NewGCS3(bucket, flags, config)
   115  		} else {
   116  			cloud, err = NewS3(bucket, flags, config)
   117  		}
   118  	} else {
   119  		err = fmt.Errorf("Unknown backend config: %T", flags.Backend)
   120  	}
   121  
   122  	return
   123  }
   124  
   125  type BucketSpec struct {
   126  	Scheme string
   127  	Bucket string
   128  	Prefix string
   129  }
   130  
   131  func ParseBucketSpec(bucket string) (spec BucketSpec, err error) {
   132  	if strings.Index(bucket, "://") != -1 {
   133  		var u *url.URL
   134  		u, err = url.Parse(bucket)
   135  		if err != nil {
   136  			return
   137  		}
   138  
   139  		spec.Scheme = u.Scheme
   140  		spec.Bucket = u.Host
   141  		if u.User != nil {
   142  			// wasb url can be wasb://container@storage-end-point
   143  			// we want to return the entire thing as bucket
   144  			spec.Bucket = u.User.String() + "@" + u.Host
   145  		}
   146  		spec.Prefix = u.Path
   147  	} else {
   148  		spec.Scheme = "s3"
   149  
   150  		colon := strings.Index(bucket, ":")
   151  		if colon != -1 {
   152  			spec.Prefix = bucket[colon+1:]
   153  			spec.Bucket = bucket[0:colon]
   154  		} else {
   155  			spec.Bucket = bucket
   156  		}
   157  	}
   158  
   159  	spec.Prefix = strings.Trim(spec.Prefix, "/")
   160  	if spec.Prefix != "" {
   161  		spec.Prefix += "/"
   162  	}
   163  	return
   164  }
   165  
   166  func NewGoofys(ctx context.Context, bucket string, flags *FlagStorage) *Goofys {
   167  	return newGoofys(ctx, bucket, flags, NewBackend)
   168  }
   169  
   170  func newGoofys(ctx context.Context, bucket string, flags *FlagStorage,
   171  	newBackend func(string, *FlagStorage) (StorageBackend, error)) *Goofys {
   172  	// Set up the basic struct.
   173  	fs := &Goofys{
   174  		bucket: bucket,
   175  		flags:  flags,
   176  		umask:  0122,
   177  	}
   178  
   179  	var prefix string
   180  	colon := strings.Index(bucket, ":")
   181  	if colon != -1 {
   182  		prefix = bucket[colon+1:]
   183  		prefix = strings.Trim(prefix, "/")
   184  		if prefix != "" {
   185  			prefix += "/"
   186  		}
   187  
   188  		fs.bucket = bucket[0:colon]
   189  		bucket = fs.bucket
   190  	}
   191  
   192  	if flags.DebugS3 {
   193  		s3Log.Level = logrus.DebugLevel
   194  	}
   195  
   196  	cloud, err := newBackend(bucket, flags)
   197  	if err != nil {
   198  		log.Errorf("Unable to setup backend: %v", err)
   199  		return nil
   200  	}
   201  	_, fs.gcs = cloud.Delegate().(*GCS3)
   202  
   203  	randomObjectName := prefix + (RandStringBytesMaskImprSrc(32))
   204  	err = cloud.Init(randomObjectName)
   205  	if err != nil {
   206  		log.Errorf("Unable to access '%v': %v", bucket, err)
   207  		return nil
   208  	}
   209  	go cloud.MultipartExpire(&MultipartExpireInput{})
   210  
   211  	now := time.Now()
   212  	fs.rootAttrs = InodeAttributes{
   213  		Size:  4096,
   214  		Mtime: now,
   215  	}
   216  
   217  	fs.bufferPool = BufferPool{}.Init()
   218  
   219  	fs.nextInodeID = fuseops.RootInodeID + 1
   220  	fs.inodes = make(map[fuseops.InodeID]*Inode)
   221  	root := NewInode(fs, nil, PString(""))
   222  	root.Id = fuseops.RootInodeID
   223  	root.ToDir()
   224  	root.dir.cloud = cloud
   225  	root.dir.mountPrefix = prefix
   226  	root.Attributes.Mtime = fs.rootAttrs.Mtime
   227  
   228  	fs.inodes[fuseops.RootInodeID] = root
   229  	fs.addDotAndDotDot(root)
   230  
   231  	fs.nextHandleID = 1
   232  	fs.dirHandles = make(map[fuseops.HandleID]*DirHandle)
   233  
   234  	fs.fileHandles = make(map[fuseops.HandleID]*FileHandle)
   235  
   236  	fs.replicators = Ticket{Total: 16}.Init()
   237  	fs.restorers = Ticket{Total: 20}.Init()
   238  
   239  	return fs
   240  }
   241  
   242  // from https://stackoverflow.com/questions/22892120/how-to-generate-a-random-string-of-a-fixed-length-in-golang
   243  func RandStringBytesMaskImprSrc(n int) string {
   244  	const letterBytes = "abcdefghijklmnopqrstuvwxyz0123456789"
   245  	const (
   246  		letterIdxBits = 6                    // 6 bits to represent a letter index
   247  		letterIdxMask = 1<<letterIdxBits - 1 // All 1-bits, as many as letterIdxBits
   248  		letterIdxMax  = 63 / letterIdxBits   // # of letter indices fitting in 63 bits
   249  	)
   250  	src := rand.NewSource(time.Now().UnixNano())
   251  	b := make([]byte, n)
   252  	// A src.Int63() generates 63 random bits, enough for letterIdxMax characters!
   253  	for i, cache, remain := n-1, src.Int63(), letterIdxMax; i >= 0; {
   254  		if remain == 0 {
   255  			cache, remain = src.Int63(), letterIdxMax
   256  		}
   257  		if idx := int(cache & letterIdxMask); idx < len(letterBytes) {
   258  			b[i] = letterBytes[idx]
   259  			i--
   260  		}
   261  		cache >>= letterIdxBits
   262  		remain--
   263  	}
   264  
   265  	return string(b)
   266  }
   267  
   268  func (fs *Goofys) SigUsr1() {
   269  	fs.mu.RLock()
   270  
   271  	log.Infof("forgot %v inodes", fs.forgotCnt)
   272  	log.Infof("%v inodes", len(fs.inodes))
   273  	fs.mu.RUnlock()
   274  	debug.FreeOSMemory()
   275  }
   276  
   277  // Find the given inode. Panic if it doesn't exist.
   278  //
   279  // RLOCKS_REQUIRED(fs.mu)
   280  func (fs *Goofys) getInodeOrDie(id fuseops.InodeID) (inode *Inode) {
   281  	inode = fs.inodes[id]
   282  	if inode == nil {
   283  		panic(fmt.Sprintf("Unknown inode: %v", id))
   284  	}
   285  
   286  	return
   287  }
   288  
   289  type Mount struct {
   290  	// Mount Point relative to goofys's root mount.
   291  	name    string
   292  	cloud   StorageBackend
   293  	prefix  string
   294  	mounted bool
   295  }
   296  
   297  func (fs *Goofys) mount(mp *Inode, b *Mount) {
   298  	if b.mounted {
   299  		return
   300  	}
   301  
   302  	name := strings.Trim(b.name, "/")
   303  
   304  	// create path for the mount. AttrTime is set to TIME_MAX so
   305  	// they will never expire and be removed. But DirTime is not
   306  	// so we will still consult the underlining cloud for listing
   307  	// (which will then be merged with the cached result)
   308  
   309  	for {
   310  		idx := strings.Index(name, "/")
   311  		if idx == -1 {
   312  			break
   313  		}
   314  		dirName := name[0:idx]
   315  		name = name[idx+1:]
   316  
   317  		mp.mu.Lock()
   318  		dirInode := mp.findChildUnlocked(dirName)
   319  		if dirInode == nil {
   320  			fs.mu.Lock()
   321  
   322  			dirInode = NewInode(fs, mp, &dirName)
   323  			dirInode.ToDir()
   324  			dirInode.AttrTime = TIME_MAX
   325  
   326  			fs.insertInode(mp, dirInode)
   327  			fs.mu.Unlock()
   328  		}
   329  		mp.mu.Unlock()
   330  		mp = dirInode
   331  	}
   332  
   333  	mp.mu.Lock()
   334  	defer mp.mu.Unlock()
   335  
   336  	prev := mp.findChildUnlocked(name)
   337  	if prev == nil {
   338  		mountInode := NewInode(fs, mp, &name)
   339  		mountInode.ToDir()
   340  		mountInode.dir.cloud = b.cloud
   341  		mountInode.dir.mountPrefix = b.prefix
   342  		mountInode.AttrTime = TIME_MAX
   343  
   344  		fs.mu.Lock()
   345  		defer fs.mu.Unlock()
   346  
   347  		fs.insertInode(mp, mountInode)
   348  		prev = mountInode
   349  	} else {
   350  		if !prev.isDir() {
   351  			panic(fmt.Sprintf("inode %v is not a directory", *prev.FullName()))
   352  		}
   353  
   354  		// This inode might have some cached data from a parent mount.
   355  		// Clear this cache by resetting the DirTime.
   356  		// Note: resetDirTimeRec should be called without holding the lock.
   357  		prev.resetDirTimeRec()
   358  		prev.mu.Lock()
   359  		defer prev.mu.Unlock()
   360  		prev.dir.cloud = b.cloud
   361  		prev.dir.mountPrefix = b.prefix
   362  		prev.AttrTime = TIME_MAX
   363  
   364  	}
   365  	fuseLog.Infof("mounted /%v", *prev.FullName())
   366  	b.mounted = true
   367  }
   368  
   369  func (fs *Goofys) MountAll(mounts []*Mount) {
   370  	fs.mu.RLock()
   371  	root := fs.getInodeOrDie(fuseops.RootInodeID)
   372  	fs.mu.RUnlock()
   373  
   374  	for _, m := range mounts {
   375  		fs.mount(root, m)
   376  	}
   377  }
   378  
   379  func (fs *Goofys) Mount(mount *Mount) {
   380  	fs.mu.RLock()
   381  	root := fs.getInodeOrDie(fuseops.RootInodeID)
   382  	fs.mu.RUnlock()
   383  	fs.mount(root, mount)
   384  }
   385  
   386  func (fs *Goofys) Unmount(mountPoint string) {
   387  	fs.mu.RLock()
   388  	mp := fs.getInodeOrDie(fuseops.RootInodeID)
   389  	fs.mu.RUnlock()
   390  
   391  	fuseLog.Infof("Attempting to unmount %v", mountPoint)
   392  	path := strings.Split(strings.Trim(mountPoint, "/"), "/")
   393  	for _, localName := range path {
   394  		dirInode := mp.findChild(localName)
   395  		if dirInode == nil || !dirInode.isDir() {
   396  			fuseLog.Errorf("Failed to find directory:%v while unmounting %v. "+
   397  				"Ignoring the unmount operation.", localName, mountPoint)
   398  			return
   399  		}
   400  		mp = dirInode
   401  	}
   402  	mp.ResetForUnmount()
   403  	return
   404  }
   405  
   406  func (fs *Goofys) StatFS(
   407  	ctx context.Context,
   408  	op *fuseops.StatFSOp) (err error) {
   409  
   410  	const BLOCK_SIZE = 4096
   411  	const TOTAL_SPACE = 1 * 1024 * 1024 * 1024 * 1024 * 1024 // 1PB
   412  	const TOTAL_BLOCKS = TOTAL_SPACE / BLOCK_SIZE
   413  	const INODES = 1 * 1000 * 1000 * 1000 // 1 billion
   414  	op.BlockSize = BLOCK_SIZE
   415  	op.Blocks = TOTAL_BLOCKS
   416  	op.BlocksFree = TOTAL_BLOCKS
   417  	op.BlocksAvailable = TOTAL_BLOCKS
   418  	op.IoSize = 1 * 1024 * 1024 // 1MB
   419  	op.Inodes = INODES
   420  	op.InodesFree = INODES
   421  	return
   422  }
   423  
   424  func (fs *Goofys) GetInodeAttributes(
   425  	ctx context.Context,
   426  	op *fuseops.GetInodeAttributesOp) (err error) {
   427  
   428  	fs.mu.RLock()
   429  	inode := fs.getInodeOrDie(op.Inode)
   430  	fs.mu.RUnlock()
   431  
   432  	attr, err := inode.GetAttributes()
   433  	if err == nil {
   434  		op.Attributes = *attr
   435  		op.AttributesExpiration = time.Now().Add(fs.flags.StatCacheTTL)
   436  	}
   437  
   438  	return
   439  }
   440  
   441  func (fs *Goofys) GetXattr(ctx context.Context,
   442  	op *fuseops.GetXattrOp) (err error) {
   443  	fs.mu.RLock()
   444  	inode := fs.getInodeOrDie(op.Inode)
   445  	fs.mu.RUnlock()
   446  
   447  	value, err := inode.GetXattr(op.Name)
   448  	if err != nil {
   449  		return
   450  	}
   451  
   452  	op.BytesRead = len(value)
   453  
   454  	if len(op.Dst) != 0 {
   455  		if len(op.Dst) < op.BytesRead {
   456  			return syscall.ERANGE
   457  		}
   458  
   459  		copy(op.Dst, value)
   460  	}
   461  	return
   462  }
   463  
   464  func (fs *Goofys) ListXattr(ctx context.Context,
   465  	op *fuseops.ListXattrOp) (err error) {
   466  	fs.mu.RLock()
   467  	inode := fs.getInodeOrDie(op.Inode)
   468  	fs.mu.RUnlock()
   469  
   470  	xattrs, err := inode.ListXattr()
   471  
   472  	ncopied := 0
   473  
   474  	for _, name := range xattrs {
   475  		buf := op.Dst[ncopied:]
   476  		nlen := len(name) + 1
   477  
   478  		if nlen <= len(buf) {
   479  			copy(buf, name)
   480  			ncopied += nlen
   481  			buf[nlen-1] = '\x00'
   482  		}
   483  
   484  		op.BytesRead += nlen
   485  	}
   486  
   487  	if len(op.Dst) != 0 && ncopied < op.BytesRead {
   488  		err = syscall.ERANGE
   489  	}
   490  
   491  	return
   492  }
   493  
   494  func (fs *Goofys) RemoveXattr(ctx context.Context,
   495  	op *fuseops.RemoveXattrOp) (err error) {
   496  	fs.mu.RLock()
   497  	inode := fs.getInodeOrDie(op.Inode)
   498  	fs.mu.RUnlock()
   499  
   500  	err = inode.RemoveXattr(op.Name)
   501  
   502  	return
   503  }
   504  
   505  func (fs *Goofys) SetXattr(ctx context.Context,
   506  	op *fuseops.SetXattrOp) (err error) {
   507  	fs.mu.RLock()
   508  	inode := fs.getInodeOrDie(op.Inode)
   509  	fs.mu.RUnlock()
   510  
   511  	err = inode.SetXattr(op.Name, op.Value, op.Flags)
   512  	return
   513  }
   514  
   515  func mapHttpError(status int) error {
   516  	switch status {
   517  	case 400:
   518  		return fuse.EINVAL
   519  	case 401:
   520  		return syscall.EACCES
   521  	case 403:
   522  		return syscall.EACCES
   523  	case 404:
   524  		return fuse.ENOENT
   525  	case 405:
   526  		return syscall.ENOTSUP
   527  	case 429:
   528  		return syscall.EAGAIN
   529  	case 500:
   530  		return syscall.EAGAIN
   531  	default:
   532  		return nil
   533  	}
   534  }
   535  
   536  func mapAwsError(err error) error {
   537  	if err == nil {
   538  		return nil
   539  	}
   540  
   541  	if awsErr, ok := err.(awserr.Error); ok {
   542  		switch awsErr.Code() {
   543  		case "BucketRegionError":
   544  			// don't need to log anything, we should detect region after
   545  			return err
   546  		case "NoSuchBucket":
   547  			return syscall.ENXIO
   548  		case "BucketAlreadyOwnedByYou":
   549  			return fuse.EEXIST
   550  		}
   551  
   552  		if reqErr, ok := err.(awserr.RequestFailure); ok {
   553  			// A service error occurred
   554  			err = mapHttpError(reqErr.StatusCode())
   555  			if err != nil {
   556  				return err
   557  			} else {
   558  				s3Log.Errorf("http=%v %v s3=%v request=%v\n",
   559  					reqErr.StatusCode(), reqErr.Message(),
   560  					awsErr.Code(), reqErr.RequestID())
   561  				return reqErr
   562  			}
   563  		} else {
   564  			// Generic AWS Error with Code, Message, and original error (if any)
   565  			s3Log.Errorf("code=%v msg=%v, err=%v\n", awsErr.Code(), awsErr.Message(), awsErr.OrigErr())
   566  			return awsErr
   567  		}
   568  	} else {
   569  		return err
   570  	}
   571  }
   572  
   573  // note that this is NOT the same as url.PathEscape in golang 1.8,
   574  // as this preserves / and url.PathEscape converts / to %2F
   575  func pathEscape(path string) string {
   576  	u := url.URL{Path: path}
   577  	return u.EscapedPath()
   578  }
   579  
   580  func (fs *Goofys) allocateInodeId() (id fuseops.InodeID) {
   581  	id = fs.nextInodeID
   582  	fs.nextInodeID++
   583  	return
   584  }
   585  
   586  func expired(cache time.Time, ttl time.Duration) bool {
   587  	now := time.Now()
   588  	if cache.After(now) {
   589  		return false
   590  	}
   591  	return !cache.Add(ttl).After(now)
   592  }
   593  
   594  func (fs *Goofys) LookUpInode(
   595  	ctx context.Context,
   596  	op *fuseops.LookUpInodeOp) (err error) {
   597  
   598  	var inode *Inode
   599  	var ok bool
   600  	defer func() { fuseLog.Debugf("<-- LookUpInode %v %v %v", op.Parent, op.Name, err) }()
   601  
   602  	fs.mu.RLock()
   603  	parent := fs.getInodeOrDie(op.Parent)
   604  	fs.mu.RUnlock()
   605  
   606  	parent.mu.Lock()
   607  	inode = parent.findChildUnlocked(op.Name)
   608  	if inode != nil {
   609  		ok = true
   610  		inode.Ref()
   611  
   612  		if expired(inode.AttrTime, fs.flags.StatCacheTTL) {
   613  			ok = false
   614  			if atomic.LoadInt32(&inode.fileHandles) != 0 {
   615  				// we have an open file handle, object
   616  				// in S3 may not represent the true
   617  				// state of the file anyway, so just
   618  				// return what we know which is
   619  				// potentially more accurate
   620  				ok = true
   621  			} else {
   622  				inode.logFuse("lookup expired")
   623  			}
   624  		}
   625  	} else {
   626  		ok = false
   627  	}
   628  	parent.mu.Unlock()
   629  
   630  	if !ok {
   631  		var newInode *Inode
   632  
   633  		newInode, err = parent.LookUp(op.Name)
   634  		if err == fuse.ENOENT && inode != nil && inode.isDir() {
   635  			// we may not be able to look up an implicit
   636  			// dir if all the children are removed, so we
   637  			// just pretend this dir is still around
   638  			err = nil
   639  		} else if err != nil {
   640  			if inode != nil {
   641  				// just kidding! pretend we didn't up the ref
   642  				fs.mu.Lock()
   643  				defer fs.mu.Unlock()
   644  
   645  				stale := inode.DeRef(1)
   646  				if stale {
   647  					delete(fs.inodes, inode.Id)
   648  					parent.removeChild(inode)
   649  				}
   650  			}
   651  			return err
   652  		}
   653  
   654  		if inode == nil {
   655  			parent.mu.Lock()
   656  			// check again if it's there, could have been
   657  			// added by another lookup or readdir
   658  			inode = parent.findChildUnlocked(op.Name)
   659  			if inode == nil {
   660  				fs.mu.Lock()
   661  				inode = newInode
   662  				fs.insertInode(parent, inode)
   663  				fs.mu.Unlock()
   664  			}
   665  			parent.mu.Unlock()
   666  		} else {
   667  			inode.mu.Lock()
   668  
   669  			if newInode != nil {
   670  				// if only size changed, kernel seems to
   671  				// automatically drop cache
   672  				if inode.Attributes != newInode.Attributes {
   673  					inode.invalidateCache = true
   674  				} else if inode.knownETag != nil &&
   675  					newInode.knownETag != nil &&
   676  					*inode.knownETag != *newInode.knownETag {
   677  					// if this is a new file (ie:
   678  					// inode.knownETag is nil),
   679  					// then prefer to read our own
   680  					// write then reading updated
   681  					// data
   682  					inode.invalidateCache = true
   683  				}
   684  
   685  				if newInode.Attributes.Mtime.IsZero() {
   686  					// this can happen if it's an
   687  					// implicit dir, use the last
   688  					// known value
   689  					newInode.Attributes.Mtime = inode.Attributes.Mtime
   690  				}
   691  				inode.Attributes = newInode.Attributes
   692  				inode.knownETag = newInode.knownETag
   693  			}
   694  			inode.AttrTime = time.Now()
   695  
   696  			inode.mu.Unlock()
   697  		}
   698  	}
   699  
   700  	op.Entry.Child = inode.Id
   701  	op.Entry.Attributes = inode.InflateAttributes()
   702  	op.Entry.AttributesExpiration = time.Now().Add(fs.flags.StatCacheTTL)
   703  	op.Entry.EntryExpiration = time.Now().Add(fs.flags.TypeCacheTTL)
   704  
   705  	return
   706  }
   707  
   708  // LOCKS_REQUIRED(fs.mu)
   709  // LOCKS_REQUIRED(parent.mu)
   710  func (fs *Goofys) insertInode(parent *Inode, inode *Inode) {
   711  	addInode := false
   712  	if *inode.Name == "." {
   713  		inode.Id = parent.Id
   714  	} else if *inode.Name == ".." {
   715  		inode.Id = fuseops.InodeID(fuseops.RootInodeID)
   716  		if parent.Parent != nil {
   717  			inode.Id = parent.Parent.Id
   718  		}
   719  	} else {
   720  		if inode.Id != 0 {
   721  			panic(fmt.Sprintf("inode id is set: %v %v", *inode.Name, inode.Id))
   722  		}
   723  		// inode.Id = fs.allocateInodeId()
   724          inode.Id = fuseops.InodeID(makeInodeID(*inode.FullName()))
   725          fuseLog.Debugf("new inode %d", inode.Id)
   726  
   727  
   728  		addInode = true
   729  	}
   730  	parent.insertChildUnlocked(inode)
   731  	if addInode {
   732  		fs.inodes[inode.Id] = inode
   733  
   734  		// if we are inserting a new directory, also create
   735  		// the child . and ..
   736  		if inode.isDir() {
   737  			fs.addDotAndDotDot(inode)
   738  		}
   739  	}
   740  }
   741  
   742  func (fs *Goofys) addDotAndDotDot(dir *Inode) {
   743  	dot := NewInode(fs, dir, PString("."))
   744  	dot.ToDir()
   745  	dot.AttrTime = TIME_MAX
   746  	fs.insertInode(dir, dot)
   747  
   748  	dot = NewInode(fs, dir, PString(".."))
   749  	dot.ToDir()
   750  	dot.AttrTime = TIME_MAX
   751  	fs.insertInode(dir, dot)
   752  }
   753  
   754  func (fs *Goofys) ForgetInode(
   755  	ctx context.Context,
   756  	op *fuseops.ForgetInodeOp) (err error) {
   757  
   758  	fs.mu.RLock()
   759  	inode := fs.getInodeOrDie(op.Inode)
   760  	fs.mu.RUnlock()
   761  
   762  	if inode.Parent != nil {
   763  		inode.Parent.mu.Lock()
   764  		defer inode.Parent.mu.Unlock()
   765  	}
   766  	stale := inode.DeRef(op.N)
   767  
   768  	if stale {
   769  		fs.mu.Lock()
   770  		defer fs.mu.Unlock()
   771  
   772  		delete(fs.inodes, op.Inode)
   773  		fs.forgotCnt += 1
   774  
   775  		if inode.Parent != nil {
   776  			inode.Parent.removeChildUnlocked(inode)
   777  		}
   778  	}
   779  
   780  	return
   781  }
   782  
   783  func (fs *Goofys) OpenDir(
   784  	ctx context.Context,
   785  	op *fuseops.OpenDirOp) (err error) {
   786  	fs.mu.Lock()
   787  
   788  	handleID := fs.nextHandleID
   789  	fs.nextHandleID++
   790  
   791  	in := fs.getInodeOrDie(op.Inode)
   792  	fs.mu.Unlock()
   793  
   794  	// XXX/is this a dir?
   795  	dh := in.OpenDir()
   796  
   797  	fs.mu.Lock()
   798  	defer fs.mu.Unlock()
   799  
   800  	fs.dirHandles[handleID] = dh
   801  	op.Handle = handleID
   802  
   803  	return
   804  }
   805  
   806  func makeDirEntry(en *DirHandleEntry) fuseutil.Dirent {
   807  	return fuseutil.Dirent{
   808  		Name:   en.Name,
   809  		Type:   en.Type,
   810  		Inode:  en.Inode,
   811  		Offset: en.Offset,
   812  	}
   813  }
   814  
   815  func (fs *Goofys) ReadDir(
   816  	ctx context.Context,
   817  	op *fuseops.ReadDirOp) (err error) {
   818  
   819  	// Find the handle.
   820  	fs.mu.RLock()
   821  	dh := fs.dirHandles[op.Handle]
   822  	fs.mu.RUnlock()
   823  
   824  	if dh == nil {
   825  		panic(fmt.Sprintf("can't find dh=%v", op.Handle))
   826  	}
   827  
   828  	inode := dh.inode
   829  	inode.logFuse("ReadDir", op.Offset)
   830  
   831  	dh.mu.Lock()
   832  	defer dh.mu.Unlock()
   833  
   834  	for i := op.Offset; ; i++ {
   835  		e, err := dh.ReadDir(i)
   836  		if err != nil {
   837  			return err
   838  		}
   839  		if e == nil {
   840  			break
   841  		}
   842  
   843  		if e.Inode == 0 {
   844  			panic(fmt.Sprintf("unset inode %v", e.Name))
   845  		}
   846  
   847  		n := fuseutil.WriteDirent(op.Dst[op.BytesRead:], makeDirEntry(e))
   848  		if n == 0 {
   849  			break
   850  		}
   851  
   852  		dh.inode.logFuse("<-- ReadDir", e.Name, e.Offset)
   853  
   854  		op.BytesRead += n
   855  	}
   856  
   857  	return
   858  }
   859  
   860  func (fs *Goofys) ReleaseDirHandle(
   861  	ctx context.Context,
   862  	op *fuseops.ReleaseDirHandleOp) (err error) {
   863  
   864  	fs.mu.Lock()
   865  	defer fs.mu.Unlock()
   866  
   867  	dh := fs.dirHandles[op.Handle]
   868  	dh.CloseDir()
   869  
   870  	fuseLog.Debugln("ReleaseDirHandle", *dh.inode.FullName())
   871  
   872  	delete(fs.dirHandles, op.Handle)
   873  
   874  	return
   875  }
   876  
   877  func (fs *Goofys) OpenFile(
   878  	ctx context.Context,
   879  	op *fuseops.OpenFileOp) (err error) {
   880  	fs.mu.RLock()
   881  	in := fs.getInodeOrDie(op.Inode)
   882  	fs.mu.RUnlock()
   883  
   884  	fh, err := in.OpenFile(op.Metadata)
   885  	if err != nil {
   886  		return
   887  	}
   888  
   889  	fs.mu.Lock()
   890  
   891  	handleID := fs.nextHandleID
   892  	fs.nextHandleID++
   893  
   894  	fs.fileHandles[handleID] = fh
   895  	fs.mu.Unlock()
   896  
   897  	op.Handle = handleID
   898  
   899  	in.mu.Lock()
   900  	defer in.mu.Unlock()
   901  
   902  	// this flag appears to tell the kernel if this open should
   903  	// use the page cache or not. If it's false and this is a
   904  	// write, then a separate open (that had op.KeepPageCache =
   905  	// true) will not read from our write, which suggests that
   906  	// this also controls if subsequent operations populates the
   907  	// cache
   908  	//
   909  	// but if this is a read, and KeepPageCache = false, and next
   910  	// open sets KeepPageCache = false, then it can read from cache.
   911  	//
   912  	// see tests TestReadNewFileWithExternalChangesFuse and
   913  	// TestReadMyOwnWriteWithExternalChangesFuse
   914  	op.KeepPageCache = !in.invalidateCache
   915  	in.invalidateCache = false
   916  
   917  	return
   918  }
   919  
   920  func (fs *Goofys) ReadFile(
   921  	ctx context.Context,
   922  	op *fuseops.ReadFileOp) (err error) {
   923  
   924  	fs.mu.RLock()
   925  	fh := fs.fileHandles[op.Handle]
   926  	fs.mu.RUnlock()
   927  
   928  	op.BytesRead, err = fh.ReadFile(op.Offset, op.Dst)
   929  
   930  	return
   931  }
   932  
   933  func (fs *Goofys) SyncFile(
   934  	ctx context.Context,
   935  	op *fuseops.SyncFileOp) (err error) {
   936  
   937  	// intentionally ignored, so that write()/sync()/write() works
   938  	// see https://github.com/kahing/goofys/issues/154
   939  	return
   940  }
   941  
   942  func (fs *Goofys) FlushFile(
   943  	ctx context.Context,
   944  	op *fuseops.FlushFileOp) (err error) {
   945  
   946  	fs.mu.RLock()
   947  	fh := fs.fileHandles[op.Handle]
   948  	fs.mu.RUnlock()
   949  
   950  	// If the file handle has a tgid, then flush the file only if the
   951  	// incoming request's tgid matches the tgid in the file handle.
   952  	// This check helps us with scenarios like https://github.com/kahing/goofys/issues/273
   953  	// Also see goofys_test.go:TestClientForkExec.
   954  	if fh.Tgid != nil {
   955  		tgid, err := GetTgid(op.Metadata.Pid)
   956  		if err != nil {
   957  			fh.inode.logFuse("<-- FlushFile",
   958  				fmt.Sprintf("Failed to retrieve tgid from op.Metadata.Pid. FlushFileOp:%#v, err:%v",
   959  					op, err))
   960  			return fuse.EIO
   961  		}
   962  		if *fh.Tgid != *tgid {
   963  			fh.inode.logFuse("<-- FlushFile",
   964  				"Operation ignored",
   965  				fmt.Sprintf("fh.Pid:%v != tgid:%v, op:%#v", *fh.Tgid, *tgid, op))
   966  			return nil
   967  		}
   968  	}
   969  
   970  	err = fh.FlushFile()
   971  	if err != nil {
   972  		// if we returned success from creat() earlier
   973  		// linux may think this file exists even when it doesn't,
   974  		// until TypeCacheTTL is over
   975  		// TODO: figure out a way to make the kernel forget this inode
   976  		// see TestWriteAnonymousFuse
   977  		fs.mu.RLock()
   978  		inode := fs.getInodeOrDie(op.Inode)
   979  		fs.mu.RUnlock()
   980  
   981  		if inode.KnownSize == nil {
   982  			inode.AttrTime = time.Time{}
   983  		}
   984  
   985  	}
   986  	fh.inode.logFuse("<-- FlushFile", err, op.Handle, op.Inode)
   987  	return
   988  }
   989  
   990  func (fs *Goofys) ReleaseFileHandle(
   991  	ctx context.Context,
   992  	op *fuseops.ReleaseFileHandleOp) (err error) {
   993  	fs.mu.Lock()
   994  	defer fs.mu.Unlock()
   995  	fh := fs.fileHandles[op.Handle]
   996  	fh.Release()
   997  
   998  	fuseLog.Debugln("ReleaseFileHandle", *fh.inode.FullName(), op.Handle, fh.inode.Id)
   999  
  1000  	delete(fs.fileHandles, op.Handle)
  1001  
  1002  	// try to compact heap
  1003  	//fs.bufferPool.MaybeGC()
  1004  	return
  1005  }
  1006  
  1007  func (fs *Goofys) CreateFile(
  1008  	ctx context.Context,
  1009  	op *fuseops.CreateFileOp) (err error) {
  1010  
  1011  	fs.mu.RLock()
  1012  	parent := fs.getInodeOrDie(op.Parent)
  1013  	fs.mu.RUnlock()
  1014  
  1015  	inode, fh := parent.Create(op.Name, op.Metadata)
  1016  
  1017  	parent.mu.Lock()
  1018  
  1019  	fs.mu.Lock()
  1020  	defer fs.mu.Unlock()
  1021  	fs.insertInode(parent, inode)
  1022  
  1023  	parent.mu.Unlock()
  1024  
  1025  	op.Entry.Child = inode.Id
  1026  	op.Entry.Attributes = inode.InflateAttributes()
  1027  	op.Entry.AttributesExpiration = time.Now().Add(fs.flags.StatCacheTTL)
  1028  	op.Entry.EntryExpiration = time.Now().Add(fs.flags.TypeCacheTTL)
  1029  
  1030  	// Allocate a handle.
  1031  	handleID := fs.nextHandleID
  1032  	fs.nextHandleID++
  1033  
  1034  	fs.fileHandles[handleID] = fh
  1035  
  1036  	op.Handle = handleID
  1037  
  1038  	inode.logFuse("<-- CreateFile")
  1039  
  1040  	return
  1041  }
  1042  
  1043  func (fs *Goofys) MkDir(
  1044  	ctx context.Context,
  1045  	op *fuseops.MkDirOp) (err error) {
  1046  
  1047  	fs.mu.RLock()
  1048  	parent := fs.getInodeOrDie(op.Parent)
  1049  	fs.mu.RUnlock()
  1050  
  1051  	// ignore op.Mode for now
  1052  	inode, err := parent.MkDir(op.Name)
  1053  	if err != nil {
  1054  		return err
  1055  	}
  1056  
  1057  	parent.mu.Lock()
  1058  
  1059  	fs.mu.Lock()
  1060  	defer fs.mu.Unlock()
  1061  	fs.insertInode(parent, inode)
  1062  
  1063  	parent.mu.Unlock()
  1064  
  1065  	op.Entry.Child = inode.Id
  1066  	op.Entry.Attributes = inode.InflateAttributes()
  1067  	op.Entry.AttributesExpiration = time.Now().Add(fs.flags.StatCacheTTL)
  1068  	op.Entry.EntryExpiration = time.Now().Add(fs.flags.TypeCacheTTL)
  1069  
  1070  	return
  1071  }
  1072  
  1073  func (fs *Goofys) RmDir(
  1074  	ctx context.Context,
  1075  	op *fuseops.RmDirOp) (err error) {
  1076  
  1077  	fs.mu.RLock()
  1078  	parent := fs.getInodeOrDie(op.Parent)
  1079  	fs.mu.RUnlock()
  1080  
  1081  	err = parent.RmDir(op.Name)
  1082  	parent.logFuse("<-- RmDir", op.Name, err)
  1083  	return
  1084  }
  1085  
  1086  func (fs *Goofys) SetInodeAttributes(
  1087  	ctx context.Context,
  1088  	op *fuseops.SetInodeAttributesOp) (err error) {
  1089  
  1090  	fs.mu.RLock()
  1091  	inode := fs.getInodeOrDie(op.Inode)
  1092  	fs.mu.RUnlock()
  1093  
  1094  	attr, err := inode.GetAttributes()
  1095  	if err == nil {
  1096  		op.Attributes = *attr
  1097  		op.AttributesExpiration = time.Now().Add(fs.flags.StatCacheTTL)
  1098  	}
  1099  	return
  1100  }
  1101  
  1102  func (fs *Goofys) WriteFile(
  1103  	ctx context.Context,
  1104  	op *fuseops.WriteFileOp) (err error) {
  1105  
  1106  	fs.mu.RLock()
  1107  
  1108  	fh, ok := fs.fileHandles[op.Handle]
  1109  	if !ok {
  1110  		panic(fmt.Sprintf("WriteFile: can't find handle %v", op.Handle))
  1111  	}
  1112  	fs.mu.RUnlock()
  1113  
  1114  	err = fh.WriteFile(op.Offset, op.Data)
  1115  
  1116  	return
  1117  }
  1118  
  1119  func (fs *Goofys) Unlink(
  1120  	ctx context.Context,
  1121  	op *fuseops.UnlinkOp) (err error) {
  1122  
  1123  	fs.mu.RLock()
  1124  	parent := fs.getInodeOrDie(op.Parent)
  1125  	fs.mu.RUnlock()
  1126  
  1127  	err = parent.Unlink(op.Name)
  1128  	return
  1129  }
  1130  
  1131  // rename("from", "to") causes the kernel to send lookup of "from" and
  1132  // "to" prior to sending rename to us
  1133  func (fs *Goofys) Rename(
  1134  	ctx context.Context,
  1135  	op *fuseops.RenameOp) (err error) {
  1136  
  1137  	fs.mu.RLock()
  1138  	parent := fs.getInodeOrDie(op.OldParent)
  1139  	newParent := fs.getInodeOrDie(op.NewParent)
  1140  	fs.mu.RUnlock()
  1141  
  1142  	// XXX don't hold the lock the entire time
  1143  	if op.OldParent == op.NewParent {
  1144  		parent.mu.Lock()
  1145  		defer parent.mu.Unlock()
  1146  	} else {
  1147  		// lock ordering to prevent deadlock
  1148  		if op.OldParent < op.NewParent {
  1149  			parent.mu.Lock()
  1150  			newParent.mu.Lock()
  1151  		} else {
  1152  			newParent.mu.Lock()
  1153  			parent.mu.Lock()
  1154  		}
  1155  		defer parent.mu.Unlock()
  1156  		defer newParent.mu.Unlock()
  1157  	}
  1158  
  1159  	err = parent.Rename(op.OldName, newParent, op.NewName)
  1160  	if err != nil {
  1161  		if err == fuse.ENOENT {
  1162  			// if the source doesn't exist, it could be
  1163  			// because this is a new file and we haven't
  1164  			// flushed it yet, pretend that's ok because
  1165  			// when we flush we will handle the rename
  1166  			inode := parent.findChildUnlocked(op.OldName)
  1167  			if inode != nil && atomic.LoadInt32(&inode.fileHandles) != 0 {
  1168  				err = nil
  1169  			}
  1170  		}
  1171  	}
  1172  	if err == nil {
  1173  		inode := parent.findChildUnlocked(op.OldName)
  1174  		if inode != nil {
  1175  			inode.mu.Lock()
  1176  			defer inode.mu.Unlock()
  1177  
  1178  			parent.removeChildUnlocked(inode)
  1179  
  1180  			newNode := newParent.findChildUnlocked(op.NewName)
  1181  			if newNode != nil {
  1182  				// this file's been overwritten, it's
  1183  				// been detached but we can't delete
  1184  				// it just yet, because the kernel
  1185  				// will still send forget ops to us
  1186  				newParent.removeChildUnlocked(newNode)
  1187  				newNode.Parent = nil
  1188  			}
  1189  
  1190  			inode.Name = &op.NewName
  1191  			inode.Parent = newParent
  1192  			newParent.insertChildUnlocked(inode)
  1193  		}
  1194  	}
  1195  	return
  1196  }