github.com/wph95/goofys@v0.24.1-0.20200907140828-7bc615e8492e/internal/goofys.go (about)

     1  // Copyright 2015 - 2017 Ka-Hing Cheung
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package internal
    16  
    17  import (
    18  	. "github.com/kahing/goofys/api/common"
    19  
    20  	"context"
    21  	"fmt"
    22  	"math/rand"
    23  	"net/url"
    24  	"runtime/debug"
    25  	"strings"
    26  	"sync"
    27  	"sync/atomic"
    28  	"syscall"
    29  	"time"
    30  
    31  	"github.com/aws/aws-sdk-go/aws/awserr"
    32  
    33  	"github.com/jacobsa/fuse"
    34  	"github.com/jacobsa/fuse/fuseops"
    35  	"github.com/jacobsa/fuse/fuseutil"
    36  
    37  	"github.com/sirupsen/logrus"
    38  	"net/http"
    39  )
    40  
    41  // goofys is a Filey System written in Go. All the backend data is
    42  // stored on S3 as is. It's a Filey System instead of a File System
    43  // because it makes minimal effort at being POSIX
    44  // compliant. Particularly things that are difficult to support on S3
    45  // or would translate into more than one round-trip would either fail
    46  // (rename non-empty dir) or faked (no per-file permission). goofys
    47  // does not have a on disk data cache, and consistency model is
    48  // close-to-open.
    49  
    50  type Goofys struct {
    51  	fuseutil.NotImplementedFileSystem
    52  	bucket string
    53  
    54  	flags *FlagStorage
    55  
    56  	umask uint32
    57  
    58  	gcs       bool
    59  	rootAttrs InodeAttributes
    60  
    61  	bufferPool *BufferPool
    62  
    63  	// A lock protecting the state of the file system struct itself (distinct
    64  	// from per-inode locks). Make sure to see the notes on lock ordering above.
    65  	mu sync.RWMutex
    66  
    67  	// The next inode ID to hand out. We assume that this will never overflow,
    68  	// since even if we were handing out inode IDs at 4 GHz, it would still take
    69  	// over a century to do so.
    70  	//
    71  	// GUARDED_BY(mu)
    72  	nextInodeID fuseops.InodeID
    73  
    74  	// The collection of live inodes, keyed by inode ID. No ID less than
    75  	// fuseops.RootInodeID is ever used.
    76  	//
    77  	// INVARIANT: For all keys k, fuseops.RootInodeID <= k < nextInodeID
    78  	// INVARIANT: For all keys k, inodes[k].ID() == k
    79  	// INVARIANT: inodes[fuseops.RootInodeID] is missing or of type inode.DirInode
    80  	// INVARIANT: For all v, if IsDirName(v.Name()) then v is inode.DirInode
    81  	//
    82  	// GUARDED_BY(mu)
    83  	inodes map[fuseops.InodeID]*Inode
    84  
    85  	nextHandleID fuseops.HandleID
    86  	dirHandles   map[fuseops.HandleID]*DirHandle
    87  
    88  	fileHandles map[fuseops.HandleID]*FileHandle
    89  
    90  	replicators *Ticket
    91  	restorers   *Ticket
    92  
    93  	forgotCnt uint32
    94  }
    95  
    96  var s3Log = GetLogger("s3")
    97  var log = GetLogger("main")
    98  var fuseLog = GetLogger("fuse")
    99  
   100  func NewBackend(bucket string, flags *FlagStorage) (cloud StorageBackend, err error) {
   101  	if flags.Backend == nil {
   102  		flags.Backend = (&S3Config{}).Init()
   103  	}
   104  
   105  	if config, ok := flags.Backend.(*AZBlobConfig); ok {
   106  		cloud, err = NewAZBlob(bucket, config)
   107  	} else if config, ok := flags.Backend.(*ADLv1Config); ok {
   108  		cloud, err = NewADLv1(bucket, flags, config)
   109  	} else if config, ok := flags.Backend.(*ADLv2Config); ok {
   110  		cloud, err = NewADLv2(bucket, flags, config)
   111  	} else if config, ok := flags.Backend.(*S3Config); ok {
   112  		if strings.HasSuffix(flags.Endpoint, "/storage.googleapis.com") {
   113  			cloud, err = NewGCS3(bucket, flags, config)
   114  		} else {
   115  			cloud, err = NewS3(bucket, flags, config)
   116  		}
   117  	} else {
   118  		err = fmt.Errorf("Unknown backend config: %T", flags.Backend)
   119  	}
   120  
   121  	return
   122  }
   123  
   124  type BucketSpec struct {
   125  	Scheme string
   126  	Bucket string
   127  	Prefix string
   128  }
   129  
   130  func ParseBucketSpec(bucket string) (spec BucketSpec, err error) {
   131  	if strings.Index(bucket, "://") != -1 {
   132  		var u *url.URL
   133  		u, err = url.Parse(bucket)
   134  		if err != nil {
   135  			return
   136  		}
   137  
   138  		spec.Scheme = u.Scheme
   139  		spec.Bucket = u.Host
   140  		if u.User != nil {
   141  			// wasb url can be wasb://container@storage-end-point
   142  			// we want to return the entire thing as bucket
   143  			spec.Bucket = u.User.String() + "@" + u.Host
   144  		}
   145  		spec.Prefix = u.Path
   146  	} else {
   147  		spec.Scheme = "s3"
   148  
   149  		colon := strings.Index(bucket, ":")
   150  		if colon != -1 {
   151  			spec.Prefix = bucket[colon+1:]
   152  			spec.Bucket = bucket[0:colon]
   153  		} else {
   154  			spec.Bucket = bucket
   155  		}
   156  	}
   157  
   158  	spec.Prefix = strings.Trim(spec.Prefix, "/")
   159  	if spec.Prefix != "" {
   160  		spec.Prefix += "/"
   161  	}
   162  	return
   163  }
   164  
   165  func NewGoofys(ctx context.Context, bucket string, flags *FlagStorage) *Goofys {
   166  	return newGoofys(ctx, bucket, flags, NewBackend)
   167  }
   168  
   169  func newGoofys(ctx context.Context, bucket string, flags *FlagStorage,
   170  	newBackend func(string, *FlagStorage) (StorageBackend, error)) *Goofys {
   171  	// Set up the basic struct.
   172  	fs := &Goofys{
   173  		bucket: bucket,
   174  		flags:  flags,
   175  		umask:  0122,
   176  	}
   177  
   178  	var prefix string
   179  	colon := strings.Index(bucket, ":")
   180  	if colon != -1 {
   181  		prefix = bucket[colon+1:]
   182  		prefix = strings.Trim(prefix, "/")
   183  		if prefix != "" {
   184  			prefix += "/"
   185  		}
   186  
   187  		fs.bucket = bucket[0:colon]
   188  		bucket = fs.bucket
   189  	}
   190  
   191  	if flags.DebugS3 {
   192  		s3Log.Level = logrus.DebugLevel
   193  	}
   194  
   195  	cloud, err := newBackend(bucket, flags)
   196  	if err != nil {
   197  		log.Errorf("Unable to setup backend: %v", err)
   198  		return nil
   199  	}
   200  	_, fs.gcs = cloud.Delegate().(*GCS3)
   201  
   202  	randomObjectName := prefix + (RandStringBytesMaskImprSrc(32))
   203  	err = cloud.Init(randomObjectName)
   204  	if err != nil {
   205  		log.Errorf("Unable to access '%v': %v", bucket, err)
   206  		return nil
   207  	}
   208  	go cloud.MultipartExpire(&MultipartExpireInput{})
   209  
   210  	now := time.Now()
   211  	fs.rootAttrs = InodeAttributes{
   212  		Size:  4096,
   213  		Mtime: now,
   214  	}
   215  
   216  	fs.bufferPool = BufferPool{}.Init()
   217  
   218  	fs.nextInodeID = fuseops.RootInodeID + 1
   219  	fs.inodes = make(map[fuseops.InodeID]*Inode)
   220  	root := NewInode(fs, nil, PString(""))
   221  	root.Id = fuseops.RootInodeID
   222  	root.ToDir()
   223  	root.dir.cloud = cloud
   224  	root.dir.mountPrefix = prefix
   225  	root.Attributes.Mtime = fs.rootAttrs.Mtime
   226  
   227  	fs.inodes[fuseops.RootInodeID] = root
   228  	fs.addDotAndDotDot(root)
   229  
   230  	fs.nextHandleID = 1
   231  	fs.dirHandles = make(map[fuseops.HandleID]*DirHandle)
   232  
   233  	fs.fileHandles = make(map[fuseops.HandleID]*FileHandle)
   234  
   235  	fs.replicators = Ticket{Total: 16}.Init()
   236  	fs.restorers = Ticket{Total: 20}.Init()
   237  
   238  	return fs
   239  }
   240  
   241  // from https://stackoverflow.com/questions/22892120/how-to-generate-a-random-string-of-a-fixed-length-in-golang
   242  func RandStringBytesMaskImprSrc(n int) string {
   243  	const letterBytes = "abcdefghijklmnopqrstuvwxyz0123456789"
   244  	const (
   245  		letterIdxBits = 6                    // 6 bits to represent a letter index
   246  		letterIdxMask = 1<<letterIdxBits - 1 // All 1-bits, as many as letterIdxBits
   247  		letterIdxMax  = 63 / letterIdxBits   // # of letter indices fitting in 63 bits
   248  	)
   249  	src := rand.NewSource(time.Now().UnixNano())
   250  	b := make([]byte, n)
   251  	// A src.Int63() generates 63 random bits, enough for letterIdxMax characters!
   252  	for i, cache, remain := n-1, src.Int63(), letterIdxMax; i >= 0; {
   253  		if remain == 0 {
   254  			cache, remain = src.Int63(), letterIdxMax
   255  		}
   256  		if idx := int(cache & letterIdxMask); idx < len(letterBytes) {
   257  			b[i] = letterBytes[idx]
   258  			i--
   259  		}
   260  		cache >>= letterIdxBits
   261  		remain--
   262  	}
   263  
   264  	return string(b)
   265  }
   266  
   267  func (fs *Goofys) SigUsr1() {
   268  	fs.mu.RLock()
   269  
   270  	log.Infof("forgot %v inodes", fs.forgotCnt)
   271  	log.Infof("%v inodes", len(fs.inodes))
   272  	fs.mu.RUnlock()
   273  	debug.FreeOSMemory()
   274  }
   275  
   276  // Find the given inode. Panic if it doesn't exist.
   277  //
   278  // RLOCKS_REQUIRED(fs.mu)
   279  func (fs *Goofys) getInodeOrDie(id fuseops.InodeID) (inode *Inode) {
   280  	inode = fs.inodes[id]
   281  	if inode == nil {
   282  		panic(fmt.Sprintf("Unknown inode: %v", id))
   283  	}
   284  
   285  	return
   286  }
   287  
   288  type Mount struct {
   289  	// Mount Point relative to goofys's root mount.
   290  	name    string
   291  	cloud   StorageBackend
   292  	prefix  string
   293  	mounted bool
   294  }
   295  
   296  func (fs *Goofys) mount(mp *Inode, b *Mount) {
   297  	if b.mounted {
   298  		return
   299  	}
   300  
   301  	name := strings.Trim(b.name, "/")
   302  
   303  	// create path for the mount. AttrTime is set to TIME_MAX so
   304  	// they will never expire and be removed. But DirTime is not
   305  	// so we will still consult the underlining cloud for listing
   306  	// (which will then be merged with the cached result)
   307  
   308  	for {
   309  		idx := strings.Index(name, "/")
   310  		if idx == -1 {
   311  			break
   312  		}
   313  		dirName := name[0:idx]
   314  		name = name[idx+1:]
   315  
   316  		mp.mu.Lock()
   317  		dirInode := mp.findChildUnlocked(dirName)
   318  		if dirInode == nil {
   319  			fs.mu.Lock()
   320  
   321  			dirInode = NewInode(fs, mp, &dirName)
   322  			dirInode.ToDir()
   323  			dirInode.AttrTime = TIME_MAX
   324  
   325  			fs.insertInode(mp, dirInode)
   326  			fs.mu.Unlock()
   327  		}
   328  		mp.mu.Unlock()
   329  		mp = dirInode
   330  	}
   331  
   332  	mp.mu.Lock()
   333  	defer mp.mu.Unlock()
   334  
   335  	prev := mp.findChildUnlocked(name)
   336  	if prev == nil {
   337  		mountInode := NewInode(fs, mp, &name)
   338  		mountInode.ToDir()
   339  		mountInode.dir.cloud = b.cloud
   340  		mountInode.dir.mountPrefix = b.prefix
   341  		mountInode.AttrTime = TIME_MAX
   342  
   343  		fs.mu.Lock()
   344  		defer fs.mu.Unlock()
   345  
   346  		fs.insertInode(mp, mountInode)
   347  		prev = mountInode
   348  	} else {
   349  		if !prev.isDir() {
   350  			panic(fmt.Sprintf("inode %v is not a directory", *prev.FullName()))
   351  		}
   352  
   353  		// This inode might have some cached data from a parent mount.
   354  		// Clear this cache by resetting the DirTime.
   355  		// Note: resetDirTimeRec should be called without holding the lock.
   356  		prev.resetDirTimeRec()
   357  		prev.mu.Lock()
   358  		defer prev.mu.Unlock()
   359  		prev.dir.cloud = b.cloud
   360  		prev.dir.mountPrefix = b.prefix
   361  		prev.AttrTime = TIME_MAX
   362  
   363  	}
   364  	fuseLog.Infof("mounted /%v", *prev.FullName())
   365  	b.mounted = true
   366  }
   367  
   368  func (fs *Goofys) MountAll(mounts []*Mount) {
   369  	fs.mu.RLock()
   370  	root := fs.getInodeOrDie(fuseops.RootInodeID)
   371  	fs.mu.RUnlock()
   372  
   373  	for _, m := range mounts {
   374  		fs.mount(root, m)
   375  	}
   376  }
   377  
   378  func (fs *Goofys) Mount(mount *Mount) {
   379  	fs.mu.RLock()
   380  	root := fs.getInodeOrDie(fuseops.RootInodeID)
   381  	fs.mu.RUnlock()
   382  	fs.mount(root, mount)
   383  }
   384  
   385  func (fs *Goofys) Unmount(mountPoint string) {
   386  	fs.mu.RLock()
   387  	mp := fs.getInodeOrDie(fuseops.RootInodeID)
   388  	fs.mu.RUnlock()
   389  
   390  	fuseLog.Infof("Attempting to unmount %v", mountPoint)
   391  	path := strings.Split(strings.Trim(mountPoint, "/"), "/")
   392  	for _, localName := range path {
   393  		dirInode := mp.findChild(localName)
   394  		if dirInode == nil || !dirInode.isDir() {
   395  			fuseLog.Errorf("Failed to find directory:%v while unmounting %v. "+
   396  				"Ignoring the unmount operation.", localName, mountPoint)
   397  			return
   398  		}
   399  		mp = dirInode
   400  	}
   401  	mp.ResetForUnmount()
   402  	return
   403  }
   404  
   405  func (fs *Goofys) StatFS(
   406  	ctx context.Context,
   407  	op *fuseops.StatFSOp) (err error) {
   408  
   409  	const BLOCK_SIZE = 4096
   410  	const TOTAL_SPACE = 1 * 1024 * 1024 * 1024 * 1024 * 1024 // 1PB
   411  	const TOTAL_BLOCKS = TOTAL_SPACE / BLOCK_SIZE
   412  	const INODES = 1 * 1000 * 1000 * 1000 // 1 billion
   413  	op.BlockSize = BLOCK_SIZE
   414  	op.Blocks = TOTAL_BLOCKS
   415  	op.BlocksFree = TOTAL_BLOCKS
   416  	op.BlocksAvailable = TOTAL_BLOCKS
   417  	op.IoSize = 1 * 1024 * 1024 // 1MB
   418  	op.Inodes = INODES
   419  	op.InodesFree = INODES
   420  	return
   421  }
   422  
   423  func (fs *Goofys) GetInodeAttributes(
   424  	ctx context.Context,
   425  	op *fuseops.GetInodeAttributesOp) (err error) {
   426  
   427  	fs.mu.RLock()
   428  	inode := fs.getInodeOrDie(op.Inode)
   429  	fs.mu.RUnlock()
   430  
   431  	attr, err := inode.GetAttributes()
   432  	if err == nil {
   433  		op.Attributes = *attr
   434  		op.AttributesExpiration = time.Now().Add(fs.flags.StatCacheTTL)
   435  	}
   436  
   437  	return
   438  }
   439  
   440  func (fs *Goofys) GetXattr(ctx context.Context,
   441  	op *fuseops.GetXattrOp) (err error) {
   442  	fs.mu.RLock()
   443  	inode := fs.getInodeOrDie(op.Inode)
   444  	fs.mu.RUnlock()
   445  
   446  	value, err := inode.GetXattr(op.Name)
   447  	if err != nil {
   448  		return
   449  	}
   450  
   451  	op.BytesRead = len(value)
   452  
   453  	if len(op.Dst) != 0 {
   454  		if len(op.Dst) < op.BytesRead {
   455  			return syscall.ERANGE
   456  		}
   457  
   458  		copy(op.Dst, value)
   459  	}
   460  	return
   461  }
   462  
   463  func (fs *Goofys) ListXattr(ctx context.Context,
   464  	op *fuseops.ListXattrOp) (err error) {
   465  	fs.mu.RLock()
   466  	inode := fs.getInodeOrDie(op.Inode)
   467  	fs.mu.RUnlock()
   468  
   469  	xattrs, err := inode.ListXattr()
   470  
   471  	ncopied := 0
   472  
   473  	for _, name := range xattrs {
   474  		buf := op.Dst[ncopied:]
   475  		nlen := len(name) + 1
   476  
   477  		if nlen <= len(buf) {
   478  			copy(buf, name)
   479  			ncopied += nlen
   480  			buf[nlen-1] = '\x00'
   481  		}
   482  
   483  		op.BytesRead += nlen
   484  	}
   485  
   486  	if len(op.Dst) != 0 && ncopied < op.BytesRead {
   487  		err = syscall.ERANGE
   488  	}
   489  
   490  	return
   491  }
   492  
   493  func (fs *Goofys) RemoveXattr(ctx context.Context,
   494  	op *fuseops.RemoveXattrOp) (err error) {
   495  	fs.mu.RLock()
   496  	inode := fs.getInodeOrDie(op.Inode)
   497  	fs.mu.RUnlock()
   498  
   499  	err = inode.RemoveXattr(op.Name)
   500  
   501  	return
   502  }
   503  
   504  func (fs *Goofys) SetXattr(ctx context.Context,
   505  	op *fuseops.SetXattrOp) (err error) {
   506  	fs.mu.RLock()
   507  	inode := fs.getInodeOrDie(op.Inode)
   508  	fs.mu.RUnlock()
   509  
   510  	err = inode.SetXattr(op.Name, op.Value, op.Flags)
   511  	return
   512  }
   513  
   514  func mapHttpError(status int) error {
   515  	switch status {
   516  	case 400:
   517  		return fuse.EINVAL
   518  	case 401:
   519  		return syscall.EACCES
   520  	case 403:
   521  		return syscall.EACCES
   522  	case 404:
   523  		return fuse.ENOENT
   524  	case 405:
   525  		return syscall.ENOTSUP
   526  	case http.StatusConflict:
   527  		return syscall.EINTR
   528  	case 429:
   529  		return syscall.EAGAIN
   530  	case 500:
   531  		return syscall.EAGAIN
   532  	default:
   533  		return nil
   534  	}
   535  }
   536  
   537  func mapAwsError(err error) error {
   538  	if err == nil {
   539  		return nil
   540  	}
   541  
   542  	if awsErr, ok := err.(awserr.Error); ok {
   543  		switch awsErr.Code() {
   544  		case "BucketRegionError":
   545  			// don't need to log anything, we should detect region after
   546  			return err
   547  		case "NoSuchBucket":
   548  			return syscall.ENXIO
   549  		case "BucketAlreadyOwnedByYou":
   550  			return fuse.EEXIST
   551  		}
   552  
   553  		if reqErr, ok := err.(awserr.RequestFailure); ok {
   554  			// A service error occurred
   555  			err = mapHttpError(reqErr.StatusCode())
   556  			if err != nil {
   557  				return err
   558  			} else {
   559  				s3Log.Errorf("http=%v %v s3=%v request=%v\n",
   560  					reqErr.StatusCode(), reqErr.Message(),
   561  					awsErr.Code(), reqErr.RequestID())
   562  				return reqErr
   563  			}
   564  		} else {
   565  			// Generic AWS Error with Code, Message, and original error (if any)
   566  			s3Log.Errorf("code=%v msg=%v, err=%v\n", awsErr.Code(), awsErr.Message(), awsErr.OrigErr())
   567  			return awsErr
   568  		}
   569  	} else {
   570  		return err
   571  	}
   572  }
   573  
   574  // note that this is NOT the same as url.PathEscape in golang 1.8,
   575  // as this preserves / and url.PathEscape converts / to %2F
   576  func pathEscape(path string) string {
   577  	u := url.URL{Path: path}
   578  	return u.EscapedPath()
   579  }
   580  
   581  func (fs *Goofys) allocateInodeId() (id fuseops.InodeID) {
   582  	id = fs.nextInodeID
   583  	fs.nextInodeID++
   584  	return
   585  }
   586  
   587  func expired(cache time.Time, ttl time.Duration) bool {
   588  	now := time.Now()
   589  	if cache.After(now) {
   590  		return false
   591  	}
   592  	return !cache.Add(ttl).After(now)
   593  }
   594  
   595  func (fs *Goofys) LookUpInode(
   596  	ctx context.Context,
   597  	op *fuseops.LookUpInodeOp) (err error) {
   598  
   599  	var inode *Inode
   600  	var ok bool
   601  	defer func() { fuseLog.Debugf("<-- LookUpInode %v %v %v", op.Parent, op.Name, err) }()
   602  
   603  	fs.mu.RLock()
   604  	parent := fs.getInodeOrDie(op.Parent)
   605  	fs.mu.RUnlock()
   606  
   607  	parent.mu.Lock()
   608  	inode = parent.findChildUnlocked(op.Name)
   609  	if inode != nil {
   610  		ok = true
   611  		inode.Ref()
   612  
   613  		if expired(inode.AttrTime, fs.flags.StatCacheTTL) {
   614  			ok = false
   615  			if atomic.LoadInt32(&inode.fileHandles) != 0 {
   616  				// we have an open file handle, object
   617  				// in S3 may not represent the true
   618  				// state of the file anyway, so just
   619  				// return what we know which is
   620  				// potentially more accurate
   621  				ok = true
   622  			} else {
   623  				inode.logFuse("lookup expired")
   624  			}
   625  		}
   626  	} else {
   627  		ok = false
   628  	}
   629  	parent.mu.Unlock()
   630  
   631  	if !ok {
   632  		var newInode *Inode
   633  
   634  		newInode, err = parent.LookUp(op.Name)
   635  		if err == fuse.ENOENT && inode != nil && inode.isDir() {
   636  			// we may not be able to look up an implicit
   637  			// dir if all the children are removed, so we
   638  			// just pretend this dir is still around
   639  			err = nil
   640  		} else if err != nil {
   641  			if inode != nil {
   642  				// just kidding! pretend we didn't up the ref
   643  				fs.mu.Lock()
   644  				defer fs.mu.Unlock()
   645  
   646  				stale := inode.DeRef(1)
   647  				if stale {
   648  					delete(fs.inodes, inode.Id)
   649  					parent.removeChild(inode)
   650  				}
   651  			}
   652  			return err
   653  		}
   654  
   655  		if inode == nil {
   656  			parent.mu.Lock()
   657  			// check again if it's there, could have been
   658  			// added by another lookup or readdir
   659  			inode = parent.findChildUnlocked(op.Name)
   660  			if inode == nil {
   661  				fs.mu.Lock()
   662  				inode = newInode
   663  				fs.insertInode(parent, inode)
   664  				fs.mu.Unlock()
   665  			}
   666  			parent.mu.Unlock()
   667  		} else {
   668  			inode.mu.Lock()
   669  
   670  			if newInode != nil {
   671  				// if only size changed, kernel seems to
   672  				// automatically drop cache
   673  				if !inode.Attributes.Equal(newInode.Attributes) {
   674  					inode.logFuse("invalidate cache because attributes changed", inode.Attributes, newInode.Attributes)
   675  					inode.invalidateCache = true
   676  				} else if inode.knownETag != nil &&
   677  					newInode.knownETag != nil &&
   678  					*inode.knownETag != *newInode.knownETag {
   679  					// if this is a new file (ie:
   680  					// inode.knownETag is nil),
   681  					// then prefer to read our own
   682  					// write then reading updated
   683  					// data
   684  					inode.logFuse("invalidate cache because etag changed", *inode.knownETag, *newInode.knownETag)
   685  					inode.invalidateCache = true
   686  				}
   687  
   688  				if newInode.Attributes.Mtime.IsZero() {
   689  					// this can happen if it's an
   690  					// implicit dir, use the last
   691  					// known value
   692  					newInode.Attributes.Mtime = inode.Attributes.Mtime
   693  				}
   694  				inode.Attributes = newInode.Attributes
   695  				inode.knownETag = newInode.knownETag
   696  			}
   697  			inode.AttrTime = time.Now()
   698  
   699  			inode.mu.Unlock()
   700  		}
   701  	}
   702  
   703  	op.Entry.Child = inode.Id
   704  	op.Entry.Attributes = inode.InflateAttributes()
   705  	op.Entry.AttributesExpiration = time.Now().Add(fs.flags.StatCacheTTL)
   706  	op.Entry.EntryExpiration = time.Now().Add(fs.flags.TypeCacheTTL)
   707  
   708  	return
   709  }
   710  
   711  // LOCKS_REQUIRED(fs.mu)
   712  // LOCKS_REQUIRED(parent.mu)
   713  func (fs *Goofys) insertInode(parent *Inode, inode *Inode) {
   714  	addInode := false
   715  	if *inode.Name == "." {
   716  		inode.Id = parent.Id
   717  	} else if *inode.Name == ".." {
   718  		inode.Id = fuseops.InodeID(fuseops.RootInodeID)
   719  		if parent.Parent != nil {
   720  			inode.Id = parent.Parent.Id
   721  		}
   722  	} else {
   723  		if inode.Id != 0 {
   724  			panic(fmt.Sprintf("inode id is set: %v %v", *inode.Name, inode.Id))
   725  		}
   726  		inode.Id = fs.allocateInodeId()
   727  		addInode = true
   728  	}
   729  	parent.insertChildUnlocked(inode)
   730  	if addInode {
   731  		fs.inodes[inode.Id] = inode
   732  
   733  		// if we are inserting a new directory, also create
   734  		// the child . and ..
   735  		if inode.isDir() {
   736  			fs.addDotAndDotDot(inode)
   737  		}
   738  	}
   739  }
   740  
   741  func (fs *Goofys) addDotAndDotDot(dir *Inode) {
   742  	dot := NewInode(fs, dir, PString("."))
   743  	dot.ToDir()
   744  	dot.AttrTime = TIME_MAX
   745  	fs.insertInode(dir, dot)
   746  
   747  	dot = NewInode(fs, dir, PString(".."))
   748  	dot.ToDir()
   749  	dot.AttrTime = TIME_MAX
   750  	fs.insertInode(dir, dot)
   751  }
   752  
   753  func (fs *Goofys) ForgetInode(
   754  	ctx context.Context,
   755  	op *fuseops.ForgetInodeOp) (err error) {
   756  
   757  	fs.mu.RLock()
   758  	inode := fs.getInodeOrDie(op.Inode)
   759  	fs.mu.RUnlock()
   760  
   761  	parent := inode.Parent
   762  	if parent != nil {
   763  		parent.mu.Lock()
   764  		defer parent.mu.Unlock()
   765  	}
   766  	stale := inode.DeRef(op.N)
   767  
   768  	if stale {
   769  		fs.mu.Lock()
   770  		defer fs.mu.Unlock()
   771  
   772  		delete(fs.inodes, op.Inode)
   773  		fs.forgotCnt += 1
   774  
   775  		if parent != nil {
   776  			parent.removeChildUnlocked(inode)
   777  		}
   778  	}
   779  
   780  	return
   781  }
   782  
   783  func (fs *Goofys) OpenDir(
   784  	ctx context.Context,
   785  	op *fuseops.OpenDirOp) (err error) {
   786  	fs.mu.Lock()
   787  
   788  	handleID := fs.nextHandleID
   789  	fs.nextHandleID++
   790  
   791  	in := fs.getInodeOrDie(op.Inode)
   792  	fs.mu.Unlock()
   793  
   794  	// XXX/is this a dir?
   795  	dh := in.OpenDir()
   796  
   797  	fs.mu.Lock()
   798  	defer fs.mu.Unlock()
   799  
   800  	fs.dirHandles[handleID] = dh
   801  	op.Handle = handleID
   802  
   803  	return
   804  }
   805  
   806  func makeDirEntry(en *DirHandleEntry) fuseutil.Dirent {
   807  	return fuseutil.Dirent{
   808  		Name:   en.Name,
   809  		Type:   en.Type,
   810  		Inode:  en.Inode,
   811  		Offset: en.Offset,
   812  	}
   813  }
   814  
   815  func (fs *Goofys) ReadDir(
   816  	ctx context.Context,
   817  	op *fuseops.ReadDirOp) (err error) {
   818  
   819  	// Find the handle.
   820  	fs.mu.RLock()
   821  	dh := fs.dirHandles[op.Handle]
   822  	fs.mu.RUnlock()
   823  
   824  	if dh == nil {
   825  		panic(fmt.Sprintf("can't find dh=%v", op.Handle))
   826  	}
   827  
   828  	inode := dh.inode
   829  	inode.logFuse("ReadDir", op.Offset)
   830  
   831  	dh.mu.Lock()
   832  	defer dh.mu.Unlock()
   833  
   834  	for i := op.Offset; ; i++ {
   835  		e, err := dh.ReadDir(i)
   836  		if err != nil {
   837  			return err
   838  		}
   839  		if e == nil {
   840  			break
   841  		}
   842  
   843  		if e.Inode == 0 {
   844  			panic(fmt.Sprintf("unset inode %v", e.Name))
   845  		}
   846  
   847  		n := fuseutil.WriteDirent(op.Dst[op.BytesRead:], makeDirEntry(e))
   848  		if n == 0 {
   849  			break
   850  		}
   851  
   852  		dh.inode.logFuse("<-- ReadDir", e.Name, e.Offset)
   853  
   854  		op.BytesRead += n
   855  	}
   856  
   857  	return
   858  }
   859  
   860  func (fs *Goofys) ReleaseDirHandle(
   861  	ctx context.Context,
   862  	op *fuseops.ReleaseDirHandleOp) (err error) {
   863  
   864  	fs.mu.Lock()
   865  	defer fs.mu.Unlock()
   866  
   867  	dh := fs.dirHandles[op.Handle]
   868  	dh.CloseDir()
   869  
   870  	fuseLog.Debugln("ReleaseDirHandle", *dh.inode.FullName())
   871  
   872  	delete(fs.dirHandles, op.Handle)
   873  
   874  	return
   875  }
   876  
   877  func (fs *Goofys) OpenFile(
   878  	ctx context.Context,
   879  	op *fuseops.OpenFileOp) (err error) {
   880  	fs.mu.RLock()
   881  	in := fs.getInodeOrDie(op.Inode)
   882  	fs.mu.RUnlock()
   883  
   884  	fh, err := in.OpenFile(op.Metadata)
   885  	if err != nil {
   886  		return
   887  	}
   888  
   889  	fs.mu.Lock()
   890  
   891  	handleID := fs.nextHandleID
   892  	fs.nextHandleID++
   893  
   894  	fs.fileHandles[handleID] = fh
   895  	fs.mu.Unlock()
   896  
   897  	op.Handle = handleID
   898  
   899  	in.mu.Lock()
   900  	defer in.mu.Unlock()
   901  
   902  	// this flag appears to tell the kernel if this open should
   903  	// use the page cache or not. "use" here means:
   904  	//
   905  	// read will read from cache
   906  	// write will populate cache
   907  	//
   908  	// because we have one flag to control both behaviors, if an
   909  	// object is updated out-of-band and we need to invalidate
   910  	// cache, and we write to this object locally, subsequent read
   911  	// will not read from cache
   912  	//
   913  	// see tests TestReadNewFileWithExternalChangesFuse and
   914  	// TestReadMyOwnWrite*Fuse
   915  	op.KeepPageCache = !in.invalidateCache
   916  	fh.keepPageCache = op.KeepPageCache
   917  	in.invalidateCache = false
   918  
   919  	return
   920  }
   921  
   922  func (fs *Goofys) ReadFile(
   923  	ctx context.Context,
   924  	op *fuseops.ReadFileOp) (err error) {
   925  
   926  	fs.mu.RLock()
   927  	fh := fs.fileHandles[op.Handle]
   928  	fs.mu.RUnlock()
   929  
   930  	op.BytesRead, err = fh.ReadFile(op.Offset, op.Dst)
   931  
   932  	return
   933  }
   934  
   935  func (fs *Goofys) SyncFile(
   936  	ctx context.Context,
   937  	op *fuseops.SyncFileOp) (err error) {
   938  
   939  	// intentionally ignored, so that write()/sync()/write() works
   940  	// see https://github.com/kahing/goofys/issues/154
   941  	return
   942  }
   943  
   944  func (fs *Goofys) FlushFile(
   945  	ctx context.Context,
   946  	op *fuseops.FlushFileOp) (err error) {
   947  
   948  	fs.mu.RLock()
   949  	fh := fs.fileHandles[op.Handle]
   950  	fs.mu.RUnlock()
   951  
   952  	// If the file handle has a tgid, then flush the file only if the
   953  	// incoming request's tgid matches the tgid in the file handle.
   954  	// This check helps us with scenarios like https://github.com/kahing/goofys/issues/273
   955  	// Also see goofys_test.go:TestClientForkExec.
   956  	if fh.Tgid != nil {
   957  		tgid, err := GetTgid(op.Metadata.Pid)
   958  		if err != nil {
   959  			fh.inode.logFuse("<-- FlushFile",
   960  				fmt.Sprintf("Failed to retrieve tgid from op.Metadata.Pid. FlushFileOp:%#v, err:%v",
   961  					op, err))
   962  			return fuse.EIO
   963  		}
   964  		if *fh.Tgid != *tgid {
   965  			fh.inode.logFuse("<-- FlushFile",
   966  				"Operation ignored",
   967  				fmt.Sprintf("fh.Pid:%v != tgid:%v, op:%#v", *fh.Tgid, *tgid, op))
   968  			return nil
   969  		}
   970  	}
   971  
   972  	err = fh.FlushFile()
   973  	if err != nil {
   974  		// if we returned success from creat() earlier
   975  		// linux may think this file exists even when it doesn't,
   976  		// until TypeCacheTTL is over
   977  		// TODO: figure out a way to make the kernel forget this inode
   978  		// see TestWriteAnonymousFuse
   979  		fs.mu.RLock()
   980  		inode := fs.getInodeOrDie(op.Inode)
   981  		fs.mu.RUnlock()
   982  
   983  		if inode.KnownSize == nil {
   984  			inode.AttrTime = time.Time{}
   985  		}
   986  
   987  	}
   988  	fh.inode.logFuse("<-- FlushFile", err, op.Handle, op.Inode)
   989  	return
   990  }
   991  
   992  func (fs *Goofys) ReleaseFileHandle(
   993  	ctx context.Context,
   994  	op *fuseops.ReleaseFileHandleOp) (err error) {
   995  	fs.mu.Lock()
   996  	defer fs.mu.Unlock()
   997  	fh := fs.fileHandles[op.Handle]
   998  	fh.Release()
   999  
  1000  	fuseLog.Debugln("ReleaseFileHandle", *fh.inode.FullName(), op.Handle, fh.inode.Id)
  1001  
  1002  	delete(fs.fileHandles, op.Handle)
  1003  
  1004  	// try to compact heap
  1005  	//fs.bufferPool.MaybeGC()
  1006  	return
  1007  }
  1008  
  1009  func (fs *Goofys) CreateFile(
  1010  	ctx context.Context,
  1011  	op *fuseops.CreateFileOp) (err error) {
  1012  
  1013  	fs.mu.RLock()
  1014  	parent := fs.getInodeOrDie(op.Parent)
  1015  	fs.mu.RUnlock()
  1016  
  1017  	inode, fh := parent.Create(op.Name, op.Metadata)
  1018  
  1019  	parent.mu.Lock()
  1020  
  1021  	fs.mu.Lock()
  1022  	defer fs.mu.Unlock()
  1023  	fs.insertInode(parent, inode)
  1024  
  1025  	parent.mu.Unlock()
  1026  
  1027  	op.Entry.Child = inode.Id
  1028  	op.Entry.Attributes = inode.InflateAttributes()
  1029  	op.Entry.AttributesExpiration = time.Now().Add(fs.flags.StatCacheTTL)
  1030  	op.Entry.EntryExpiration = time.Now().Add(fs.flags.TypeCacheTTL)
  1031  
  1032  	// Allocate a handle.
  1033  	handleID := fs.nextHandleID
  1034  	fs.nextHandleID++
  1035  
  1036  	fs.fileHandles[handleID] = fh
  1037  
  1038  	op.Handle = handleID
  1039  
  1040  	inode.logFuse("<-- CreateFile")
  1041  
  1042  	return
  1043  }
  1044  
  1045  func (fs *Goofys) MkDir(
  1046  	ctx context.Context,
  1047  	op *fuseops.MkDirOp) (err error) {
  1048  
  1049  	fs.mu.RLock()
  1050  	parent := fs.getInodeOrDie(op.Parent)
  1051  	fs.mu.RUnlock()
  1052  
  1053  	// ignore op.Mode for now
  1054  	inode, err := parent.MkDir(op.Name)
  1055  	if err != nil {
  1056  		return err
  1057  	}
  1058  
  1059  	parent.mu.Lock()
  1060  
  1061  	fs.mu.Lock()
  1062  	defer fs.mu.Unlock()
  1063  	fs.insertInode(parent, inode)
  1064  
  1065  	parent.mu.Unlock()
  1066  
  1067  	op.Entry.Child = inode.Id
  1068  	op.Entry.Attributes = inode.InflateAttributes()
  1069  	op.Entry.AttributesExpiration = time.Now().Add(fs.flags.StatCacheTTL)
  1070  	op.Entry.EntryExpiration = time.Now().Add(fs.flags.TypeCacheTTL)
  1071  
  1072  	return
  1073  }
  1074  
  1075  func (fs *Goofys) RmDir(
  1076  	ctx context.Context,
  1077  	op *fuseops.RmDirOp) (err error) {
  1078  
  1079  	fs.mu.RLock()
  1080  	parent := fs.getInodeOrDie(op.Parent)
  1081  	fs.mu.RUnlock()
  1082  
  1083  	err = parent.RmDir(op.Name)
  1084  	parent.logFuse("<-- RmDir", op.Name, err)
  1085  	return
  1086  }
  1087  
  1088  func (fs *Goofys) SetInodeAttributes(
  1089  	ctx context.Context,
  1090  	op *fuseops.SetInodeAttributesOp) (err error) {
  1091  
  1092  	fs.mu.RLock()
  1093  	inode := fs.getInodeOrDie(op.Inode)
  1094  	fs.mu.RUnlock()
  1095  
  1096  	attr, err := inode.GetAttributes()
  1097  	if err == nil {
  1098  		op.Attributes = *attr
  1099  		op.AttributesExpiration = time.Now().Add(fs.flags.StatCacheTTL)
  1100  	}
  1101  	return
  1102  }
  1103  
  1104  func (fs *Goofys) WriteFile(
  1105  	ctx context.Context,
  1106  	op *fuseops.WriteFileOp) (err error) {
  1107  
  1108  	fs.mu.RLock()
  1109  
  1110  	fh, ok := fs.fileHandles[op.Handle]
  1111  	if !ok {
  1112  		panic(fmt.Sprintf("WriteFile: can't find handle %v", op.Handle))
  1113  	}
  1114  	fs.mu.RUnlock()
  1115  
  1116  	err = fh.WriteFile(op.Offset, op.Data)
  1117  
  1118  	return
  1119  }
  1120  
  1121  func (fs *Goofys) Unlink(
  1122  	ctx context.Context,
  1123  	op *fuseops.UnlinkOp) (err error) {
  1124  
  1125  	fs.mu.RLock()
  1126  	parent := fs.getInodeOrDie(op.Parent)
  1127  	fs.mu.RUnlock()
  1128  
  1129  	err = parent.Unlink(op.Name)
  1130  	return
  1131  }
  1132  
  1133  // rename("from", "to") causes the kernel to send lookup of "from" and
  1134  // "to" prior to sending rename to us
  1135  func (fs *Goofys) Rename(
  1136  	ctx context.Context,
  1137  	op *fuseops.RenameOp) (err error) {
  1138  
  1139  	fs.mu.RLock()
  1140  	parent := fs.getInodeOrDie(op.OldParent)
  1141  	newParent := fs.getInodeOrDie(op.NewParent)
  1142  	fs.mu.RUnlock()
  1143  
  1144  	// XXX don't hold the lock the entire time
  1145  	if op.OldParent == op.NewParent {
  1146  		parent.mu.Lock()
  1147  		defer parent.mu.Unlock()
  1148  	} else {
  1149  		// lock ordering to prevent deadlock
  1150  		if op.OldParent < op.NewParent {
  1151  			parent.mu.Lock()
  1152  			newParent.mu.Lock()
  1153  		} else {
  1154  			newParent.mu.Lock()
  1155  			parent.mu.Lock()
  1156  		}
  1157  		defer parent.mu.Unlock()
  1158  		defer newParent.mu.Unlock()
  1159  	}
  1160  
  1161  	err = parent.Rename(op.OldName, newParent, op.NewName)
  1162  	if err != nil {
  1163  		if err == fuse.ENOENT {
  1164  			// if the source doesn't exist, it could be
  1165  			// because this is a new file and we haven't
  1166  			// flushed it yet, pretend that's ok because
  1167  			// when we flush we will handle the rename
  1168  			inode := parent.findChildUnlocked(op.OldName)
  1169  			if inode != nil && atomic.LoadInt32(&inode.fileHandles) != 0 {
  1170  				err = nil
  1171  			}
  1172  		}
  1173  	}
  1174  	if err == nil {
  1175  		inode := parent.findChildUnlocked(op.OldName)
  1176  		if inode != nil {
  1177  			inode.mu.Lock()
  1178  			defer inode.mu.Unlock()
  1179  
  1180  			parent.removeChildUnlocked(inode)
  1181  
  1182  			newNode := newParent.findChildUnlocked(op.NewName)
  1183  			if newNode != nil {
  1184  				// this file's been overwritten, it's
  1185  				// been detached but we can't delete
  1186  				// it just yet, because the kernel
  1187  				// will still send forget ops to us
  1188  				newParent.removeChildUnlocked(newNode)
  1189  				newNode.Parent = nil
  1190  			}
  1191  
  1192  			inode.Name = &op.NewName
  1193  			inode.Parent = newParent
  1194  			newParent.insertChildUnlocked(inode)
  1195  		}
  1196  	}
  1197  	return
  1198  }