github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fs/fsutil/inode_cached.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package fsutil
    16  
    17  import (
    18  	"fmt"
    19  	"io"
    20  
    21  	"github.com/SagerNet/gvisor/pkg/context"
    22  	"github.com/SagerNet/gvisor/pkg/hostarch"
    23  	"github.com/SagerNet/gvisor/pkg/log"
    24  	"github.com/SagerNet/gvisor/pkg/safemem"
    25  	"github.com/SagerNet/gvisor/pkg/sentry/fs"
    26  	ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time"
    27  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/pgalloc"
    29  	"github.com/SagerNet/gvisor/pkg/sentry/usage"
    30  	"github.com/SagerNet/gvisor/pkg/sync"
    31  	"github.com/SagerNet/gvisor/pkg/usermem"
    32  )
    33  
    34  // Lock order (compare the lock order model in mm/mm.go):
    35  //
    36  // CachingInodeOperations.attrMu ("fs locks")
    37  //   CachingInodeOperations.mapsMu ("memmap.Mappable locks not taken by Translate")
    38  //     CachingInodeOperations.dataMu ("memmap.Mappable locks taken by Translate")
    39  //       CachedFileObject locks
    40  
    41  // CachingInodeOperations caches the metadata and content of a CachedFileObject.
    42  // It implements a subset of InodeOperations. As a utility it can be used to
    43  // implement the full set of InodeOperations. Generally it should not be
    44  // embedded to avoid unexpected inherited behavior.
    45  //
    46  // CachingInodeOperations implements Mappable for the CachedFileObject:
    47  //
    48  // - If CachedFileObject.FD returns a value >= 0 then the file descriptor
    49  //   will be memory mapped on the host.
    50  //
    51  // - Otherwise, the contents of CachedFileObject are buffered into memory
    52  //   managed by the CachingInodeOperations.
    53  //
    54  // Implementations of FileOperations for a CachedFileObject must read and
    55  // write through CachingInodeOperations using Read and Write respectively.
    56  //
    57  // Implementations of InodeOperations.WriteOut must call Sync to write out
    58  // in-memory modifications of data and metadata to the CachedFileObject.
    59  //
    60  // +stateify savable
    61  type CachingInodeOperations struct {
    62  	// backingFile is a handle to a cached file object.
    63  	backingFile CachedFileObject
    64  
    65  	// mfp is used to allocate memory that caches backingFile's contents.
    66  	mfp pgalloc.MemoryFileProvider
    67  
    68  	// opts contains options. opts is immutable.
    69  	opts CachingInodeOperationsOptions
    70  
    71  	attrMu sync.Mutex `state:"nosave"`
    72  
    73  	// attr is unstable cached metadata.
    74  	//
    75  	// attr is protected by attrMu. attr.Size is protected by both attrMu and
    76  	// dataMu; reading it requires locking either mutex, while mutating it
    77  	// requires locking both.
    78  	attr fs.UnstableAttr
    79  
    80  	// dirtyAttr is metadata that was updated in-place but hasn't yet
    81  	// been successfully written out.
    82  	//
    83  	// dirtyAttr is protected by attrMu.
    84  	dirtyAttr fs.AttrMask
    85  
    86  	mapsMu sync.Mutex `state:"nosave"`
    87  
    88  	// mappings tracks mappings of the cached file object into
    89  	// memmap.MappingSpaces.
    90  	//
    91  	// mappings is protected by mapsMu.
    92  	mappings memmap.MappingSet
    93  
    94  	dataMu sync.RWMutex `state:"nosave"`
    95  
    96  	// cache maps offsets into the cached file to offsets into
    97  	// mfp.MemoryFile() that store the file's data.
    98  	//
    99  	// cache is protected by dataMu.
   100  	cache FileRangeSet
   101  
   102  	// dirty tracks dirty segments in cache.
   103  	//
   104  	// dirty is protected by dataMu.
   105  	dirty DirtySet
   106  
   107  	// hostFileMapper caches internal mappings of backingFile.FD().
   108  	hostFileMapper *HostFileMapper
   109  
   110  	// refs tracks active references to data in the cache.
   111  	//
   112  	// refs is protected by dataMu.
   113  	refs FrameRefSet
   114  }
   115  
   116  // CachingInodeOperationsOptions configures a CachingInodeOperations.
   117  //
   118  // +stateify savable
   119  type CachingInodeOperationsOptions struct {
   120  	// If ForcePageCache is true, use the sentry page cache even if a host file
   121  	// descriptor is available.
   122  	ForcePageCache bool
   123  
   124  	// If LimitHostFDTranslation is true, apply maxFillRange() constraints to
   125  	// host file descriptor mappings returned by
   126  	// CachingInodeOperations.Translate().
   127  	LimitHostFDTranslation bool
   128  }
   129  
   130  // CachedFileObject is a file that may require caching.
   131  type CachedFileObject interface {
   132  	// ReadToBlocksAt reads up to dsts.NumBytes() bytes from the file to dsts,
   133  	// starting at offset, and returns the number of bytes read. ReadToBlocksAt
   134  	// may return a partial read without an error.
   135  	ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)
   136  
   137  	// WriteFromBlocksAt writes up to srcs.NumBytes() bytes from srcs to the
   138  	// file, starting at offset, and returns the number of bytes written.
   139  	// WriteFromBlocksAt may return a partial write without an error.
   140  	WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)
   141  
   142  	// SetMaskedAttributes sets the attributes in attr that are true in
   143  	// mask on the backing file. If the mask contains only ATime or MTime
   144  	// and the CachedFileObject has an FD to the file, then this operation
   145  	// is a noop unless forceSetTimestamps is true. This avoids an extra
   146  	// RPC to the gofer in the open-read/write-close case, when the
   147  	// timestamps on the file will be updated by the host kernel for us.
   148  	//
   149  	// SetMaskedAttributes may be called at any point, regardless of whether
   150  	// the file was opened.
   151  	SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr, forceSetTimestamps bool) error
   152  
   153  	// Allocate allows the caller to reserve disk space for the inode.
   154  	// It's equivalent to fallocate(2) with 'mode=0'.
   155  	Allocate(ctx context.Context, offset int64, length int64) error
   156  
   157  	// Sync instructs the remote filesystem to sync the file to stable storage.
   158  	Sync(ctx context.Context) error
   159  
   160  	// FD returns a host file descriptor. If it is possible for
   161  	// CachingInodeOperations.AddMapping to have ever been called with writable
   162  	// = true, the FD must have been opened O_RDWR; otherwise, it may have been
   163  	// opened O_RDONLY or O_RDWR. (mmap unconditionally requires that mapped
   164  	// files are readable.) If no host file descriptor is available, FD returns
   165  	// a negative number.
   166  	//
   167  	// For any given CachedFileObject, if FD() ever succeeds (returns a
   168  	// non-negative number), it must always succeed.
   169  	//
   170  	// FD is called iff the file has been memory mapped. This implies that
   171  	// the file was opened (see fs.InodeOperations.GetFile).
   172  	FD() int
   173  }
   174  
   175  // NewCachingInodeOperations returns a new CachingInodeOperations backed by
   176  // a CachedFileObject and its initial unstable attributes.
   177  func NewCachingInodeOperations(ctx context.Context, backingFile CachedFileObject, uattr fs.UnstableAttr, opts CachingInodeOperationsOptions) *CachingInodeOperations {
   178  	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
   179  	if mfp == nil {
   180  		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider))
   181  	}
   182  	return &CachingInodeOperations{
   183  		backingFile:    backingFile,
   184  		mfp:            mfp,
   185  		opts:           opts,
   186  		attr:           uattr,
   187  		hostFileMapper: NewHostFileMapper(),
   188  	}
   189  }
   190  
   191  // Release implements fs.InodeOperations.Release.
   192  func (c *CachingInodeOperations) Release() {
   193  	c.mapsMu.Lock()
   194  	defer c.mapsMu.Unlock()
   195  	c.dataMu.Lock()
   196  	defer c.dataMu.Unlock()
   197  
   198  	// Something has gone terribly wrong if we're releasing an inode that is
   199  	// still memory-mapped.
   200  	if !c.mappings.IsEmpty() {
   201  		panic(fmt.Sprintf("Releasing CachingInodeOperations with mappings:\n%s", &c.mappings))
   202  	}
   203  
   204  	// Drop any cached pages that are still awaiting MemoryFile eviction. (This
   205  	// means that MemoryFile no longer needs to evict them.)
   206  	mf := c.mfp.MemoryFile()
   207  	mf.MarkAllUnevictable(c)
   208  	if err := SyncDirtyAll(context.Background(), &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil {
   209  		panic(fmt.Sprintf("Failed to writeback cached data: %v", err))
   210  	}
   211  	c.cache.DropAll(mf)
   212  	c.dirty.RemoveAll()
   213  }
   214  
   215  // UnstableAttr implements fs.InodeOperations.UnstableAttr.
   216  func (c *CachingInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
   217  	c.attrMu.Lock()
   218  	attr := c.attr
   219  	c.attrMu.Unlock()
   220  	return attr, nil
   221  }
   222  
   223  // SetPermissions implements fs.InodeOperations.SetPermissions.
   224  func (c *CachingInodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, perms fs.FilePermissions) bool {
   225  	c.attrMu.Lock()
   226  	defer c.attrMu.Unlock()
   227  
   228  	now := ktime.NowFromContext(ctx)
   229  	masked := fs.AttrMask{Perms: true}
   230  	if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Perms: perms}, false); err != nil {
   231  		return false
   232  	}
   233  	c.attr.Perms = perms
   234  	c.touchStatusChangeTimeLocked(now)
   235  	return true
   236  }
   237  
   238  // SetOwner implements fs.InodeOperations.SetOwner.
   239  func (c *CachingInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
   240  	if !owner.UID.Ok() && !owner.GID.Ok() {
   241  		return nil
   242  	}
   243  
   244  	c.attrMu.Lock()
   245  	defer c.attrMu.Unlock()
   246  
   247  	now := ktime.NowFromContext(ctx)
   248  	masked := fs.AttrMask{
   249  		UID: owner.UID.Ok(),
   250  		GID: owner.GID.Ok(),
   251  	}
   252  	if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Owner: owner}, false); err != nil {
   253  		return err
   254  	}
   255  	if owner.UID.Ok() {
   256  		c.attr.Owner.UID = owner.UID
   257  	}
   258  	if owner.GID.Ok() {
   259  		c.attr.Owner.GID = owner.GID
   260  	}
   261  	c.touchStatusChangeTimeLocked(now)
   262  	return nil
   263  }
   264  
   265  // SetTimestamps implements fs.InodeOperations.SetTimestamps.
   266  func (c *CachingInodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
   267  	if ts.ATimeOmit && ts.MTimeOmit {
   268  		return nil
   269  	}
   270  
   271  	c.attrMu.Lock()
   272  	defer c.attrMu.Unlock()
   273  
   274  	// Replace requests to use the "system time" with the current time to
   275  	// ensure that cached timestamps remain consistent with the remote
   276  	// filesystem.
   277  	now := ktime.NowFromContext(ctx)
   278  	if ts.ATimeSetSystemTime {
   279  		ts.ATime = now
   280  	}
   281  	if ts.MTimeSetSystemTime {
   282  		ts.MTime = now
   283  	}
   284  	masked := fs.AttrMask{
   285  		AccessTime:       !ts.ATimeOmit,
   286  		ModificationTime: !ts.MTimeOmit,
   287  	}
   288  	// Call SetMaskedAttributes with forceSetTimestamps = true to make sure
   289  	// the timestamp is updated.
   290  	if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{AccessTime: ts.ATime, ModificationTime: ts.MTime}, true); err != nil {
   291  		return err
   292  	}
   293  	if !ts.ATimeOmit {
   294  		c.attr.AccessTime = ts.ATime
   295  	}
   296  	if !ts.MTimeOmit {
   297  		c.attr.ModificationTime = ts.MTime
   298  	}
   299  	c.touchStatusChangeTimeLocked(now)
   300  	return nil
   301  }
   302  
   303  // Truncate implements fs.InodeOperations.Truncate.
   304  func (c *CachingInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
   305  	c.attrMu.Lock()
   306  	defer c.attrMu.Unlock()
   307  
   308  	// c.attr.Size is protected by both c.attrMu and c.dataMu.
   309  	c.dataMu.Lock()
   310  	now := ktime.NowFromContext(ctx)
   311  	masked := fs.AttrMask{Size: true}
   312  	attr := fs.UnstableAttr{Size: size}
   313  	if c.attr.Perms.HasSetUIDOrGID() {
   314  		masked.Perms = true
   315  		attr.Perms = c.attr.Perms
   316  		attr.Perms.DropSetUIDAndMaybeGID()
   317  		c.attr.Perms = attr.Perms
   318  	}
   319  	if err := c.backingFile.SetMaskedAttributes(ctx, masked, attr, false); err != nil {
   320  		c.dataMu.Unlock()
   321  		return err
   322  	}
   323  	oldSize := c.attr.Size
   324  	c.attr.Size = size
   325  	c.touchModificationAndStatusChangeTimeLocked(now)
   326  
   327  	// We drop c.dataMu here so that we can lock c.mapsMu and invalidate
   328  	// mappings below. This allows concurrent calls to Read/Translate/etc.
   329  	// These functions synchronize with an in-progress Truncate by refusing to
   330  	// use cache contents beyond the new c.attr.Size. (We are still holding
   331  	// c.attrMu, so we can't race with Truncate/Write.)
   332  	c.dataMu.Unlock()
   333  
   334  	// Nothing left to do unless shrinking the file.
   335  	if size >= oldSize {
   336  		return nil
   337  	}
   338  
   339  	oldpgend := fs.OffsetPageEnd(oldSize)
   340  	newpgend := fs.OffsetPageEnd(size)
   341  
   342  	// Invalidate past translations of truncated pages.
   343  	if newpgend != oldpgend {
   344  		c.mapsMu.Lock()
   345  		c.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
   346  			// Compare Linux's mm/truncate.c:truncate_setsize() =>
   347  			// truncate_pagecache() =>
   348  			// mm/memory.c:unmap_mapping_range(evencows=1).
   349  			InvalidatePrivate: true,
   350  		})
   351  		c.mapsMu.Unlock()
   352  	}
   353  
   354  	// We are now guaranteed that there are no translations of truncated pages,
   355  	// and can remove them from the cache. Since truncated pages have been
   356  	// removed from the backing file, they should be dropped without being
   357  	// written back.
   358  	c.dataMu.Lock()
   359  	defer c.dataMu.Unlock()
   360  	c.cache.Truncate(uint64(size), c.mfp.MemoryFile())
   361  	c.dirty.KeepClean(memmap.MappableRange{uint64(size), oldpgend})
   362  
   363  	return nil
   364  }
   365  
   366  // Allocate implements fs.InodeOperations.Allocate.
   367  func (c *CachingInodeOperations) Allocate(ctx context.Context, offset, length int64) error {
   368  	newSize := offset + length
   369  
   370  	// c.attr.Size is protected by both c.attrMu and c.dataMu.
   371  	c.attrMu.Lock()
   372  	defer c.attrMu.Unlock()
   373  	c.dataMu.Lock()
   374  	defer c.dataMu.Unlock()
   375  
   376  	if newSize <= c.attr.Size {
   377  		return nil
   378  	}
   379  
   380  	now := ktime.NowFromContext(ctx)
   381  	if err := c.backingFile.Allocate(ctx, offset, length); err != nil {
   382  		return err
   383  	}
   384  
   385  	c.attr.Size = newSize
   386  	c.touchModificationAndStatusChangeTimeLocked(now)
   387  	return nil
   388  }
   389  
   390  // WriteDirtyPagesAndAttrs will write the dirty pages and attributes to the
   391  // gofer without calling Fsync on the remote file.
   392  func (c *CachingInodeOperations) WriteDirtyPagesAndAttrs(ctx context.Context, inode *fs.Inode) error {
   393  	c.attrMu.Lock()
   394  	defer c.attrMu.Unlock()
   395  	c.dataMu.Lock()
   396  	defer c.dataMu.Unlock()
   397  
   398  	// Write dirty pages back.
   399  	err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.mfp.MemoryFile(), c.backingFile.WriteFromBlocksAt)
   400  	if err != nil {
   401  		return err
   402  	}
   403  
   404  	// SyncDirtyAll above would have grown if needed. On shrinks, the backing
   405  	// file is called directly, so size is never needs to be updated.
   406  	c.dirtyAttr.Size = false
   407  
   408  	// Write out cached attributes.
   409  	if err := c.backingFile.SetMaskedAttributes(ctx, c.dirtyAttr, c.attr, false); err != nil {
   410  		return err
   411  	}
   412  	c.dirtyAttr = fs.AttrMask{}
   413  
   414  	return nil
   415  }
   416  
   417  // WriteOut implements fs.InodeOperations.WriteOut.
   418  func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
   419  	if err := c.WriteDirtyPagesAndAttrs(ctx, inode); err != nil {
   420  		return err
   421  	}
   422  
   423  	// Fsync the remote file.
   424  	return c.backingFile.Sync(ctx)
   425  }
   426  
   427  // IncLinks increases the link count and updates cached modification time.
   428  func (c *CachingInodeOperations) IncLinks(ctx context.Context) {
   429  	c.attrMu.Lock()
   430  	c.attr.Links++
   431  	c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx))
   432  	c.attrMu.Unlock()
   433  }
   434  
   435  // DecLinks decreases the link count and updates cached modification time.
   436  func (c *CachingInodeOperations) DecLinks(ctx context.Context) {
   437  	c.attrMu.Lock()
   438  	c.attr.Links--
   439  	c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx))
   440  	c.attrMu.Unlock()
   441  }
   442  
   443  // TouchAccessTime updates the cached access time in-place to the
   444  // current time. It does not update status change time in-place. See
   445  // mm/filemap.c:do_generic_file_read -> include/linux/h:file_accessed.
   446  func (c *CachingInodeOperations) TouchAccessTime(ctx context.Context, inode *fs.Inode) {
   447  	if inode.MountSource.Flags.NoAtime {
   448  		return
   449  	}
   450  
   451  	c.attrMu.Lock()
   452  	c.touchAccessTimeLocked(ktime.NowFromContext(ctx))
   453  	c.attrMu.Unlock()
   454  }
   455  
   456  // touchAccesstimeLocked updates the cached access time in-place to the current
   457  // time.
   458  //
   459  // Preconditions: c.attrMu is locked for writing.
   460  func (c *CachingInodeOperations) touchAccessTimeLocked(now ktime.Time) {
   461  	c.attr.AccessTime = now
   462  	c.dirtyAttr.AccessTime = true
   463  }
   464  
   465  // TouchModificationAndStatusChangeTime updates the cached modification and
   466  // status change times in-place to the current time.
   467  func (c *CachingInodeOperations) TouchModificationAndStatusChangeTime(ctx context.Context) {
   468  	c.attrMu.Lock()
   469  	c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx))
   470  	c.attrMu.Unlock()
   471  }
   472  
   473  // touchModificationAndStatusChangeTimeLocked updates the cached modification
   474  // and status change times in-place to the current time.
   475  //
   476  // Preconditions: c.attrMu is locked for writing.
   477  func (c *CachingInodeOperations) touchModificationAndStatusChangeTimeLocked(now ktime.Time) {
   478  	c.attr.ModificationTime = now
   479  	c.dirtyAttr.ModificationTime = true
   480  	c.attr.StatusChangeTime = now
   481  	c.dirtyAttr.StatusChangeTime = true
   482  }
   483  
   484  // TouchStatusChangeTime updates the cached status change time in-place to the
   485  // current time.
   486  func (c *CachingInodeOperations) TouchStatusChangeTime(ctx context.Context) {
   487  	c.attrMu.Lock()
   488  	c.touchStatusChangeTimeLocked(ktime.NowFromContext(ctx))
   489  	c.attrMu.Unlock()
   490  }
   491  
   492  // touchStatusChangeTimeLocked updates the cached status change time
   493  // in-place to the current time.
   494  //
   495  // Preconditions: c.attrMu is locked for writing.
   496  func (c *CachingInodeOperations) touchStatusChangeTimeLocked(now ktime.Time) {
   497  	c.attr.StatusChangeTime = now
   498  	c.dirtyAttr.StatusChangeTime = true
   499  }
   500  
   501  // UpdateUnstable updates the cached unstable attributes. Only non-dirty
   502  // attributes are updated.
   503  func (c *CachingInodeOperations) UpdateUnstable(attr fs.UnstableAttr) {
   504  	// All attributes are protected by attrMu.
   505  	c.attrMu.Lock()
   506  
   507  	if !c.dirtyAttr.Usage {
   508  		c.attr.Usage = attr.Usage
   509  	}
   510  	if !c.dirtyAttr.Perms {
   511  		c.attr.Perms = attr.Perms
   512  	}
   513  	if !c.dirtyAttr.UID {
   514  		c.attr.Owner.UID = attr.Owner.UID
   515  	}
   516  	if !c.dirtyAttr.GID {
   517  		c.attr.Owner.GID = attr.Owner.GID
   518  	}
   519  	if !c.dirtyAttr.AccessTime {
   520  		c.attr.AccessTime = attr.AccessTime
   521  	}
   522  	if !c.dirtyAttr.ModificationTime {
   523  		c.attr.ModificationTime = attr.ModificationTime
   524  	}
   525  	if !c.dirtyAttr.StatusChangeTime {
   526  		c.attr.StatusChangeTime = attr.StatusChangeTime
   527  	}
   528  	if !c.dirtyAttr.Links {
   529  		c.attr.Links = attr.Links
   530  	}
   531  
   532  	// Size requires holding attrMu and dataMu.
   533  	c.dataMu.Lock()
   534  	if !c.dirtyAttr.Size {
   535  		c.attr.Size = attr.Size
   536  	}
   537  	c.dataMu.Unlock()
   538  
   539  	c.attrMu.Unlock()
   540  }
   541  
   542  // Read reads from frames and otherwise directly from the backing file
   543  // into dst starting at offset until dst is full, EOF is reached, or an
   544  // error is encountered.
   545  //
   546  // Read may partially fill dst and return a nil error.
   547  func (c *CachingInodeOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
   548  	if dst.NumBytes() == 0 {
   549  		return 0, nil
   550  	}
   551  
   552  	// Have we reached EOF? We check for this again in
   553  	// inodeReadWriter.ReadToBlocks to avoid holding c.attrMu (which would
   554  	// serialize reads) or c.dataMu (which would violate lock ordering), but
   555  	// check here first (before calling into MM) since reading at EOF is
   556  	// common: getting a return value of 0 from a read syscall is the only way
   557  	// to detect EOF.
   558  	//
   559  	// TODO(jamieliu): Separate out c.attr.Size and use atomics instead of
   560  	// c.dataMu.
   561  	c.dataMu.RLock()
   562  	size := c.attr.Size
   563  	c.dataMu.RUnlock()
   564  	if offset >= size {
   565  		return 0, io.EOF
   566  	}
   567  
   568  	n, err := dst.CopyOutFrom(ctx, &inodeReadWriter{ctx, c, offset})
   569  	// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
   570  	c.TouchAccessTime(ctx, file.Dirent.Inode)
   571  	return n, err
   572  }
   573  
   574  // Write writes to frames and otherwise directly to the backing file
   575  // from src starting at offset and until src is empty or an error is
   576  // encountered.
   577  //
   578  // If Write partially fills src, a non-nil error is returned.
   579  func (c *CachingInodeOperations) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
   580  	// Hot path. Avoid defers.
   581  	if src.NumBytes() == 0 {
   582  		return 0, nil
   583  	}
   584  
   585  	c.attrMu.Lock()
   586  	// Compare Linux's mm/filemap.c:__generic_file_write_iter() => file_update_time().
   587  	c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx))
   588  	n, err := src.CopyInTo(ctx, &inodeReadWriter{ctx, c, offset})
   589  	c.attrMu.Unlock()
   590  	return n, err
   591  }
   592  
   593  type inodeReadWriter struct {
   594  	ctx    context.Context
   595  	c      *CachingInodeOperations
   596  	offset int64
   597  }
   598  
   599  // ReadToBlocks implements safemem.Reader.ReadToBlocks.
   600  func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
   601  	mem := rw.c.mfp.MemoryFile()
   602  	fillCache := !rw.c.useHostPageCache() && mem.ShouldCacheEvictable()
   603  
   604  	// Hot path. Avoid defers.
   605  	var unlock func()
   606  	if fillCache {
   607  		rw.c.dataMu.Lock()
   608  		unlock = rw.c.dataMu.Unlock
   609  	} else {
   610  		rw.c.dataMu.RLock()
   611  		unlock = rw.c.dataMu.RUnlock
   612  	}
   613  
   614  	// Compute the range to read.
   615  	if rw.offset >= rw.c.attr.Size {
   616  		unlock()
   617  		return 0, io.EOF
   618  	}
   619  	end := fs.ReadEndOffset(rw.offset, int64(dsts.NumBytes()), rw.c.attr.Size)
   620  	if end == rw.offset { // dsts.NumBytes() == 0?
   621  		unlock()
   622  		return 0, nil
   623  	}
   624  
   625  	var done uint64
   626  	seg, gap := rw.c.cache.Find(uint64(rw.offset))
   627  	for rw.offset < end {
   628  		mr := memmap.MappableRange{uint64(rw.offset), uint64(end)}
   629  		switch {
   630  		case seg.Ok():
   631  			// Get internal mappings from the cache.
   632  			ims, err := mem.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Read)
   633  			if err != nil {
   634  				unlock()
   635  				return done, err
   636  			}
   637  
   638  			// Copy from internal mappings.
   639  			n, err := safemem.CopySeq(dsts, ims)
   640  			done += n
   641  			rw.offset += int64(n)
   642  			dsts = dsts.DropFirst64(n)
   643  			if err != nil {
   644  				unlock()
   645  				return done, err
   646  			}
   647  
   648  			// Continue.
   649  			seg, gap = seg.NextNonEmpty()
   650  
   651  		case gap.Ok():
   652  			gapMR := gap.Range().Intersect(mr)
   653  			if fillCache {
   654  				// Read into the cache, then re-enter the loop to read from the
   655  				// cache.
   656  				reqMR := memmap.MappableRange{
   657  					Start: uint64(hostarch.Addr(gapMR.Start).RoundDown()),
   658  					End:   fs.OffsetPageEnd(int64(gapMR.End)),
   659  				}
   660  				optMR := gap.Range()
   661  				err := rw.c.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), uint64(rw.c.attr.Size), mem, usage.PageCache, rw.c.backingFile.ReadToBlocksAt)
   662  				mem.MarkEvictable(rw.c, pgalloc.EvictableRange{optMR.Start, optMR.End})
   663  				seg, gap = rw.c.cache.Find(uint64(rw.offset))
   664  				if !seg.Ok() {
   665  					unlock()
   666  					return done, err
   667  				}
   668  				// err might have occurred in part of gap.Range() outside
   669  				// gapMR. Forget about it for now; if the error matters and
   670  				// persists, we'll run into it again in a later iteration of
   671  				// this loop.
   672  			} else {
   673  				// Read directly from the backing file.
   674  				dst := dsts.TakeFirst64(gapMR.Length())
   675  				n, err := rw.c.backingFile.ReadToBlocksAt(rw.ctx, dst, gapMR.Start)
   676  				done += n
   677  				rw.offset += int64(n)
   678  				dsts = dsts.DropFirst64(n)
   679  				// Partial reads are fine. But we must stop reading.
   680  				if n != dst.NumBytes() || err != nil {
   681  					unlock()
   682  					return done, err
   683  				}
   684  
   685  				// Continue.
   686  				seg, gap = gap.NextSegment(), FileRangeGapIterator{}
   687  			}
   688  		}
   689  	}
   690  	unlock()
   691  	return done, nil
   692  }
   693  
   694  // maybeUpdateAttrs updates the file's attributes after a write. It updates
   695  // size if data has been written past the old size, and setuid/setgid if any
   696  // bytes were written.
   697  //
   698  // Preconditions:
   699  // * rw.c.attrMu must be locked.
   700  // * rw.c.dataMu must be locked.
   701  func (rw *inodeReadWriter) maybeUpdateAttrs(nwritten uint64) {
   702  	// If the write ends beyond the file's previous size, it causes the
   703  	// file to grow.
   704  	if rw.offset > rw.c.attr.Size {
   705  		rw.c.attr.Size = rw.offset
   706  		rw.c.dirtyAttr.Size = true
   707  	}
   708  	if rw.offset > rw.c.attr.Usage {
   709  		// This is incorrect if CachingInodeOperations is caching a sparse
   710  		// file. (In Linux, keeping inode::i_blocks up to date is the
   711  		// filesystem's responsibility.)
   712  		rw.c.attr.Usage = rw.offset
   713  		rw.c.dirtyAttr.Usage = true
   714  	}
   715  
   716  	// If bytes were written, ensure setuid and setgid are cleared.
   717  	if nwritten > 0 && rw.c.attr.Perms.HasSetUIDOrGID() {
   718  		rw.c.dirtyAttr.Perms = true
   719  		rw.c.attr.Perms.DropSetUIDAndMaybeGID()
   720  	}
   721  }
   722  
   723  // WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
   724  //
   725  // Preconditions: rw.c.attrMu must be locked.
   726  func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
   727  	// Hot path. Avoid defers.
   728  	rw.c.dataMu.Lock()
   729  
   730  	// Compute the range to write.
   731  	end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes()))
   732  	if end == rw.offset { // srcs.NumBytes() == 0?
   733  		rw.c.dataMu.Unlock()
   734  		return 0, nil
   735  	}
   736  
   737  	mf := rw.c.mfp.MemoryFile()
   738  	var done uint64
   739  	seg, gap := rw.c.cache.Find(uint64(rw.offset))
   740  	for rw.offset < end {
   741  		mr := memmap.MappableRange{uint64(rw.offset), uint64(end)}
   742  		switch {
   743  		case seg.Ok() && seg.Start() < mr.End:
   744  			// Get internal mappings from the cache.
   745  			segMR := seg.Range().Intersect(mr)
   746  			ims, err := mf.MapInternal(seg.FileRangeOf(segMR), hostarch.Write)
   747  			if err != nil {
   748  				rw.maybeUpdateAttrs(done)
   749  				rw.c.dataMu.Unlock()
   750  				return done, err
   751  			}
   752  
   753  			// Copy to internal mappings.
   754  			n, err := safemem.CopySeq(ims, srcs)
   755  			done += n
   756  			rw.offset += int64(n)
   757  			srcs = srcs.DropFirst64(n)
   758  			rw.c.dirty.MarkDirty(segMR)
   759  			if err != nil {
   760  				rw.maybeUpdateAttrs(done)
   761  				rw.c.dataMu.Unlock()
   762  				return done, err
   763  			}
   764  
   765  			// Continue.
   766  			seg, gap = seg.NextNonEmpty()
   767  
   768  		case gap.Ok() && gap.Start() < mr.End:
   769  			// Write directly to the backing file. At present, we never fill
   770  			// the cache when writing, since doing so can convert small writes
   771  			// into inefficient read-modify-write cycles, and we have no
   772  			// mechanism for detecting or avoiding this.
   773  			gapmr := gap.Range().Intersect(mr)
   774  			src := srcs.TakeFirst64(gapmr.Length())
   775  			n, err := rw.c.backingFile.WriteFromBlocksAt(rw.ctx, src, gapmr.Start)
   776  			done += n
   777  			rw.offset += int64(n)
   778  			srcs = srcs.DropFirst64(n)
   779  			// Partial writes are fine. But we must stop writing.
   780  			if n != src.NumBytes() || err != nil {
   781  				rw.maybeUpdateAttrs(done)
   782  				rw.c.dataMu.Unlock()
   783  				return done, err
   784  			}
   785  
   786  			// Continue.
   787  			seg, gap = gap.NextSegment(), FileRangeGapIterator{}
   788  		}
   789  	}
   790  	rw.maybeUpdateAttrs(done)
   791  	rw.c.dataMu.Unlock()
   792  	return done, nil
   793  }
   794  
   795  // useHostPageCache returns true if c uses c.backingFile.FD() for all file I/O
   796  // and memory mappings, and false if c.cache may contain data cached from
   797  // c.backingFile.
   798  func (c *CachingInodeOperations) useHostPageCache() bool {
   799  	return !c.opts.ForcePageCache && c.backingFile.FD() >= 0
   800  }
   801  
   802  // AddMapping implements memmap.Mappable.AddMapping.
   803  func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error {
   804  	// Hot path. Avoid defers.
   805  	c.mapsMu.Lock()
   806  	mapped := c.mappings.AddMapping(ms, ar, offset, writable)
   807  	// Do this unconditionally since whether we have c.backingFile.FD() >= 0
   808  	// can change across save/restore.
   809  	for _, r := range mapped {
   810  		c.hostFileMapper.IncRefOn(r)
   811  	}
   812  	if !c.useHostPageCache() {
   813  		// c.Evict() will refuse to evict memory-mapped pages, so tell the
   814  		// MemoryFile to not bother trying.
   815  		mf := c.mfp.MemoryFile()
   816  		for _, r := range mapped {
   817  			mf.MarkUnevictable(c, pgalloc.EvictableRange{r.Start, r.End})
   818  		}
   819  	}
   820  	c.mapsMu.Unlock()
   821  	return nil
   822  }
   823  
   824  // RemoveMapping implements memmap.Mappable.RemoveMapping.
   825  func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) {
   826  	// Hot path. Avoid defers.
   827  	c.mapsMu.Lock()
   828  	unmapped := c.mappings.RemoveMapping(ms, ar, offset, writable)
   829  	for _, r := range unmapped {
   830  		c.hostFileMapper.DecRefOn(r)
   831  	}
   832  	if c.useHostPageCache() {
   833  		c.mapsMu.Unlock()
   834  		return
   835  	}
   836  
   837  	// Pages that are no longer referenced by any application memory mappings
   838  	// are now considered unused; allow MemoryFile to evict them when
   839  	// necessary.
   840  	mf := c.mfp.MemoryFile()
   841  	c.dataMu.Lock()
   842  	for _, r := range unmapped {
   843  		// Since these pages are no longer mapped, they are no longer
   844  		// concurrently dirtyable by a writable memory mapping.
   845  		c.dirty.AllowClean(r)
   846  		mf.MarkEvictable(c, pgalloc.EvictableRange{r.Start, r.End})
   847  	}
   848  	c.dataMu.Unlock()
   849  	c.mapsMu.Unlock()
   850  }
   851  
   852  // CopyMapping implements memmap.Mappable.CopyMapping.
   853  func (c *CachingInodeOperations) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error {
   854  	return c.AddMapping(ctx, ms, dstAR, offset, writable)
   855  }
   856  
   857  // Translate implements memmap.Mappable.Translate.
   858  func (c *CachingInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
   859  	// Hot path. Avoid defer.
   860  	if c.useHostPageCache() {
   861  		mr := optional
   862  		if c.opts.LimitHostFDTranslation {
   863  			mr = maxFillRange(required, optional)
   864  		}
   865  		return []memmap.Translation{
   866  			{
   867  				Source: mr,
   868  				File:   c,
   869  				Offset: mr.Start,
   870  				Perms:  hostarch.AnyAccess,
   871  			},
   872  		}, nil
   873  	}
   874  
   875  	c.dataMu.Lock()
   876  
   877  	// Constrain translations to c.attr.Size (rounded up) to prevent
   878  	// translation to pages that may be concurrently truncated.
   879  	pgend := fs.OffsetPageEnd(c.attr.Size)
   880  	var beyondEOF bool
   881  	if required.End > pgend {
   882  		if required.Start >= pgend {
   883  			c.dataMu.Unlock()
   884  			return nil, &memmap.BusError{io.EOF}
   885  		}
   886  		beyondEOF = true
   887  		required.End = pgend
   888  	}
   889  	if optional.End > pgend {
   890  		optional.End = pgend
   891  	}
   892  
   893  	mf := c.mfp.MemoryFile()
   894  	cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), uint64(c.attr.Size), mf, usage.PageCache, c.backingFile.ReadToBlocksAt)
   895  
   896  	var ts []memmap.Translation
   897  	var translatedEnd uint64
   898  	for seg := c.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
   899  		segMR := seg.Range().Intersect(optional)
   900  		// TODO(jamieliu): Make Translations writable even if writability is
   901  		// not required if already kept-dirty by another writable translation.
   902  		perms := hostarch.AccessType{
   903  			Read:    true,
   904  			Execute: true,
   905  		}
   906  		if at.Write {
   907  			// From this point forward, this memory can be dirtied through the
   908  			// mapping at any time.
   909  			c.dirty.KeepDirty(segMR)
   910  			perms.Write = true
   911  		}
   912  		ts = append(ts, memmap.Translation{
   913  			Source: segMR,
   914  			File:   mf,
   915  			Offset: seg.FileRangeOf(segMR).Start,
   916  			Perms:  perms,
   917  		})
   918  		translatedEnd = segMR.End
   919  	}
   920  
   921  	c.dataMu.Unlock()
   922  
   923  	// Don't return the error returned by c.cache.Fill if it occurred outside
   924  	// of required.
   925  	if translatedEnd < required.End && cerr != nil {
   926  		return ts, &memmap.BusError{cerr}
   927  	}
   928  	if beyondEOF {
   929  		return ts, &memmap.BusError{io.EOF}
   930  	}
   931  	return ts, nil
   932  }
   933  
   934  func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange {
   935  	const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily
   936  	if required.Length() >= maxReadahead {
   937  		return required
   938  	}
   939  	if optional.Length() <= maxReadahead {
   940  		return optional
   941  	}
   942  	optional.Start = required.Start
   943  	if optional.Length() <= maxReadahead {
   944  		return optional
   945  	}
   946  	optional.End = optional.Start + maxReadahead
   947  	return optional
   948  }
   949  
   950  // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
   951  func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error {
   952  	// Whether we have a host fd (and consequently what memmap.File is
   953  	// mapped) can change across save/restore, so invalidate all translations
   954  	// unconditionally.
   955  	c.mapsMu.Lock()
   956  	defer c.mapsMu.Unlock()
   957  	c.mappings.InvalidateAll(memmap.InvalidateOpts{})
   958  
   959  	// Sync the cache's contents so that if we have a host fd after restore,
   960  	// the remote file's contents are coherent.
   961  	mf := c.mfp.MemoryFile()
   962  	c.dataMu.Lock()
   963  	defer c.dataMu.Unlock()
   964  	if err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil {
   965  		return err
   966  	}
   967  
   968  	// Discard the cache so that it's not stored in saved state. This is safe
   969  	// because per InvalidateUnsavable invariants, no new translations can have
   970  	// been returned after we invalidated all existing translations above.
   971  	c.cache.DropAll(mf)
   972  	c.dirty.RemoveAll()
   973  
   974  	return nil
   975  }
   976  
   977  // NotifyChangeFD must be called after the file description represented by
   978  // CachedFileObject.FD() changes.
   979  func (c *CachingInodeOperations) NotifyChangeFD() error {
   980  	// Update existing sentry mappings to refer to the new file description.
   981  	if err := c.hostFileMapper.RegenerateMappings(c.backingFile.FD()); err != nil {
   982  		return err
   983  	}
   984  
   985  	// Shoot down existing application mappings of the old file description;
   986  	// they will be remapped with the new file description on demand.
   987  	c.mapsMu.Lock()
   988  	defer c.mapsMu.Unlock()
   989  
   990  	c.mappings.InvalidateAll(memmap.InvalidateOpts{})
   991  	return nil
   992  }
   993  
   994  // Evict implements pgalloc.EvictableMemoryUser.Evict.
   995  func (c *CachingInodeOperations) Evict(ctx context.Context, er pgalloc.EvictableRange) {
   996  	c.mapsMu.Lock()
   997  	defer c.mapsMu.Unlock()
   998  	c.dataMu.Lock()
   999  	defer c.dataMu.Unlock()
  1000  
  1001  	mr := memmap.MappableRange{er.Start, er.End}
  1002  	mf := c.mfp.MemoryFile()
  1003  	// Only allow pages that are no longer memory-mapped to be evicted.
  1004  	for mgap := c.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() {
  1005  		mgapMR := mgap.Range().Intersect(mr)
  1006  		if mgapMR.Length() == 0 {
  1007  			continue
  1008  		}
  1009  		if err := SyncDirty(ctx, mgapMR, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil {
  1010  			log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err)
  1011  		}
  1012  		c.cache.Drop(mgapMR, mf)
  1013  		c.dirty.KeepClean(mgapMR)
  1014  	}
  1015  }
  1016  
  1017  // IncRef implements memmap.File.IncRef. This is used when we directly map an
  1018  // underlying host fd and CachingInodeOperations is used as the memmap.File
  1019  // during translation.
  1020  func (c *CachingInodeOperations) IncRef(fr memmap.FileRange) {
  1021  	// Hot path. Avoid defers.
  1022  	c.dataMu.Lock()
  1023  	seg, gap := c.refs.Find(fr.Start)
  1024  	for {
  1025  		switch {
  1026  		case seg.Ok() && seg.Start() < fr.End:
  1027  			seg = c.refs.Isolate(seg, fr)
  1028  			seg.SetValue(seg.Value() + 1)
  1029  			seg, gap = seg.NextNonEmpty()
  1030  		case gap.Ok() && gap.Start() < fr.End:
  1031  			newRange := gap.Range().Intersect(fr)
  1032  			usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped)
  1033  			seg, gap = c.refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty()
  1034  		default:
  1035  			c.refs.MergeAdjacent(fr)
  1036  			c.dataMu.Unlock()
  1037  			return
  1038  		}
  1039  	}
  1040  }
  1041  
  1042  // DecRef implements memmap.File.DecRef. This is used when we directly map an
  1043  // underlying host fd and CachingInodeOperations is used as the memmap.File
  1044  // during translation.
  1045  func (c *CachingInodeOperations) DecRef(fr memmap.FileRange) {
  1046  	// Hot path. Avoid defers.
  1047  	c.dataMu.Lock()
  1048  	seg := c.refs.FindSegment(fr.Start)
  1049  
  1050  	for seg.Ok() && seg.Start() < fr.End {
  1051  		seg = c.refs.Isolate(seg, fr)
  1052  		if old := seg.Value(); old == 1 {
  1053  			usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped)
  1054  			seg = c.refs.Remove(seg).NextSegment()
  1055  		} else {
  1056  			seg.SetValue(old - 1)
  1057  			seg = seg.NextSegment()
  1058  		}
  1059  	}
  1060  	c.refs.MergeAdjacent(fr)
  1061  	c.dataMu.Unlock()
  1062  }
  1063  
  1064  // MapInternal implements memmap.File.MapInternal. This is used when we
  1065  // directly map an underlying host fd and CachingInodeOperations is used as the
  1066  // memmap.File during translation.
  1067  func (c *CachingInodeOperations) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) {
  1068  	return c.hostFileMapper.MapInternal(fr, c.backingFile.FD(), at.Write)
  1069  }
  1070  
  1071  // FD implements memmap.File.FD. This is used when we directly map an
  1072  // underlying host fd and CachingInodeOperations is used as the memmap.File
  1073  // during translation.
  1074  func (c *CachingInodeOperations) FD() int {
  1075  	return c.backingFile.FD()
  1076  }