github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/gofer/regular_file.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package gofer
    16  
    17  import (
    18  	"fmt"
    19  	"io"
    20  	"math"
    21  	"sync/atomic"
    22  
    23  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    24  	"github.com/SagerNet/gvisor/pkg/context"
    25  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    26  	"github.com/SagerNet/gvisor/pkg/hostarch"
    27  	"github.com/SagerNet/gvisor/pkg/log"
    28  	"github.com/SagerNet/gvisor/pkg/metric"
    29  	"github.com/SagerNet/gvisor/pkg/p9"
    30  	"github.com/SagerNet/gvisor/pkg/safemem"
    31  	"github.com/SagerNet/gvisor/pkg/sentry/fs/fsutil"
    32  	"github.com/SagerNet/gvisor/pkg/sentry/fsmetric"
    33  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    34  	"github.com/SagerNet/gvisor/pkg/sentry/pgalloc"
    35  	"github.com/SagerNet/gvisor/pkg/sentry/usage"
    36  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    37  	"github.com/SagerNet/gvisor/pkg/sync"
    38  	"github.com/SagerNet/gvisor/pkg/syserror"
    39  	"github.com/SagerNet/gvisor/pkg/usermem"
    40  )
    41  
    42  func (d *dentry) isRegularFile() bool {
    43  	return d.fileType() == linux.S_IFREG
    44  }
    45  
    46  // +stateify savable
    47  type regularFileFD struct {
    48  	fileDescription
    49  
    50  	// off is the file offset. off is protected by mu.
    51  	mu  sync.Mutex `state:"nosave"`
    52  	off int64
    53  }
    54  
    55  func newRegularFileFD(mnt *vfs.Mount, d *dentry, flags uint32) (*regularFileFD, error) {
    56  	fd := &regularFileFD{}
    57  	fd.LockFD.Init(&d.locks)
    58  	if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
    59  		AllowDirectIO: true,
    60  	}); err != nil {
    61  		return nil, err
    62  	}
    63  	if fd.vfsfd.IsWritable() && (atomic.LoadUint32(&d.mode)&0111 != 0) {
    64  		metric.SuspiciousOperationsMetric.Increment("opened_write_execute_file")
    65  	}
    66  	if atomic.LoadInt32(&d.mmapFD) >= 0 {
    67  		fsmetric.GoferOpensHost.Increment()
    68  	} else {
    69  		fsmetric.GoferOpens9P.Increment()
    70  	}
    71  	return fd, nil
    72  }
    73  
    74  // Release implements vfs.FileDescriptionImpl.Release.
    75  func (fd *regularFileFD) Release(context.Context) {
    76  }
    77  
    78  // OnClose implements vfs.FileDescriptionImpl.OnClose.
    79  func (fd *regularFileFD) OnClose(ctx context.Context) error {
    80  	if !fd.vfsfd.IsWritable() {
    81  		return nil
    82  	}
    83  	// Skip flushing if there are client-buffered writes, since (as with the
    84  	// VFS1 client) we don't flush buffered writes on close anyway.
    85  	d := fd.dentry()
    86  	if d.fs.opts.interop != InteropModeExclusive {
    87  		return nil
    88  	}
    89  	d.dataMu.RLock()
    90  	haveDirtyPages := !d.dirty.IsEmpty()
    91  	d.dataMu.RUnlock()
    92  	if haveDirtyPages {
    93  		return nil
    94  	}
    95  	d.handleMu.RLock()
    96  	defer d.handleMu.RUnlock()
    97  	if d.writeFile.isNil() {
    98  		return nil
    99  	}
   100  	return d.writeFile.flush(ctx)
   101  }
   102  
   103  // Allocate implements vfs.FileDescriptionImpl.Allocate.
   104  func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
   105  	d := fd.dentry()
   106  	return d.doAllocate(ctx, offset, length, func() error {
   107  		d.handleMu.RLock()
   108  		defer d.handleMu.RUnlock()
   109  		return d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
   110  	})
   111  }
   112  
   113  // PRead implements vfs.FileDescriptionImpl.PRead.
   114  func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
   115  	start := fsmetric.StartReadWait()
   116  	d := fd.dentry()
   117  	defer func() {
   118  		if atomic.LoadInt32(&d.readFD) >= 0 {
   119  			fsmetric.GoferReadsHost.Increment()
   120  			fsmetric.FinishReadWait(fsmetric.GoferReadWaitHost, start)
   121  		} else {
   122  			fsmetric.GoferReads9P.Increment()
   123  			fsmetric.FinishReadWait(fsmetric.GoferReadWait9P, start)
   124  		}
   125  	}()
   126  
   127  	if offset < 0 {
   128  		return 0, linuxerr.EINVAL
   129  	}
   130  
   131  	// Check that flags are supported.
   132  	//
   133  	// TODO(github.com/SagerNet/issue/2601): Support select preadv2 flags.
   134  	if opts.Flags&^linux.RWF_HIPRI != 0 {
   135  		return 0, syserror.EOPNOTSUPP
   136  	}
   137  
   138  	// Check for reading at EOF before calling into MM (but not under
   139  	// InteropModeShared, which makes d.size unreliable).
   140  	if d.cachedMetadataAuthoritative() && uint64(offset) >= atomic.LoadUint64(&d.size) {
   141  		return 0, io.EOF
   142  	}
   143  
   144  	var (
   145  		n       int64
   146  		readErr error
   147  	)
   148  	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
   149  		// Lock d.metadataMu for the rest of the read to prevent d.size from
   150  		// changing.
   151  		d.metadataMu.Lock()
   152  		defer d.metadataMu.Unlock()
   153  		// Write dirty cached pages that will be touched by the read back to
   154  		// the remote file.
   155  		if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil {
   156  			return 0, err
   157  		}
   158  		rw := getDentryReadWriter(ctx, d, offset)
   159  		// Require the read to go to the remote file.
   160  		rw.direct = true
   161  		n, readErr = dst.CopyOutFrom(ctx, rw)
   162  		putDentryReadWriter(rw)
   163  		if d.fs.opts.interop != InteropModeShared {
   164  			// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
   165  			d.touchAtimeLocked(fd.vfsfd.Mount())
   166  		}
   167  	} else {
   168  		rw := getDentryReadWriter(ctx, d, offset)
   169  		n, readErr = dst.CopyOutFrom(ctx, rw)
   170  		putDentryReadWriter(rw)
   171  		if d.fs.opts.interop != InteropModeShared {
   172  			// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
   173  			d.touchAtime(fd.vfsfd.Mount())
   174  		}
   175  	}
   176  	return n, readErr
   177  }
   178  
   179  // Read implements vfs.FileDescriptionImpl.Read.
   180  func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
   181  	fd.mu.Lock()
   182  	n, err := fd.PRead(ctx, dst, fd.off, opts)
   183  	fd.off += n
   184  	fd.mu.Unlock()
   185  	return n, err
   186  }
   187  
   188  // PWrite implements vfs.FileDescriptionImpl.PWrite.
   189  func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
   190  	n, _, err := fd.pwrite(ctx, src, offset, opts)
   191  	return n, err
   192  }
   193  
   194  // pwrite returns the number of bytes written, final offset, error. The final
   195  // offset should be ignored by PWrite.
   196  func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
   197  	if offset < 0 {
   198  		return 0, offset, linuxerr.EINVAL
   199  	}
   200  
   201  	// Check that flags are supported.
   202  	//
   203  	// TODO(github.com/SagerNet/issue/2601): Support select pwritev2 flags.
   204  	if opts.Flags&^linux.RWF_HIPRI != 0 {
   205  		return 0, offset, syserror.EOPNOTSUPP
   206  	}
   207  
   208  	d := fd.dentry()
   209  
   210  	d.metadataMu.Lock()
   211  	defer d.metadataMu.Unlock()
   212  
   213  	// If the fd was opened with O_APPEND, make sure the file size is updated.
   214  	// There is a possible race here if size is modified externally after
   215  	// metadata cache is updated.
   216  	if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
   217  		if err := d.refreshSizeLocked(ctx); err != nil {
   218  			return 0, offset, err
   219  		}
   220  	}
   221  
   222  	// Set offset to file size if the fd was opened with O_APPEND.
   223  	if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
   224  		// Holding d.metadataMu is sufficient for reading d.size.
   225  		offset = int64(d.size)
   226  	}
   227  	limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
   228  	if err != nil {
   229  		return 0, offset, err
   230  	}
   231  	src = src.TakeFirst64(limit)
   232  
   233  	if d.fs.opts.interop != InteropModeShared {
   234  		// Compare Linux's mm/filemap.c:__generic_file_write_iter() =>
   235  		// file_update_time(). This is d.touchCMtime(), but without locking
   236  		// d.metadataMu (recursively).
   237  		d.touchCMtimeLocked()
   238  	}
   239  
   240  	rw := getDentryReadWriter(ctx, d, offset)
   241  	defer putDentryReadWriter(rw)
   242  
   243  	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
   244  		if err := fd.writeCache(ctx, d, offset, src); err != nil {
   245  			return 0, offset, err
   246  		}
   247  
   248  		// Require the write to go to the remote file.
   249  		rw.direct = true
   250  	}
   251  
   252  	n, err := src.CopyInTo(ctx, rw)
   253  	if err != nil {
   254  		return n, offset + n, err
   255  	}
   256  	if n > 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
   257  		// Note that if any of the following fail, then we can't guarantee that
   258  		// any data was actually written with the semantics of O_DSYNC or
   259  		// O_SYNC, so we return zero bytes written. Compare Linux's
   260  		// mm/filemap.c:generic_file_write_iter() =>
   261  		// include/linux/fs.h:generic_write_sync().
   262  		//
   263  		// Write dirty cached pages touched by the write back to the remote
   264  		// file.
   265  		if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
   266  			return 0, offset, err
   267  		}
   268  		// Request the remote filesystem to sync the remote file.
   269  		if err := d.syncRemoteFile(ctx); err != nil {
   270  			return 0, offset, err
   271  		}
   272  	}
   273  
   274  	// As with Linux, writing clears the setuid and setgid bits.
   275  	if n > 0 {
   276  		oldMode := atomic.LoadUint32(&d.mode)
   277  		// If setuid or setgid were set, update d.mode and propagate
   278  		// changes to the host.
   279  		if newMode := vfs.ClearSUIDAndSGID(oldMode); newMode != oldMode {
   280  			atomic.StoreUint32(&d.mode, newMode)
   281  			if err := d.file.setAttr(ctx, p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(newMode)}); err != nil {
   282  				return 0, offset, err
   283  			}
   284  		}
   285  	}
   286  
   287  	return n, offset + n, nil
   288  }
   289  
   290  func (fd *regularFileFD) writeCache(ctx context.Context, d *dentry, offset int64, src usermem.IOSequence) error {
   291  	// Write dirty cached pages that will be touched by the write back to
   292  	// the remote file.
   293  	if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
   294  		return err
   295  	}
   296  
   297  	// Remove touched pages from the cache.
   298  	pgstart := hostarch.PageRoundDown(uint64(offset))
   299  	pgend, ok := hostarch.PageRoundUp(uint64(offset + src.NumBytes()))
   300  	if !ok {
   301  		return linuxerr.EINVAL
   302  	}
   303  	mr := memmap.MappableRange{pgstart, pgend}
   304  	var freed []memmap.FileRange
   305  
   306  	d.dataMu.Lock()
   307  	cseg := d.cache.LowerBoundSegment(mr.Start)
   308  	for cseg.Ok() && cseg.Start() < mr.End {
   309  		cseg = d.cache.Isolate(cseg, mr)
   310  		freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()})
   311  		cseg = d.cache.Remove(cseg).NextSegment()
   312  	}
   313  	d.dataMu.Unlock()
   314  
   315  	// Invalidate mappings of removed pages.
   316  	d.mapsMu.Lock()
   317  	d.mappings.Invalidate(mr, memmap.InvalidateOpts{})
   318  	d.mapsMu.Unlock()
   319  
   320  	// Finally free pages removed from the cache.
   321  	mf := d.fs.mfp.MemoryFile()
   322  	for _, freedFR := range freed {
   323  		mf.DecRef(freedFR)
   324  	}
   325  	return nil
   326  }
   327  
   328  // Write implements vfs.FileDescriptionImpl.Write.
   329  func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
   330  	fd.mu.Lock()
   331  	n, off, err := fd.pwrite(ctx, src, fd.off, opts)
   332  	fd.off = off
   333  	fd.mu.Unlock()
   334  	return n, err
   335  }
   336  
   337  type dentryReadWriter struct {
   338  	ctx    context.Context
   339  	d      *dentry
   340  	off    uint64
   341  	direct bool
   342  }
   343  
   344  var dentryReadWriterPool = sync.Pool{
   345  	New: func() interface{} {
   346  		return &dentryReadWriter{}
   347  	},
   348  }
   349  
   350  func getDentryReadWriter(ctx context.Context, d *dentry, offset int64) *dentryReadWriter {
   351  	rw := dentryReadWriterPool.Get().(*dentryReadWriter)
   352  	rw.ctx = ctx
   353  	rw.d = d
   354  	rw.off = uint64(offset)
   355  	rw.direct = false
   356  	return rw
   357  }
   358  
   359  func putDentryReadWriter(rw *dentryReadWriter) {
   360  	rw.ctx = nil
   361  	rw.d = nil
   362  	dentryReadWriterPool.Put(rw)
   363  }
   364  
   365  // ReadToBlocks implements safemem.Reader.ReadToBlocks.
   366  func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
   367  	if dsts.IsEmpty() {
   368  		return 0, nil
   369  	}
   370  
   371  	// If we have a mmappable host FD (which must be used here to ensure
   372  	// coherence with memory-mapped I/O), or if InteropModeShared is in effect
   373  	// (which prevents us from caching file contents and makes dentry.size
   374  	// unreliable), or if the file was opened O_DIRECT, read directly from
   375  	// dentry.readHandleLocked() without locking dentry.dataMu.
   376  	rw.d.handleMu.RLock()
   377  	h := rw.d.readHandleLocked()
   378  	if (rw.d.mmapFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
   379  		n, err := h.readToBlocksAt(rw.ctx, dsts, rw.off)
   380  		rw.d.handleMu.RUnlock()
   381  		rw.off += n
   382  		return n, err
   383  	}
   384  
   385  	// Otherwise read from/through the cache.
   386  	mf := rw.d.fs.mfp.MemoryFile()
   387  	fillCache := mf.ShouldCacheEvictable()
   388  	var dataMuUnlock func()
   389  	if fillCache {
   390  		rw.d.dataMu.Lock()
   391  		dataMuUnlock = rw.d.dataMu.Unlock
   392  	} else {
   393  		rw.d.dataMu.RLock()
   394  		dataMuUnlock = rw.d.dataMu.RUnlock
   395  	}
   396  
   397  	// Compute the range to read (limited by file size and overflow-checked).
   398  	if rw.off >= rw.d.size {
   399  		dataMuUnlock()
   400  		rw.d.handleMu.RUnlock()
   401  		return 0, io.EOF
   402  	}
   403  	end := rw.d.size
   404  	if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
   405  		end = rend
   406  	}
   407  
   408  	var done uint64
   409  	seg, gap := rw.d.cache.Find(rw.off)
   410  	for rw.off < end {
   411  		mr := memmap.MappableRange{rw.off, end}
   412  		switch {
   413  		case seg.Ok():
   414  			// Get internal mappings from the cache.
   415  			ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Read)
   416  			if err != nil {
   417  				dataMuUnlock()
   418  				rw.d.handleMu.RUnlock()
   419  				return done, err
   420  			}
   421  
   422  			// Copy from internal mappings.
   423  			n, err := safemem.CopySeq(dsts, ims)
   424  			done += n
   425  			rw.off += n
   426  			dsts = dsts.DropFirst64(n)
   427  			if err != nil {
   428  				dataMuUnlock()
   429  				rw.d.handleMu.RUnlock()
   430  				return done, err
   431  			}
   432  
   433  			// Continue.
   434  			seg, gap = seg.NextNonEmpty()
   435  
   436  		case gap.Ok():
   437  			gapMR := gap.Range().Intersect(mr)
   438  			if fillCache {
   439  				// Read into the cache, then re-enter the loop to read from the
   440  				// cache.
   441  				gapEnd, _ := hostarch.PageRoundUp(gapMR.End)
   442  				reqMR := memmap.MappableRange{
   443  					Start: hostarch.PageRoundDown(gapMR.Start),
   444  					End:   gapEnd,
   445  				}
   446  				optMR := gap.Range()
   447  				err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), rw.d.size, mf, usage.PageCache, h.readToBlocksAt)
   448  				mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End})
   449  				seg, gap = rw.d.cache.Find(rw.off)
   450  				if !seg.Ok() {
   451  					dataMuUnlock()
   452  					rw.d.handleMu.RUnlock()
   453  					return done, err
   454  				}
   455  				// err might have occurred in part of gap.Range() outside gapMR
   456  				// (in particular, gap.End() might be beyond EOF). Forget about
   457  				// it for now; if the error matters and persists, we'll run
   458  				// into it again in a later iteration of this loop.
   459  			} else {
   460  				// Read directly from the file.
   461  				gapDsts := dsts.TakeFirst64(gapMR.Length())
   462  				n, err := h.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start)
   463  				done += n
   464  				rw.off += n
   465  				dsts = dsts.DropFirst64(n)
   466  				// Partial reads are fine. But we must stop reading.
   467  				if n != gapDsts.NumBytes() || err != nil {
   468  					dataMuUnlock()
   469  					rw.d.handleMu.RUnlock()
   470  					return done, err
   471  				}
   472  
   473  				// Continue.
   474  				seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
   475  			}
   476  		}
   477  	}
   478  	dataMuUnlock()
   479  	rw.d.handleMu.RUnlock()
   480  	return done, nil
   481  }
   482  
   483  // WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
   484  //
   485  // Preconditions: rw.d.metadataMu must be locked.
   486  func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
   487  	if srcs.IsEmpty() {
   488  		return 0, nil
   489  	}
   490  
   491  	// If we have a mmappable host FD (which must be used here to ensure
   492  	// coherence with memory-mapped I/O), or if InteropModeShared is in effect
   493  	// (which prevents us from caching file contents), or if the file was
   494  	// opened with O_DIRECT, write directly to dentry.writeHandleLocked()
   495  	// without locking dentry.dataMu.
   496  	rw.d.handleMu.RLock()
   497  	h := rw.d.writeHandleLocked()
   498  	if (rw.d.mmapFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
   499  		n, err := h.writeFromBlocksAt(rw.ctx, srcs, rw.off)
   500  		rw.off += n
   501  		rw.d.dataMu.Lock()
   502  		if rw.off > rw.d.size {
   503  			atomic.StoreUint64(&rw.d.size, rw.off)
   504  			// The remote file's size will implicitly be extended to the correct
   505  			// value when we write back to it.
   506  		}
   507  		rw.d.dataMu.Unlock()
   508  		rw.d.handleMu.RUnlock()
   509  		return n, err
   510  	}
   511  
   512  	// Otherwise write to/through the cache.
   513  	mf := rw.d.fs.mfp.MemoryFile()
   514  	rw.d.dataMu.Lock()
   515  
   516  	// Compute the range to write (overflow-checked).
   517  	start := rw.off
   518  	end := rw.off + srcs.NumBytes()
   519  	if end <= rw.off {
   520  		end = math.MaxInt64
   521  	}
   522  
   523  	var (
   524  		done   uint64
   525  		retErr error
   526  	)
   527  	seg, gap := rw.d.cache.Find(rw.off)
   528  	for rw.off < end {
   529  		mr := memmap.MappableRange{rw.off, end}
   530  		switch {
   531  		case seg.Ok():
   532  			// Get internal mappings from the cache.
   533  			segMR := seg.Range().Intersect(mr)
   534  			ims, err := mf.MapInternal(seg.FileRangeOf(segMR), hostarch.Write)
   535  			if err != nil {
   536  				retErr = err
   537  				goto exitLoop
   538  			}
   539  
   540  			// Copy to internal mappings.
   541  			n, err := safemem.CopySeq(ims, srcs)
   542  			done += n
   543  			rw.off += n
   544  			srcs = srcs.DropFirst64(n)
   545  			rw.d.dirty.MarkDirty(segMR)
   546  			if err != nil {
   547  				retErr = err
   548  				goto exitLoop
   549  			}
   550  
   551  			// Continue.
   552  			seg, gap = seg.NextNonEmpty()
   553  
   554  		case gap.Ok():
   555  			// Write directly to the file. At present, we never fill the cache
   556  			// when writing, since doing so can convert small writes into
   557  			// inefficient read-modify-write cycles, and we have no mechanism
   558  			// for detecting or avoiding this.
   559  			gapMR := gap.Range().Intersect(mr)
   560  			gapSrcs := srcs.TakeFirst64(gapMR.Length())
   561  			n, err := h.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start)
   562  			done += n
   563  			rw.off += n
   564  			srcs = srcs.DropFirst64(n)
   565  			// Partial writes are fine. But we must stop writing.
   566  			if n != gapSrcs.NumBytes() || err != nil {
   567  				retErr = err
   568  				goto exitLoop
   569  			}
   570  
   571  			// Continue.
   572  			seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
   573  		}
   574  	}
   575  exitLoop:
   576  	if rw.off > rw.d.size {
   577  		atomic.StoreUint64(&rw.d.size, rw.off)
   578  		// The remote file's size will implicitly be extended to the correct
   579  		// value when we write back to it.
   580  	}
   581  	// If InteropModeWritethrough is in effect, flush written data back to the
   582  	// remote filesystem.
   583  	if rw.d.fs.opts.interop == InteropModeWritethrough && done != 0 {
   584  		if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{
   585  			Start: start,
   586  			End:   rw.off,
   587  		}, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, h.writeFromBlocksAt); err != nil {
   588  			// We have no idea how many bytes were actually flushed.
   589  			rw.off = start
   590  			done = 0
   591  			retErr = err
   592  		}
   593  	}
   594  	rw.d.dataMu.Unlock()
   595  	rw.d.handleMu.RUnlock()
   596  	return done, retErr
   597  }
   598  
   599  func (d *dentry) writeback(ctx context.Context, offset, size int64) error {
   600  	if size == 0 {
   601  		return nil
   602  	}
   603  	d.handleMu.RLock()
   604  	defer d.handleMu.RUnlock()
   605  	h := d.writeHandleLocked()
   606  	d.dataMu.Lock()
   607  	defer d.dataMu.Unlock()
   608  	// Compute the range of valid bytes (overflow-checked).
   609  	if uint64(offset) >= d.size {
   610  		return nil
   611  	}
   612  	end := int64(d.size)
   613  	if rend := offset + size; rend > offset && rend < end {
   614  		end = rend
   615  	}
   616  	return fsutil.SyncDirty(ctx, memmap.MappableRange{
   617  		Start: uint64(offset),
   618  		End:   uint64(end),
   619  	}, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt)
   620  }
   621  
   622  // Seek implements vfs.FileDescriptionImpl.Seek.
   623  func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
   624  	fd.mu.Lock()
   625  	defer fd.mu.Unlock()
   626  	newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence)
   627  	if err != nil {
   628  		return 0, err
   629  	}
   630  	fd.off = newOffset
   631  	return newOffset, nil
   632  }
   633  
   634  // Calculate the new offset for a seek operation on a regular file.
   635  func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int64, whence int32) (int64, error) {
   636  	switch whence {
   637  	case linux.SEEK_SET:
   638  		// Use offset as specified.
   639  	case linux.SEEK_CUR:
   640  		offset += fdOffset
   641  	case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE:
   642  		// Ensure file size is up to date.
   643  		if !d.cachedMetadataAuthoritative() {
   644  			if err := d.updateFromGetattr(ctx); err != nil {
   645  				return 0, err
   646  			}
   647  		}
   648  		size := int64(atomic.LoadUint64(&d.size))
   649  		// For SEEK_DATA and SEEK_HOLE, treat the file as a single contiguous
   650  		// block of data.
   651  		switch whence {
   652  		case linux.SEEK_END:
   653  			offset += size
   654  		case linux.SEEK_DATA:
   655  			if offset > size {
   656  				return 0, linuxerr.ENXIO
   657  			}
   658  			// Use offset as specified.
   659  		case linux.SEEK_HOLE:
   660  			if offset > size {
   661  				return 0, linuxerr.ENXIO
   662  			}
   663  			offset = size
   664  		}
   665  	default:
   666  		return 0, linuxerr.EINVAL
   667  	}
   668  	if offset < 0 {
   669  		return 0, linuxerr.EINVAL
   670  	}
   671  	return offset, nil
   672  }
   673  
   674  // Sync implements vfs.FileDescriptionImpl.Sync.
   675  func (fd *regularFileFD) Sync(ctx context.Context) error {
   676  	return fd.dentry().syncCachedFile(ctx, false /* lowSyncExpectations */)
   677  }
   678  
   679  // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
   680  func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
   681  	d := fd.dentry()
   682  	// Force sentry page caching at your own risk.
   683  	if !d.fs.opts.forcePageCache {
   684  		switch d.fs.opts.interop {
   685  		case InteropModeExclusive:
   686  			// Any mapping is fine.
   687  		case InteropModeWritethrough:
   688  			// Shared writable mappings require a host FD, since otherwise we
   689  			// can't synchronously flush memory-mapped writes to the remote
   690  			// file.
   691  			if opts.Private || !opts.MaxPerms.Write {
   692  				break
   693  			}
   694  			fallthrough
   695  		case InteropModeShared:
   696  			// All mappings require a host FD to be coherent with other
   697  			// filesystem users.
   698  			if atomic.LoadInt32(&d.mmapFD) < 0 {
   699  				return linuxerr.ENODEV
   700  			}
   701  		default:
   702  			panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop))
   703  		}
   704  	}
   705  	// After this point, d may be used as a memmap.Mappable.
   706  	d.pf.hostFileMapperInitOnce.Do(d.pf.hostFileMapper.Init)
   707  	opts.SentryOwnedContent = d.fs.opts.forcePageCache
   708  	return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts)
   709  }
   710  
   711  func (d *dentry) mayCachePages() bool {
   712  	if d.fs.opts.forcePageCache {
   713  		return true
   714  	}
   715  	if d.fs.opts.interop == InteropModeShared {
   716  		return false
   717  	}
   718  	return atomic.LoadInt32(&d.mmapFD) >= 0
   719  }
   720  
   721  // AddMapping implements memmap.Mappable.AddMapping.
   722  func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error {
   723  	d.mapsMu.Lock()
   724  	mapped := d.mappings.AddMapping(ms, ar, offset, writable)
   725  	// Do this unconditionally since whether we have a host FD can change
   726  	// across save/restore.
   727  	for _, r := range mapped {
   728  		d.pf.hostFileMapper.IncRefOn(r)
   729  	}
   730  	if d.mayCachePages() {
   731  		// d.Evict() will refuse to evict memory-mapped pages, so tell the
   732  		// MemoryFile to not bother trying.
   733  		mf := d.fs.mfp.MemoryFile()
   734  		for _, r := range mapped {
   735  			mf.MarkUnevictable(d, pgalloc.EvictableRange{r.Start, r.End})
   736  		}
   737  	}
   738  	d.mapsMu.Unlock()
   739  	return nil
   740  }
   741  
   742  // RemoveMapping implements memmap.Mappable.RemoveMapping.
   743  func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) {
   744  	d.mapsMu.Lock()
   745  	unmapped := d.mappings.RemoveMapping(ms, ar, offset, writable)
   746  	for _, r := range unmapped {
   747  		d.pf.hostFileMapper.DecRefOn(r)
   748  	}
   749  	if d.mayCachePages() {
   750  		// Pages that are no longer referenced by any application memory
   751  		// mappings are now considered unused; allow MemoryFile to evict them
   752  		// when necessary.
   753  		mf := d.fs.mfp.MemoryFile()
   754  		d.dataMu.Lock()
   755  		for _, r := range unmapped {
   756  			// Since these pages are no longer mapped, they are no longer
   757  			// concurrently dirtyable by a writable memory mapping.
   758  			d.dirty.AllowClean(r)
   759  			mf.MarkEvictable(d, pgalloc.EvictableRange{r.Start, r.End})
   760  		}
   761  		d.dataMu.Unlock()
   762  	}
   763  	d.mapsMu.Unlock()
   764  }
   765  
   766  // CopyMapping implements memmap.Mappable.CopyMapping.
   767  func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error {
   768  	return d.AddMapping(ctx, ms, dstAR, offset, writable)
   769  }
   770  
   771  // Translate implements memmap.Mappable.Translate.
   772  func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
   773  	d.handleMu.RLock()
   774  	if d.mmapFD >= 0 && !d.fs.opts.forcePageCache {
   775  		d.handleMu.RUnlock()
   776  		mr := optional
   777  		if d.fs.opts.limitHostFDTranslation {
   778  			mr = maxFillRange(required, optional)
   779  		}
   780  		return []memmap.Translation{
   781  			{
   782  				Source: mr,
   783  				File:   &d.pf,
   784  				Offset: mr.Start,
   785  				Perms:  hostarch.AnyAccess,
   786  			},
   787  		}, nil
   788  	}
   789  
   790  	d.dataMu.Lock()
   791  
   792  	// Constrain translations to d.size (rounded up) to prevent translation to
   793  	// pages that may be concurrently truncated.
   794  	pgend, _ := hostarch.PageRoundUp(d.size)
   795  	var beyondEOF bool
   796  	if required.End > pgend {
   797  		if required.Start >= pgend {
   798  			d.dataMu.Unlock()
   799  			d.handleMu.RUnlock()
   800  			return nil, &memmap.BusError{io.EOF}
   801  		}
   802  		beyondEOF = true
   803  		required.End = pgend
   804  	}
   805  	if optional.End > pgend {
   806  		optional.End = pgend
   807  	}
   808  
   809  	mf := d.fs.mfp.MemoryFile()
   810  	h := d.readHandleLocked()
   811  	cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), d.size, mf, usage.PageCache, h.readToBlocksAt)
   812  
   813  	var ts []memmap.Translation
   814  	var translatedEnd uint64
   815  	for seg := d.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
   816  		segMR := seg.Range().Intersect(optional)
   817  		// TODO(jamieliu): Make Translations writable even if writability is
   818  		// not required if already kept-dirty by another writable translation.
   819  		perms := hostarch.AccessType{
   820  			Read:    true,
   821  			Execute: true,
   822  		}
   823  		if at.Write {
   824  			// From this point forward, this memory can be dirtied through the
   825  			// mapping at any time.
   826  			d.dirty.KeepDirty(segMR)
   827  			perms.Write = true
   828  		}
   829  		ts = append(ts, memmap.Translation{
   830  			Source: segMR,
   831  			File:   mf,
   832  			Offset: seg.FileRangeOf(segMR).Start,
   833  			Perms:  perms,
   834  		})
   835  		translatedEnd = segMR.End
   836  	}
   837  
   838  	d.dataMu.Unlock()
   839  	d.handleMu.RUnlock()
   840  
   841  	// Don't return the error returned by c.cache.Fill if it occurred outside
   842  	// of required.
   843  	if translatedEnd < required.End && cerr != nil {
   844  		return ts, &memmap.BusError{cerr}
   845  	}
   846  	if beyondEOF {
   847  		return ts, &memmap.BusError{io.EOF}
   848  	}
   849  	return ts, nil
   850  }
   851  
   852  func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange {
   853  	const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily
   854  	if required.Length() >= maxReadahead {
   855  		return required
   856  	}
   857  	if optional.Length() <= maxReadahead {
   858  		return optional
   859  	}
   860  	optional.Start = required.Start
   861  	if optional.Length() <= maxReadahead {
   862  		return optional
   863  	}
   864  	optional.End = optional.Start + maxReadahead
   865  	return optional
   866  }
   867  
   868  // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
   869  func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
   870  	// Whether we have a host fd (and consequently what memmap.File is
   871  	// mapped) can change across save/restore, so invalidate all translations
   872  	// unconditionally.
   873  	d.mapsMu.Lock()
   874  	defer d.mapsMu.Unlock()
   875  	d.mappings.InvalidateAll(memmap.InvalidateOpts{})
   876  
   877  	// Write the cache's contents back to the remote file so that if we have a
   878  	// host fd after restore, the remote file's contents are coherent.
   879  	mf := d.fs.mfp.MemoryFile()
   880  	d.handleMu.RLock()
   881  	defer d.handleMu.RUnlock()
   882  	h := d.writeHandleLocked()
   883  	d.dataMu.Lock()
   884  	defer d.dataMu.Unlock()
   885  	if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
   886  		return err
   887  	}
   888  
   889  	// Discard the cache so that it's not stored in saved state. This is safe
   890  	// because per InvalidateUnsavable invariants, no new translations can have
   891  	// been returned after we invalidated all existing translations above.
   892  	d.cache.DropAll(mf)
   893  	d.dirty.RemoveAll()
   894  
   895  	return nil
   896  }
   897  
   898  // Evict implements pgalloc.EvictableMemoryUser.Evict.
   899  func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
   900  	mr := memmap.MappableRange{er.Start, er.End}
   901  	mf := d.fs.mfp.MemoryFile()
   902  	d.mapsMu.Lock()
   903  	defer d.mapsMu.Unlock()
   904  	d.handleMu.RLock()
   905  	defer d.handleMu.RUnlock()
   906  	h := d.writeHandleLocked()
   907  	d.dataMu.Lock()
   908  	defer d.dataMu.Unlock()
   909  
   910  	// Only allow pages that are no longer memory-mapped to be evicted.
   911  	for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() {
   912  		mgapMR := mgap.Range().Intersect(mr)
   913  		if mgapMR.Length() == 0 {
   914  			continue
   915  		}
   916  		if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
   917  			log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err)
   918  		}
   919  		d.cache.Drop(mgapMR, mf)
   920  		d.dirty.KeepClean(mgapMR)
   921  	}
   922  }
   923  
   924  // dentryPlatformFile implements memmap.File. It exists solely because dentry
   925  // cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef.
   926  //
   927  // dentryPlatformFile is only used when a host FD representing the remote file
   928  // is available (i.e. dentry.mmapFD >= 0), and that FD is used for application
   929  // memory mappings (i.e. !filesystem.opts.forcePageCache).
   930  //
   931  // +stateify savable
   932  type dentryPlatformFile struct {
   933  	*dentry
   934  
   935  	// fdRefs counts references on memmap.File offsets. fdRefs is protected
   936  	// by dentry.dataMu.
   937  	fdRefs fsutil.FrameRefSet
   938  
   939  	// If this dentry represents a regular file, and dentry.mmapFD >= 0,
   940  	// hostFileMapper caches mappings of dentry.mmapFD.
   941  	hostFileMapper fsutil.HostFileMapper
   942  
   943  	// hostFileMapperInitOnce is used to lazily initialize hostFileMapper.
   944  	hostFileMapperInitOnce sync.Once `state:"nosave"`
   945  }
   946  
   947  // IncRef implements memmap.File.IncRef.
   948  func (d *dentryPlatformFile) IncRef(fr memmap.FileRange) {
   949  	d.dataMu.Lock()
   950  	d.fdRefs.IncRefAndAccount(fr)
   951  	d.dataMu.Unlock()
   952  }
   953  
   954  // DecRef implements memmap.File.DecRef.
   955  func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) {
   956  	d.dataMu.Lock()
   957  	d.fdRefs.DecRefAndAccount(fr)
   958  	d.dataMu.Unlock()
   959  }
   960  
   961  // MapInternal implements memmap.File.MapInternal.
   962  func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) {
   963  	d.handleMu.RLock()
   964  	defer d.handleMu.RUnlock()
   965  	return d.hostFileMapper.MapInternal(fr, int(d.mmapFD), at.Write)
   966  }
   967  
   968  // FD implements memmap.File.FD.
   969  func (d *dentryPlatformFile) FD() int {
   970  	d.handleMu.RLock()
   971  	defer d.handleMu.RUnlock()
   972  	return int(d.mmapFD)
   973  }