gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/fsimpl/gofer/regular_file.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package gofer
    16  
    17  import (
    18  	"fmt"
    19  	"io"
    20  	"math"
    21  
    22  	"gvisor.dev/gvisor/pkg/abi/linux"
    23  	"gvisor.dev/gvisor/pkg/context"
    24  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    25  	"gvisor.dev/gvisor/pkg/hostarch"
    26  	"gvisor.dev/gvisor/pkg/log"
    27  	"gvisor.dev/gvisor/pkg/metric"
    28  	"gvisor.dev/gvisor/pkg/safemem"
    29  	"gvisor.dev/gvisor/pkg/sentry/fsmetric"
    30  	"gvisor.dev/gvisor/pkg/sentry/fsutil"
    31  	"gvisor.dev/gvisor/pkg/sentry/memmap"
    32  	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
    33  	"gvisor.dev/gvisor/pkg/sentry/usage"
    34  	"gvisor.dev/gvisor/pkg/sentry/vfs"
    35  	"gvisor.dev/gvisor/pkg/sync"
    36  	"gvisor.dev/gvisor/pkg/usermem"
    37  )
    38  
    39  func (d *dentry) isRegularFile() bool {
    40  	return d.fileType() == linux.S_IFREG
    41  }
    42  
    43  // +stateify savable
    44  type regularFileFD struct {
    45  	fileDescription
    46  
    47  	// off is the file offset. off is protected by mu.
    48  	mu  sync.Mutex `state:"nosave"`
    49  	off int64
    50  }
    51  
    52  func newRegularFileFD(mnt *vfs.Mount, d *dentry, flags uint32) (*regularFileFD, error) {
    53  	fd := &regularFileFD{}
    54  	fd.LockFD.Init(&d.locks)
    55  	if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
    56  		AllowDirectIO: true,
    57  	}); err != nil {
    58  		return nil, err
    59  	}
    60  	if fd.vfsfd.IsWritable() && (d.mode.Load()&0111 != 0) {
    61  		metric.SuspiciousOperationsMetric.Increment(&metric.SuspiciousOperationsTypeOpenedWriteExecuteFile)
    62  	}
    63  	if d.mmapFD.Load() >= 0 {
    64  		fsmetric.GoferOpensHost.Increment()
    65  	} else {
    66  		fsmetric.GoferOpens9P.Increment()
    67  	}
    68  	return fd, nil
    69  }
    70  
    71  // Release implements vfs.FileDescriptionImpl.Release.
    72  func (fd *regularFileFD) Release(context.Context) {
    73  }
    74  
    75  // OnClose implements vfs.FileDescriptionImpl.OnClose.
    76  func (fd *regularFileFD) OnClose(ctx context.Context) error {
    77  	if !fd.vfsfd.IsWritable() {
    78  		return nil
    79  	}
    80  	d := fd.dentry()
    81  	if d.fs.opts.interop == InteropModeExclusive {
    82  		// d may have dirty pages that we won't write back now (and wouldn't
    83  		// have in VFS1), making a flushf RPC ineffective. If this is the case,
    84  		// skip the flushf.
    85  		//
    86  		// Note that it's also possible to have dirty pages under other interop
    87  		// modes if forcePageCache is in effect; we conservatively assume that
    88  		// applications have some way of tolerating this and still want the
    89  		// flushf.
    90  		d.dataMu.RLock()
    91  		haveDirtyPages := !d.dirty.IsEmpty()
    92  		d.dataMu.RUnlock()
    93  		if haveDirtyPages {
    94  			return nil
    95  		}
    96  	}
    97  	return d.flush(ctx)
    98  }
    99  
   100  // Allocate implements vfs.FileDescriptionImpl.Allocate.
   101  func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
   102  	d := fd.dentry()
   103  	return d.doAllocate(ctx, offset, length, func() error {
   104  		return d.allocate(ctx, mode, offset, length)
   105  	})
   106  }
   107  
   108  // PRead implements vfs.FileDescriptionImpl.PRead.
   109  func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
   110  	start := fsmetric.StartReadWait()
   111  	d := fd.dentry()
   112  	defer func() {
   113  		if d.readFD.Load() >= 0 {
   114  			fsmetric.GoferReadsHost.Increment()
   115  			fsmetric.FinishReadWait(fsmetric.GoferReadWaitHost, start)
   116  		} else {
   117  			fsmetric.GoferReads9P.Increment()
   118  			fsmetric.FinishReadWait(fsmetric.GoferReadWait9P, start)
   119  		}
   120  	}()
   121  
   122  	if offset < 0 {
   123  		return 0, linuxerr.EINVAL
   124  	}
   125  
   126  	// Check that flags are supported.
   127  	//
   128  	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
   129  	if opts.Flags&^linux.RWF_HIPRI != 0 {
   130  		return 0, linuxerr.EOPNOTSUPP
   131  	}
   132  
   133  	// Check for reading at EOF before calling into MM (but not under
   134  	// InteropModeShared, which makes d.size unreliable).
   135  	if d.cachedMetadataAuthoritative() && uint64(offset) >= d.size.Load() {
   136  		return 0, io.EOF
   137  	}
   138  
   139  	var (
   140  		n       int64
   141  		readErr error
   142  	)
   143  	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
   144  		// Write dirty cached pages that will be touched by the read back to
   145  		// the remote file.
   146  		if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil {
   147  			return 0, err
   148  		}
   149  		rw := getDentryReadWriter(ctx, d, offset)
   150  		// Require the read to go to the remote file.
   151  		rw.direct = true
   152  		n, readErr = dst.CopyOutFrom(ctx, rw)
   153  		putDentryReadWriter(rw)
   154  		if d.fs.opts.interop != InteropModeShared {
   155  			// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
   156  			d.touchAtimeLocked(fd.vfsfd.Mount())
   157  		}
   158  	} else {
   159  		rw := getDentryReadWriter(ctx, d, offset)
   160  		n, readErr = dst.CopyOutFrom(ctx, rw)
   161  		putDentryReadWriter(rw)
   162  		if d.fs.opts.interop != InteropModeShared {
   163  			// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
   164  			d.touchAtime(fd.vfsfd.Mount())
   165  		}
   166  	}
   167  	return n, readErr
   168  }
   169  
   170  // Read implements vfs.FileDescriptionImpl.Read.
   171  func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
   172  	fd.mu.Lock()
   173  	n, err := fd.PRead(ctx, dst, fd.off, opts)
   174  	fd.off += n
   175  	fd.mu.Unlock()
   176  	return n, err
   177  }
   178  
   179  // PWrite implements vfs.FileDescriptionImpl.PWrite.
   180  func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
   181  	n, _, err := fd.pwrite(ctx, src, offset, opts)
   182  	return n, err
   183  }
   184  
   185  // pwrite returns the number of bytes written, final offset, error. The final
   186  // offset should be ignored by PWrite.
   187  func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
   188  	if offset < 0 {
   189  		return 0, offset, linuxerr.EINVAL
   190  	}
   191  
   192  	// Check that flags are supported.
   193  	//
   194  	// TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
   195  	if opts.Flags&^linux.RWF_HIPRI != 0 {
   196  		return 0, offset, linuxerr.EOPNOTSUPP
   197  	}
   198  
   199  	d := fd.dentry()
   200  
   201  	d.metadataMu.Lock()
   202  	defer d.metadataMu.Unlock()
   203  
   204  	// If the fd was opened with O_APPEND, make sure the file size is updated.
   205  	// There is a possible race here if size is modified externally after
   206  	// metadata cache is updated.
   207  	if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
   208  		if err := d.refreshSizeLocked(ctx); err != nil {
   209  			return 0, offset, err
   210  		}
   211  	}
   212  
   213  	// Set offset to file size if the fd was opened with O_APPEND.
   214  	if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
   215  		// Holding d.metadataMu is sufficient for reading d.size.
   216  		offset = int64(d.size.RacyLoad())
   217  	}
   218  	limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
   219  	if err != nil {
   220  		return 0, offset, err
   221  	}
   222  	src = src.TakeFirst64(limit)
   223  
   224  	if d.fs.opts.interop != InteropModeShared {
   225  		// Compare Linux's mm/filemap.c:__generic_file_write_iter() =>
   226  		// file_update_time(). This is d.touchCMtime(), but without locking
   227  		// d.metadataMu (recursively).
   228  		d.touchCMtimeLocked()
   229  	}
   230  
   231  	rw := getDentryReadWriter(ctx, d, offset)
   232  	defer putDentryReadWriter(rw)
   233  
   234  	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
   235  		if err := fd.writeCache(ctx, d, offset, src); err != nil {
   236  			return 0, offset, err
   237  		}
   238  
   239  		// Require the write to go to the remote file.
   240  		rw.direct = true
   241  	}
   242  
   243  	n, err := src.CopyInTo(ctx, rw)
   244  	if err != nil {
   245  		return n, offset + n, err
   246  	}
   247  	if n > 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
   248  		// Note that if any of the following fail, then we can't guarantee that
   249  		// any data was actually written with the semantics of O_DSYNC or
   250  		// O_SYNC, so we return zero bytes written. Compare Linux's
   251  		// mm/filemap.c:generic_file_write_iter() =>
   252  		// include/linux/fs.h:generic_write_sync().
   253  		//
   254  		// Write dirty cached pages touched by the write back to the remote
   255  		// file.
   256  		if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
   257  			return 0, offset, err
   258  		}
   259  		// Request the remote filesystem to sync the remote file.
   260  		if err := d.syncRemoteFile(ctx); err != nil {
   261  			return 0, offset, err
   262  		}
   263  	}
   264  
   265  	// As with Linux, writing clears the setuid and setgid bits.
   266  	if n > 0 {
   267  		oldMode := d.mode.Load()
   268  		// If setuid or setgid were set, update d.mode and propagate
   269  		// changes to the host.
   270  		if newMode := vfs.ClearSUIDAndSGID(oldMode); newMode != oldMode {
   271  			if err := d.chmod(ctx, uint16(newMode)); err != nil {
   272  				return 0, offset, err
   273  			}
   274  			d.mode.Store(newMode)
   275  		}
   276  	}
   277  
   278  	return n, offset + n, nil
   279  }
   280  
   281  func (fd *regularFileFD) writeCache(ctx context.Context, d *dentry, offset int64, src usermem.IOSequence) error {
   282  	// Write dirty cached pages that will be touched by the write back to
   283  	// the remote file.
   284  	if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
   285  		return err
   286  	}
   287  
   288  	// Remove touched pages from the cache.
   289  	pgstart := hostarch.PageRoundDown(uint64(offset))
   290  	pgend, ok := hostarch.PageRoundUp(uint64(offset + src.NumBytes()))
   291  	if !ok {
   292  		return linuxerr.EINVAL
   293  	}
   294  	mr := memmap.MappableRange{pgstart, pgend}
   295  	var freed []memmap.FileRange
   296  
   297  	d.dataMu.Lock()
   298  	cseg := d.cache.LowerBoundSegment(mr.Start)
   299  	for cseg.Ok() && cseg.Start() < mr.End {
   300  		cseg = d.cache.Isolate(cseg, mr)
   301  		freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()})
   302  		cseg = d.cache.Remove(cseg).NextSegment()
   303  	}
   304  	d.dataMu.Unlock()
   305  
   306  	// Invalidate mappings of removed pages.
   307  	d.mapsMu.Lock()
   308  	d.mappings.Invalidate(mr, memmap.InvalidateOpts{})
   309  	d.mapsMu.Unlock()
   310  
   311  	// Finally free pages removed from the cache.
   312  	mf := d.fs.mf
   313  	for _, freedFR := range freed {
   314  		mf.DecRef(freedFR)
   315  	}
   316  	return nil
   317  }
   318  
   319  // Write implements vfs.FileDescriptionImpl.Write.
   320  func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
   321  	fd.mu.Lock()
   322  	n, off, err := fd.pwrite(ctx, src, fd.off, opts)
   323  	fd.off = off
   324  	fd.mu.Unlock()
   325  	return n, err
   326  }
   327  
   328  type dentryReadWriter struct {
   329  	ctx    context.Context
   330  	d      *dentry
   331  	off    uint64
   332  	direct bool
   333  }
   334  
   335  var dentryReadWriterPool = sync.Pool{
   336  	New: func() any {
   337  		return &dentryReadWriter{}
   338  	},
   339  }
   340  
   341  func getDentryReadWriter(ctx context.Context, d *dentry, offset int64) *dentryReadWriter {
   342  	rw := dentryReadWriterPool.Get().(*dentryReadWriter)
   343  	rw.ctx = ctx
   344  	rw.d = d
   345  	rw.off = uint64(offset)
   346  	rw.direct = false
   347  	return rw
   348  }
   349  
   350  func putDentryReadWriter(rw *dentryReadWriter) {
   351  	rw.ctx = nil
   352  	rw.d = nil
   353  	dentryReadWriterPool.Put(rw)
   354  }
   355  
   356  // ReadToBlocks implements safemem.Reader.ReadToBlocks.
   357  func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
   358  	if dsts.IsEmpty() {
   359  		return 0, nil
   360  	}
   361  
   362  	// If we have a mmappable host FD (which must be used here to ensure
   363  	// coherence with memory-mapped I/O), or if InteropModeShared is in effect
   364  	// (which prevents us from caching file contents and makes dentry.size
   365  	// unreliable), or if the file was opened O_DIRECT, read directly from
   366  	// readHandle() without locking dentry.dataMu.
   367  	rw.d.handleMu.RLock()
   368  	h := rw.d.readHandle()
   369  	if (rw.d.mmapFD.RacyLoad() >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
   370  		n, err := h.readToBlocksAt(rw.ctx, dsts, rw.off)
   371  		rw.d.handleMu.RUnlock()
   372  		rw.off += n
   373  		return n, err
   374  	}
   375  
   376  	// Otherwise read from/through the cache.
   377  	mf := rw.d.fs.mf
   378  	fillCache := mf.ShouldCacheEvictable()
   379  	var dataMuUnlock func()
   380  	if fillCache {
   381  		rw.d.dataMu.Lock()
   382  		dataMuUnlock = rw.d.dataMu.Unlock
   383  	} else {
   384  		rw.d.dataMu.RLock()
   385  		dataMuUnlock = rw.d.dataMu.RUnlock
   386  	}
   387  
   388  	// Compute the range to read (limited by file size and overflow-checked).
   389  	end := rw.d.size.Load()
   390  	if rw.off >= end {
   391  		dataMuUnlock()
   392  		rw.d.handleMu.RUnlock()
   393  		return 0, io.EOF
   394  	}
   395  	if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
   396  		end = rend
   397  	}
   398  
   399  	var done uint64
   400  	seg, gap := rw.d.cache.Find(rw.off)
   401  	for rw.off < end {
   402  		mr := memmap.MappableRange{rw.off, end}
   403  		switch {
   404  		case seg.Ok():
   405  			// Get internal mappings from the cache.
   406  			ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Read)
   407  			if err != nil {
   408  				dataMuUnlock()
   409  				rw.d.handleMu.RUnlock()
   410  				return done, err
   411  			}
   412  
   413  			// Copy from internal mappings.
   414  			n, err := safemem.CopySeq(dsts, ims)
   415  			done += n
   416  			rw.off += n
   417  			dsts = dsts.DropFirst64(n)
   418  			if err != nil {
   419  				dataMuUnlock()
   420  				rw.d.handleMu.RUnlock()
   421  				return done, err
   422  			}
   423  
   424  			// Continue.
   425  			seg, gap = seg.NextNonEmpty()
   426  
   427  		case gap.Ok():
   428  			gapMR := gap.Range().Intersect(mr)
   429  			if fillCache {
   430  				// Read into the cache, then re-enter the loop to read from the
   431  				// cache.
   432  				gapEnd, _ := hostarch.PageRoundUp(gapMR.End)
   433  				reqMR := memmap.MappableRange{
   434  					Start: hostarch.PageRoundDown(gapMR.Start),
   435  					End:   gapEnd,
   436  				}
   437  				optMR := gap.Range()
   438  				_, err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), rw.d.size.Load(), mf, usage.PageCache, pgalloc.AllocateAndWritePopulate, h.readToBlocksAt)
   439  				mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End})
   440  				seg, gap = rw.d.cache.Find(rw.off)
   441  				if !seg.Ok() {
   442  					dataMuUnlock()
   443  					rw.d.handleMu.RUnlock()
   444  					return done, err
   445  				}
   446  				// err might have occurred in part of gap.Range() outside gapMR
   447  				// (in particular, gap.End() might be beyond EOF). Forget about
   448  				// it for now; if the error matters and persists, we'll run
   449  				// into it again in a later iteration of this loop.
   450  			} else {
   451  				// Read directly from the file.
   452  				gapDsts := dsts.TakeFirst64(gapMR.Length())
   453  				n, err := h.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start)
   454  				done += n
   455  				rw.off += n
   456  				dsts = dsts.DropFirst64(n)
   457  				// Partial reads are fine. But we must stop reading.
   458  				if n != gapDsts.NumBytes() || err != nil {
   459  					dataMuUnlock()
   460  					rw.d.handleMu.RUnlock()
   461  					return done, err
   462  				}
   463  
   464  				// Continue.
   465  				seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
   466  			}
   467  		}
   468  	}
   469  	dataMuUnlock()
   470  	rw.d.handleMu.RUnlock()
   471  	return done, nil
   472  }
   473  
   474  // WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
   475  //
   476  // Preconditions: rw.d.metadataMu must be locked.
   477  func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
   478  	if srcs.IsEmpty() {
   479  		return 0, nil
   480  	}
   481  
   482  	// If we have a mmappable host FD (which must be used here to ensure
   483  	// coherence with memory-mapped I/O), or if InteropModeShared is in effect
   484  	// (which prevents us from caching file contents), or if the file was
   485  	// opened with O_DIRECT, write directly to dentry.writeHandle()
   486  	// without locking dentry.dataMu.
   487  	rw.d.handleMu.RLock()
   488  	h := rw.d.writeHandle()
   489  	if (rw.d.mmapFD.RacyLoad() >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
   490  		n, err := h.writeFromBlocksAt(rw.ctx, srcs, rw.off)
   491  		rw.off += n
   492  		rw.d.dataMu.Lock()
   493  		if rw.off > rw.d.size.Load() {
   494  			rw.d.size.Store(rw.off)
   495  			// The remote file's size will implicitly be extended to the correct
   496  			// value when we write back to it.
   497  		}
   498  		rw.d.dataMu.Unlock()
   499  		rw.d.handleMu.RUnlock()
   500  		return n, err
   501  	}
   502  
   503  	// Otherwise write to/through the cache.
   504  	mf := rw.d.fs.mf
   505  	rw.d.dataMu.Lock()
   506  
   507  	// Compute the range to write (overflow-checked).
   508  	start := rw.off
   509  	end := rw.off + srcs.NumBytes()
   510  	if end <= rw.off {
   511  		end = math.MaxInt64
   512  	}
   513  
   514  	var (
   515  		done   uint64
   516  		retErr error
   517  	)
   518  	seg, gap := rw.d.cache.Find(rw.off)
   519  	for rw.off < end {
   520  		mr := memmap.MappableRange{rw.off, end}
   521  		switch {
   522  		case seg.Ok():
   523  			// Get internal mappings from the cache.
   524  			segMR := seg.Range().Intersect(mr)
   525  			ims, err := mf.MapInternal(seg.FileRangeOf(segMR), hostarch.Write)
   526  			if err != nil {
   527  				retErr = err
   528  				goto exitLoop
   529  			}
   530  
   531  			// Copy to internal mappings.
   532  			n, err := safemem.CopySeq(ims, srcs)
   533  			done += n
   534  			rw.off += n
   535  			srcs = srcs.DropFirst64(n)
   536  			rw.d.dirty.MarkDirty(segMR)
   537  			if err != nil {
   538  				retErr = err
   539  				goto exitLoop
   540  			}
   541  
   542  			// Continue.
   543  			seg, gap = seg.NextNonEmpty()
   544  
   545  		case gap.Ok():
   546  			// Write directly to the file. At present, we never fill the cache
   547  			// when writing, since doing so can convert small writes into
   548  			// inefficient read-modify-write cycles, and we have no mechanism
   549  			// for detecting or avoiding this.
   550  			gapMR := gap.Range().Intersect(mr)
   551  			gapSrcs := srcs.TakeFirst64(gapMR.Length())
   552  			n, err := h.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start)
   553  			done += n
   554  			rw.off += n
   555  			srcs = srcs.DropFirst64(n)
   556  			// Partial writes are fine. But we must stop writing.
   557  			if n != gapSrcs.NumBytes() || err != nil {
   558  				retErr = err
   559  				goto exitLoop
   560  			}
   561  
   562  			// Continue.
   563  			seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
   564  		}
   565  	}
   566  exitLoop:
   567  	if rw.off > rw.d.size.Load() {
   568  		rw.d.size.Store(rw.off)
   569  		// The remote file's size will implicitly be extended to the correct
   570  		// value when we write back to it.
   571  	}
   572  	// If InteropModeWritethrough is in effect, flush written data back to the
   573  	// remote filesystem.
   574  	if rw.d.fs.opts.interop == InteropModeWritethrough && done != 0 {
   575  		if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{
   576  			Start: start,
   577  			End:   rw.off,
   578  		}, &rw.d.cache, &rw.d.dirty, rw.d.size.Load(), mf, h.writeFromBlocksAt); err != nil {
   579  			// We have no idea how many bytes were actually flushed.
   580  			rw.off = start
   581  			done = 0
   582  			retErr = err
   583  		}
   584  	}
   585  	rw.d.dataMu.Unlock()
   586  	rw.d.handleMu.RUnlock()
   587  	return done, retErr
   588  }
   589  
   590  func (d *dentry) writeback(ctx context.Context, offset, size int64) error {
   591  	if size == 0 {
   592  		return nil
   593  	}
   594  	d.handleMu.RLock()
   595  	defer d.handleMu.RUnlock()
   596  	h := d.writeHandle()
   597  	d.dataMu.Lock()
   598  	defer d.dataMu.Unlock()
   599  	// Compute the range of valid bytes (overflow-checked).
   600  	dentrySize := d.size.Load()
   601  	if uint64(offset) >= dentrySize {
   602  		return nil
   603  	}
   604  	end := int64(dentrySize)
   605  	if rend := offset + size; rend > offset && rend < end {
   606  		end = rend
   607  	}
   608  	return fsutil.SyncDirty(ctx, memmap.MappableRange{
   609  		Start: uint64(offset),
   610  		End:   uint64(end),
   611  	}, &d.cache, &d.dirty, dentrySize, d.fs.mf, h.writeFromBlocksAt)
   612  }
   613  
   614  // Seek implements vfs.FileDescriptionImpl.Seek.
   615  func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
   616  	fd.mu.Lock()
   617  	defer fd.mu.Unlock()
   618  	newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence)
   619  	if err != nil {
   620  		return 0, err
   621  	}
   622  	fd.off = newOffset
   623  	return newOffset, nil
   624  }
   625  
   626  // Calculate the new offset for a seek operation on a regular file.
   627  func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int64, whence int32) (int64, error) {
   628  	switch whence {
   629  	case linux.SEEK_SET:
   630  		// Use offset as specified.
   631  	case linux.SEEK_CUR:
   632  		offset += fdOffset
   633  	case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE:
   634  		// Ensure file size is up to date.
   635  		if !d.cachedMetadataAuthoritative() {
   636  			if err := d.updateMetadata(ctx); err != nil {
   637  				return 0, err
   638  			}
   639  		}
   640  		size := int64(d.size.Load())
   641  		// For SEEK_DATA and SEEK_HOLE, treat the file as a single contiguous
   642  		// block of data.
   643  		switch whence {
   644  		case linux.SEEK_END:
   645  			offset += size
   646  		case linux.SEEK_DATA:
   647  			if offset >= size {
   648  				return 0, linuxerr.ENXIO
   649  			}
   650  			// Use offset as specified.
   651  		case linux.SEEK_HOLE:
   652  			if offset >= size {
   653  				return 0, linuxerr.ENXIO
   654  			}
   655  			offset = size
   656  		}
   657  	default:
   658  		return 0, linuxerr.EINVAL
   659  	}
   660  	if offset < 0 {
   661  		return 0, linuxerr.EINVAL
   662  	}
   663  	return offset, nil
   664  }
   665  
   666  // Sync implements vfs.FileDescriptionImpl.Sync.
   667  func (fd *regularFileFD) Sync(ctx context.Context) error {
   668  	return fd.dentry().syncCachedFile(ctx, false /* forFilesystemSync */)
   669  }
   670  
   671  // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
   672  func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
   673  	d := fd.dentry()
   674  	// Force sentry page caching at your own risk.
   675  	if !d.fs.opts.forcePageCache {
   676  		switch d.fs.opts.interop {
   677  		case InteropModeExclusive:
   678  			// Any mapping is fine.
   679  		case InteropModeWritethrough:
   680  			// Shared writable mappings require a host FD, since otherwise we
   681  			// can't synchronously flush memory-mapped writes to the remote
   682  			// file.
   683  			if opts.Private || !opts.MaxPerms.Write {
   684  				break
   685  			}
   686  			fallthrough
   687  		case InteropModeShared:
   688  			// All mappings require a host FD to be coherent with other
   689  			// filesystem users.
   690  			if d.mmapFD.Load() < 0 {
   691  				return linuxerr.ENODEV
   692  			}
   693  		default:
   694  			panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop))
   695  		}
   696  	}
   697  	// After this point, d may be used as a memmap.Mappable.
   698  	d.pf.hostFileMapperInitOnce.Do(d.pf.hostFileMapper.Init)
   699  	opts.SentryOwnedContent = d.fs.opts.forcePageCache
   700  	return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts)
   701  }
   702  
   703  func (fs *filesystem) mayCachePagesInMemoryFile() bool {
   704  	return fs.opts.forcePageCache || fs.opts.interop != InteropModeShared
   705  }
   706  
   707  // AddMapping implements memmap.Mappable.AddMapping.
   708  func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error {
   709  	d.mapsMu.Lock()
   710  	mapped := d.mappings.AddMapping(ms, ar, offset, writable)
   711  	// Do this unconditionally since whether we have a host FD can change
   712  	// across save/restore.
   713  	for _, r := range mapped {
   714  		d.pf.hostFileMapper.IncRefOn(r)
   715  	}
   716  	if d.fs.mayCachePagesInMemoryFile() {
   717  		// d.Evict() will refuse to evict memory-mapped pages, so tell the
   718  		// MemoryFile to not bother trying.
   719  		mf := d.fs.mf
   720  		for _, r := range mapped {
   721  			mf.MarkUnevictable(d, pgalloc.EvictableRange{r.Start, r.End})
   722  		}
   723  	}
   724  	d.mapsMu.Unlock()
   725  	return nil
   726  }
   727  
   728  // RemoveMapping implements memmap.Mappable.RemoveMapping.
   729  func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) {
   730  	d.mapsMu.Lock()
   731  	unmapped := d.mappings.RemoveMapping(ms, ar, offset, writable)
   732  	for _, r := range unmapped {
   733  		d.pf.hostFileMapper.DecRefOn(r)
   734  	}
   735  	if d.fs.mayCachePagesInMemoryFile() {
   736  		// Pages that are no longer referenced by any application memory
   737  		// mappings are now considered unused; allow MemoryFile to evict them
   738  		// when necessary.
   739  		mf := d.fs.mf
   740  		d.dataMu.Lock()
   741  		for _, r := range unmapped {
   742  			// Since these pages are no longer mapped, they are no longer
   743  			// concurrently dirtyable by a writable memory mapping.
   744  			d.dirty.AllowClean(r)
   745  			mf.MarkEvictable(d, pgalloc.EvictableRange{r.Start, r.End})
   746  		}
   747  		d.dataMu.Unlock()
   748  	}
   749  	d.mapsMu.Unlock()
   750  }
   751  
   752  // CopyMapping implements memmap.Mappable.CopyMapping.
   753  func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error {
   754  	return d.AddMapping(ctx, ms, dstAR, offset, writable)
   755  }
   756  
   757  // Translate implements memmap.Mappable.Translate.
   758  func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
   759  	d.handleMu.RLock()
   760  	if d.mmapFD.RacyLoad() >= 0 && !d.fs.opts.forcePageCache {
   761  		d.handleMu.RUnlock()
   762  		mr := optional
   763  		if d.fs.opts.limitHostFDTranslation {
   764  			mr = maxFillRange(required, optional)
   765  		}
   766  		return []memmap.Translation{
   767  			{
   768  				Source: mr,
   769  				File:   &d.pf,
   770  				Offset: mr.Start,
   771  				Perms:  hostarch.AnyAccess,
   772  			},
   773  		}, nil
   774  	}
   775  
   776  	d.dataMu.Lock()
   777  
   778  	// Constrain translations to d.size (rounded up) to prevent translation to
   779  	// pages that may be concurrently truncated.
   780  	pgend, _ := hostarch.PageRoundUp(d.size.Load())
   781  	var beyondEOF bool
   782  	if required.End > pgend {
   783  		if required.Start >= pgend {
   784  			d.dataMu.Unlock()
   785  			d.handleMu.RUnlock()
   786  			return nil, &memmap.BusError{io.EOF}
   787  		}
   788  		beyondEOF = true
   789  		required.End = pgend
   790  	}
   791  	if optional.End > pgend {
   792  		optional.End = pgend
   793  	}
   794  
   795  	mf := d.fs.mf
   796  	h := d.readHandle()
   797  	_, cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), d.size.Load(), mf, usage.PageCache, pgalloc.AllocateAndWritePopulate, h.readToBlocksAt)
   798  
   799  	var ts []memmap.Translation
   800  	var translatedEnd uint64
   801  	for seg := d.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
   802  		segMR := seg.Range().Intersect(optional)
   803  		// TODO(jamieliu): Make Translations writable even if writability is
   804  		// not required if already kept-dirty by another writable translation.
   805  		perms := hostarch.AccessType{
   806  			Read:    true,
   807  			Execute: true,
   808  		}
   809  		if at.Write {
   810  			// From this point forward, this memory can be dirtied through the
   811  			// mapping at any time.
   812  			d.dirty.KeepDirty(segMR)
   813  			perms.Write = true
   814  		}
   815  		ts = append(ts, memmap.Translation{
   816  			Source: segMR,
   817  			File:   mf,
   818  			Offset: seg.FileRangeOf(segMR).Start,
   819  			Perms:  perms,
   820  		})
   821  		translatedEnd = segMR.End
   822  	}
   823  
   824  	d.dataMu.Unlock()
   825  	d.handleMu.RUnlock()
   826  
   827  	// Don't return the error returned by c.cache.Fill if it occurred outside
   828  	// of required.
   829  	if translatedEnd < required.End && cerr != nil {
   830  		return ts, &memmap.BusError{cerr}
   831  	}
   832  	if beyondEOF {
   833  		return ts, &memmap.BusError{io.EOF}
   834  	}
   835  	return ts, nil
   836  }
   837  
   838  func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange {
   839  	const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily
   840  	if required.Length() >= maxReadahead {
   841  		return required
   842  	}
   843  	if optional.Length() <= maxReadahead {
   844  		return optional
   845  	}
   846  	optional.Start = required.Start
   847  	if optional.Length() <= maxReadahead {
   848  		return optional
   849  	}
   850  	optional.End = optional.Start + maxReadahead
   851  	return optional
   852  }
   853  
   854  // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
   855  func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
   856  	// Whether we have a host fd (and consequently what memmap.File is
   857  	// mapped) can change across save/restore, so invalidate all translations
   858  	// unconditionally.
   859  	d.mapsMu.Lock()
   860  	defer d.mapsMu.Unlock()
   861  	d.mappings.InvalidateAll(memmap.InvalidateOpts{})
   862  
   863  	// Write the cache's contents back to the remote file so that if we have a
   864  	// host fd after restore, the remote file's contents are coherent.
   865  	mf := d.fs.mf
   866  	d.handleMu.RLock()
   867  	defer d.handleMu.RUnlock()
   868  	h := d.writeHandle()
   869  	d.dataMu.Lock()
   870  	defer d.dataMu.Unlock()
   871  	if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil {
   872  		return err
   873  	}
   874  
   875  	// Discard the cache so that it's not stored in saved state. This is safe
   876  	// because per InvalidateUnsavable invariants, no new translations can have
   877  	// been returned after we invalidated all existing translations above.
   878  	d.cache.DropAll(mf)
   879  	d.dirty.RemoveAll()
   880  
   881  	return nil
   882  }
   883  
   884  // Evict implements pgalloc.EvictableMemoryUser.Evict.
   885  func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
   886  	mr := memmap.MappableRange{er.Start, er.End}
   887  	mf := d.fs.mf
   888  	d.mapsMu.Lock()
   889  	defer d.mapsMu.Unlock()
   890  	d.handleMu.RLock()
   891  	defer d.handleMu.RUnlock()
   892  	h := d.writeHandle()
   893  	d.dataMu.Lock()
   894  	defer d.dataMu.Unlock()
   895  
   896  	// Only allow pages that are no longer memory-mapped to be evicted.
   897  	for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() {
   898  		mgapMR := mgap.Range().Intersect(mr)
   899  		if mgapMR.Length() == 0 {
   900  			continue
   901  		}
   902  		if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil {
   903  			log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err)
   904  		}
   905  		d.cache.Drop(mgapMR, mf)
   906  		d.dirty.KeepClean(mgapMR)
   907  	}
   908  }
   909  
   910  // dentryPlatformFile implements memmap.File. It exists solely because dentry
   911  // cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef.
   912  //
   913  // dentryPlatformFile is only used when a host FD representing the remote file
   914  // is available (i.e. dentry.mmapFD >= 0), and that FD is used for application
   915  // memory mappings (i.e. !filesystem.opts.forcePageCache).
   916  //
   917  // +stateify savable
   918  type dentryPlatformFile struct {
   919  	memmap.NoBufferedIOFallback
   920  
   921  	*dentry
   922  
   923  	// fdRefs counts references on memmap.File offsets. fdRefs is protected
   924  	// by dentry.dataMu.
   925  	fdRefs fsutil.FrameRefSet
   926  
   927  	// If this dentry represents a regular file, and dentry.mmapFD >= 0,
   928  	// hostFileMapper caches mappings of dentry.mmapFD.
   929  	hostFileMapper fsutil.HostFileMapper
   930  
   931  	// hostFileMapperInitOnce is used to lazily initialize hostFileMapper.
   932  	hostFileMapperInitOnce sync.Once `state:"nosave"`
   933  }
   934  
   935  // IncRef implements memmap.File.IncRef.
   936  func (d *dentryPlatformFile) IncRef(fr memmap.FileRange, memCgID uint32) {
   937  	d.dataMu.Lock()
   938  	d.fdRefs.IncRefAndAccount(fr, memCgID)
   939  	d.dataMu.Unlock()
   940  }
   941  
   942  // DecRef implements memmap.File.DecRef.
   943  func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) {
   944  	d.dataMu.Lock()
   945  	d.fdRefs.DecRefAndAccount(fr)
   946  	d.dataMu.Unlock()
   947  }
   948  
   949  // MapInternal implements memmap.File.MapInternal.
   950  func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) {
   951  	d.handleMu.RLock()
   952  	defer d.handleMu.RUnlock()
   953  	return d.hostFileMapper.MapInternal(fr, int(d.mmapFD.RacyLoad()), at.Write)
   954  }
   955  
   956  // FD implements memmap.File.FD.
   957  func (d *dentryPlatformFile) FD() int {
   958  	d.handleMu.RLock()
   959  	defer d.handleMu.RUnlock()
   960  	return int(d.mmapFD.RacyLoad())
   961  }