github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/tmpfs/regular_file.go

github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/tmpfs/regular_file.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tmpfs
    16  
    17  import (
    18  	"fmt"
    19  	"io"
    20  	"math"
    21  
    22  	"github.com/metacubex/gvisor/pkg/abi/linux"
    23  	"github.com/metacubex/gvisor/pkg/atomicbitops"
    24  	"github.com/metacubex/gvisor/pkg/context"
    25  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    26  	"github.com/metacubex/gvisor/pkg/hostarch"
    27  	"github.com/metacubex/gvisor/pkg/safemem"
    28  	"github.com/metacubex/gvisor/pkg/sentry/fsmetric"
    29  	"github.com/metacubex/gvisor/pkg/sentry/fsutil"
    30  	"github.com/metacubex/gvisor/pkg/sentry/hostfd"
    31  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    32  	"github.com/metacubex/gvisor/pkg/sentry/memmap"
    33  	"github.com/metacubex/gvisor/pkg/sentry/pgalloc"
    34  	"github.com/metacubex/gvisor/pkg/sentry/usage"
    35  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    36  	"github.com/metacubex/gvisor/pkg/sync"
    37  	"github.com/metacubex/gvisor/pkg/usermem"
    38  )
    39  
    40  // regularFile is a regular (=S_IFREG) tmpfs file.
    41  //
    42  // +stateify savable
    43  type regularFile struct {
    44  	inode inode
    45  
    46  	// memoryUsageKind is the memory accounting category under which pages backing
    47  	// this regularFile's contents are accounted.
    48  	memoryUsageKind usage.MemoryKind
    49  
    50  	// mapsMu protects mappings.
    51  	mapsMu sync.Mutex `state:"nosave"`
    52  
    53  	// mappings tracks mappings of the file into memmap.MappingSpaces.
    54  	//
    55  	// Protected by mapsMu.
    56  	mappings memmap.MappingSet
    57  
    58  	// writableMappingPages tracks how many pages of virtual memory are mapped
    59  	// as potentially writable from this file. If a page has multiple mappings,
    60  	// each mapping is counted separately.
    61  	//
    62  	// This counter is susceptible to overflow as we can potentially count
    63  	// mappings from many VMAs. We count pages rather than bytes to slightly
    64  	// mitigate this.
    65  	//
    66  	// Protected by mapsMu.
    67  	writableMappingPages uint64
    68  
    69  	// dataMu protects the fields below.
    70  	dataMu sync.RWMutex `state:"nosave"`
    71  
    72  	// data maps offsets into the file to offsets into memFile that store
    73  	// the file's data.
    74  	//
    75  	// Protected by dataMu.
    76  	data fsutil.FileRangeSet
    77  
    78  	// seals represents file seals on this inode.
    79  	//
    80  	// Protected by dataMu.
    81  	seals uint32
    82  
    83  	// size is the size of data.
    84  	//
    85  	// Protected by both dataMu and inode.mu; reading it requires holding
    86  	// either mutex, while writing requires holding both AND using atomics.
    87  	// Readers that do not require consistency (like Stat) may read the
    88  	// value atomically without holding either lock.
    89  	size atomicbitops.Uint64
    90  }
    91  
    92  func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) *inode {
    93  	file := &regularFile{
    94  		memoryUsageKind: fs.usage,
    95  		seals:           linux.F_SEAL_SEAL,
    96  	}
    97  	file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode, parentDir)
    98  	file.inode.nlink = atomicbitops.FromUint32(1) // from parent directory
    99  	return &file.inode
   100  }
   101  
   102  // newUnlinkedRegularFileDescription creates a regular file on the tmpfs
   103  // filesystem represented by mount and returns an FD representing that file.
   104  // The new file is not reachable by path traversal from any other file.
   105  //
   106  // newUnlinkedRegularFileDescription is analogous to Linux's
   107  // mm/shmem.c:__shmem_file_setup().
   108  //
   109  // Preconditions: mount must be a tmpfs mount.
   110  func newUnlinkedRegularFileDescription(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, name string) (*regularFileFD, error) {
   111  	fs, ok := mount.Filesystem().Impl().(*filesystem)
   112  	if !ok {
   113  		panic("tmpfs.newUnlinkedRegularFileDescription() called with non-tmpfs mount")
   114  	}
   115  
   116  	inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777, nil /* parentDir */)
   117  	d := fs.newDentry(inode)
   118  	defer d.DecRef(ctx)
   119  	d.name = name
   120  
   121  	fd := &regularFileFD{}
   122  	fd.Init(&inode.locks)
   123  	flags := uint32(linux.O_RDWR)
   124  	if err := fd.vfsfd.Init(fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
   125  		return nil, err
   126  	}
   127  	return fd, nil
   128  }
   129  
   130  // NewZeroFile creates a new regular file and file description as for
   131  // mmap(MAP_SHARED | MAP_ANONYMOUS). The file has the given size and is
   132  // initially (implicitly) filled with zeroes.
   133  //
   134  // Preconditions: mount must be a tmpfs mount.
   135  func NewZeroFile(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, size uint64) (*vfs.FileDescription, error) {
   136  	// Compare mm/shmem.c:shmem_zero_setup().
   137  	fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, "dev/zero")
   138  	if err != nil {
   139  		return nil, err
   140  	}
   141  	rf := fd.inode().impl.(*regularFile)
   142  	rf.memoryUsageKind = usage.Anonymous
   143  	rf.size.Store(size)
   144  	return &fd.vfsfd, err
   145  }
   146  
   147  // NewMemfd creates a new regular file and file description as for
   148  // memfd_create.
   149  //
   150  // Preconditions: mount must be a tmpfs mount.
   151  func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
   152  	fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, name)
   153  	if err != nil {
   154  		return nil, err
   155  	}
   156  	if allowSeals {
   157  		fd.inode().impl.(*regularFile).seals = 0
   158  	}
   159  	return &fd.vfsfd, nil
   160  }
   161  
   162  // truncate grows or shrinks the file to the given size. It returns true if the
   163  // file size was updated.
   164  func (rf *regularFile) truncate(newSize uint64) (bool, error) {
   165  	rf.inode.mu.Lock()
   166  	defer rf.inode.mu.Unlock()
   167  	return rf.truncateLocked(newSize)
   168  }
   169  
   170  // Preconditions:
   171  //   - rf.inode.mu must be held.
   172  //   - rf.dataMu must be locked for writing.
   173  //   - newSize > rf.size.
   174  func (rf *regularFile) growLocked(newSize uint64) error {
   175  	// Can we grow the file?
   176  	if rf.seals&linux.F_SEAL_GROW != 0 {
   177  		return linuxerr.EPERM
   178  	}
   179  	rf.size.Store(newSize)
   180  	return nil
   181  }
   182  
   183  // Preconditions: rf.inode.mu must be held.
   184  func (rf *regularFile) truncateLocked(newSize uint64) (bool, error) {
   185  	oldSize := rf.size.RacyLoad()
   186  	if newSize == oldSize {
   187  		// Nothing to do.
   188  		return false, nil
   189  	}
   190  
   191  	// Need to hold inode.mu and dataMu while modifying size.
   192  	rf.dataMu.Lock()
   193  	if newSize > oldSize {
   194  		err := rf.growLocked(newSize)
   195  		rf.dataMu.Unlock()
   196  		return err == nil, err
   197  	}
   198  
   199  	// We are shrinking the file. First check if this is allowed.
   200  	if rf.seals&linux.F_SEAL_SHRINK != 0 {
   201  		rf.dataMu.Unlock()
   202  		return false, linuxerr.EPERM
   203  	}
   204  
   205  	rf.size.Store(newSize)
   206  	rf.dataMu.Unlock()
   207  
   208  	// Invalidate past translations of truncated pages.
   209  	oldpgend := offsetPageEnd(int64(oldSize))
   210  	newpgend := offsetPageEnd(int64(newSize))
   211  	if newpgend < oldpgend {
   212  		rf.mapsMu.Lock()
   213  		rf.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
   214  			// Compare Linux's mm/shmem.c:shmem_setattr() =>
   215  			// mm/memory.c:unmap_mapping_range(evencows=1).
   216  			InvalidatePrivate: true,
   217  		})
   218  		rf.mapsMu.Unlock()
   219  	}
   220  
   221  	// We are now guaranteed that there are no translations of truncated pages,
   222  	// and can remove them.
   223  	rf.dataMu.Lock()
   224  	decPages := rf.data.Truncate(newSize, rf.inode.fs.mf)
   225  	rf.dataMu.Unlock()
   226  	rf.inode.fs.unaccountPages(decPages)
   227  	return true, nil
   228  }
   229  
   230  // AddMapping implements memmap.Mappable.AddMapping.
   231  func (rf *regularFile) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error {
   232  	rf.mapsMu.Lock()
   233  	defer rf.mapsMu.Unlock()
   234  	rf.dataMu.RLock()
   235  	defer rf.dataMu.RUnlock()
   236  
   237  	// Reject writable mapping if F_SEAL_WRITE is set.
   238  	if rf.seals&linux.F_SEAL_WRITE != 0 && writable {
   239  		return linuxerr.EPERM
   240  	}
   241  
   242  	rf.mappings.AddMapping(ms, ar, offset, writable)
   243  	if writable {
   244  		pagesBefore := rf.writableMappingPages
   245  
   246  		// ar is guaranteed to be page aligned per memmap.Mappable.
   247  		rf.writableMappingPages += uint64(ar.Length() / hostarch.PageSize)
   248  
   249  		if rf.writableMappingPages < pagesBefore {
   250  			panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages))
   251  		}
   252  	}
   253  
   254  	return nil
   255  }
   256  
   257  // RemoveMapping implements memmap.Mappable.RemoveMapping.
   258  func (rf *regularFile) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) {
   259  	rf.mapsMu.Lock()
   260  	defer rf.mapsMu.Unlock()
   261  
   262  	rf.mappings.RemoveMapping(ms, ar, offset, writable)
   263  
   264  	if writable {
   265  		pagesBefore := rf.writableMappingPages
   266  
   267  		// ar is guaranteed to be page aligned per memmap.Mappable.
   268  		rf.writableMappingPages -= uint64(ar.Length() / hostarch.PageSize)
   269  
   270  		if rf.writableMappingPages > pagesBefore {
   271  			panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages))
   272  		}
   273  	}
   274  }
   275  
   276  // CopyMapping implements memmap.Mappable.CopyMapping.
   277  func (rf *regularFile) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error {
   278  	return rf.AddMapping(ctx, ms, dstAR, offset, writable)
   279  }
   280  
   281  // Translate implements memmap.Mappable.Translate.
   282  func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
   283  	rf.dataMu.Lock()
   284  	defer rf.dataMu.Unlock()
   285  
   286  	// Constrain translations to f.attr.Size (rounded up) to prevent
   287  	// translation to pages that may be concurrently truncated.
   288  	pgend := offsetPageEnd(int64(rf.size.RacyLoad()))
   289  	var beyondEOF bool
   290  	if required.End > pgend {
   291  		if required.Start >= pgend {
   292  			return nil, &memmap.BusError{io.EOF}
   293  		}
   294  		beyondEOF = true
   295  		required.End = pgend
   296  	}
   297  	if optional.End > pgend {
   298  		optional.End = pgend
   299  	}
   300  	pagesToFill := rf.data.PagesToFill(required, optional)
   301  	if !rf.inode.fs.accountPages(pagesToFill) {
   302  		// If we can not accommodate pagesToFill pages, then retry with just
   303  		// the required range. Because optional may be larger than required.
   304  		// Only error out if even the required range can not be allocated for.
   305  		pagesToFill = rf.data.PagesToFill(required, required)
   306  		if !rf.inode.fs.accountPages(pagesToFill) {
   307  			return nil, &memmap.BusError{linuxerr.ENOSPC}
   308  		}
   309  		optional = required
   310  	}
   311  	pagesAlloced, cerr := rf.data.Fill(ctx, required, optional, rf.size.RacyLoad(), rf.inode.fs.mf, rf.memoryUsageKind, pgalloc.AllocateOnly, nil /* r */)
   312  	// rf.data.Fill() may fail mid-way. We still want to account any pages that
   313  	// were allocated, irrespective of an error.
   314  	rf.inode.fs.adjustPageAcct(pagesToFill, pagesAlloced)
   315  
   316  	var ts []memmap.Translation
   317  	var translatedEnd uint64
   318  	for seg := rf.data.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
   319  		segMR := seg.Range().Intersect(optional)
   320  		ts = append(ts, memmap.Translation{
   321  			Source: segMR,
   322  			File:   rf.inode.fs.mf,
   323  			Offset: seg.FileRangeOf(segMR).Start,
   324  			Perms:  hostarch.AnyAccess,
   325  		})
   326  		translatedEnd = segMR.End
   327  	}
   328  
   329  	// Don't return the error returned by f.data.Fill if it occurred outside of
   330  	// required.
   331  	if translatedEnd < required.End && cerr != nil {
   332  		return ts, &memmap.BusError{cerr}
   333  	}
   334  	if beyondEOF {
   335  		return ts, &memmap.BusError{io.EOF}
   336  	}
   337  	return ts, nil
   338  }
   339  
   340  // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
   341  func (*regularFile) InvalidateUnsavable(context.Context) error {
   342  	return nil
   343  }
   344  
   345  // +stateify savable
   346  type regularFileFD struct {
   347  	fileDescription
   348  
   349  	// off is the file offset. off is accessed using atomic memory operations.
   350  	// offMu serializes operations that may mutate off.
   351  	off   int64
   352  	offMu sync.Mutex `state:"nosave"`
   353  }
   354  
   355  // Release implements vfs.FileDescriptionImpl.Release.
   356  func (fd *regularFileFD) Release(context.Context) {
   357  	// noop
   358  }
   359  
   360  // Allocate implements vfs.FileDescriptionImpl.Allocate.
   361  func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
   362  	f := fd.inode().impl.(*regularFile)
   363  	// To be consistent with Linux, inode.mu must be locked throughout.
   364  	f.inode.mu.Lock()
   365  	defer f.inode.mu.Unlock()
   366  	end := offset + length
   367  	pgEnd, ok := hostarch.PageRoundUp(end)
   368  	if !ok {
   369  		return linuxerr.EFBIG
   370  	}
   371  	// Allocate in chunks for the following reasons:
   372  	// 1. Size limit may permit really large fallocate, which can take a long
   373  	//    time to execute on the host. This can cause watchdog to timeout and
   374  	//    crash the system. Watchdog needs petting.
   375  	// 2. Linux allocates folios iteratively while checking for interrupts. In
   376  	//    gVisor, we need to manually check for interrupts between chunks.
   377  	const chunkSize = 4 << 30 // 4 GiB
   378  	for curPgStart := hostarch.PageRoundDown(offset); curPgStart < pgEnd; {
   379  		curPgEnd := pgEnd
   380  		newSize := end
   381  		if curPgEnd-curPgStart > chunkSize {
   382  			curPgEnd = curPgStart + chunkSize
   383  			newSize = curPgEnd
   384  		}
   385  		required := memmap.MappableRange{Start: curPgStart, End: curPgEnd}
   386  		if err := f.allocateLocked(ctx, mode, newSize, required); err != nil {
   387  			return err
   388  		}
   389  		// This loop can take a long time to process, so periodically check for
   390  		// interrupts. This also pets the watchdog.
   391  		if ctx.Interrupted() {
   392  			return linuxerr.EINTR
   393  		}
   394  		// Advance curPgStart.
   395  		curPgStart = curPgEnd
   396  	}
   397  	return nil
   398  }
   399  
   400  // Preconditions:
   401  // - rf.inode.mu is locked.
   402  // - required must be page-aligned.
   403  // - required.Start < newSize <= required.End.
   404  func (rf *regularFile) allocateLocked(ctx context.Context, mode, newSize uint64, required memmap.MappableRange) error {
   405  	rf.dataMu.Lock()
   406  	defer rf.dataMu.Unlock()
   407  
   408  	// We must allocate pages in the range specified by offset and length.
   409  	// Even if newSize <= oldSize, there might not be actual memory backing this
   410  	// range, so any gaps must be filled by calling f.data.Fill().
   411  	// "After a successful call, subsequent writes into the range
   412  	// specified by offset and len are guaranteed not to fail because of
   413  	// lack of disk space."  - fallocate(2)
   414  	pagesToFill := rf.data.PagesToFill(required, required)
   415  	if !rf.inode.fs.accountPages(pagesToFill) {
   416  		return linuxerr.ENOSPC
   417  	}
   418  	// Given our definitions in pgalloc, fallocate(2) semantics imply that pages
   419  	// in the MemoryFile must be committed, in addition to being allocated.
   420  	allocMode := pgalloc.AllocateAndCommit
   421  	if !rf.inode.fs.mf.IsDiskBacked() {
   422  		// Upgrade to AllocateAndWritePopulate for memory(shmem)-backed files. We
   423  		// take a more aggressive approach in populating pages for memory-backed
   424  		// MemoryFiles. shmem pages are subject to swap rather than disk writeback.
   425  		// They are not likely to be swapped before they are written to. Hence it
   426  		// is beneficial to populate (in addition to commit) shmem pages to avoid
   427  		// faulting page-by-page when these pages are written to in the future.
   428  		allocMode = pgalloc.AllocateAndWritePopulate
   429  	}
   430  	pagesAlloced, err := rf.data.Fill(ctx, required, required, newSize, rf.inode.fs.mf, rf.memoryUsageKind, allocMode, nil /* r */)
   431  	// f.data.Fill() may fail mid-way. We still want to account any pages that
   432  	// were allocated, irrespective of an error.
   433  	rf.inode.fs.adjustPageAcct(pagesToFill, pagesAlloced)
   434  	if err != nil && err != io.EOF {
   435  		return err
   436  	}
   437  
   438  	oldSize := rf.size.Load()
   439  	if oldSize >= newSize {
   440  		return nil
   441  	}
   442  	return rf.growLocked(newSize)
   443  }
   444  
   445  // PRead implements vfs.FileDescriptionImpl.PRead.
   446  func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
   447  	start := fsmetric.StartReadWait()
   448  	defer fsmetric.FinishReadWait(fsmetric.TmpfsReadWait, start)
   449  	fsmetric.TmpfsReads.Increment()
   450  
   451  	if offset < 0 {
   452  		return 0, linuxerr.EINVAL
   453  	}
   454  
   455  	// Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
   456  	// all state is in-memory.
   457  	//
   458  	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
   459  	if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
   460  		return 0, linuxerr.EOPNOTSUPP
   461  	}
   462  
   463  	if dst.NumBytes() == 0 {
   464  		return 0, nil
   465  	}
   466  	f := fd.inode().impl.(*regularFile)
   467  	rw := getRegularFileReadWriter(f, offset, 0)
   468  	n, err := dst.CopyOutFrom(ctx, rw)
   469  	putRegularFileReadWriter(rw)
   470  	fd.inode().touchAtime(fd.vfsfd.Mount())
   471  	return n, err
   472  }
   473  
   474  // Read implements vfs.FileDescriptionImpl.Read.
   475  func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
   476  	fd.offMu.Lock()
   477  	n, err := fd.PRead(ctx, dst, fd.off, opts)
   478  	fd.off += n
   479  	fd.offMu.Unlock()
   480  	return n, err
   481  }
   482  
   483  // PWrite implements vfs.FileDescriptionImpl.PWrite.
   484  func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
   485  	n, _, err := fd.pwrite(ctx, src, offset, opts)
   486  	return n, err
   487  }
   488  
   489  // pwrite returns the number of bytes written, final offset and error. The
   490  // final offset should be ignored by PWrite.
   491  func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
   492  	if offset < 0 {
   493  		return 0, offset, linuxerr.EINVAL
   494  	}
   495  
   496  	// Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
   497  	// all state is in-memory.
   498  	//
   499  	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
   500  	if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
   501  		return 0, offset, linuxerr.EOPNOTSUPP
   502  	}
   503  
   504  	srclen := src.NumBytes()
   505  	if srclen == 0 {
   506  		return 0, offset, nil
   507  	}
   508  	f := fd.inode().impl.(*regularFile)
   509  	f.inode.mu.Lock()
   510  	defer f.inode.mu.Unlock()
   511  	// If the file is opened with O_APPEND, update offset to file size.
   512  	if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
   513  		// Locking f.inode.mu is sufficient for reading f.size.
   514  		offset = int64(f.size.RacyLoad())
   515  	}
   516  	end := offset + srclen
   517  	if end < offset {
   518  		// Overflow.
   519  		return 0, offset, linuxerr.EINVAL
   520  	}
   521  
   522  	srclen, err = vfs.CheckLimit(ctx, offset, srclen)
   523  	if err != nil {
   524  		return 0, offset, err
   525  	}
   526  	src = src.TakeFirst64(srclen)
   527  
   528  	// Perform the write.
   529  	rw := getRegularFileReadWriter(f, offset, pgalloc.MemoryCgroupIDFromContext(ctx))
   530  	n, err := src.CopyInTo(ctx, rw)
   531  
   532  	f.inode.touchCMtimeLocked()
   533  	for {
   534  		old := f.inode.mode.Load()
   535  		new := vfs.ClearSUIDAndSGID(old)
   536  		if swapped := f.inode.mode.CompareAndSwap(old, new); swapped {
   537  			break
   538  		}
   539  	}
   540  	putRegularFileReadWriter(rw)
   541  	return n, n + offset, err
   542  }
   543  
   544  // Write implements vfs.FileDescriptionImpl.Write.
   545  func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
   546  	fd.offMu.Lock()
   547  	n, off, err := fd.pwrite(ctx, src, fd.off, opts)
   548  	fd.off = off
   549  	fd.offMu.Unlock()
   550  	return n, err
   551  }
   552  
   553  // Seek implements vfs.FileDescriptionImpl.Seek.
   554  func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
   555  	fd.offMu.Lock()
   556  	defer fd.offMu.Unlock()
   557  	switch whence {
   558  	case linux.SEEK_SET:
   559  		// use offset as specified
   560  	case linux.SEEK_CUR:
   561  		offset += fd.off
   562  	case linux.SEEK_END:
   563  		offset += int64(fd.inode().impl.(*regularFile).size.Load())
   564  	default:
   565  		return 0, linuxerr.EINVAL
   566  	}
   567  	if offset < 0 {
   568  		return 0, linuxerr.EINVAL
   569  	}
   570  	fd.off = offset
   571  	return offset, nil
   572  }
   573  
   574  // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
   575  func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
   576  	file := fd.inode().impl.(*regularFile)
   577  	opts.SentryOwnedContent = true
   578  	return vfs.GenericConfigureMMap(&fd.vfsfd, file, opts)
   579  }
   580  
   581  // offsetPageEnd returns the file offset rounded up to the nearest
   582  // page boundary. offsetPageEnd panics if rounding up causes overflow,
   583  // which shouldn't be possible given that offset is an int64.
   584  func offsetPageEnd(offset int64) uint64 {
   585  	end, ok := hostarch.Addr(offset).RoundUp()
   586  	if !ok {
   587  		panic("impossible overflow")
   588  	}
   589  	return uint64(end)
   590  }
   591  
   592  // regularFileReadWriter implements safemem.Reader and Safemem.Writer.
   593  type regularFileReadWriter struct {
   594  	file *regularFile
   595  
   596  	// Offset into the file to read/write at. Note that this may be
   597  	// different from the FD offset if PRead/PWrite is used.
   598  	off uint64
   599  
   600  	// memCgID is the memory cgroup ID used for accounting the allocated
   601  	// pages.
   602  	memCgID uint32
   603  }
   604  
   605  var regularFileReadWriterPool = sync.Pool{
   606  	New: func() any {
   607  		return &regularFileReadWriter{}
   608  	},
   609  }
   610  
   611  func getRegularFileReadWriter(file *regularFile, offset int64, memCgID uint32) *regularFileReadWriter {
   612  	rw := regularFileReadWriterPool.Get().(*regularFileReadWriter)
   613  	rw.file = file
   614  	rw.off = uint64(offset)
   615  	rw.memCgID = memCgID
   616  	return rw
   617  }
   618  
   619  func putRegularFileReadWriter(rw *regularFileReadWriter) {
   620  	rw.file = nil
   621  	regularFileReadWriterPool.Put(rw)
   622  }
   623  
   624  // ReadToBlocks implements safemem.Reader.ReadToBlocks.
   625  func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
   626  	rw.file.dataMu.RLock()
   627  	defer rw.file.dataMu.RUnlock()
   628  	size := rw.file.size.RacyLoad()
   629  
   630  	// Compute the range to read (limited by file size and overflow-checked).
   631  	if rw.off >= size {
   632  		return 0, io.EOF
   633  	}
   634  	end := size
   635  	if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
   636  		end = rend
   637  	}
   638  
   639  	var done uint64
   640  	seg, gap := rw.file.data.Find(uint64(rw.off))
   641  	for rw.off < end {
   642  		mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
   643  		switch {
   644  		case seg.Ok():
   645  			// Get internal mappings.
   646  			ims, err := rw.file.inode.fs.mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Read)
   647  			if err != nil {
   648  				return done, err
   649  			}
   650  
   651  			// Copy from internal mappings.
   652  			n, err := safemem.CopySeq(dsts, ims)
   653  			done += n
   654  			rw.off += uint64(n)
   655  			dsts = dsts.DropFirst64(n)
   656  			if err != nil {
   657  				return done, err
   658  			}
   659  
   660  			// Continue.
   661  			seg, gap = seg.NextNonEmpty()
   662  
   663  		case gap.Ok():
   664  			// Tmpfs holes are zero-filled.
   665  			gapmr := gap.Range().Intersect(mr)
   666  			dst := dsts.TakeFirst64(gapmr.Length())
   667  			n, err := safemem.ZeroSeq(dst)
   668  			done += n
   669  			rw.off += uint64(n)
   670  			dsts = dsts.DropFirst64(n)
   671  			if err != nil {
   672  				return done, err
   673  			}
   674  
   675  			// Continue.
   676  			seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
   677  		}
   678  	}
   679  	return done, nil
   680  }
   681  
   682  // WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
   683  //
   684  // Preconditions: rw.file.inode.mu must be held.
   685  func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
   686  	// Hold dataMu so we can modify size.
   687  	rw.file.dataMu.Lock()
   688  	defer rw.file.dataMu.Unlock()
   689  
   690  	// Compute the range to write (overflow-checked).
   691  	end := rw.off + srcs.NumBytes()
   692  	if end <= rw.off {
   693  		end = math.MaxInt64
   694  	}
   695  
   696  	// Check if seals prevent either file growth or all writes.
   697  	switch {
   698  	case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed
   699  		return 0, linuxerr.EPERM
   700  	case end > rw.file.size.RacyLoad() && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed
   701  		// When growth is sealed, Linux effectively allows writes which would
   702  		// normally grow the file to partially succeed up to the current EOF,
   703  		// rounded down to the page boundary before the EOF.
   704  		//
   705  		// This happens because writes (and thus the growth check) for tmpfs
   706  		// files proceed page-by-page on Linux, and the final write to the page
   707  		// containing EOF fails, resulting in a partial write up to the start of
   708  		// that page.
   709  		//
   710  		// To emulate this behaviour, artificially truncate the write to the
   711  		// start of the page containing the current EOF.
   712  		//
   713  		// See Linux, mm/filemap.c:generic_perform_write() and
   714  		// mm/shmem.c:shmem_write_begin().
   715  		if pgstart := uint64(hostarch.Addr(rw.file.size.RacyLoad()).RoundDown()); end > pgstart {
   716  			end = pgstart
   717  		}
   718  		if end <= rw.off {
   719  			// Truncation would result in no data being written.
   720  			return 0, linuxerr.EPERM
   721  		}
   722  	}
   723  
   724  	// Page-aligned mr for when we need to allocate memory. RoundUp can't
   725  	// overflow since end is an int64.
   726  	pgstartaddr := hostarch.Addr(rw.off).RoundDown()
   727  	pgendaddr, _ := hostarch.Addr(end).RoundUp()
   728  	pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)}
   729  
   730  	var (
   731  		done   uint64
   732  		retErr error
   733  	)
   734  	seg, gap := rw.file.data.Find(uint64(rw.off))
   735  	for rw.off < end {
   736  		mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
   737  		switch {
   738  		case seg.Ok():
   739  			n, err := rw.writeToMF(seg.FileRangeOf(seg.Range().Intersect(mr)), srcs)
   740  			done += n
   741  			rw.off += uint64(n)
   742  			srcs = srcs.DropFirst64(n)
   743  			if err != nil {
   744  				retErr = err
   745  				goto exitLoop
   746  			}
   747  
   748  			// Continue.
   749  			seg, gap = seg.NextNonEmpty()
   750  
   751  		case gap.Ok():
   752  			// Allocate memory for the write.
   753  			gapMR := gap.Range().Intersect(pgMR)
   754  			pagesToFill := gapMR.Length() / hostarch.PageSize
   755  			pagesReserved := rw.file.inode.fs.accountPagesPartial(pagesToFill)
   756  			if pagesReserved == 0 {
   757  				if done == 0 {
   758  					retErr = linuxerr.ENOSPC
   759  					goto exitLoop
   760  				}
   761  				retErr = nil
   762  				goto exitLoop
   763  			}
   764  			gapMR.End = gapMR.Start + (hostarch.PageSize * pagesReserved)
   765  			allocMode := pgalloc.AllocateAndWritePopulate
   766  			if rw.file.inode.fs.mf.IsDiskBacked() {
   767  				// Don't populate pages for disk-backed files. Benchmarking showed that
   768  				// disk-backed pages are likely to be written back to disk before we
   769  				// can write to them. The pages fault again on write anyways. In total,
   770  				// prepopulating disk-backed pages deteriorates performance as it fails
   771  				// to eliminate future page faults and we also additionally incur
   772  				// useless disk writebacks.
   773  				allocMode = pgalloc.AllocateOnly
   774  			}
   775  			fr, err := rw.file.inode.fs.mf.Allocate(gapMR.Length(), pgalloc.AllocOpts{
   776  				Kind:    rw.file.memoryUsageKind,
   777  				Mode:    allocMode,
   778  				MemCgID: rw.memCgID,
   779  			})
   780  			if err != nil {
   781  				retErr = err
   782  				rw.file.inode.fs.unaccountPages(pagesReserved)
   783  				goto exitLoop
   784  			}
   785  
   786  			// Write to that memory as usual.
   787  			seg, gap = rw.file.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{}
   788  		default:
   789  			panic("unreachable")
   790  		}
   791  	}
   792  exitLoop:
   793  	// If the write ends beyond the file's previous size, it causes the
   794  	// file to grow.
   795  	if rw.off > rw.file.size.RacyLoad() {
   796  		rw.file.size.Store(rw.off)
   797  	}
   798  
   799  	return done, retErr
   800  }
   801  
   802  func (rw *regularFileReadWriter) writeToMF(fr memmap.FileRange, srcs safemem.BlockSeq) (uint64, error) {
   803  	if rw.file.inode.fs.mf.IsDiskBacked() {
   804  		// Disk-backed files are not prepopulated. The safemem.CopySeq() approach
   805  		// used below incurs a lot of page faults without page prepopulation, which
   806  		// causes a lot of context switching. Use write(2) host syscall instead,
   807  		// which makes one context switch and faults all the pages that are touched
   808  		// during the write.
   809  		return hostfd.Pwritev2(
   810  			int32(rw.file.inode.fs.mf.FD()), // fd
   811  			srcs.TakeFirst64(fr.Length()),   // srcs
   812  			int64(fr.Start),                 // offset
   813  			0,                               // flags
   814  		)
   815  	}
   816  	// Get internal mappings.
   817  	ims, err := rw.file.inode.fs.mf.MapInternal(fr, hostarch.Write)
   818  	if err != nil {
   819  		return 0, err
   820  	}
   821  	// Copy to internal mappings.
   822  	return safemem.CopySeq(ims, srcs)
   823  }
   824  
   825  // GetSeals returns the current set of seals on a memfd inode.
   826  func GetSeals(fd *vfs.FileDescription) (uint32, error) {
   827  	f, ok := fd.Impl().(*regularFileFD)
   828  	if !ok {
   829  		return 0, linuxerr.EINVAL
   830  	}
   831  	rf := f.inode().impl.(*regularFile)
   832  	rf.dataMu.RLock()
   833  	defer rf.dataMu.RUnlock()
   834  	return rf.seals, nil
   835  }
   836  
   837  // AddSeals adds new file seals to a memfd inode.
   838  func AddSeals(fd *vfs.FileDescription, val uint32) error {
   839  	f, ok := fd.Impl().(*regularFileFD)
   840  	if !ok {
   841  		return linuxerr.EINVAL
   842  	}
   843  	rf := f.inode().impl.(*regularFile)
   844  	rf.mapsMu.Lock()
   845  	defer rf.mapsMu.Unlock()
   846  	rf.dataMu.Lock()
   847  	defer rf.dataMu.Unlock()
   848  
   849  	if rf.seals&linux.F_SEAL_SEAL != 0 {
   850  		// Seal applied which prevents addition of any new seals.
   851  		return linuxerr.EPERM
   852  	}
   853  
   854  	// F_SEAL_WRITE can only be added if there are no active writable maps.
   855  	if rf.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 {
   856  		if rf.writableMappingPages > 0 {
   857  			return linuxerr.EBUSY
   858  		}
   859  	}
   860  
   861  	// Seals can only be added, never removed.
   862  	rf.seals |= val
   863  	return nil
   864  }