github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/pkg/sentry/fsimpl/tmpfs/regular_file.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tmpfs
    16  
    17  import (
    18  	"fmt"
    19  	"io"
    20  	"math"
    21  
    22  	"github.com/ttpreport/gvisor-ligolo/pkg/abi/linux"
    23  	"github.com/ttpreport/gvisor-ligolo/pkg/atomicbitops"
    24  	"github.com/ttpreport/gvisor-ligolo/pkg/context"
    25  	"github.com/ttpreport/gvisor-ligolo/pkg/errors/linuxerr"
    26  	"github.com/ttpreport/gvisor-ligolo/pkg/hostarch"
    27  	"github.com/ttpreport/gvisor-ligolo/pkg/safemem"
    28  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/fsmetric"
    29  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/fsutil"
    30  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/hostfd"
    31  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/kernel/auth"
    32  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/memmap"
    33  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/pgalloc"
    34  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/usage"
    35  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/vfs"
    36  	"github.com/ttpreport/gvisor-ligolo/pkg/sync"
    37  	"github.com/ttpreport/gvisor-ligolo/pkg/usermem"
    38  )
    39  
    40  // regularFile is a regular (=S_IFREG) tmpfs file.
    41  //
    42  // +stateify savable
    43  type regularFile struct {
    44  	inode inode
    45  
    46  	// memoryUsageKind is the memory accounting category under which pages backing
    47  	// this regularFile's contents are accounted.
    48  	memoryUsageKind usage.MemoryKind
    49  
    50  	// mapsMu protects mappings.
    51  	mapsMu sync.Mutex `state:"nosave"`
    52  
    53  	// mappings tracks mappings of the file into memmap.MappingSpaces.
    54  	//
    55  	// Protected by mapsMu.
    56  	mappings memmap.MappingSet
    57  
    58  	// writableMappingPages tracks how many pages of virtual memory are mapped
    59  	// as potentially writable from this file. If a page has multiple mappings,
    60  	// each mapping is counted separately.
    61  	//
    62  	// This counter is susceptible to overflow as we can potentially count
    63  	// mappings from many VMAs. We count pages rather than bytes to slightly
    64  	// mitigate this.
    65  	//
    66  	// Protected by mapsMu.
    67  	writableMappingPages uint64
    68  
    69  	// dataMu protects the fields below.
    70  	dataMu sync.RWMutex `state:"nosave"`
    71  
    72  	// data maps offsets into the file to offsets into memFile that store
    73  	// the file's data.
    74  	//
    75  	// Protected by dataMu.
    76  	data fsutil.FileRangeSet
    77  
    78  	// seals represents file seals on this inode.
    79  	//
    80  	// Protected by dataMu.
    81  	seals uint32
    82  
    83  	// size is the size of data.
    84  	//
    85  	// Protected by both dataMu and inode.mu; reading it requires holding
    86  	// either mutex, while writing requires holding both AND using atomics.
    87  	// Readers that do not require consistency (like Stat) may read the
    88  	// value atomically without holding either lock.
    89  	size atomicbitops.Uint64
    90  }
    91  
    92  func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) *inode {
    93  	file := &regularFile{
    94  		memoryUsageKind: fs.usage,
    95  		seals:           linux.F_SEAL_SEAL,
    96  	}
    97  	file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode, parentDir)
    98  	file.inode.nlink = atomicbitops.FromUint32(1) // from parent directory
    99  	return &file.inode
   100  }
   101  
   102  // newUnlinkedRegularFileDescription creates a regular file on the tmpfs
   103  // filesystem represented by mount and returns an FD representing that file.
   104  // The new file is not reachable by path traversal from any other file.
   105  //
   106  // newUnlinkedRegularFileDescription is analogous to Linux's
   107  // mm/shmem.c:__shmem_file_setup().
   108  //
   109  // Preconditions: mount must be a tmpfs mount.
   110  func newUnlinkedRegularFileDescription(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, name string) (*regularFileFD, error) {
   111  	fs, ok := mount.Filesystem().Impl().(*filesystem)
   112  	if !ok {
   113  		panic("tmpfs.newUnlinkedRegularFileDescription() called with non-tmpfs mount")
   114  	}
   115  
   116  	inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777, nil /* parentDir */)
   117  	d := fs.newDentry(inode)
   118  	defer d.DecRef(ctx)
   119  	d.name = name
   120  
   121  	fd := &regularFileFD{}
   122  	fd.Init(&inode.locks)
   123  	flags := uint32(linux.O_RDWR)
   124  	if err := fd.vfsfd.Init(fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
   125  		return nil, err
   126  	}
   127  	return fd, nil
   128  }
   129  
   130  // NewZeroFile creates a new regular file and file description as for
   131  // mmap(MAP_SHARED | MAP_ANONYMOUS). The file has the given size and is
   132  // initially (implicitly) filled with zeroes.
   133  //
   134  // Preconditions: mount must be a tmpfs mount.
   135  func NewZeroFile(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, size uint64) (*vfs.FileDescription, error) {
   136  	// Compare mm/shmem.c:shmem_zero_setup().
   137  	fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, "dev/zero")
   138  	if err != nil {
   139  		return nil, err
   140  	}
   141  	rf := fd.inode().impl.(*regularFile)
   142  	rf.memoryUsageKind = usage.Anonymous
   143  	rf.size.Store(size)
   144  	return &fd.vfsfd, err
   145  }
   146  
   147  // NewMemfd creates a new regular file and file description as for
   148  // memfd_create.
   149  //
   150  // Preconditions: mount must be a tmpfs mount.
   151  func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
   152  	fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, name)
   153  	if err != nil {
   154  		return nil, err
   155  	}
   156  	if allowSeals {
   157  		fd.inode().impl.(*regularFile).seals = 0
   158  	}
   159  	return &fd.vfsfd, nil
   160  }
   161  
   162  // truncate grows or shrinks the file to the given size. It returns true if the
   163  // file size was updated.
   164  func (rf *regularFile) truncate(newSize uint64) (bool, error) {
   165  	rf.inode.mu.Lock()
   166  	defer rf.inode.mu.Unlock()
   167  	return rf.truncateLocked(newSize)
   168  }
   169  
   170  // Preconditions:
   171  //   - rf.inode.mu must be held.
   172  //   - rf.dataMu must be locked for writing.
   173  //   - newSize > rf.size.
   174  func (rf *regularFile) growLocked(newSize uint64) error {
   175  	// Can we grow the file?
   176  	if rf.seals&linux.F_SEAL_GROW != 0 {
   177  		return linuxerr.EPERM
   178  	}
   179  	rf.size.Store(newSize)
   180  	return nil
   181  }
   182  
   183  // Preconditions: rf.inode.mu must be held.
   184  func (rf *regularFile) truncateLocked(newSize uint64) (bool, error) {
   185  	oldSize := rf.size.RacyLoad()
   186  	if newSize == oldSize {
   187  		// Nothing to do.
   188  		return false, nil
   189  	}
   190  
   191  	// Need to hold inode.mu and dataMu while modifying size.
   192  	rf.dataMu.Lock()
   193  	if newSize > oldSize {
   194  		err := rf.growLocked(newSize)
   195  		rf.dataMu.Unlock()
   196  		return err == nil, err
   197  	}
   198  
   199  	// We are shrinking the file. First check if this is allowed.
   200  	if rf.seals&linux.F_SEAL_SHRINK != 0 {
   201  		rf.dataMu.Unlock()
   202  		return false, linuxerr.EPERM
   203  	}
   204  
   205  	rf.size.Store(newSize)
   206  	rf.dataMu.Unlock()
   207  
   208  	// Invalidate past translations of truncated pages.
   209  	oldpgend := offsetPageEnd(int64(oldSize))
   210  	newpgend := offsetPageEnd(int64(newSize))
   211  	if newpgend < oldpgend {
   212  		rf.mapsMu.Lock()
   213  		rf.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
   214  			// Compare Linux's mm/shmem.c:shmem_setattr() =>
   215  			// mm/memory.c:unmap_mapping_range(evencows=1).
   216  			InvalidatePrivate: true,
   217  		})
   218  		rf.mapsMu.Unlock()
   219  	}
   220  
   221  	// We are now guaranteed that there are no translations of truncated pages,
   222  	// and can remove them.
   223  	rf.dataMu.Lock()
   224  	decPages := rf.data.Truncate(newSize, rf.inode.fs.mf)
   225  	rf.dataMu.Unlock()
   226  	rf.inode.fs.unaccountPages(decPages)
   227  	return true, nil
   228  }
   229  
   230  // AddMapping implements memmap.Mappable.AddMapping.
   231  func (rf *regularFile) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error {
   232  	rf.mapsMu.Lock()
   233  	defer rf.mapsMu.Unlock()
   234  	rf.dataMu.RLock()
   235  	defer rf.dataMu.RUnlock()
   236  
   237  	// Reject writable mapping if F_SEAL_WRITE is set.
   238  	if rf.seals&linux.F_SEAL_WRITE != 0 && writable {
   239  		return linuxerr.EPERM
   240  	}
   241  
   242  	rf.mappings.AddMapping(ms, ar, offset, writable)
   243  	if writable {
   244  		pagesBefore := rf.writableMappingPages
   245  
   246  		// ar is guaranteed to be page aligned per memmap.Mappable.
   247  		rf.writableMappingPages += uint64(ar.Length() / hostarch.PageSize)
   248  
   249  		if rf.writableMappingPages < pagesBefore {
   250  			panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages))
   251  		}
   252  	}
   253  
   254  	return nil
   255  }
   256  
   257  // RemoveMapping implements memmap.Mappable.RemoveMapping.
   258  func (rf *regularFile) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) {
   259  	rf.mapsMu.Lock()
   260  	defer rf.mapsMu.Unlock()
   261  
   262  	rf.mappings.RemoveMapping(ms, ar, offset, writable)
   263  
   264  	if writable {
   265  		pagesBefore := rf.writableMappingPages
   266  
   267  		// ar is guaranteed to be page aligned per memmap.Mappable.
   268  		rf.writableMappingPages -= uint64(ar.Length() / hostarch.PageSize)
   269  
   270  		if rf.writableMappingPages > pagesBefore {
   271  			panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages))
   272  		}
   273  	}
   274  }
   275  
   276  // CopyMapping implements memmap.Mappable.CopyMapping.
   277  func (rf *regularFile) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error {
   278  	return rf.AddMapping(ctx, ms, dstAR, offset, writable)
   279  }
   280  
   281  // Translate implements memmap.Mappable.Translate.
   282  func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
   283  	rf.dataMu.Lock()
   284  	defer rf.dataMu.Unlock()
   285  
   286  	// Constrain translations to f.attr.Size (rounded up) to prevent
   287  	// translation to pages that may be concurrently truncated.
   288  	pgend := offsetPageEnd(int64(rf.size.RacyLoad()))
   289  	var beyondEOF bool
   290  	if required.End > pgend {
   291  		if required.Start >= pgend {
   292  			return nil, &memmap.BusError{io.EOF}
   293  		}
   294  		beyondEOF = true
   295  		required.End = pgend
   296  	}
   297  	if optional.End > pgend {
   298  		optional.End = pgend
   299  	}
   300  	pagesToFill := rf.data.PagesToFill(required, optional)
   301  	if !rf.inode.fs.accountPages(pagesToFill) {
   302  		// If we can not accommodate pagesToFill pages, then retry with just
   303  		// the required range. Because optional may be larger than required.
   304  		// Only error out if even the required range can not be allocated for.
   305  		pagesToFill = rf.data.PagesToFill(required, required)
   306  		if !rf.inode.fs.accountPages(pagesToFill) {
   307  			return nil, &memmap.BusError{linuxerr.ENOSPC}
   308  		}
   309  		optional = required
   310  	}
   311  	pagesAlloced, cerr := rf.data.Fill(ctx, required, optional, rf.size.RacyLoad(), rf.inode.fs.mf, rf.memoryUsageKind, pgalloc.AllocateOnly, nil /* r */)
   312  	// rf.data.Fill() may fail mid-way. We still want to account any pages that
   313  	// were allocated, irrespective of an error.
   314  	rf.inode.fs.adjustPageAcct(pagesToFill, pagesAlloced)
   315  
   316  	var ts []memmap.Translation
   317  	var translatedEnd uint64
   318  	for seg := rf.data.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
   319  		segMR := seg.Range().Intersect(optional)
   320  		ts = append(ts, memmap.Translation{
   321  			Source: segMR,
   322  			File:   rf.inode.fs.mf,
   323  			Offset: seg.FileRangeOf(segMR).Start,
   324  			Perms:  hostarch.AnyAccess,
   325  		})
   326  		translatedEnd = segMR.End
   327  	}
   328  
   329  	// Don't return the error returned by f.data.Fill if it occurred outside of
   330  	// required.
   331  	if translatedEnd < required.End && cerr != nil {
   332  		return ts, &memmap.BusError{cerr}
   333  	}
   334  	if beyondEOF {
   335  		return ts, &memmap.BusError{io.EOF}
   336  	}
   337  	return ts, nil
   338  }
   339  
   340  // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
   341  func (*regularFile) InvalidateUnsavable(context.Context) error {
   342  	return nil
   343  }
   344  
   345  // +stateify savable
   346  type regularFileFD struct {
   347  	fileDescription
   348  
   349  	// off is the file offset. off is accessed using atomic memory operations.
   350  	// offMu serializes operations that may mutate off.
   351  	off   int64
   352  	offMu sync.Mutex `state:"nosave"`
   353  }
   354  
   355  // Release implements vfs.FileDescriptionImpl.Release.
   356  func (fd *regularFileFD) Release(context.Context) {
   357  	// noop
   358  }
   359  
   360  // Allocate implements vfs.FileDescriptionImpl.Allocate.
   361  func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
   362  	f := fd.inode().impl.(*regularFile)
   363  
   364  	f.inode.mu.Lock()
   365  	defer f.inode.mu.Unlock()
   366  	f.dataMu.Lock()
   367  	defer f.dataMu.Unlock()
   368  
   369  	// We must allocate pages in the range specified by offset and length.
   370  	// Even if newSize <= oldSize, there might not be actual memory backing this
   371  	// range, so any gaps must be filled by calling f.data.Fill().
   372  	// "After a successful call, subsequent writes into the range
   373  	// specified by offset and len are guaranteed not to fail because of
   374  	// lack of disk space."  - fallocate(2)
   375  	newSize := offset + length
   376  	pgstartaddr := hostarch.Addr(offset).RoundDown()
   377  	pgendaddr, ok := hostarch.Addr(newSize).RoundUp()
   378  	if !ok {
   379  		return linuxerr.EFBIG
   380  	}
   381  	required := memmap.MappableRange{Start: uint64(pgstartaddr), End: uint64(pgendaddr)}
   382  	pagesToFill := f.data.PagesToFill(required, required)
   383  	if !f.inode.fs.accountPages(pagesToFill) {
   384  		return linuxerr.ENOSPC
   385  	}
   386  	// Given our definitions in pgalloc, fallocate(2) semantics imply that pages
   387  	// in the MemoryFile must be committed, in addition to being allocated.
   388  	allocMode := pgalloc.AllocateAndCommit
   389  	if !f.inode.fs.mf.IsDiskBacked() {
   390  		// Upgrade to AllocateAndWritePopulate for memory(shmem)-backed files. We
   391  		// take a more aggressive approach in populating pages for memory-backed
   392  		// MemoryFiles. shmem pages are subject to swap rather than disk writeback.
   393  		// They are not likely to be swapped before they are written to. Hence it
   394  		// is beneficial to populate (in addition to commit) shmem pages to avoid
   395  		// faulting page-by-page when these pages are written to in the future.
   396  		allocMode = pgalloc.AllocateAndWritePopulate
   397  	}
   398  	pagesAlloced, err := f.data.Fill(ctx, required, required, newSize, f.inode.fs.mf, f.memoryUsageKind, allocMode, nil /* r */)
   399  	// f.data.Fill() may fail mid-way. We still want to account any pages that
   400  	// were allocated, irrespective of an error.
   401  	f.inode.fs.adjustPageAcct(pagesToFill, pagesAlloced)
   402  	if err != nil && err != io.EOF {
   403  		return err
   404  	}
   405  
   406  	oldSize := f.size.Load()
   407  	if oldSize >= newSize {
   408  		return nil
   409  	}
   410  	return f.growLocked(newSize)
   411  }
   412  
   413  // PRead implements vfs.FileDescriptionImpl.PRead.
   414  func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
   415  	start := fsmetric.StartReadWait()
   416  	defer fsmetric.FinishReadWait(fsmetric.TmpfsReadWait, start)
   417  	fsmetric.TmpfsReads.Increment()
   418  
   419  	if offset < 0 {
   420  		return 0, linuxerr.EINVAL
   421  	}
   422  
   423  	// Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
   424  	// all state is in-memory.
   425  	//
   426  	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
   427  	if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
   428  		return 0, linuxerr.EOPNOTSUPP
   429  	}
   430  
   431  	if dst.NumBytes() == 0 {
   432  		return 0, nil
   433  	}
   434  	f := fd.inode().impl.(*regularFile)
   435  	rw := getRegularFileReadWriter(f, offset, 0)
   436  	n, err := dst.CopyOutFrom(ctx, rw)
   437  	putRegularFileReadWriter(rw)
   438  	fd.inode().touchAtime(fd.vfsfd.Mount())
   439  	return n, err
   440  }
   441  
   442  // Read implements vfs.FileDescriptionImpl.Read.
   443  func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
   444  	fd.offMu.Lock()
   445  	n, err := fd.PRead(ctx, dst, fd.off, opts)
   446  	fd.off += n
   447  	fd.offMu.Unlock()
   448  	return n, err
   449  }
   450  
   451  // PWrite implements vfs.FileDescriptionImpl.PWrite.
   452  func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
   453  	n, _, err := fd.pwrite(ctx, src, offset, opts)
   454  	return n, err
   455  }
   456  
   457  // pwrite returns the number of bytes written, final offset and error. The
   458  // final offset should be ignored by PWrite.
   459  func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
   460  	if offset < 0 {
   461  		return 0, offset, linuxerr.EINVAL
   462  	}
   463  
   464  	// Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
   465  	// all state is in-memory.
   466  	//
   467  	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
   468  	if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
   469  		return 0, offset, linuxerr.EOPNOTSUPP
   470  	}
   471  
   472  	srclen := src.NumBytes()
   473  	if srclen == 0 {
   474  		return 0, offset, nil
   475  	}
   476  	f := fd.inode().impl.(*regularFile)
   477  	f.inode.mu.Lock()
   478  	defer f.inode.mu.Unlock()
   479  	// If the file is opened with O_APPEND, update offset to file size.
   480  	if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
   481  		// Locking f.inode.mu is sufficient for reading f.size.
   482  		offset = int64(f.size.RacyLoad())
   483  	}
   484  	end := offset + srclen
   485  	if end < offset {
   486  		// Overflow.
   487  		return 0, offset, linuxerr.EINVAL
   488  	}
   489  
   490  	srclen, err = vfs.CheckLimit(ctx, offset, srclen)
   491  	if err != nil {
   492  		return 0, offset, err
   493  	}
   494  	src = src.TakeFirst64(srclen)
   495  
   496  	// Perform the write.
   497  	rw := getRegularFileReadWriter(f, offset, pgalloc.MemoryCgroupIDFromContext(ctx))
   498  	n, err := src.CopyInTo(ctx, rw)
   499  
   500  	f.inode.touchCMtimeLocked()
   501  	for {
   502  		old := f.inode.mode.Load()
   503  		new := vfs.ClearSUIDAndSGID(old)
   504  		if swapped := f.inode.mode.CompareAndSwap(old, new); swapped {
   505  			break
   506  		}
   507  	}
   508  	putRegularFileReadWriter(rw)
   509  	return n, n + offset, err
   510  }
   511  
   512  // Write implements vfs.FileDescriptionImpl.Write.
   513  func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
   514  	fd.offMu.Lock()
   515  	n, off, err := fd.pwrite(ctx, src, fd.off, opts)
   516  	fd.off = off
   517  	fd.offMu.Unlock()
   518  	return n, err
   519  }
   520  
   521  // Seek implements vfs.FileDescriptionImpl.Seek.
   522  func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
   523  	fd.offMu.Lock()
   524  	defer fd.offMu.Unlock()
   525  	switch whence {
   526  	case linux.SEEK_SET:
   527  		// use offset as specified
   528  	case linux.SEEK_CUR:
   529  		offset += fd.off
   530  	case linux.SEEK_END:
   531  		offset += int64(fd.inode().impl.(*regularFile).size.Load())
   532  	default:
   533  		return 0, linuxerr.EINVAL
   534  	}
   535  	if offset < 0 {
   536  		return 0, linuxerr.EINVAL
   537  	}
   538  	fd.off = offset
   539  	return offset, nil
   540  }
   541  
   542  // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
   543  func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
   544  	file := fd.inode().impl.(*regularFile)
   545  	opts.SentryOwnedContent = true
   546  	return vfs.GenericConfigureMMap(&fd.vfsfd, file, opts)
   547  }
   548  
   549  // offsetPageEnd returns the file offset rounded up to the nearest
   550  // page boundary. offsetPageEnd panics if rounding up causes overflow,
   551  // which shouldn't be possible given that offset is an int64.
   552  func offsetPageEnd(offset int64) uint64 {
   553  	end, ok := hostarch.Addr(offset).RoundUp()
   554  	if !ok {
   555  		panic("impossible overflow")
   556  	}
   557  	return uint64(end)
   558  }
   559  
   560  // regularFileReadWriter implements safemem.Reader and Safemem.Writer.
   561  type regularFileReadWriter struct {
   562  	file *regularFile
   563  
   564  	// Offset into the file to read/write at. Note that this may be
   565  	// different from the FD offset if PRead/PWrite is used.
   566  	off uint64
   567  
   568  	// memCgID is the memory cgroup ID used for accounting the allocated
   569  	// pages.
   570  	memCgID uint32
   571  }
   572  
   573  var regularFileReadWriterPool = sync.Pool{
   574  	New: func() any {
   575  		return &regularFileReadWriter{}
   576  	},
   577  }
   578  
   579  func getRegularFileReadWriter(file *regularFile, offset int64, memCgID uint32) *regularFileReadWriter {
   580  	rw := regularFileReadWriterPool.Get().(*regularFileReadWriter)
   581  	rw.file = file
   582  	rw.off = uint64(offset)
   583  	rw.memCgID = memCgID
   584  	return rw
   585  }
   586  
   587  func putRegularFileReadWriter(rw *regularFileReadWriter) {
   588  	rw.file = nil
   589  	regularFileReadWriterPool.Put(rw)
   590  }
   591  
   592  // ReadToBlocks implements safemem.Reader.ReadToBlocks.
   593  func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
   594  	rw.file.dataMu.RLock()
   595  	defer rw.file.dataMu.RUnlock()
   596  	size := rw.file.size.RacyLoad()
   597  
   598  	// Compute the range to read (limited by file size and overflow-checked).
   599  	if rw.off >= size {
   600  		return 0, io.EOF
   601  	}
   602  	end := size
   603  	if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
   604  		end = rend
   605  	}
   606  
   607  	var done uint64
   608  	seg, gap := rw.file.data.Find(uint64(rw.off))
   609  	for rw.off < end {
   610  		mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
   611  		switch {
   612  		case seg.Ok():
   613  			// Get internal mappings.
   614  			ims, err := rw.file.inode.fs.mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Read)
   615  			if err != nil {
   616  				return done, err
   617  			}
   618  
   619  			// Copy from internal mappings.
   620  			n, err := safemem.CopySeq(dsts, ims)
   621  			done += n
   622  			rw.off += uint64(n)
   623  			dsts = dsts.DropFirst64(n)
   624  			if err != nil {
   625  				return done, err
   626  			}
   627  
   628  			// Continue.
   629  			seg, gap = seg.NextNonEmpty()
   630  
   631  		case gap.Ok():
   632  			// Tmpfs holes are zero-filled.
   633  			gapmr := gap.Range().Intersect(mr)
   634  			dst := dsts.TakeFirst64(gapmr.Length())
   635  			n, err := safemem.ZeroSeq(dst)
   636  			done += n
   637  			rw.off += uint64(n)
   638  			dsts = dsts.DropFirst64(n)
   639  			if err != nil {
   640  				return done, err
   641  			}
   642  
   643  			// Continue.
   644  			seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
   645  		}
   646  	}
   647  	return done, nil
   648  }
   649  
   650  // WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
   651  //
   652  // Preconditions: rw.file.inode.mu must be held.
   653  func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
   654  	// Hold dataMu so we can modify size.
   655  	rw.file.dataMu.Lock()
   656  	defer rw.file.dataMu.Unlock()
   657  
   658  	// Compute the range to write (overflow-checked).
   659  	end := rw.off + srcs.NumBytes()
   660  	if end <= rw.off {
   661  		end = math.MaxInt64
   662  	}
   663  
   664  	// Check if seals prevent either file growth or all writes.
   665  	switch {
   666  	case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed
   667  		return 0, linuxerr.EPERM
   668  	case end > rw.file.size.RacyLoad() && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed
   669  		// When growth is sealed, Linux effectively allows writes which would
   670  		// normally grow the file to partially succeed up to the current EOF,
   671  		// rounded down to the page boundary before the EOF.
   672  		//
   673  		// This happens because writes (and thus the growth check) for tmpfs
   674  		// files proceed page-by-page on Linux, and the final write to the page
   675  		// containing EOF fails, resulting in a partial write up to the start of
   676  		// that page.
   677  		//
   678  		// To emulate this behaviour, artifically truncate the write to the
   679  		// start of the page containing the current EOF.
   680  		//
   681  		// See Linux, mm/filemap.c:generic_perform_write() and
   682  		// mm/shmem.c:shmem_write_begin().
   683  		if pgstart := uint64(hostarch.Addr(rw.file.size.RacyLoad()).RoundDown()); end > pgstart {
   684  			end = pgstart
   685  		}
   686  		if end <= rw.off {
   687  			// Truncation would result in no data being written.
   688  			return 0, linuxerr.EPERM
   689  		}
   690  	}
   691  
   692  	// Page-aligned mr for when we need to allocate memory. RoundUp can't
   693  	// overflow since end is an int64.
   694  	pgstartaddr := hostarch.Addr(rw.off).RoundDown()
   695  	pgendaddr, _ := hostarch.Addr(end).RoundUp()
   696  	pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)}
   697  
   698  	var (
   699  		done   uint64
   700  		retErr error
   701  	)
   702  	seg, gap := rw.file.data.Find(uint64(rw.off))
   703  	for rw.off < end {
   704  		mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
   705  		switch {
   706  		case seg.Ok():
   707  			n, err := rw.writeToMF(seg.FileRangeOf(seg.Range().Intersect(mr)), srcs)
   708  			done += n
   709  			rw.off += uint64(n)
   710  			srcs = srcs.DropFirst64(n)
   711  			if err != nil {
   712  				retErr = err
   713  				goto exitLoop
   714  			}
   715  
   716  			// Continue.
   717  			seg, gap = seg.NextNonEmpty()
   718  
   719  		case gap.Ok():
   720  			// Allocate memory for the write.
   721  			gapMR := gap.Range().Intersect(pgMR)
   722  			pagesToFill := gapMR.Length() / hostarch.PageSize
   723  			pagesReserved := rw.file.inode.fs.accountPagesPartial(pagesToFill)
   724  			if pagesReserved == 0 {
   725  				if done == 0 {
   726  					retErr = linuxerr.ENOSPC
   727  					goto exitLoop
   728  				}
   729  				retErr = nil
   730  				goto exitLoop
   731  			}
   732  			gapMR.End = gapMR.Start + (hostarch.PageSize * pagesReserved)
   733  			allocMode := pgalloc.AllocateAndWritePopulate
   734  			if rw.file.inode.fs.mf.IsDiskBacked() {
   735  				// Don't populate pages for disk-backed files. Benchmarking showed that
   736  				// disk-backed pages are likely to be written back to disk before we
   737  				// can write to them. The pages fault again on write anyways. In total,
   738  				// prepopulating disk-backed pages deteriorates performance as it fails
   739  				// to eliminate future page faults and we also additionally incur
   740  				// useless disk writebacks.
   741  				allocMode = pgalloc.AllocateOnly
   742  			}
   743  			fr, err := rw.file.inode.fs.mf.Allocate(gapMR.Length(), pgalloc.AllocOpts{
   744  				Kind:    rw.file.memoryUsageKind,
   745  				Mode:    allocMode,
   746  				MemCgID: rw.memCgID,
   747  			})
   748  			if err != nil {
   749  				retErr = err
   750  				rw.file.inode.fs.unaccountPages(pagesReserved)
   751  				goto exitLoop
   752  			}
   753  
   754  			// Write to that memory as usual.
   755  			seg, gap = rw.file.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{}
   756  		default:
   757  			panic("unreachable")
   758  		}
   759  	}
   760  exitLoop:
   761  	// If the write ends beyond the file's previous size, it causes the
   762  	// file to grow.
   763  	if rw.off > rw.file.size.RacyLoad() {
   764  		rw.file.size.Store(rw.off)
   765  	}
   766  
   767  	return done, retErr
   768  }
   769  
   770  func (rw *regularFileReadWriter) writeToMF(fr memmap.FileRange, srcs safemem.BlockSeq) (uint64, error) {
   771  	if rw.file.inode.fs.mf.IsDiskBacked() {
   772  		// Disk-backed files are not prepopulated. The safemem.CopySeq() approach
   773  		// used below incurs a lot of page faults without page prepopulation, which
   774  		// causes a lot of context switching. Use write(2) host syscall instead,
   775  		// which makes one context switch and faults all the pages that are touched
   776  		// during the write.
   777  		return hostfd.Pwritev2(
   778  			int32(rw.file.inode.fs.mf.FD()), // fd
   779  			srcs.TakeFirst64(fr.Length()),   // srcs
   780  			int64(fr.Start),                 // offset
   781  			0,                               // flags
   782  		)
   783  	}
   784  	// Get internal mappings.
   785  	ims, err := rw.file.inode.fs.mf.MapInternal(fr, hostarch.Write)
   786  	if err != nil {
   787  		return 0, err
   788  	}
   789  	// Copy to internal mappings.
   790  	return safemem.CopySeq(ims, srcs)
   791  }
   792  
   793  // GetSeals returns the current set of seals on a memfd inode.
   794  func GetSeals(fd *vfs.FileDescription) (uint32, error) {
   795  	f, ok := fd.Impl().(*regularFileFD)
   796  	if !ok {
   797  		return 0, linuxerr.EINVAL
   798  	}
   799  	rf := f.inode().impl.(*regularFile)
   800  	rf.dataMu.RLock()
   801  	defer rf.dataMu.RUnlock()
   802  	return rf.seals, nil
   803  }
   804  
   805  // AddSeals adds new file seals to a memfd inode.
   806  func AddSeals(fd *vfs.FileDescription, val uint32) error {
   807  	f, ok := fd.Impl().(*regularFileFD)
   808  	if !ok {
   809  		return linuxerr.EINVAL
   810  	}
   811  	rf := f.inode().impl.(*regularFile)
   812  	rf.mapsMu.Lock()
   813  	defer rf.mapsMu.Unlock()
   814  	rf.dataMu.Lock()
   815  	defer rf.dataMu.Unlock()
   816  
   817  	if rf.seals&linux.F_SEAL_SEAL != 0 {
   818  		// Seal applied which prevents addition of any new seals.
   819  		return linuxerr.EPERM
   820  	}
   821  
   822  	// F_SEAL_WRITE can only be added if there are no active writable maps.
   823  	if rf.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 {
   824  		if rf.writableMappingPages > 0 {
   825  			return linuxerr.EBUSY
   826  		}
   827  	}
   828  
   829  	// Seals can only be added, never removed.
   830  	rf.seals |= val
   831  	return nil
   832  }